From 1451e9ef33979c0cef2edaf19f921b549092bb9e Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Wed, 7 Jan 2026 18:18:39 +0100 Subject: [PATCH 001/240] prepare the new importing step for cross linking data without actually addding any functionality to it --- backend/protzilla/all_steps.py | 1 + .../importing/cross_linking_import.py | 33 +++++++++++++++++++ backend/protzilla/methods/importing.py | 33 +++++++++++++++++++ 3 files changed, 67 insertions(+) create mode 100644 backend/protzilla/importing/cross_linking_import.py diff --git a/backend/protzilla/all_steps.py b/backend/protzilla/all_steps.py index 5f9872623..1b4dad4d4 100644 --- a/backend/protzilla/all_steps.py +++ b/backend/protzilla/all_steps.py @@ -14,6 +14,7 @@ importing.EvidenceImport, importing.ExampleDatasetImport, importing.FastaImport, + importing.CrossLinkingImport, data_preprocessing.FilterProteinsBySamplesMissing, data_preprocessing.FilterProteinsBySilacRatios, data_preprocessing.FilterByProteinsCount, diff --git a/backend/protzilla/importing/cross_linking_import.py b/backend/protzilla/importing/cross_linking_import.py new file mode 100644 index 000000000..1dab20166 --- /dev/null +++ b/backend/protzilla/importing/cross_linking_import.py @@ -0,0 +1,33 @@ +""" +This module contains the code to parse a file containing cross linking data. +""" + +import logging +from pathlib import Path +import pandas as pd +import traceback + +from backend.protzilla.utilities import format_trace + +def cross_linking_import( + file_path: Path, +) -> dict: + try: + df = pd.read_csv( + file_path, + sep="\t", + low_memory=False, + na_values=["", 0], + keep_default_na=True, + ) + except Exception as e: + msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid cross linking file." + return dict( + messages=[ + dict( + level=logging.ERROR, + msg=msg, + trace=format_trace(traceback.format_exception(e)), + ) + ] + ) \ No newline at end of file diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 3cc2fa973..b04b1895c 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -15,6 +15,7 @@ from backend.protzilla.steps import Step, StepManager from protzilla.importing.example_dataset_import import example_dataset_import from protzilla.importing.fasta_import import fasta_import +from protzilla.importing.cross_linking_import import cross_linking_import from protzilla.importing.import_utils import ( AggregationMethods, FeatureOrientationType, @@ -399,3 +400,35 @@ def create_form(self): ) calc_method = staticmethod(example_dataset_import) + +class CrossLinkingImport(ImportingStep): + display_name = "Cross Linking Data Import" + operation = "Cross Linking Data Import" + method_description = "Import a file containing cross linking data" + + output_keys = ["crossLinking_df"] + + def create_form(self): + return Form( + label="Cross Linking Data Import", + input_fields=[ + FileInput( + name="file_path", + label="Cross Linking Data file (.xlsx oder .csv)", + value=None, + ), + DropdownField( + name="", + label="", + #value=IntensityType.IBAQ.value, + #options=IntensityType, + ), + CheckboxField( + name="", + label="", + value=False, + ), + ], + ) + + calc_method = staticmethod(cross_linking_import) From 992bcb8ac4fd70829bdddc5bff9ff33911dfc1fd Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Wed, 7 Jan 2026 18:53:50 +0100 Subject: [PATCH 002/240] feat: add alphafold prediction load step --- backend/protzilla/all_steps.py | 1 + .../alphafold_protein_structure_load.py | 41 +++++++++++++++++++ backend/protzilla/methods/data_analysis.py | 31 ++++++++++++++ 3 files changed, 73 insertions(+) create mode 100644 backend/protzilla/data_analysis/alphafold_protein_structure_load.py diff --git a/backend/protzilla/all_steps.py b/backend/protzilla/all_steps.py index 5f9872623..710820622 100644 --- a/backend/protzilla/all_steps.py +++ b/backend/protzilla/all_steps.py @@ -65,6 +65,7 @@ data_analysis.PTMOverviewVisualization, data_analysis.PTMBarVisualization, data_analysis.PTMDetailsVisualization, + data_analysis.AlphaFoldPredictionLoad, data_preprocessing.ImputationByMinPerSample, data_integration.EnrichmentAnalysisGOAnalysisWithString, data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr, diff --git a/backend/protzilla/data_analysis/alphafold_protein_structure_load.py b/backend/protzilla/data_analysis/alphafold_protein_structure_load.py new file mode 100644 index 000000000..16f6c5317 --- /dev/null +++ b/backend/protzilla/data_analysis/alphafold_protein_structure_load.py @@ -0,0 +1,41 @@ +import pandas as pd +import requests + +def fetch_af_protein_structure(uniprot: str) -> dict: + """ + Fetch AlphaFold protein structure prediction data from the AlphaFold Database. + + :param uniprot: UniProt accession or protein ID + :type uniprot: str + :return: Dictionary containing af_structure_df with structure metadata + :rtype: dict + :raises ValueError: If no AlphaFold predictions are found for the given protein + """ + url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot}" + records = requests.get(url, timeout=30).json() + if not records: + raise ValueError(f"No AlphaFold DB predictions for {uniprot}") + r = records[0] + + data = { + "entryId": r.get("entryId"), + "uniprotAccession": r.get("uniprotAccession"), + "uniprotId": r.get("uniprotId"), + "modelCreatedDate": r.get("modelCreatedDate"), + "latestVersion": r.get("latestVersion"), + "uniprotStart": r.get("uniprotStart"), + "uniprotEnd": r.get("uniprotEnd"), + "sequenceLength": len(r.get("uniprotSequence", "")) + if isinstance(r.get("uniprotSequence"), str) + else None, + } + + """ Other things we could get: + "pdbUrl": r.get("pdbUrl"), + "cifUrl": r.get("cifUrl"), + "paeDocUrl": r.get("paeDocUrl"), + "plddtDocUrl": r.get("plddtDocUrl"), """ + + af_structure_df = pd.DataFrame([data]) + + return {"af_structure_df": af_structure_df} diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 75b9e1bd6..e396e1a42 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -7,6 +7,9 @@ hierarchical_agglomerative_clustering, k_means, ) +from backend.protzilla.data_analysis.alphafold_protein_structure_load import ( + fetch_af_protein_structure, +) from backend.protzilla.data_analysis.differential_expression_anova import anova from backend.protzilla.data_analysis.differential_expression_kruskal_wallis import ( kruskal_wallis_test_on_ptm_data, @@ -2453,3 +2456,31 @@ def create_form(self): label="PTM Details Visualization", input_fields=_PTMVisualizationWithGroups.get_form_fields(), ) + + +class AlphaFoldPredictionLoad(DataAnalysisStep): + display_name = "AlphaFold DB Prediction Load" + operation = "Protein Structure Analysis" + method_description = "Loads the predicted structure of the protein with the given protein ID out of the AlphaFold DB." + + output_keys = [ + "af_structure_df", + ] + + plot_method = None + + def create_form(self): + return Form( + label="AlphaFold DB Prediction Load", + input_fields=[ + TextField( + name="uniprot", + label="Protein ID", + ), + ], + ) + + calc_method = staticmethod(fetch_af_protein_structure) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + return inputs From c35ec57542c64d171cfd797c04fe6b91952f810c Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Thu, 8 Jan 2026 17:52:17 +0100 Subject: [PATCH 003/240] feat: add creation of cross-linking dataframe from csm and proteomediscoverer xlinkx format --- backend/main/views.py | 8 +- backend/protzilla/all_steps.py | 2 +- .../importing/cross_linking_import.py | 153 ++++++++++++++++-- backend/protzilla/methods/importing.py | 9 +- 4 files changed, 155 insertions(+), 17 deletions(-) diff --git a/backend/main/views.py b/backend/main/views.py index 50df32aa1..eff8e49f4 100644 --- a/backend/main/views.py +++ b/backend/main/views.py @@ -48,7 +48,13 @@ database_metadata_path = EXTERNAL_DATA_PATH / "internal" / "metadata" / "uniprot.json" -dataframes = ["protein_df", "metadata_df", "peptide_df", "modification_df"] +dataframes = [ + "protein_df", + "metadata_df", + "peptide_df", + "modification_df", + "crosslinking_df", +] @ensure_csrf_cookie diff --git a/backend/protzilla/all_steps.py b/backend/protzilla/all_steps.py index 1b4dad4d4..ca55d5945 100644 --- a/backend/protzilla/all_steps.py +++ b/backend/protzilla/all_steps.py @@ -14,7 +14,7 @@ importing.EvidenceImport, importing.ExampleDatasetImport, importing.FastaImport, - importing.CrossLinkingImport, + importing.CrossLinkingImport, data_preprocessing.FilterProteinsBySamplesMissing, data_preprocessing.FilterProteinsBySilacRatios, data_preprocessing.FilterByProteinsCount, diff --git a/backend/protzilla/importing/cross_linking_import.py b/backend/protzilla/importing/cross_linking_import.py index 1dab20166..9261f5dab 100644 --- a/backend/protzilla/importing/cross_linking_import.py +++ b/backend/protzilla/importing/cross_linking_import.py @@ -6,20 +6,151 @@ from pathlib import Path import pandas as pd import traceback +import requests from backend.protzilla.utilities import format_trace -def cross_linking_import( - file_path: Path, -) -> dict: + +def get_gene_name_from_protein_id(protein_id): + return "placeholder" + url = f"https://rest.uniprot.org/uniprotkb/{protein_id}" + params = {"fields": "gene_names", "format": "json"} + + response = requests.get(url, params=params) + response.raise_for_status() # Fehler werfen, wenn etwas schief geht + + data = response.json() + + gene_name = data["genes"][0]["geneName"]["value"] + + return gene_name + + +def get_protein_ids_from_gene_name(gene_name): + return "placeholder" + url = "https://rest.uniprot.org/uniprotkb/search" + params = { + "query": f"gene:{gene_name} AND organism_id:9606 AND reviewed:true", + "format": "list", + "includeIsoform": "true", + } + response = requests.get(url, params=params) + response.raise_for_status() # Fehler werfen, wenn etwas schief geht + + all_ids = response.text.strip().split("\n") + protein_ids = [i for i in all_ids if "-" not in i] + list_of_protein_isoforms = [i for i in all_ids if "-" in i] + + return protein_ids, list_of_protein_isoforms + + +def remove_brackets_from_peptide(peptide: str) -> str: + return peptide.replace("[", "").replace("]", "") + + +def get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format( + peptide: str, +) -> int: + return peptide.find("[") + + +rename_columns_csm_format = { + "Crosslink Type": "Is_intra_crosslink", + "PepSeq1": "Peptide1", + "PepSeq2": "Peptide2", + "PepPos1": "Peptide_position1", + "PepPos2": "Peptide_position2", + "LinkPos1": "CL_position1", + "LinkPos2": "CL_position2", + "PEP": "Q_value", +} + +rename_columns_proteomediscoverer_xlinkx_format = { + "Accession A": "Protein_id1", + "Accession B": "Protein_id2", + "Crosslink Type": "Is_intra_crosslink", + "Sequence A": "Peptide1", + "Sequence B": "Peptide2", + "Position A": "Peptide_position1", + "Position B": "Peptide_position2", + "Q-value": "Q_value", +} + +columns_in_cross_linking_df = [ + "Protein1", + "Protein2", + "Protein_id1", + "Protein_id2", + "Is_intra_crosslink", + "Crosslinker", + "Peptide1", + "Peptide2", + "Peptide_position1", # ToDo: check, dass wirklich immer 0-basiert + "Peptide_position2", + "CL_position1", # ToDo: check, dass wirklich immer 0-basiert + "CL_position2", + "Q_value", +] + + +def read_ProteomeDiscoverer_XlinkX_file(file_path: Path) -> pd.DataFrame: + df = pd.read_excel(file_path).rename( + columns=rename_columns_proteomediscoverer_xlinkx_format + ) + + df["CL_position1"] = df["Peptide1"].apply( + get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format + ) + df["CL_position2"] = df["Peptide2"].apply( + get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format + ) + + df["Peptide1"] = df["Peptide1"].apply(remove_brackets_from_peptide).astype("string") + df["Peptide2"] = df["Peptide2"].apply(remove_brackets_from_peptide).astype("string") + + df["Protein1"] = df["Protein_id1"].apply(get_gene_name_from_protein_id) + df["Protein2"] = df["Protein_id2"].apply(get_gene_name_from_protein_id) + + df["Is_intra_crosslink"] = df["Is_intra_crosslink"].eq("Intra") + + return normalize_crosslinking_df(df) + + +def read_csm_file(file_path: Path) -> pd.DataFrame: + df = pd.read_csv(file_path, low_memory=False).rename( + columns=rename_columns_csm_format + ) + + df["Protein_id1"] = df["Protein1"].apply(get_protein_ids_from_gene_name) + df["Protein_id2"] = df["Protein2"].apply(get_protein_ids_from_gene_name) + + df["Is_intra_crosslink"] = df["Protein1"].eq(df["Protein2"]) + + return normalize_crosslinking_df(df) + + +def normalize_crosslinking_df(df: pd.DataFrame) -> pd.DataFrame: + df = df.astype( + { + "Protein1": "string", + "Protein2": "string", + "Is_intra_crosslink": "bool", + "Crosslinker": "string", + "Peptide1": "string", + "Peptide2": "string", + "Q_value": "Float64", + } + ) + return df.loc[:, columns_in_cross_linking_df] + + +def cross_linking_import(file_path: Path) -> dict: try: - df = pd.read_csv( - file_path, - sep="\t", - low_memory=False, - na_values=["", 0], - keep_default_na=True, - ) + if file_path.suffix == ".csv": + df = read_csm_file(file_path) + elif file_path.suffix == ".xlsx": + df = read_ProteomeDiscoverer_XlinkX_file(file_path) + return dict(crosslinking_df=df) # kann sein, dass df nicht existiert except Exception as e: msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid cross linking file." return dict( @@ -30,4 +161,4 @@ def cross_linking_import( trace=format_trace(traceback.format_exception(e)), ) ] - ) \ No newline at end of file + ) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index b04b1895c..691571067 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -401,12 +401,13 @@ def create_form(self): calc_method = staticmethod(example_dataset_import) + class CrossLinkingImport(ImportingStep): display_name = "Cross Linking Data Import" operation = "Cross Linking Data Import" method_description = "Import a file containing cross linking data" - output_keys = ["crossLinking_df"] + output_keys = ["crosslinking_df"] def create_form(self): return Form( @@ -420,8 +421,8 @@ def create_form(self): DropdownField( name="", label="", - #value=IntensityType.IBAQ.value, - #options=IntensityType, + # value=IntensityType.IBAQ.value, + # options=IntensityType, ), CheckboxField( name="", @@ -431,4 +432,4 @@ def create_form(self): ], ) - calc_method = staticmethod(cross_linking_import) + calc_method = staticmethod(cross_linking_import) From 1a6e0116070618c93e6ffcccd6ca8cb6801d3c06 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Thu, 8 Jan 2026 18:43:41 +0100 Subject: [PATCH 004/240] feat: change alphafold structure prediction load so it saves all data across runs permanently in userdata --- .../alphafold_protein_structure_load.py | 143 +++++++++++++----- 1 file changed, 107 insertions(+), 36 deletions(-) diff --git a/backend/protzilla/data_analysis/alphafold_protein_structure_load.py b/backend/protzilla/data_analysis/alphafold_protein_structure_load.py index 16f6c5317..cc950537a 100644 --- a/backend/protzilla/data_analysis/alphafold_protein_structure_load.py +++ b/backend/protzilla/data_analysis/alphafold_protein_structure_load.py @@ -1,41 +1,112 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + import pandas as pd import requests -def fetch_af_protein_structure(uniprot: str) -> dict: - """ - Fetch AlphaFold protein structure prediction data from the AlphaFold Database. +from backend.protzilla.constants import paths +from backend.protzilla.constants.protzilla_logging import logger + + +def _download_file(session: requests.Session, url: str, dest: Path) -> Path | None: + try: + dest.parent.mkdir(parents=True, exist_ok=True) + with session.get(url, stream=True, timeout=60) as r: + r.raise_for_status() + with open(dest, "wb") as f: + for chunk in r.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + logger.info("Downloaded %s -> %s", url, dest) + return dest + except requests.RequestException: + logger.exception("Failed to download %s", url) + return None + except OSError: + logger.exception("Failed to write file %s", dest) + return None - :param uniprot: UniProt accession or protein ID - :type uniprot: str - :return: Dictionary containing af_structure_df with structure metadata - :rtype: dict - :raises ValueError: If no AlphaFold predictions are found for the given protein - """ + +def fetch_af_protein_structure( + uniprot: str, download_files: bool = True +) -> dict[str, Any]: url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot}" - records = requests.get(url, timeout=30).json() - if not records: - raise ValueError(f"No AlphaFold DB predictions for {uniprot}") - r = records[0] - - data = { - "entryId": r.get("entryId"), - "uniprotAccession": r.get("uniprotAccession"), - "uniprotId": r.get("uniprotId"), - "modelCreatedDate": r.get("modelCreatedDate"), - "latestVersion": r.get("latestVersion"), - "uniprotStart": r.get("uniprotStart"), - "uniprotEnd": r.get("uniprotEnd"), - "sequenceLength": len(r.get("uniprotSequence", "")) - if isinstance(r.get("uniprotSequence"), str) - else None, - } - - """ Other things we could get: - "pdbUrl": r.get("pdbUrl"), - "cifUrl": r.get("cifUrl"), - "paeDocUrl": r.get("paeDocUrl"), - "plddtDocUrl": r.get("plddtDocUrl"), """ - - af_structure_df = pd.DataFrame([data]) - - return {"af_structure_df": af_structure_df} + + with requests.Session() as session: + try: + resp = session.get(url, timeout=30) + resp.raise_for_status() + records = resp.json() + except requests.RequestException as e: + raise RuntimeError(f"AlphaFold request failed for {uniprot}: {e}") from e + except ValueError as e: + raise RuntimeError(f"AlphaFold returned non-JSON for {uniprot}: {e}") from e + + if not isinstance(records, list) or not records: + raise ValueError(f"No AlphaFold DB predictions for {uniprot}") + + r = records[0] + if not isinstance(r, dict): + raise RuntimeError(f"Unexpected AlphaFold payload for {uniprot}") + + data: dict[str, Any] = { + "entryId": r.get("entryId"), + "uniprotAccession": r.get("uniprotAccession"), + "uniprotId": r.get("uniprotId"), + "modelCreatedDate": r.get("modelCreatedDate"), + "latestVersion": r.get("latestVersion"), + "uniprotStart": r.get("uniprotStart"), + "uniprotEnd": r.get("uniprotEnd"), + "sequenceLength": len(r["uniprotSequence"]) + if isinstance(r.get("uniprotSequence"), str) + else None, + } + + for key in ("pdbUrl", "cifUrl", "paeDocUrl", "plddtDocUrl"): + if isinstance(r.get(key), str) and r.get(key): + data[key] = r[key] + + # prefer reading the existing AlphaFold metadata CSV into the dataframe + meta_dir = paths.EXTERNAL_DATA_PATH / "alphafold" + meta_dir.mkdir(parents=True, exist_ok=True) + metadata_csv = meta_dir / "alphafold_metadata.csv" + + new_row = pd.DataFrame([data]) + try: + if metadata_csv.exists(): + existing = pd.read_csv(metadata_csv, dtype=str) + acc = data.get("uniprotAccession") + if acc and "uniprotAccession" in existing.columns: + existing = existing[existing["uniprotAccession"] != acc] + combined = pd.concat([existing, new_row], ignore_index=True) + else: + combined = new_row + + combined.to_csv(metadata_csv, index=False) + logger.info("Wrote AlphaFold metadata to %s", metadata_csv) + af_structure_df = combined + except Exception: + logger.exception( + "Failed to write AlphaFold metadata CSV to %s", metadata_csv + ) + af_structure_df = new_row + + downloaded: dict[str, str] = {} + if download_files: + target_dir = meta_dir / (data.get("uniprotAccession") or uniprot) + for key in ("cifUrl", "pdbUrl", "paeDocUrl", "plddtDocUrl"): + urlval = data.get(key) + if isinstance(urlval, str) and urlval: + fname = urlval.split("?")[0].rstrip("/").split("/")[-1] + dest = target_dir / fname + saved = _download_file(session, urlval, dest) + if saved: + downloaded[key] = str(saved) + + return { + "af_structure_df": af_structure_df, + "metadata_csv": str(metadata_csv), + "downloaded_files": downloaded, + } From 5f40d137db77e1948754649074bf27c3f5995f13 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 12 Jan 2026 14:02:59 +0100 Subject: [PATCH 005/240] refactor: add messages regarding import success for user, no German comments, remove unnecessary contents of form, 1-based CL positions --- .../importing/cross_linking_import.py | 54 +++++-------------- backend/protzilla/importing/import_utils.py | 39 ++++++++++++++ backend/protzilla/methods/importing.py | 13 +---- 3 files changed, 53 insertions(+), 53 deletions(-) diff --git a/backend/protzilla/importing/cross_linking_import.py b/backend/protzilla/importing/cross_linking_import.py index 9261f5dab..1d1d6d619 100644 --- a/backend/protzilla/importing/cross_linking_import.py +++ b/backend/protzilla/importing/cross_linking_import.py @@ -9,6 +9,11 @@ import requests from backend.protzilla.utilities import format_trace +from backend.protzilla.importing.import_utils import ( + columns_in_cross_linking_df, + rename_columns_csm_format, + rename_columns_proteomediscoverer_xlinkx_format, +) def get_gene_name_from_protein_id(protein_id): @@ -51,46 +56,7 @@ def remove_brackets_from_peptide(peptide: str) -> str: def get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format( peptide: str, ) -> int: - return peptide.find("[") - - -rename_columns_csm_format = { - "Crosslink Type": "Is_intra_crosslink", - "PepSeq1": "Peptide1", - "PepSeq2": "Peptide2", - "PepPos1": "Peptide_position1", - "PepPos2": "Peptide_position2", - "LinkPos1": "CL_position1", - "LinkPos2": "CL_position2", - "PEP": "Q_value", -} - -rename_columns_proteomediscoverer_xlinkx_format = { - "Accession A": "Protein_id1", - "Accession B": "Protein_id2", - "Crosslink Type": "Is_intra_crosslink", - "Sequence A": "Peptide1", - "Sequence B": "Peptide2", - "Position A": "Peptide_position1", - "Position B": "Peptide_position2", - "Q-value": "Q_value", -} - -columns_in_cross_linking_df = [ - "Protein1", - "Protein2", - "Protein_id1", - "Protein_id2", - "Is_intra_crosslink", - "Crosslinker", - "Peptide1", - "Peptide2", - "Peptide_position1", # ToDo: check, dass wirklich immer 0-basiert - "Peptide_position2", - "CL_position1", # ToDo: check, dass wirklich immer 0-basiert - "CL_position2", - "Q_value", -] + return peptide.find("[") + 1 # 1-based index def read_ProteomeDiscoverer_XlinkX_file(file_path: Path) -> pd.DataFrame: @@ -150,7 +116,8 @@ def cross_linking_import(file_path: Path) -> dict: df = read_csm_file(file_path) elif file_path.suffix == ".xlsx": df = read_ProteomeDiscoverer_XlinkX_file(file_path) - return dict(crosslinking_df=df) # kann sein, dass df nicht existiert + else: + raise ValueError(f"Unsupported file type: {file_path.suffix}") except Exception as e: msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid cross linking file." return dict( @@ -162,3 +129,8 @@ def cross_linking_import(file_path: Path) -> dict: ) ] ) + msg = f"Successfully imported data of {len(df)} cross-links." + return dict( + crosslinking_df=df, + messages=[dict(level=logging.INFO, msg=msg)], + ) diff --git a/backend/protzilla/importing/import_utils.py b/backend/protzilla/importing/import_utils.py index 6b9fce095..f5d0420ec 100644 --- a/backend/protzilla/importing/import_utils.py +++ b/backend/protzilla/importing/import_utils.py @@ -14,3 +14,42 @@ class AggregationMethods(Enum): sum = "Sum" median = "Median" mean = "Mean" + + +rename_columns_csm_format = { + "Crosslink Type": "Is_intra_crosslink", + "PepSeq1": "Peptide1", + "PepSeq2": "Peptide2", + "PepPos1": "Peptide_position1", + "PepPos2": "Peptide_position2", + "LinkPos1": "CL_position1", + "LinkPos2": "CL_position2", + "PEP": "Q_value", +} + +rename_columns_proteomediscoverer_xlinkx_format = { + "Accession A": "Protein_id1", + "Accession B": "Protein_id2", + "Crosslink Type": "Is_intra_crosslink", + "Sequence A": "Peptide1", + "Sequence B": "Peptide2", + "Position A": "Peptide_position1", + "Position B": "Peptide_position2", + "Q-value": "Q_value", +} + +columns_in_cross_linking_df = [ + "Protein1", + "Protein2", + "Protein_id1", + "Protein_id2", + "Is_intra_crosslink", + "Crosslinker", + "Peptide1", + "Peptide2", + "Peptide_position1", + "Peptide_position2", + "CL_position1", + "CL_position2", + "Q_value", +] diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 691571067..f8d180655 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -415,20 +415,9 @@ def create_form(self): input_fields=[ FileInput( name="file_path", - label="Cross Linking Data file (.xlsx oder .csv)", + label="Cross Linking Data file (.xlsx or .csv)", value=None, ), - DropdownField( - name="", - label="", - # value=IntensityType.IBAQ.value, - # options=IntensityType, - ), - CheckboxField( - name="", - label="", - value=False, - ), ], ) From 988305e628f34a826d5d25ddb0209ad592d095ab Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 12 Jan 2026 14:15:32 +0100 Subject: [PATCH 006/240] fix: move alphafold loading step into importing and some minor fixes --- backend/protzilla/all_steps.py | 2 +- .../alphafold_protein_structure_load.py | 30 +++++++++--------- backend/protzilla/methods/data_analysis.py | 31 ------------------- backend/protzilla/methods/importing.py | 31 +++++++++++++++++++ 4 files changed, 46 insertions(+), 48 deletions(-) rename backend/protzilla/{data_analysis => importing}/alphafold_protein_structure_load.py (82%) diff --git a/backend/protzilla/all_steps.py b/backend/protzilla/all_steps.py index 710820622..4cf2842f7 100644 --- a/backend/protzilla/all_steps.py +++ b/backend/protzilla/all_steps.py @@ -14,6 +14,7 @@ importing.EvidenceImport, importing.ExampleDatasetImport, importing.FastaImport, + importing.AlphaFoldPredictionLoad, data_preprocessing.FilterProteinsBySamplesMissing, data_preprocessing.FilterProteinsBySilacRatios, data_preprocessing.FilterByProteinsCount, @@ -65,7 +66,6 @@ data_analysis.PTMOverviewVisualization, data_analysis.PTMBarVisualization, data_analysis.PTMDetailsVisualization, - data_analysis.AlphaFoldPredictionLoad, data_preprocessing.ImputationByMinPerSample, data_integration.EnrichmentAnalysisGOAnalysisWithString, data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr, diff --git a/backend/protzilla/data_analysis/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py similarity index 82% rename from backend/protzilla/data_analysis/alphafold_protein_structure_load.py rename to backend/protzilla/importing/alphafold_protein_structure_load.py index cc950537a..552a24a18 100644 --- a/backend/protzilla/data_analysis/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -29,9 +29,7 @@ def _download_file(session: requests.Session, url: str, dest: Path) -> Path | No return None -def fetch_af_protein_structure( - uniprot: str, download_files: bool = True -) -> dict[str, Any]: +def fetch_alphafold_protein_structure(uniprot: str) -> dict[str, Any]: url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot}" with requests.Session() as session: @@ -86,27 +84,27 @@ def fetch_af_protein_structure( combined.to_csv(metadata_csv, index=False) logger.info("Wrote AlphaFold metadata to %s", metadata_csv) - af_structure_df = combined + alphafold_df = combined except Exception: logger.exception( "Failed to write AlphaFold metadata CSV to %s", metadata_csv ) - af_structure_df = new_row + alphafold_df = new_row downloaded: dict[str, str] = {} - if download_files: - target_dir = meta_dir / (data.get("uniprotAccession") or uniprot) - for key in ("cifUrl", "pdbUrl", "paeDocUrl", "plddtDocUrl"): - urlval = data.get(key) - if isinstance(urlval, str) and urlval: - fname = urlval.split("?")[0].rstrip("/").split("/")[-1] - dest = target_dir / fname - saved = _download_file(session, urlval, dest) - if saved: - downloaded[key] = str(saved) + + target_dir = meta_dir / (data.get("uniprotAccession") or uniprot) + for key in ("cifUrl", "pdbUrl", "paeDocUrl", "plddtDocUrl"): + urlval = data.get(key) + if isinstance(urlval, str) and urlval: + fname = urlval.split("?")[0].rstrip("/").split("/")[-1] + dest = target_dir / fname + saved = _download_file(session, urlval, dest) + if saved: + downloaded[key] = str(saved) return { - "af_structure_df": af_structure_df, + "alphafold_df": alphafold_df, "metadata_csv": str(metadata_csv), "downloaded_files": downloaded, } diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index e396e1a42..75b9e1bd6 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -7,9 +7,6 @@ hierarchical_agglomerative_clustering, k_means, ) -from backend.protzilla.data_analysis.alphafold_protein_structure_load import ( - fetch_af_protein_structure, -) from backend.protzilla.data_analysis.differential_expression_anova import anova from backend.protzilla.data_analysis.differential_expression_kruskal_wallis import ( kruskal_wallis_test_on_ptm_data, @@ -2456,31 +2453,3 @@ def create_form(self): label="PTM Details Visualization", input_fields=_PTMVisualizationWithGroups.get_form_fields(), ) - - -class AlphaFoldPredictionLoad(DataAnalysisStep): - display_name = "AlphaFold DB Prediction Load" - operation = "Protein Structure Analysis" - method_description = "Loads the predicted structure of the protein with the given protein ID out of the AlphaFold DB." - - output_keys = [ - "af_structure_df", - ] - - plot_method = None - - def create_form(self): - return Form( - label="AlphaFold DB Prediction Load", - input_fields=[ - TextField( - name="uniprot", - label="Protein ID", - ), - ], - ) - - calc_method = staticmethod(fetch_af_protein_structure) - - def insert_dataframes(self, steps: StepManager, inputs) -> dict: - return inputs diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 3cc2fa973..140fd8468 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -11,6 +11,9 @@ max_quant_import, ms_fragger_import, ) +from backend.protzilla.importing.alphafold_protein_structure_load import ( + fetch_alphafold_protein_structure, +) from backend.protzilla.importing.peptide_import import peptide_import, evidence_import from backend.protzilla.steps import Step, StepManager from protzilla.importing.example_dataset_import import example_dataset_import @@ -399,3 +402,31 @@ def create_form(self): ) calc_method = staticmethod(example_dataset_import) + + +class AlphaFoldPredictionLoad(ImportingStep): + display_name = "AlphaFold DB Prediction Load" + operation = "Protein Structure Import" + method_description = "Loads the predicted structure of the protein with the given protein ID out of the AlphaFold DB." + + output_keys = [ + "alphafold_df", + ] + + plot_method = None + + def create_form(self): + return Form( + label="AlphaFold DB Prediction Load", + input_fields=[ + TextField( + name="uniprot", + label="Protein ID", + ), + ], + ) + + calc_method = staticmethod(fetch_alphafold_protein_structure) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + return inputs From a207bc2ec926237d956f6cad17d507b8a0e46adc Mon Sep 17 00:00:00 2001 From: Tarek Massini Date: Mon, 12 Jan 2026 14:41:32 +0100 Subject: [PATCH 007/240] fix: formatting --- .../importing/alphafold_protein_structure_load.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 552a24a18..8fc9059cf 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -57,9 +57,11 @@ def fetch_alphafold_protein_structure(uniprot: str) -> dict[str, Any]: "latestVersion": r.get("latestVersion"), "uniprotStart": r.get("uniprotStart"), "uniprotEnd": r.get("uniprotEnd"), - "sequenceLength": len(r["uniprotSequence"]) - if isinstance(r.get("uniprotSequence"), str) - else None, + "sequenceLength": ( + len(r["uniprotSequence"]) + if isinstance(r.get("uniprotSequence"), str) + else None + ), } for key in ("pdbUrl", "cifUrl", "paeDocUrl", "plddtDocUrl"): From 9649e0d33152f63f9334f70428d5821a9bd8d962 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 12 Jan 2026 15:10:21 +0100 Subject: [PATCH 008/240] fix: remove unnecessary function and add step to test --- backend/protzilla/methods/importing.py | 3 --- backend/tests/main/test_views_helper.py | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 140fd8468..661821a73 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -427,6 +427,3 @@ def create_form(self): ) calc_method = staticmethod(fetch_alphafold_protein_structure) - - def insert_dataframes(self, steps: StepManager, inputs) -> dict: - return inputs diff --git a/backend/tests/main/test_views_helper.py b/backend/tests/main/test_views_helper.py index d40de2d75..f574970e9 100644 --- a/backend/tests/main/test_views_helper.py +++ b/backend/tests/main/test_views_helper.py @@ -13,6 +13,7 @@ def test_get_all_possible_step_names(): "EvidenceImport", "ExampleDatasetImport", "FastaImport", + "AlphaFoldPredictionLoad", "FilterProteinsBySamplesMissing", "FilterProteinsBySilacRatios", "FilterByProteinsCount", From cdfa15fdb6c4b116ff0f40563bcbae68ec1aeebe Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Tue, 13 Jan 2026 09:28:37 +0100 Subject: [PATCH 009/240] add error handling and cache for uniprot lookup --- .../importing/cross_linking_import.py | 238 ++++++++++++++++-- 1 file changed, 211 insertions(+), 27 deletions(-) diff --git a/backend/protzilla/importing/cross_linking_import.py b/backend/protzilla/importing/cross_linking_import.py index 1d1d6d619..e3e40f696 100644 --- a/backend/protzilla/importing/cross_linking_import.py +++ b/backend/protzilla/importing/cross_linking_import.py @@ -4,6 +4,7 @@ import logging from pathlib import Path +from typing import Callable, Tuple import pandas as pd import traceback import requests @@ -16,37 +17,136 @@ ) +def get_protein_designation(designation_lookup_cache, protein_designation, uniprot_lookup_func): + if designation_lookup_cache[protein_designation]: + success, new_protein_designation, error = True, designation_lookup_cache[protein_designation], None + else: + success, new_protein_designation, error = uniprot_lookup_func(protein_designation) + if success: + designation_lookup_cache[protein_designation] = new_protein_designation + return success, new_protein_designation, error + + def get_gene_name_from_protein_id(protein_id): - return "placeholder" + """ + Retrieves the gene name for a given Protein ID from UniProt. + + Parameters: + protein_id (str): The UniProt accession ID (e.g. "Q92878"). + + Returns: + success (bool): True if the lookup succeeded, False otherwise + gene_name (str or None): Official gene name if successful, else None + error (str or None): Error code/message if failed, else None + """ + #return "placeholder" url = f"https://rest.uniprot.org/uniprotkb/{protein_id}" params = {"fields": "gene_names", "format": "json"} - response = requests.get(url, params=params) - response.raise_for_status() # Fehler werfen, wenn etwas schief geht - - data = response.json() + try: + response = requests.get(url, params=params) + response.raise_for_status() - gene_name = data["genes"][0]["geneName"]["value"] + data = response.json() + gene_name = data.get("genes", [{}])[0].get("geneName", {}).get("value") - return gene_name + if gene_name: + return True, gene_name, None + else: + return False, None, "NO_GENE_NAME_FOUND" + + except requests.exceptions.Timeout: + return False, None, "TIMEOUT" + + except requests.exceptions.HTTPError as e: + return False, None, f"HTTP_{e.response.status_code}" + + except requests.exceptions.RequestException: + return False, None, "REQUEST_ERROR" + + except ValueError: + return False, None, "INVALID_JSON" def get_protein_ids_from_gene_name(gene_name): - return "placeholder" + """ + Retrieves UniProt protein IDs for a given human gene name. + + Parameters: + gene_name (str): The gene symbol to look up (e.g. "RAD50") + + Returns: + success (bool): True if lookup succeeded, False otherwise + data (dict or None): { + "protein_ids" (list of str): all protein IDs without any isoform information, + "list_of_protein_isoforms" (list of str): all isomform IDs + } if success else None + error (str or None): error code/message if failed, else None + """ + #return "placeholder" url = "https://rest.uniprot.org/uniprotkb/search" params = { "query": f"gene:{gene_name} AND organism_id:9606 AND reviewed:true", "format": "list", "includeIsoform": "true", } - response = requests.get(url, params=params) - response.raise_for_status() # Fehler werfen, wenn etwas schief geht + try: + response = requests.get(url, params=params, timeout=1) + response.raise_for_status() + + all_ids = response.text.strip().split("\n") + protein_ids = [i for i in all_ids if "-" not in i] + list_of_protein_isoforms = [i for i in all_ids if "-" in i] + + if not protein_ids: + return False, None, "NO_PROTEIN_ID_FOUND" + else: + return True, { + "protein_ids": protein_ids, + "list_of_protein_isoforms": list_of_protein_isoforms + }, None + + except requests.exceptions.Timeout: + return False, None, "TIMEOUT" + + except requests.exceptions.HTTPError as e: + return False, None, f"HTTP_{e.response.status_code}" + + except requests.exceptions.RequestException: + return False, None, "REQUEST_ERROR" + + +def iterate_for_protein_designation(df, protein_designation, uniprot_lookup_func): + good_rows = [] + failed_rows = [] + protein_designation_cache = {} + + for _, row in df.iterrows(): + row_dict = row.to_dict() + + success1, new_protein_designation1, error1 = get_protein_designation(protein_designation_cache, row[protein_designation + "1"], uniprot_lookup_func) + success2, new_protein_designation2, error2 = get_protein_designation(protein_designation_cache, row[protein_designation + "2"], uniprot_lookup_func) + + errors_occurred = {} + if not success1: + errors_occurred["Protein1_error"] = error1 + if not success2: + errors_occurred["Protein2_error"] = error2 + + if errors_occurred: + failed_row = row_dict.copy() + failed_row.update(errors_occurred) + failed_rows.append(failed_row) + else: + row_dict[protein_designation + "1"] = new_protein_designation1 + row_dict[protein_designation + "2"] = new_protein_designation2 - all_ids = response.text.strip().split("\n") - protein_ids = [i for i in all_ids if "-" not in i] - list_of_protein_isoforms = [i for i in all_ids if "-" in i] + good_rows.append(row_dict) - return protein_ids, list_of_protein_isoforms + good_df = normalize_crosslinking_df(pd.DataFrame(good_rows)) + failed_df = pd.DataFrame(failed_rows) + + return good_df, failed_df def remove_brackets_from_peptide(peptide: str) -> str: @@ -74,25 +174,96 @@ def read_ProteomeDiscoverer_XlinkX_file(file_path: Path) -> pd.DataFrame: df["Peptide1"] = df["Peptide1"].apply(remove_brackets_from_peptide).astype("string") df["Peptide2"] = df["Peptide2"].apply(remove_brackets_from_peptide).astype("string") - df["Protein1"] = df["Protein_id1"].apply(get_gene_name_from_protein_id) - df["Protein2"] = df["Protein_id2"].apply(get_gene_name_from_protein_id) - df["Is_intra_crosslink"] = df["Is_intra_crosslink"].eq("Intra") - return normalize_crosslinking_df(df) + """good_rows = [] + failed_rows = [] + gene_names_cache = {} + + for _, row in df.iterrows(): + row_dict = row.to_dict() + + success1, gene_name1, error1 = get_protein_designation(gene_names_cache, row["Protein_id1"]) + success2, gene_name2, error2 = get_protein_designation(gene_names_cache, row["Protein_id2"]) + + errors_occurred = {} + if not success1: + errors_occurred["Protein1_error"] = error1 + if not success2: + errors_occurred["Protein2_error"] = error2 + + if errors_occurred: + failed_row = row_dict.copy() + failed_row.update(errors_occurred) + failed_rows.append(failed_row) + else: + row_dict["Protein_id1"] = gene_name1 + row_dict["Protein_id2"] = gene_name2 + + good_rows.append(row_dict) + + #df["Protein1"] = df["Protein_id1"].apply(get_gene_name_from_protein_id) + #df["Protein2"] = df["Protein_id2"].apply(get_gene_name_from_protein_id) + + good_df = normalize_crosslinking_df(pd.DataFrame(good_rows)) + failed_df = pd.DataFrame(failed_rows)""" + + good_df, failed_df = iterate_for_protein_designation(df, "Protein_id", get_gene_name_from_protein_id) + + return good_df, failed_df def read_csm_file(file_path: Path) -> pd.DataFrame: + """ + Returns two DataFrames: + - normalized_df: only rows with successful UniProt lookups + - failed_df: rows where UniProt lookup failed, including error messages + """ df = pd.read_csv(file_path, low_memory=False).rename( columns=rename_columns_csm_format ) - df["Protein_id1"] = df["Protein1"].apply(get_protein_ids_from_gene_name) - df["Protein_id2"] = df["Protein2"].apply(get_protein_ids_from_gene_name) - df["Is_intra_crosslink"] = df["Protein1"].eq(df["Protein2"]) - return normalize_crosslinking_df(df) + """good_rows = [] + failed_rows = [] + + for _, row in df.iterrows(): + row_dict = row.to_dict() + + success1, data1, error1 = get_protein_ids_from_gene_name(row["Protein1"]) + success2, data2, error2 = get_protein_ids_from_gene_name(row["Protein2"]) + + errors_occurred = {} + if not success1: + errors_occurred["Protein1_error"] = error1 + if not success2: + errors_occurred["Protein2_error"] = error2 + + if errors_occurred: + failed_row = row_dict.copy() + failed_row.update(errors_occurred) + failed_rows.append(failed_row) + else: + row_dict["Protein_id1"] = data1 + row_dict["Protein_id2"] = data2 + + #row_dict["Is_intra_crosslink"] = row["Protein1"] == row["Protein2"] + + good_rows.append(row_dict) + + good_df = normalize_crosslinking_df(pd.DataFrame(good_rows)) + failed_df = pd.DataFrame(failed_rows)""" + + good_df, failed_df = iterate_for_protein_designation(df, "Protein", get_protein_ids_from_gene_name) + + #df["Protein_id1"] = df["Protein1"].apply(get_protein_ids_from_gene_name) + #df["Protein_id2"] = df["Protein2"].apply(get_protein_ids_from_gene_name) + + #df["Is_intra_crosslink"] = df["Protein1"].eq(df["Protein2"]) + + #return normalize_crosslinking_df(df) + return good_df, failed_df def normalize_crosslinking_df(df: pd.DataFrame) -> pd.DataFrame: @@ -113,9 +284,11 @@ def normalize_crosslinking_df(df: pd.DataFrame) -> pd.DataFrame: def cross_linking_import(file_path: Path) -> dict: try: if file_path.suffix == ".csv": - df = read_csm_file(file_path) + good_df, failed_df = read_csm_file(file_path) + #df = read_csm_file(file_path) elif file_path.suffix == ".xlsx": - df = read_ProteomeDiscoverer_XlinkX_file(file_path) + good_df, failed_df = read_ProteomeDiscoverer_XlinkX_file(file_path) + #df = read_ProteomeDiscoverer_XlinkX_file(file_path) else: raise ValueError(f"Unsupported file type: {file_path.suffix}") except Exception as e: @@ -129,8 +302,19 @@ def cross_linking_import(file_path: Path) -> dict: ) ] ) - msg = f"Successfully imported data of {len(df)} cross-links." + if failed_df.empty: + msg = f"Successfully imported data of {len(good_df)} cross-links." + messages = [dict(level=logging.INFO, msg=msg)] + else: + msg = f"Warning: {len(failed_df)} rows failed to import, however {len(good_df)} cross-links were successfully imported." + messages = [ + dict(level=logging.WARNING, msg=msg), + dict(level=logging.WARNING, msg=f"Failed rows:\n{failed_df}") + ] + return dict( - crosslinking_df=df, - messages=[dict(level=logging.INFO, msg=msg)], + crosslinking_df=good_df, + messages=messages + #crosslinking_df=df, + #messages=[dict(level=logging.INFO, msg=msg)], ) From 33c6fd499a584e91249176ea997eac3dc9773116 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Tue, 13 Jan 2026 12:10:50 +0100 Subject: [PATCH 010/240] fix: remove urls from metadata download and change alphafold_df to only show the protein metadata that was just uploaded --- .../importing/alphafold_protein_structure_load.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 8fc9059cf..bf0272a3a 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -64,40 +64,37 @@ def fetch_alphafold_protein_structure(uniprot: str) -> dict[str, Any]: ), } + files_urls: dict[str, Any] = {} + for key in ("pdbUrl", "cifUrl", "paeDocUrl", "plddtDocUrl"): if isinstance(r.get(key), str) and r.get(key): - data[key] = r[key] + files_urls[key] = r[key] # prefer reading the existing AlphaFold metadata CSV into the dataframe meta_dir = paths.EXTERNAL_DATA_PATH / "alphafold" meta_dir.mkdir(parents=True, exist_ok=True) metadata_csv = meta_dir / "alphafold_metadata.csv" - new_row = pd.DataFrame([data]) + alphafold_df = pd.DataFrame([data]) try: if metadata_csv.exists(): existing = pd.read_csv(metadata_csv, dtype=str) acc = data.get("uniprotAccession") if acc and "uniprotAccession" in existing.columns: existing = existing[existing["uniprotAccession"] != acc] - combined = pd.concat([existing, new_row], ignore_index=True) - else: - combined = new_row + combined = pd.concat([existing, alphafold_df], ignore_index=True) combined.to_csv(metadata_csv, index=False) logger.info("Wrote AlphaFold metadata to %s", metadata_csv) - alphafold_df = combined except Exception: logger.exception( "Failed to write AlphaFold metadata CSV to %s", metadata_csv ) - alphafold_df = new_row - downloaded: dict[str, str] = {} target_dir = meta_dir / (data.get("uniprotAccession") or uniprot) for key in ("cifUrl", "pdbUrl", "paeDocUrl", "plddtDocUrl"): - urlval = data.get(key) + urlval = files_urls.get(key) if isinstance(urlval, str) and urlval: fname = urlval.split("?")[0].rstrip("/").split("/")[-1] dest = target_dir / fname From 5fb8200bc5a2b47aa6d835d78a2efe0b16aed7cf Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Tue, 13 Jan 2026 16:28:53 +0100 Subject: [PATCH 011/240] use batch request instead of cache for uniprot lookup --- .../importing/cross_linking_import.py | 322 +++++++++++------- 1 file changed, 195 insertions(+), 127 deletions(-) diff --git a/backend/protzilla/importing/cross_linking_import.py b/backend/protzilla/importing/cross_linking_import.py index e3e40f696..021538738 100644 --- a/backend/protzilla/importing/cross_linking_import.py +++ b/backend/protzilla/importing/cross_linking_import.py @@ -3,8 +3,8 @@ """ import logging -from pathlib import Path -from typing import Callable, Tuple +from pathlib import Path +from collections import defaultdict import pandas as pd import traceback import requests @@ -16,7 +16,7 @@ rename_columns_proteomediscoverer_xlinkx_format, ) - +""" def get_protein_designation(designation_lookup_cache, protein_designation, uniprot_lookup_func): if designation_lookup_cache[protein_designation]: success, new_protein_designation, error = True, designation_lookup_cache[protein_designation], None @@ -25,107 +25,226 @@ def get_protein_designation(designation_lookup_cache, protein_designation, unipr if success: designation_lookup_cache[protein_designation] = new_protein_designation return success, new_protein_designation, error +""" + + +def aggregate_data(df: pd.DataFrame, column: str) -> set: + """ + Extracts unique values from two DataFrame columns and returns them as a set. + + Parameters: + df (pd.DataFrame): Input DataFrame + column (str): Column name + + Returns: + set: Unique values from the columns + """ + return set( + df[[column + "1", column + "2"]] + .stack() + .astype(str) + .str.strip() + ) -def get_gene_name_from_protein_id(protein_id): +def get_gene_name_from_protein_ids(protein_ids: set): """ - Retrieves the gene name for a given Protein ID from UniProt. + Retrieves the gene names for a given set of Protein IDs in a batch from UniProt. Parameters: - protein_id (str): The UniProt accession ID (e.g. "Q92878"). + protein_ids (set): Set of UniProt accession IDs (e.g. {"Q92878", "P51587"}). Returns: - success (bool): True if the lookup succeeded, False otherwise - gene_name (str or None): Official gene name if successful, else None - error (str or None): Error code/message if failed, else None + dict: Mapping protein_id -> (success, gene_name, error) + success (bool): True if the lookup for that protein_id succeeded, False otherwise + gene_name (str or None): Official gene name if successful, else None + error (str or None): Error code/message if failed, else None """ #return "placeholder" - url = f"https://rest.uniprot.org/uniprotkb/{protein_id}" - params = {"fields": "gene_names", "format": "json"} + + results = {} + if not protein_ids: + return results + + url = f"https://rest.uniprot.org/uniprotkb/search" + params = { + "query": " OR ".join(f"accession:{pid}" for pid in protein_ids), + "fields": "accesssion,gene_names", + "format": "json" + } try: - response = requests.get(url, params=params) + response = requests.get(url, params=params, timeout=15) response.raise_for_status() - data = response.json() - gene_name = data.get("genes", [{}])[0].get("geneName", {}).get("value") - if gene_name: - return True, gene_name, None - else: - return False, None, "NO_GENE_NAME_FOUND" + for entry in data.get("results", []): + protein_id = entry.get("primaryAccession") + output = entry.get("genes", [{}]) + gene_name = output[0].get("geneName", {}).get("value") if output else None + + #gene_name = data.get("genes", [{}])[0].get("geneName", {}).get("value") + + if gene_name: + results[protein_id] = (True, gene_name, None) + #return True, gene_name, None + else: + results[protein_id] = (False, None, "NO_GENE_NAME_FOUND") + #return False, None, "NO_GENE_NAME_FOUND" + + for pid in protein_ids: + if pid not in results: + results[pid] = (False, None, "PROTEIN_ID_NOT_FOUND") except requests.exceptions.Timeout: - return False, None, "TIMEOUT" + for pid in protein_ids: + results[pid] = (False, None, "TIMEOUT") + #return False, None, "TIMEOUT" except requests.exceptions.HTTPError as e: - return False, None, f"HTTP_{e.response.status_code}" + for pid in protein_ids: + results[pid] = (False, None, f"HTTP_{e.response.status_code}") + #return False, None, f"HTTP_{e.response.status_code}" except requests.exceptions.RequestException: - return False, None, "REQUEST_ERROR" + for pid in protein_ids: + results[pid] = (False, None, "REQUEST_ERROR") + #return False, None, "REQUEST_ERROR" except ValueError: - return False, None, "INVALID_JSON" + for pid in protein_ids: + results[pid] = (False, None, "INVALID_JSON") + #return False, None, "INVALID_JSON" + + return results -def get_protein_ids_from_gene_name(gene_name): +def get_protein_ids_from_gene_name(gene_names: set): """ - Retrieves UniProt protein IDs for a given human gene name. + Retrieves UniProt protein IDs for a given set of human gene names as a batch query. Parameters: - gene_name (str): The gene symbol to look up (e.g. "RAD50") + gene_names (set): Set of gene symbols to look up (e.g. {"RAD50", "MRE11"}) Returns: - success (bool): True if lookup succeeded, False otherwise - data (dict or None): { - "protein_ids" (list of str): all protein IDs without any isoform information, - "list_of_protein_isoforms" (list of str): all isomform IDs - } if success else None - error (str or None): error code/message if failed, else None + dict: Mapping gene_name -> (success, data, error) + success (bool): True if lookup for this gene_name succeeded, False otherwise + data (dict or None): { + "protein_ids" (list of str): all protein IDs without any isoform information, + "list_of_protein_isoforms" (list of str): all isomform IDs + } if success else None + error (str or None): error code/message if failed, else None """ #return "placeholder" + + results = {} + if not gene_names: + return results + + query = " OR ".join(f"gene:{g}" for g in gene_names) + url = "https://rest.uniprot.org/uniprotkb/search" params = { - "query": f"gene:{gene_name} AND organism_id:9606 AND reviewed:true", - "format": "list", + "query": f"({query}) AND organism_id:9606 AND reviewed:true", + "format": "tsv", + "fields": "accession,genes", "includeIsoform": "true", } try: - response = requests.get(url, params=params, timeout=1) - response.raise_for_status() - - all_ids = response.text.strip().split("\n") - protein_ids = [i for i in all_ids if "-" not in i] - list_of_protein_isoforms = [i for i in all_ids if "-" in i] + response = requests.get(url, params=params, timeout=15) + response.raise_for_status() - if not protein_ids: - return False, None, "NO_PROTEIN_ID_FOUND" - else: - return True, { - "protein_ids": protein_ids, - "list_of_protein_isoforms": list_of_protein_isoforms - }, None + output = defaultdict(lambda: { + "protein_ids": [], + "list_of_protein_isoforms": [] + }) + + lines = response.text.strip().split("\n") + header = lines[0].split("\t") + protein_id_idx = header.index("Entry") + gene_name_idx = header.index("Gene Names") + + for line in lines[1:]: + parts = line.split("\t") + protein_id = parts[protein_id_idx] + output_gene_names = parts[gene_name_idx].split() + + for g in output_gene_names: + if g in gene_names: + if "-" in protein_id: + output[g]["list_of_protein_isoforms"].append(protein_id) + else: + output[g]["protein_ids"].append(protein_id) + + for gn in gene_names: + data = output.get(gn) + + if not data or not data["protein_ids"]: + results[gn] = (False, None, "NO_PROTEIN_ID_FOUND") + #return False, None, "NO_PROTEIN_ID_FOUND" + else: + results[gn] = (True, data, None) + #return True, { + # "protein_ids": protein_ids, + # "list_of_protein_isoforms": list_of_protein_isoforms + #}, None + + return results except requests.exceptions.Timeout: - return False, None, "TIMEOUT" + for g in gene_names: + results[g] = (False, None, "TIMEOUT") + return results + #return False, None, "TIMEOUT" except requests.exceptions.HTTPError as e: - return False, None, f"HTTP_{e.response.status_code}" + for g in gene_names: + results[g] = (False, None, f"HTTP_{e.response.status_code}") + return results + #return False, None, f"HTTP_{e.response.status_code}" except requests.exceptions.RequestException: - return False, None, "REQUEST_ERROR" + for g in gene_names: + results[g] = (False, None, "REQUEST_ERROR") + return results + #return False, None, "REQUEST_ERROR" + +def iterate_for_protein_designation( + df, + protein_designation, + uniprot_lookup_results, + value_extractor=lambda x: x +): + """ + Iterates over a DataFrame and adds the missing protein designations to the dataframe using + precomputed lookup results. (either protein ids or gene names are included in the imported + data and the other is added to the data frame here) -def iterate_for_protein_designation(df, protein_designation, uniprot_lookup_func): + Parameters: + df (pd.DataFrame) + protein_designation (str): existing protein designation, e.g. "Protein_id" or "Protein" + uniprot_lookup_results (dict): + Mapping key -> (success, data, error) + value_extractor (callable): + function(data) -> value to store in DataFrame cell + + Returns: + good_df (pd.DataFrame): Rows with successful lookups + failed_df (pd.DataFrame): Rows with lookup errors + """ good_rows = [] failed_rows = [] - protein_designation_cache = {} for _, row in df.iterrows(): row_dict = row.to_dict() - success1, new_protein_designation1, error1 = get_protein_designation(protein_designation_cache, row[protein_designation + "1"], uniprot_lookup_func) - success2, new_protein_designation2, error2 = get_protein_designation(protein_designation_cache, row[protein_designation + "2"], uniprot_lookup_func) + success1, data1, error1 = uniprot_lookup_results.get( + row[protein_designation + "1"], (False, None, "NOT_LOOKED_UP") + ) + success2, data2, error2 = uniprot_lookup_results.get( + row[protein_designation + "2"], (False, None, "NOT_LOOKED_UP") + ) errors_occurred = {} if not success1: @@ -138,10 +257,9 @@ def iterate_for_protein_designation(df, protein_designation, uniprot_lookup_func failed_row.update(errors_occurred) failed_rows.append(failed_row) else: - row_dict[protein_designation + "1"] = new_protein_designation1 - row_dict[protein_designation + "2"] = new_protein_designation2 - - good_rows.append(row_dict) + row_dict[protein_designation + "1"] = value_extractor(data1) + row_dict[protein_designation + "2"] = value_extractor(data2) + good_rows.append(row_dict) good_df = normalize_crosslinking_df(pd.DataFrame(good_rows)) failed_df = pd.DataFrame(failed_rows) @@ -176,39 +294,15 @@ def read_ProteomeDiscoverer_XlinkX_file(file_path: Path) -> pd.DataFrame: df["Is_intra_crosslink"] = df["Is_intra_crosslink"].eq("Intra") - """good_rows = [] - failed_rows = [] - gene_names_cache = {} - - for _, row in df.iterrows(): - row_dict = row.to_dict() - - success1, gene_name1, error1 = get_protein_designation(gene_names_cache, row["Protein_id1"]) - success2, gene_name2, error2 = get_protein_designation(gene_names_cache, row["Protein_id2"]) - - errors_occurred = {} - if not success1: - errors_occurred["Protein1_error"] = error1 - if not success2: - errors_occurred["Protein2_error"] = error2 - - if errors_occurred: - failed_row = row_dict.copy() - failed_row.update(errors_occurred) - failed_rows.append(failed_row) - else: - row_dict["Protein_id1"] = gene_name1 - row_dict["Protein_id2"] = gene_name2 - - good_rows.append(row_dict) - - #df["Protein1"] = df["Protein_id1"].apply(get_gene_name_from_protein_id) - #df["Protein2"] = df["Protein_id2"].apply(get_gene_name_from_protein_id) - - good_df = normalize_crosslinking_df(pd.DataFrame(good_rows)) - failed_df = pd.DataFrame(failed_rows)""" - - good_df, failed_df = iterate_for_protein_designation(df, "Protein_id", get_gene_name_from_protein_id) + #unique_protein_ids = set(df[["Protein_id1", "Protein_id2"]].stack().astype(str).str.strip()) + unique_protein_ids = aggregate_data(df, "Protein_id") + uniprot_lookup_results = get_gene_name_from_protein_ids(unique_protein_ids) + good_df, failed_df = iterate_for_protein_designation( + df, + "Protein_id", + uniprot_lookup_results, + value_extractor=lambda x: x + ) return good_df, failed_df @@ -225,44 +319,18 @@ def read_csm_file(file_path: Path) -> pd.DataFrame: df["Is_intra_crosslink"] = df["Protein1"].eq(df["Protein2"]) - """good_rows = [] - failed_rows = [] - - for _, row in df.iterrows(): - row_dict = row.to_dict() - - success1, data1, error1 = get_protein_ids_from_gene_name(row["Protein1"]) - success2, data2, error2 = get_protein_ids_from_gene_name(row["Protein2"]) - - errors_occurred = {} - if not success1: - errors_occurred["Protein1_error"] = error1 - if not success2: - errors_occurred["Protein2_error"] = error2 - - if errors_occurred: - failed_row = row_dict.copy() - failed_row.update(errors_occurred) - failed_rows.append(failed_row) - else: - row_dict["Protein_id1"] = data1 - row_dict["Protein_id2"] = data2 - - #row_dict["Is_intra_crosslink"] = row["Protein1"] == row["Protein2"] - - good_rows.append(row_dict) - - good_df = normalize_crosslinking_df(pd.DataFrame(good_rows)) - failed_df = pd.DataFrame(failed_rows)""" - - good_df, failed_df = iterate_for_protein_designation(df, "Protein", get_protein_ids_from_gene_name) - - #df["Protein_id1"] = df["Protein1"].apply(get_protein_ids_from_gene_name) - #df["Protein_id2"] = df["Protein2"].apply(get_protein_ids_from_gene_name) - - #df["Is_intra_crosslink"] = df["Protein1"].eq(df["Protein2"]) + unique_gene_names = aggregate_data(df, "Protein") + uniprot_lookup_results = get_protein_ids_from_gene_name(unique_gene_names) + # In our UniProt lookup we already get all isoforms of the respective gene name. + # Right now we only store the protein id without any isoform information in our dataframe to keep it consistent. + # If we ever need the isoform information we just have to change what the value extractor stores in our dataframe. + good_df, failed_df = iterate_for_protein_designation( + df, + "Protein", + uniprot_lookup_results, + value_extractor=lambda x: x["protein_ids"][0] if x else None + ) - #return normalize_crosslinking_df(df) return good_df, failed_df From 1149761be6999d55db0946014f259c4ab5a635ff Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Wed, 14 Jan 2026 16:04:50 +0100 Subject: [PATCH 012/240] feat: add checkbox for option to save data across runs, change API requests to pull only data we want, extend alphafold_protein_structure_load so input sequence can be saved as fasta --- .../alphafold_protein_structure_load.py | 103 +++++++++++------- backend/protzilla/methods/importing.py | 5 + 2 files changed, 66 insertions(+), 42 deletions(-) diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index bf0272a3a..4769b0264 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -5,6 +5,8 @@ import pandas as pd import requests +from textwrap import wrap + from backend.protzilla.constants import paths from backend.protzilla.constants.protzilla_logging import logger @@ -29,7 +31,18 @@ def _download_file(session: requests.Session, url: str, dest: Path) -> Path | No return None -def fetch_alphafold_protein_structure(uniprot: str) -> dict[str, Any]: +def to_fasta(seq: str, header: str = "protein_sequence", width: int = 60) -> str: + VALID_AA = set("ACDEFGHIKLMNPQRSTVWYBXZJUO*-") + if not seq or any(c.isspace() for c in seq): + raise ValueError("Sequence must be a single, whitespace-free string.") + seq = seq.upper() + bad = set(seq) - VALID_AA + if bad: + raise ValueError(f"Invalid characters in sequence: {''.join(sorted(bad))}") + return ">" + header + "\n" + "\n".join(wrap(seq, width)) + "\n" + + +def fetch_alphafold_protein_structure(uniprot: str, persistUploads: bool,) -> dict[str, Any]: url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot}" with requests.Session() as session: @@ -50,57 +63,63 @@ def fetch_alphafold_protein_structure(uniprot: str) -> dict[str, Any]: raise RuntimeError(f"Unexpected AlphaFold payload for {uniprot}") data: dict[str, Any] = { - "entryId": r.get("entryId"), + "entryID": r.get("uniprotAccession"), "uniprotAccession": r.get("uniprotAccession"), - "uniprotId": r.get("uniprotId"), "modelCreatedDate": r.get("modelCreatedDate"), - "latestVersion": r.get("latestVersion"), - "uniprotStart": r.get("uniprotStart"), - "uniprotEnd": r.get("uniprotEnd"), - "sequenceLength": ( - len(r["uniprotSequence"]) - if isinstance(r.get("uniprotSequence"), str) - else None - ), + "gene": r.get("gene"), + "alphafold_version": r.get("toolUsed"), } + seq_tmp = r.get("sequence") + files_urls: dict[str, Any] = {} - for key in ("pdbUrl", "cifUrl", "paeDocUrl", "plddtDocUrl"): + for key in ("cifUrl", "paeDocUrl", "plddtDocUrl"): if isinstance(r.get(key), str) and r.get(key): files_urls[key] = r[key] - # prefer reading the existing AlphaFold metadata CSV into the dataframe - meta_dir = paths.EXTERNAL_DATA_PATH / "alphafold" - meta_dir.mkdir(parents=True, exist_ok=True) - metadata_csv = meta_dir / "alphafold_metadata.csv" - alphafold_df = pd.DataFrame([data]) - try: - if metadata_csv.exists(): - existing = pd.read_csv(metadata_csv, dtype=str) - acc = data.get("uniprotAccession") - if acc and "uniprotAccession" in existing.columns: - existing = existing[existing["uniprotAccession"] != acc] - combined = pd.concat([existing, alphafold_df], ignore_index=True) - - combined.to_csv(metadata_csv, index=False) - logger.info("Wrote AlphaFold metadata to %s", metadata_csv) - except Exception: - logger.exception( - "Failed to write AlphaFold metadata CSV to %s", metadata_csv - ) - downloaded: dict[str, str] = {} - - target_dir = meta_dir / (data.get("uniprotAccession") or uniprot) - for key in ("cifUrl", "pdbUrl", "paeDocUrl", "plddtDocUrl"): - urlval = files_urls.get(key) - if isinstance(urlval, str) and urlval: - fname = urlval.split("?")[0].rstrip("/").split("/")[-1] - dest = target_dir / fname - saved = _download_file(session, urlval, dest) - if saved: - downloaded[key] = str(saved) + acc = data.get("uniprotAccession") + if persistUploads: + # prefer reading the existing AlphaFold metadata CSV into the dataframe + meta_dir = paths.EXTERNAL_DATA_PATH / "alphafold" + meta_dir.mkdir(parents=True, exist_ok=True) + metadata_csv = meta_dir / "alphafold_metadata.csv" + + try: + if metadata_csv.exists(): + existing = pd.read_csv(metadata_csv, dtype=str) + if acc and "uniprotAccession" in existing.columns: + existing = existing[existing["uniprotAccession"] != acc] + combined = pd.concat([existing, alphafold_df], ignore_index=True) + combined.to_csv(metadata_csv, index=False) + else: + alphafold_df.to_csv(metadata_csv, index=False) + logger.info("Wrote AlphaFold metadata to %s", metadata_csv) + except Exception: + logger.exception( + "Failed to write AlphaFold metadata CSV to %s", metadata_csv + ) + downloaded: dict[str, str] = {} + + target_dir = meta_dir / (acc or uniprot) + + for key in ("cifUrl", "pdbUrl", "paeDocUrl", "plddtDocUrl"): + urlval = files_urls.get(key) + if isinstance(urlval, str) and urlval: + fname = urlval.split("?")[0].rstrip("/").split("/")[-1] + dest = target_dir / fname + saved = _download_file(session, urlval, dest) + if saved: + downloaded[key] = str(saved) + + sequence = to_fasta(seq=seq_tmp, header=uniprot) + dest = target_dir / f"{uniprot.upper()}.fasta" + + with open(dest, "w") as f: + f.write(sequence) + else: + pass return { "alphafold_df": alphafold_df, diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 27df39e9e..da33bfabc 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -426,6 +426,11 @@ def create_form(self): name="uniprot", label="Protein ID", ), + CheckboxField( + name="persistUploads", + label="Upload should be saved persistently across runs", + value=True, + ), ], ) From 7bae8a32298748d6d832fea3a833a0792c8c8981 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Wed, 14 Jan 2026 18:18:05 +0100 Subject: [PATCH 013/240] [wip] feat: add handling of files when not permanently saved --- .../alphafold_protein_structure_load.py | 197 ++++++++++++++---- backend/protzilla/methods/importing.py | 6 +- requirements.txt | 1 + 3 files changed, 160 insertions(+), 44 deletions(-) diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 4769b0264..78cee956d 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -6,6 +6,9 @@ import pandas as pd import requests from textwrap import wrap +from CifFile import ReadCif +import shutil +import tempfile from backend.protzilla.constants import paths @@ -42,6 +45,149 @@ def to_fasta(seq: str, header: str = "protein_sequence", width: int = 60) -> str return ">" + header + "\n" + "\n".join(wrap(seq, width)) + "\n" +def fasta_to_dataframe(fasta_path: str): + records = [] + seq_id = None + seq = [] + + with open(fasta_path, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + if line.startswith(">"): + if seq_id is not None: + sequence = "".join(seq) + records.append({ + "id": seq_id, + "sequence": sequence, + "length": len(sequence), + }) + seq_id = line[1:].split()[0] + seq = [] + else: + seq.append(line) + + if seq_id is not None: + sequence = "".join(seq) + records.append({ + "id": seq_id, + "sequence": sequence, + "length": len(sequence), + }) + + return pd.DataFrame(records) + + +def cif_to_dataframe(cif_path): + cif = ReadCif(str(cif_path)) + block = cif.first_block() + + df = pd.DataFrame({ + "label": block["_atom_site_label"], + "element": block["_atom_site_type_symbol"], + "x": block["_atom_site_fract_x"], + "y": block["_atom_site_fract_y"], + "z": block["_atom_site_fract_z"], + }) + + return df + + +def _handle_alphafold_files( + session: requests.Session, + files_urls: dict[str, Any], + uniprot: str, + seq: str, + metadata_df: pd.DataFrame, + acc: str, + persist_upload: bool = False, +) -> tuple[ + dict[str, str], + Path, + pd.DataFrame, + pd.DataFrame, + pd.DataFrame, + pd.DataFrame, +]: + """ + Either the files are persistently saved on disk and loaded into dataframes or only loaded into dataframes to be used only for the current run. + """ + cif_df = None + pae_df = None + plddt_df = None + sequence_df = None + + meta_dir = paths.EXTERNAL_DATA_PATH / "alphafold" + target_dir = meta_dir / (acc or uniprot) + downloaded: dict[str, str] = {} + + temp_dir = None + work_dir = target_dir + + if persist_upload: + target_dir.mkdir(parents=True, exist_ok=True) + else: + temp_dir = Path(tempfile.mkdtemp()) + work_dir = temp_dir + + try: + if persist_upload and metadata_df is not None: + meta_dir.mkdir(parents=True, exist_ok=True) + metadata_csv = meta_dir / "alphafold_metadata.csv" + try: + if metadata_csv.exists(): + existing = pd.read_csv(metadata_csv, dtype=str) + if acc and "uniprotAccession" in existing.columns: + existing = existing[existing["uniprotAccession"] != acc] + combined = pd.concat([existing, metadata_df], ignore_index=True) + combined.to_csv(metadata_csv, index=False) + else: + metadata_df.to_csv(metadata_csv, index=False) + logger.info("Wrote AlphaFold metadata to %s", metadata_csv) + except Exception: + logger.exception( + "Failed to write AlphaFold metadata CSV to %s", metadata_csv + ) + + for key in ("cifUrl", "paeDocUrl", "plddtDocUrl"): + urlval = files_urls.get(key) + if isinstance(urlval, str) and urlval: + fname = urlval.split("?")[0].rstrip("/").split("/")[-1] + dest = work_dir / fname + saved = _download_file(session, urlval, dest) + if saved: + downloaded[key] = str(saved) + try: + if key == "cifUrl": + cif_df = cif_to_dataframe(saved) + elif key == "paeDocUrl": + pae_df = pd.read_json(saved) + elif key == "plddtDocUrl": + plddt_df = pd.read_json(saved) + except Exception: + logger.exception("Failed to load %s into dataframe", key) + + sequence = to_fasta(seq=seq, header=uniprot) + fasta_dest = work_dir / f"{uniprot.upper()}.fasta" + try: + fasta_dest.parent.mkdir(parents=True, exist_ok=True) + with open(fasta_dest, "w") as f: + f.write(sequence) + logger.info("Wrote FASTA sequence to %s", fasta_dest) + sequence_df = fasta_to_dataframe(str(fasta_dest)) + except OSError: + logger.exception("Failed to write FASTA file %s", fasta_dest) + except Exception: + logger.exception("Failed to create sequence dataframe") + + finally: + if temp_dir is not None: + shutil.rmtree(temp_dir, ignore_errors=True) + + return cif_df, pae_df, plddt_df, sequence_df + + def fetch_alphafold_protein_structure(uniprot: str, persistUploads: bool,) -> dict[str, Any]: url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot}" @@ -78,51 +224,16 @@ def fetch_alphafold_protein_structure(uniprot: str, persistUploads: bool,) -> di if isinstance(r.get(key), str) and r.get(key): files_urls[key] = r[key] - alphafold_df = pd.DataFrame([data]) + metadata_df = pd.DataFrame([data]) acc = data.get("uniprotAccession") - if persistUploads: - # prefer reading the existing AlphaFold metadata CSV into the dataframe - meta_dir = paths.EXTERNAL_DATA_PATH / "alphafold" - meta_dir.mkdir(parents=True, exist_ok=True) - metadata_csv = meta_dir / "alphafold_metadata.csv" - try: - if metadata_csv.exists(): - existing = pd.read_csv(metadata_csv, dtype=str) - if acc and "uniprotAccession" in existing.columns: - existing = existing[existing["uniprotAccession"] != acc] - combined = pd.concat([existing, alphafold_df], ignore_index=True) - combined.to_csv(metadata_csv, index=False) - else: - alphafold_df.to_csv(metadata_csv, index=False) - logger.info("Wrote AlphaFold metadata to %s", metadata_csv) - except Exception: - logger.exception( - "Failed to write AlphaFold metadata CSV to %s", metadata_csv - ) - downloaded: dict[str, str] = {} - - target_dir = meta_dir / (acc or uniprot) - - for key in ("cifUrl", "pdbUrl", "paeDocUrl", "plddtDocUrl"): - urlval = files_urls.get(key) - if isinstance(urlval, str) and urlval: - fname = urlval.split("?")[0].rstrip("/").split("/")[-1] - dest = target_dir / fname - saved = _download_file(session, urlval, dest) - if saved: - downloaded[key] = str(saved) - - sequence = to_fasta(seq=seq_tmp, header=uniprot) - dest = target_dir / f"{uniprot.upper()}.fasta" - - with open(dest, "w") as f: - f.write(sequence) - else: - pass + cif_df, pae_df, plddt_df, sequence_df = _handle_alphafold_files(session=session, files_urls=files_urls, uniprot=uniprot, seq=seq_tmp, metadata_df=metadata_df, acc=acc, persist_upload=persistUploads) + return { - "alphafold_df": alphafold_df, - "metadata_csv": str(metadata_csv), - "downloaded_files": downloaded, + "metadata_df": metadata_df, + "cif_df": cif_df, + "pae_df": pae_df, + "plddt_df": plddt_df, + "sequence_df": sequence_df, } diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index da33bfabc..9d0570941 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -413,7 +413,11 @@ class AlphaFoldPredictionLoad(ImportingStep): method_description = "Loads the predicted structure of the protein with the given protein ID out of the AlphaFold DB." output_keys = [ - "alphafold_df", + "metadata_df", + "cif_df", + "pae_df", + "plddt_df", + "sequence_df", ] plot_method = None diff --git a/requirements.txt b/requirements.txt index dc2f81796..87d0e1cf7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,6 +24,7 @@ plotly==6.4.0 pre-commit==4.4.0 git+https://git@github.com/hendraet/ptm-visualization.git@main#egg=protein_sequencing psutil==7.1.3 +PyCifRW==5.0.1 pydeseq2==0.5.3 pytest==9.0.1 pytest-cov==7.0.0 From bb0873b84fff8d6688c15335f1747464454de8a6 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Thu, 15 Jan 2026 11:47:50 +0100 Subject: [PATCH 014/240] solve bugs so crosslinking data import with UniProt lookup roughly works now --- .../importing/cross_linking_import.py | 84 ++++++++++++++----- 1 file changed, 63 insertions(+), 21 deletions(-) diff --git a/backend/protzilla/importing/cross_linking_import.py b/backend/protzilla/importing/cross_linking_import.py index 021538738..cb931bc88 100644 --- a/backend/protzilla/importing/cross_linking_import.py +++ b/backend/protzilla/importing/cross_linking_import.py @@ -8,6 +8,7 @@ import pandas as pd import traceback import requests +import re from backend.protzilla.utilities import format_trace from backend.protzilla.importing.import_utils import ( @@ -42,6 +43,7 @@ def aggregate_data(df: pd.DataFrame, column: str) -> set: return set( df[[column + "1", column + "2"]] .stack() + .dropna() .astype(str) .str.strip() ) @@ -66,10 +68,31 @@ def get_gene_name_from_protein_ids(protein_ids: set): if not protein_ids: return results + # Regex for valid accession input directly from UniProt + # A batch request containing an id that doesn't match this regex, + # leads to an http 400 for the whole request. + valid_id_pattern = re.compile( + r"^(?:" + r"[OPQ][0-9][A-Z0-9]{3}[0-9]" + r"|" + r"[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2}" + r")$" + ) + + valid_ids = set() + for pid in protein_ids: + if valid_id_pattern.match(pid): + valid_ids.add(pid) + else: + results[pid] = (False, None, "NOT_A_VALID_PROTEIN_ID") + + if not valid_ids: + return results + url = f"https://rest.uniprot.org/uniprotkb/search" params = { - "query": " OR ".join(f"accession:{pid}" for pid in protein_ids), - "fields": "accesssion,gene_names", + "query": " OR ".join(f"accession:{pid}" for pid in valid_ids), + "fields": "accession,gene_primary", "format": "json" } @@ -85,6 +108,8 @@ def get_gene_name_from_protein_ids(protein_ids: set): #gene_name = data.get("genes", [{}])[0].get("geneName", {}).get("value") + # If there is more than one protein id for a gene name, + # we only store the last one that was found. if gene_name: results[protein_id] = (True, gene_name, None) #return True, gene_name, None @@ -92,27 +117,27 @@ def get_gene_name_from_protein_ids(protein_ids: set): results[protein_id] = (False, None, "NO_GENE_NAME_FOUND") #return False, None, "NO_GENE_NAME_FOUND" - for pid in protein_ids: + for pid in valid_ids: if pid not in results: results[pid] = (False, None, "PROTEIN_ID_NOT_FOUND") except requests.exceptions.Timeout: - for pid in protein_ids: + for pid in valid_ids: results[pid] = (False, None, "TIMEOUT") #return False, None, "TIMEOUT" except requests.exceptions.HTTPError as e: - for pid in protein_ids: + for pid in valid_ids: results[pid] = (False, None, f"HTTP_{e.response.status_code}") #return False, None, f"HTTP_{e.response.status_code}" except requests.exceptions.RequestException: - for pid in protein_ids: + for pid in valid_ids: results[pid] = (False, None, "REQUEST_ERROR") #return False, None, "REQUEST_ERROR" except ValueError: - for pid in protein_ids: + for pid in valid_ids: results[pid] = (False, None, "INVALID_JSON") #return False, None, "INVALID_JSON" @@ -141,13 +166,24 @@ def get_protein_ids_from_gene_name(gene_names: set): if not gene_names: return results - query = " OR ".join(f"gene:{g}" for g in gene_names) + # Filter decoy Proteins, because we cannot process them decently? + valid_gene_names = set() + for name in gene_names: + if name.startswith("decoy:"): + results[name] = (False, None, "IS_DECOY_PROTEIN") + else: + valid_gene_names.add(name) + + if not valid_gene_names: + return results + + query = " OR ".join(f"gene:{g}" for g in valid_gene_names) url = "https://rest.uniprot.org/uniprotkb/search" params = { "query": f"({query}) AND organism_id:9606 AND reviewed:true", "format": "tsv", - "fields": "accession,genes", + "fields": "accession,gene_primary", "includeIsoform": "true", } try: @@ -162,7 +198,7 @@ def get_protein_ids_from_gene_name(gene_names: set): lines = response.text.strip().split("\n") header = lines[0].split("\t") protein_id_idx = header.index("Entry") - gene_name_idx = header.index("Gene Names") + gene_name_idx = header.index("Gene Names (primary)") for line in lines[1:]: parts = line.split("\t") @@ -170,13 +206,13 @@ def get_protein_ids_from_gene_name(gene_names: set): output_gene_names = parts[gene_name_idx].split() for g in output_gene_names: - if g in gene_names: + if g in valid_gene_names: if "-" in protein_id: output[g]["list_of_protein_isoforms"].append(protein_id) else: output[g]["protein_ids"].append(protein_id) - for gn in gene_names: + for gn in valid_gene_names: data = output.get(gn) if not data or not data["protein_ids"]: @@ -192,19 +228,19 @@ def get_protein_ids_from_gene_name(gene_names: set): return results except requests.exceptions.Timeout: - for g in gene_names: + for g in valid_gene_names: results[g] = (False, None, "TIMEOUT") return results #return False, None, "TIMEOUT" except requests.exceptions.HTTPError as e: - for g in gene_names: + for g in valid_gene_names: results[g] = (False, None, f"HTTP_{e.response.status_code}") return results #return False, None, f"HTTP_{e.response.status_code}" except requests.exceptions.RequestException: - for g in gene_names: + for g in valid_gene_names: results[g] = (False, None, "REQUEST_ERROR") return results #return False, None, "REQUEST_ERROR" @@ -212,7 +248,8 @@ def get_protein_ids_from_gene_name(gene_names: set): def iterate_for_protein_designation( df, - protein_designation, + existing_designation, + new_designation, uniprot_lookup_results, value_extractor=lambda x: x ): @@ -240,10 +277,10 @@ def iterate_for_protein_designation( row_dict = row.to_dict() success1, data1, error1 = uniprot_lookup_results.get( - row[protein_designation + "1"], (False, None, "NOT_LOOKED_UP") + row[existing_designation + "1"], (False, None, "NOT_LOOKED_UP") ) success2, data2, error2 = uniprot_lookup_results.get( - row[protein_designation + "2"], (False, None, "NOT_LOOKED_UP") + row[existing_designation + "2"], (False, None, "NOT_LOOKED_UP") ) errors_occurred = {} @@ -257,10 +294,13 @@ def iterate_for_protein_designation( failed_row.update(errors_occurred) failed_rows.append(failed_row) else: - row_dict[protein_designation + "1"] = value_extractor(data1) - row_dict[protein_designation + "2"] = value_extractor(data2) + row_dict[new_designation + "1"] = value_extractor(data1) + row_dict[new_designation + "2"] = value_extractor(data2) good_rows.append(row_dict) + #print(df.columns) + + good_df = normalize_crosslinking_df(pd.DataFrame(good_rows)) failed_df = pd.DataFrame(failed_rows) @@ -300,6 +340,7 @@ def read_ProteomeDiscoverer_XlinkX_file(file_path: Path) -> pd.DataFrame: good_df, failed_df = iterate_for_protein_designation( df, "Protein_id", + "Protein", uniprot_lookup_results, value_extractor=lambda x: x ) @@ -310,7 +351,7 @@ def read_ProteomeDiscoverer_XlinkX_file(file_path: Path) -> pd.DataFrame: def read_csm_file(file_path: Path) -> pd.DataFrame: """ Returns two DataFrames: - - normalized_df: only rows with successful UniProt lookups + - good_df: only rows with successful UniProt lookups - failed_df: rows where UniProt lookup failed, including error messages """ df = pd.read_csv(file_path, low_memory=False).rename( @@ -327,6 +368,7 @@ def read_csm_file(file_path: Path) -> pd.DataFrame: good_df, failed_df = iterate_for_protein_designation( df, "Protein", + "Protein_id", uniprot_lookup_results, value_extractor=lambda x: x["protein_ids"][0] if x else None ) From 76a21902ecf648c10e0905ad8d996c19e8e33631 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Thu, 15 Jan 2026 14:09:04 +0100 Subject: [PATCH 015/240] refactor uniprot lookup as part of crosslinking import --- .../importing/cross_linking_import.py | 358 +++++++++--------- 1 file changed, 183 insertions(+), 175 deletions(-) diff --git a/backend/protzilla/importing/cross_linking_import.py b/backend/protzilla/importing/cross_linking_import.py index cb931bc88..3afd1f280 100644 --- a/backend/protzilla/importing/cross_linking_import.py +++ b/backend/protzilla/importing/cross_linking_import.py @@ -17,17 +17,6 @@ rename_columns_proteomediscoverer_xlinkx_format, ) -""" -def get_protein_designation(designation_lookup_cache, protein_designation, uniprot_lookup_func): - if designation_lookup_cache[protein_designation]: - success, new_protein_designation, error = True, designation_lookup_cache[protein_designation], None - else: - success, new_protein_designation, error = uniprot_lookup_func(protein_designation) - if success: - designation_lookup_cache[protein_designation] = new_protein_designation - return success, new_protein_designation, error -""" - def aggregate_data(df: pd.DataFrame, column: str) -> set: """ @@ -48,6 +37,79 @@ def aggregate_data(df: pd.DataFrame, column: str) -> set: .str.strip() ) +def validate_data_before_lookup(data_for_lookup:set, is_valid_function, error_code: str): + """ + Splits input values into valid and invalid ones. + + Invalid values are directly written to results with the given error code. + + Returns: + valid_data (set) + results (dict): value -> (False, None, error_code) + """ + valid_data = set() + results = {} + + if not data_for_lookup: + return valid_data, results + + for data in data_for_lookup: + if is_valid_function(data): + valid_data.add(data) + else: + results[data] = (False, None, error_code) + + return valid_data, results + + +def build_uniprot_search_params( + data_for_lookup:set, + field_of_existing_data: str, + *, + extra_query: str | None = None, + response_format: str, + fields: str, + include_isoforms: bool = False, +): + """ + Builds UniProt search URL and params. + """ + uniprot_search_url = "https://rest.uniprot.org/uniprotkb/search" + + base_query = " OR ".join(f"{field_of_existing_data}:{data}" for data in data_for_lookup) + + if extra_query: + base_query = f"({base_query}) AND {extra_query}" + + params = { + "query": base_query, + "format": response_format, + "fields": fields, + } + + if include_isoforms: + params["includeIsoform"] = "true" + + return uniprot_search_url, params + + +def execute_uniprot_request(url, params, valid_data, results): + try: + response = requests.get(url, params=params, timeout=15) + response.raise_for_status() + return response + + except requests.exceptions.Timeout: + error = "TIMEOUT" + except requests.exceptions.HTTPError as e: + error = f"HTTP_{e.response.status_code}" + except requests.exceptions.RequestException: + error = "REQUEST_ERROR" + + for data in valid_data: + results[data] = (False, None, error) + return None + def get_gene_name_from_protein_ids(protein_ids: set): """ @@ -62,15 +124,9 @@ def get_gene_name_from_protein_ids(protein_ids: set): gene_name (str or None): Official gene name if successful, else None error (str or None): Error code/message if failed, else None """ - #return "placeholder" - - results = {} - if not protein_ids: - return results - # Regex for valid accession input directly from UniProt # A batch request containing an id that doesn't match this regex, - # leads to an http 400 for the whole request. + # leads to an http 400 for the whole request. valid_id_pattern = re.compile( r"^(?:" r"[OPQ][0-9][A-Z0-9]{3}[0-9]" @@ -79,69 +135,45 @@ def get_gene_name_from_protein_ids(protein_ids: set): r")$" ) - valid_ids = set() - for pid in protein_ids: - if valid_id_pattern.match(pid): - valid_ids.add(pid) - else: - results[pid] = (False, None, "NOT_A_VALID_PROTEIN_ID") + valid_ids, results = validate_data_before_lookup( + protein_ids, + is_valid_function=lambda pid: bool(valid_id_pattern.match(pid)), + error_code="NOT_A_VALID_PROTEIN_ID" + ) if not valid_ids: return results - url = f"https://rest.uniprot.org/uniprotkb/search" - params = { - "query": " OR ".join(f"accession:{pid}" for pid in valid_ids), - "fields": "accession,gene_primary", - "format": "json" - } + url, params = build_uniprot_search_params( + valid_ids, + field_of_existing_data="accession", + response_format="json", + fields="accession,gene_primary" + ) - try: - response = requests.get(url, params=params, timeout=15) - response.raise_for_status() - data = response.json() - - for entry in data.get("results", []): - protein_id = entry.get("primaryAccession") - output = entry.get("genes", [{}]) - gene_name = output[0].get("geneName", {}).get("value") if output else None - - #gene_name = data.get("genes", [{}])[0].get("geneName", {}).get("value") - - # If there is more than one protein id for a gene name, - # we only store the last one that was found. - if gene_name: - results[protein_id] = (True, gene_name, None) - #return True, gene_name, None - else: - results[protein_id] = (False, None, "NO_GENE_NAME_FOUND") - #return False, None, "NO_GENE_NAME_FOUND" - - for pid in valid_ids: - if pid not in results: - results[pid] = (False, None, "PROTEIN_ID_NOT_FOUND") - - except requests.exceptions.Timeout: - for pid in valid_ids: - results[pid] = (False, None, "TIMEOUT") - #return False, None, "TIMEOUT" - - except requests.exceptions.HTTPError as e: - for pid in valid_ids: - results[pid] = (False, None, f"HTTP_{e.response.status_code}") - #return False, None, f"HTTP_{e.response.status_code}" - - except requests.exceptions.RequestException: - for pid in valid_ids: - results[pid] = (False, None, "REQUEST_ERROR") - #return False, None, "REQUEST_ERROR" - - except ValueError: - for pid in valid_ids: - results[pid] = (False, None, "INVALID_JSON") - #return False, None, "INVALID_JSON" + response = execute_uniprot_request(url, params, valid_ids, results) + if response is None: + return results - return results + data = response.json() + + for entry in data.get("results", []): + protein_id = entry.get("primaryAccession") + output = entry.get("genes", [{}]) + gene_name = output[0].get("geneName", {}).get("value") if output else None + + # If there is more than one protein id for a gene name, + # we only store the last one that was found. + if gene_name: + results[protein_id] = (True, gene_name, None) + else: + results[protein_id] = (False, None, "NO_GENE_NAME_FOUND") + + for pid in valid_ids: + if pid not in results: + results[pid] = (False, None, "PROTEIN_ID_NOT_FOUND") + + return results def get_protein_ids_from_gene_name(gene_names: set): @@ -160,90 +192,60 @@ def get_protein_ids_from_gene_name(gene_names: set): } if success else None error (str or None): error code/message if failed, else None """ - #return "placeholder" - - results = {} - if not gene_names: - return results - # Filter decoy Proteins, because we cannot process them decently? - valid_gene_names = set() - for name in gene_names: - if name.startswith("decoy:"): - results[name] = (False, None, "IS_DECOY_PROTEIN") - else: - valid_gene_names.add(name) + valid_gene_names, results = validate_data_before_lookup( + gene_names, + is_valid_function=lambda name: not name.startswith("decoy:"), + error_code="IS_DECOY_PROTEIN" + ) if not valid_gene_names: return results - query = " OR ".join(f"gene:{g}" for g in valid_gene_names) + url, params = build_uniprot_search_params( + valid_gene_names, + field_of_existing_data="gene", + extra_query="organism_id:9606 AND reviewed:true", + response_format="tsv", + fields="accession,gene_primary", + include_isoforms=True, + ) + + response = execute_uniprot_request(url, params, valid_gene_names, results) + if response is None: + return results - url = "https://rest.uniprot.org/uniprotkb/search" - params = { - "query": f"({query}) AND organism_id:9606 AND reviewed:true", - "format": "tsv", - "fields": "accession,gene_primary", - "includeIsoform": "true", - } - try: - response = requests.get(url, params=params, timeout=15) - response.raise_for_status() + output = defaultdict(lambda: { + "protein_ids": [], + "list_of_protein_isoforms": [] + }) + + lines = response.text.strip().split("\n") + header = lines[0].split("\t") + protein_id_idx = header.index("Entry") + gene_name_idx = header.index("Gene Names (primary)") + + for line in lines[1:]: + parts = line.split("\t") + protein_id = parts[protein_id_idx] + output_gene_names = parts[gene_name_idx].split() + + for g in output_gene_names: + if g in valid_gene_names: + if "-" in protein_id: + output[g]["list_of_protein_isoforms"].append(protein_id) + else: + output[g]["protein_ids"].append(protein_id) + + for gn in valid_gene_names: + data = output.get(gn) + + if not data or not data["protein_ids"]: + results[gn] = (False, None, "NO_PROTEIN_ID_FOUND") + else: + results[gn] = (True, data, None) - output = defaultdict(lambda: { - "protein_ids": [], - "list_of_protein_isoforms": [] - }) - - lines = response.text.strip().split("\n") - header = lines[0].split("\t") - protein_id_idx = header.index("Entry") - gene_name_idx = header.index("Gene Names (primary)") - - for line in lines[1:]: - parts = line.split("\t") - protein_id = parts[protein_id_idx] - output_gene_names = parts[gene_name_idx].split() - - for g in output_gene_names: - if g in valid_gene_names: - if "-" in protein_id: - output[g]["list_of_protein_isoforms"].append(protein_id) - else: - output[g]["protein_ids"].append(protein_id) - - for gn in valid_gene_names: - data = output.get(gn) - - if not data or not data["protein_ids"]: - results[gn] = (False, None, "NO_PROTEIN_ID_FOUND") - #return False, None, "NO_PROTEIN_ID_FOUND" - else: - results[gn] = (True, data, None) - #return True, { - # "protein_ids": protein_ids, - # "list_of_protein_isoforms": list_of_protein_isoforms - #}, None - - return results - - except requests.exceptions.Timeout: - for g in valid_gene_names: - results[g] = (False, None, "TIMEOUT") - return results - #return False, None, "TIMEOUT" - - except requests.exceptions.HTTPError as e: - for g in valid_gene_names: - results[g] = (False, None, f"HTTP_{e.response.status_code}") - return results - #return False, None, f"HTTP_{e.response.status_code}" - - except requests.exceptions.RequestException: - for g in valid_gene_names: - results[g] = (False, None, "REQUEST_ERROR") - return results - #return False, None, "REQUEST_ERROR" + return results def iterate_for_protein_designation( @@ -298,14 +300,29 @@ def iterate_for_protein_designation( row_dict[new_designation + "2"] = value_extractor(data2) good_rows.append(row_dict) - #print(df.columns) - - good_df = normalize_crosslinking_df(pd.DataFrame(good_rows)) failed_df = pd.DataFrame(failed_rows) return good_df, failed_df +def get_missing_protein_designation( + df: pd.DataFrame, + existing_column: str, + missing_column: str, + uniprot_lookup_function, + value_extractor=lambda x: x +): + unique_existing_designations = aggregate_data(df, existing_column) + uniprot_lookup_results = uniprot_lookup_function(unique_existing_designations) + good_df, failed_df = iterate_for_protein_designation( + df, + existing_column, + missing_column, + uniprot_lookup_results, + value_extractor + ) + return good_df, failed_df + def remove_brackets_from_peptide(peptide: str) -> str: return peptide.replace("[", "").replace("]", "") @@ -334,14 +351,11 @@ def read_ProteomeDiscoverer_XlinkX_file(file_path: Path) -> pd.DataFrame: df["Is_intra_crosslink"] = df["Is_intra_crosslink"].eq("Intra") - #unique_protein_ids = set(df[["Protein_id1", "Protein_id2"]].stack().astype(str).str.strip()) - unique_protein_ids = aggregate_data(df, "Protein_id") - uniprot_lookup_results = get_gene_name_from_protein_ids(unique_protein_ids) - good_df, failed_df = iterate_for_protein_designation( - df, - "Protein_id", - "Protein", - uniprot_lookup_results, + good_df, failed_df = get_missing_protein_designation( + df=df, + existing_column="Protein_id", + missing_column="Protein", + uniprot_lookup_function=get_gene_name_from_protein_ids, value_extractor=lambda x: x ) @@ -360,17 +374,15 @@ def read_csm_file(file_path: Path) -> pd.DataFrame: df["Is_intra_crosslink"] = df["Protein1"].eq(df["Protein2"]) - unique_gene_names = aggregate_data(df, "Protein") - uniprot_lookup_results = get_protein_ids_from_gene_name(unique_gene_names) # In our UniProt lookup we already get all isoforms of the respective gene name. # Right now we only store the protein id without any isoform information in our dataframe to keep it consistent. # If we ever need the isoform information we just have to change what the value extractor stores in our dataframe. - good_df, failed_df = iterate_for_protein_designation( - df, - "Protein", - "Protein_id", - uniprot_lookup_results, - value_extractor=lambda x: x["protein_ids"][0] if x else None + good_df, failed_df = get_missing_protein_designation( + df=df, + existing_column="Protein", + missing_column="Protein_id", + uniprot_lookup_function=get_protein_ids_from_gene_name, + value_extractor=lambda x: x["protein_ids"][0] if x else None ) return good_df, failed_df @@ -395,10 +407,8 @@ def cross_linking_import(file_path: Path) -> dict: try: if file_path.suffix == ".csv": good_df, failed_df = read_csm_file(file_path) - #df = read_csm_file(file_path) elif file_path.suffix == ".xlsx": good_df, failed_df = read_ProteomeDiscoverer_XlinkX_file(file_path) - #df = read_ProteomeDiscoverer_XlinkX_file(file_path) else: raise ValueError(f"Unsupported file type: {file_path.suffix}") except Exception as e: @@ -425,6 +435,4 @@ def cross_linking_import(file_path: Path) -> dict: return dict( crosslinking_df=good_df, messages=messages - #crosslinking_df=df, - #messages=[dict(level=logging.INFO, msg=msg)], ) From 7abf0b4d4116c8e65836647a7026bd7b6e6eeae3 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Fri, 16 Jan 2026 14:27:37 +0100 Subject: [PATCH 016/240] feature: add forms for entering lengths of cross linkers and accepted deviation --- backend/protzilla/all_steps.py | 1 + .../data_analysis/cross_linking_validation.py | 15 ++++++++ backend/protzilla/form.py | 5 +++ backend/protzilla/methods/data_analysis.py | 38 +++++++++++++++++++ 4 files changed, 59 insertions(+) create mode 100644 backend/protzilla/data_analysis/cross_linking_validation.py diff --git a/backend/protzilla/all_steps.py b/backend/protzilla/all_steps.py index ca55d5945..4bdca092c 100644 --- a/backend/protzilla/all_steps.py +++ b/backend/protzilla/all_steps.py @@ -66,6 +66,7 @@ data_analysis.PTMOverviewVisualization, data_analysis.PTMBarVisualization, data_analysis.PTMDetailsVisualization, + data_analysis.CrossLinkingValidationWithAngstromDeviation, data_preprocessing.ImputationByMinPerSample, data_integration.EnrichmentAnalysisGOAnalysisWithString, data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr, diff --git a/backend/protzilla/data_analysis/cross_linking_validation.py b/backend/protzilla/data_analysis/cross_linking_validation.py new file mode 100644 index 000000000..24c58d627 --- /dev/null +++ b/backend/protzilla/data_analysis/cross_linking_validation.py @@ -0,0 +1,15 @@ +""" +This module contains the code to parse a file containing cross linking data. +""" + +import logging + +from pathlib import Path + +import traceback + +from backend.protzilla.utilities import format_trace + + +def validate_cross_linking_with_angstrom_deviation(): + pass diff --git a/backend/protzilla/form.py b/backend/protzilla/form.py index 8bf480c29..ed5099896 100644 --- a/backend/protzilla/form.py +++ b/backend/protzilla/form.py @@ -184,6 +184,7 @@ class HeaderInfoField: MultiSelectField, DropdownField, FileInput, + FloatField, ] StructuralField = Union[FormDivider, InfoField, HeaderInfoField] @@ -219,6 +220,10 @@ def update_values(self, values: Dict[str, Any]) -> None: def apply_modification(self, run: Run) -> None: self.modify_form(run) + def add_field(self, new_field: InputField): + self.input_fields.append(new_field) + self._field_map[new_field.name] = new_field + def __getitem__(self, fieldname: str) -> InputField: "to do form[fieldname] to get the field object" diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 75b9e1bd6..990ff4b69 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -60,6 +60,10 @@ from protzilla.data_analysis.ptm_visualization.ptm_overview_plot import ( get_detected_modifications, ) +from protzilla.data_analysis.cross_linking_validation import ( + validate_cross_linking_with_angstrom_deviation, +) +from backend.protzilla.run import Run class TTestType(Enum): @@ -2453,3 +2457,37 @@ def create_form(self): label="PTM Details Visualization", input_fields=_PTMVisualizationWithGroups.get_form_fields(), ) + + +class CrossLinkingValidationWithAngstromDeviation(DataAnalysisStep): + display_name = "Ångström Deviation" + operation = "Cross Linking Validation" + method_description = "Validates cross links based on the difference between the length of the cross linker and the distance between the amino acids which were connected by the cross linker. (in Ångström)" + + def create_form(self): + return Form( + label="Ångström Deviation", + input_fields=[ + FloatField( + name="accepted_deviation", + label="Accepted deviation in Ångström", + min=0, + value=0.20, + ), + ], + ) + + def modify_form(self, form: Form, run: Run) -> None: + cross_linker = ["cross_linker1", "cross_linker2"] + for cl in cross_linker: + field_name = f"length_of_{cl}" + if field_name not in form: + field = FloatField( + name=field_name, + label=f"Length of {cl} in Ångström", + min=0, + value=1.0, + ) + form.add_field(field) + + calc_method = staticmethod(validate_cross_linking_with_angstrom_deviation) From 00c1a371e22baae489ec1e2ff73abc784b4872e2 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Fri, 16 Jan 2026 15:22:52 +0100 Subject: [PATCH 017/240] feature: fields for crosslinker lengths are now based on the used crosslinkers in the uploaded data --- backend/protzilla/form_helper.py | 10 ++++++++++ backend/protzilla/methods/data_analysis.py | 12 +++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/backend/protzilla/form_helper.py b/backend/protzilla/form_helper.py index 60655d53f..fe9817533 100644 --- a/backend/protzilla/form_helper.py +++ b/backend/protzilla/form_helper.py @@ -53,3 +53,13 @@ def get_choices_for_metadata_non_sample_columns( ) -> list[Option]: metadata_choices = get_choices_for_metadata(run, instance_identifier) return [c for c in metadata_choices if c.label != "Sample"] + + +def get_crosslinker_names_from_crosslinker_df(run: Run) -> list[str]: + df = run.steps.get_step_output( + Step, output_key="crosslinking_df" + ) # first step that returns a crosslinking_df + if df is None or "Crosslinker" not in df.columns: + return [] + crosslinkers = df["Crosslinker"].dropna().unique() + return crosslinkers diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 990ff4b69..d70e33f95 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -64,6 +64,7 @@ validate_cross_linking_with_angstrom_deviation, ) from backend.protzilla.run import Run +from backend.protzilla.form_helper import get_crosslinker_names_from_crosslinker_df class TTestType(Enum): @@ -2478,7 +2479,7 @@ def create_form(self): ) def modify_form(self, form: Form, run: Run) -> None: - cross_linker = ["cross_linker1", "cross_linker2"] + cross_linker = get_crosslinker_names_from_crosslinker_df(run) for cl in cross_linker: field_name = f"length_of_{cl}" if field_name not in form: @@ -2491,3 +2492,12 @@ def modify_form(self, form: Form, run: Run) -> None: form.add_field(field) calc_method = staticmethod(validate_cross_linking_with_angstrom_deviation) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["crosslinking_df"] = steps.get_step_output( + Step, + "crosslinking_df", + ) + if inputs.get("crosslinking_df") is None: + raise ValueError("No cross linking data found.") + return inputs From e6f309f870fffc5cca7c0c2cc47e1144e2eab471 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Fri, 16 Jan 2026 22:56:39 +0100 Subject: [PATCH 018/240] fix too much filtering of not processable rows --- .../importing/cross_linking_import.py | 131 +++++++++++++++++- 1 file changed, 126 insertions(+), 5 deletions(-) diff --git a/backend/protzilla/importing/cross_linking_import.py b/backend/protzilla/importing/cross_linking_import.py index 3afd1f280..cf11fee8d 100644 --- a/backend/protzilla/importing/cross_linking_import.py +++ b/backend/protzilla/importing/cross_linking_import.py @@ -111,6 +111,82 @@ def execute_uniprot_request(url, params, valid_data, results): return None +def fallback_single_lookup(query: str, query_type: str): + try: + if query_type == "genes": + url = f"https://rest.uniprot.org/uniprotkb/{query}" + params = { + "fields": "gene_primary", + "format": "json" + } + elif query_type == "results": + url = "https://rest.uniprot.org/uniprotkb/search" + params = { + "query": f"gene_exact:{query}", + "format": "json", + "fields": "accession,gene_primary", + "size": 500 + } + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + output = data.get(query_type, []) + return output if output else None + + except requests.exceptions.RequestException: + return None + except (KeyError, TypeError): + return None + + +""" +def fallback(protein_id: str): + url = f"https://rest.uniprot.org/uniprotkb/{protein_id}" + params = { + "fields": "gene_primary", + "format": "json" + } + + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + + data = response.json() + + genes = data.get("genes", []) + if not genes: + return None + + return genes[0].get("geneName", {}).get("value") +""" +""" +def fallback_gene(gene_name): + url = "https://rest.uniprot.org/uniprotkb/search" + params = { + "query": f"gene_exact:{gene_name}", + "format": "json", + "fields": "accession,gene_primary" + } + response = requests.get(url, params=params) + response.raise_for_status() + data = response.json() + ids = data.get("results", []) + if not ids: + return None + + inner_dict = { + "protein_ids": [], + "list_of_protein_isoforms": [] + } + + for entry in ids: + accession = entry.get("primaryAccession") + if accession: + inner_dict["protein_ids"].append(accession) + + return inner_dict if inner_dict["protein_ids"] else None +""" + + def get_gene_name_from_protein_ids(protein_ids: set): """ Retrieves the gene names for a given set of Protein IDs in a batch from UniProt. @@ -171,7 +247,13 @@ def get_gene_name_from_protein_ids(protein_ids: set): for pid in valid_ids: if pid not in results: - results[pid] = (False, None, "PROTEIN_ID_NOT_FOUND") + output = fallback_single_lookup(pid, "genes") + gene_name = output[0].get("geneName", {}).get("value") + #gene_name = fallback(pid) + if gene_name: + results[pid] = (True, gene_name, None) + else: + results[pid] = (False, None, "PROTEIN_ID_NOT_FOUND") return results @@ -195,7 +277,7 @@ def get_protein_ids_from_gene_name(gene_names: set): # Filter decoy Proteins, because we cannot process them decently? valid_gene_names, results = validate_data_before_lookup( gene_names, - is_valid_function=lambda name: not name.startswith("decoy:"), + is_valid_function=lambda name: not name.startswith("DECOY:"), error_code="IS_DECOY_PROTEIN" ) @@ -204,7 +286,7 @@ def get_protein_ids_from_gene_name(gene_names: set): url, params = build_uniprot_search_params( valid_gene_names, - field_of_existing_data="gene", + field_of_existing_data="gene_exact", extra_query="organism_id:9606 AND reviewed:true", response_format="tsv", fields="accession,gene_primary", @@ -241,7 +323,24 @@ def get_protein_ids_from_gene_name(gene_names: set): data = output.get(gn) if not data or not data["protein_ids"]: - results[gn] = (False, None, "NO_PROTEIN_ID_FOUND") + output = fallback_single_lookup(gn, "results") + inner_dict = { + "protein_ids": [], + "list_of_protein_isoforms": [] + } + if output and isinstance(output, list): + for entry in output: + if not isinstance(entry, dict): + continue + pid = entry.get("primaryAccession") + if pid: + inner_dict["protein_ids"].append(pid) + protein_id = inner_dict if inner_dict["protein_ids"] else None + #protein_id = fallback_gene(gn) + if protein_id: + results[gn] = (True, protein_id, None) + else: + results[gn] = (False, None, "NO_PROTEIN_ID_FOUND") else: results[gn] = (True, data, None) @@ -324,6 +423,15 @@ def get_missing_protein_designation( return good_df, failed_df +def normalize_gene_name_column(df, columns: list[str]): + for col in columns: + df[col] = df[col].astype("string").str.upper() + return df + +def remove_isoform_from_protein_id(protein_id: str) -> str: + return protein_id.split('-', 1)[0] + + def remove_brackets_from_peptide(peptide: str) -> str: return peptide.replace("[", "").replace("]", "") @@ -351,6 +459,13 @@ def read_ProteomeDiscoverer_XlinkX_file(file_path: Path) -> pd.DataFrame: df["Is_intra_crosslink"] = df["Is_intra_crosslink"].eq("Intra") + # Right now we remove the isoform ending from every protein_id (if necessary), + # because we cannot process isoforms properly. + # If we ever wanted to add an "Isoforms" column, we need to store the original value from + # the "Protein_id1/2" column in the "Isoforms" column first, before removing the isoform ending. + df["Protein_id1"] = df["Protein_id1"].apply(remove_isoform_from_protein_id).astype("string") + df["Protein_id2"] = df["Protein_id2"].apply(remove_isoform_from_protein_id).astype("string") + good_df, failed_df = get_missing_protein_designation( df=df, existing_column="Protein_id", @@ -374,6 +489,8 @@ def read_csm_file(file_path: Path) -> pd.DataFrame: df["Is_intra_crosslink"] = df["Protein1"].eq(df["Protein2"]) + df = normalize_gene_name_column(df, ["Protein1", "Protein2"]) + # In our UniProt lookup we already get all isoforms of the respective gene name. # Right now we only store the protein id without any isoform information in our dataframe to keep it consistent. # If we ever need the isoform information we just have to change what the value extractor stores in our dataframe. @@ -431,8 +548,12 @@ def cross_linking_import(file_path: Path) -> dict: dict(level=logging.WARNING, msg=msg), dict(level=logging.WARNING, msg=f"Failed rows:\n{failed_df}") ] + pd.set_option("display.max_columns", None) + failed_df.to_csv("failed_rows.csv", index=False) + print("Failed rows saved to failed_rows.csv") + #print(f"Failed rows:\n{failed_df}") return dict( crosslinking_df=good_df, messages=messages - ) + ) \ No newline at end of file From 839694d3bbf6f5e7d8df027b03c99066bd0b2395 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Sat, 17 Jan 2026 13:15:45 +0100 Subject: [PATCH 019/240] fix: fix cif import and conversion to dataframe, polish up code and add docstrings where needed --- .../alphafold_protein_structure_load.py | 173 ++++++++++++++---- backend/protzilla/methods/importing.py | 2 +- requirements.txt | 2 +- 3 files changed, 136 insertions(+), 41 deletions(-) diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 78cee956d..66cde9df1 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -1,21 +1,28 @@ from __future__ import annotations +import shutil +import tempfile from pathlib import Path +from textwrap import wrap from typing import Any +import gemmi import pandas as pd import requests -from textwrap import wrap -from CifFile import ReadCif -import shutil -import tempfile - from backend.protzilla.constants import paths from backend.protzilla.constants.protzilla_logging import logger def _download_file(session: requests.Session, url: str, dest: Path) -> Path | None: + """ + Download a file from a URL and save it to the specified destination path. + + :param session: The requests session to use for the download + :param url: The URL of the file to download + :param dest: The destination path where the file should be saved + :return: The destination path if successful, None otherwise + """ try: dest.parent.mkdir(parents=True, exist_ok=True) with session.get(url, stream=True, timeout=60) as r: @@ -35,6 +42,15 @@ def _download_file(session: requests.Session, url: str, dest: Path) -> Path | No def to_fasta(seq: str, header: str = "protein_sequence", width: int = 60) -> str: + """ + Convert a protein sequence to FASTA format. + + :param seq: The protein sequence to convert + :param header: The header line for the FASTA record (default: "protein_sequence") + :param width: The maximum line width for sequence wrapping (default: 60) + :return: The sequence in FASTA format + :raises ValueError: If the sequence contains invalid characters or whitespace + """ VALID_AA = set("ACDEFGHIKLMNPQRSTVWYBXZJUO*-") if not seq or any(c.isspace() for c in seq): raise ValueError("Sequence must be a single, whitespace-free string.") @@ -45,7 +61,13 @@ def to_fasta(seq: str, header: str = "protein_sequence", width: int = 60) -> str return ">" + header + "\n" + "\n".join(wrap(seq, width)) + "\n" -def fasta_to_dataframe(fasta_path: str): +def fasta_to_dataframe(fasta_path: str) -> pd.DataFrame: + """ + Parse a FASTA file and convert it to a DataFrame. + + :param fasta_path: The path to the FASTA file + :return: A DataFrame with columns 'id', 'sequence', and 'length' + """ records = [] seq_id = None seq = [] @@ -58,11 +80,13 @@ def fasta_to_dataframe(fasta_path: str): if line.startswith(">"): if seq_id is not None: sequence = "".join(seq) - records.append({ - "id": seq_id, - "sequence": sequence, - "length": len(sequence), - }) + records.append( + { + "id": seq_id, + "sequence": sequence, + "length": len(sequence), + } + ) seq_id = line[1:].split()[0] seq = [] else: @@ -70,28 +94,61 @@ def fasta_to_dataframe(fasta_path: str): if seq_id is not None: sequence = "".join(seq) - records.append({ - "id": seq_id, - "sequence": sequence, - "length": len(sequence), - }) + records.append( + { + "id": seq_id, + "sequence": sequence, + "length": len(sequence), + } + ) return pd.DataFrame(records) -def cif_to_dataframe(cif_path): - cif = ReadCif(str(cif_path)) - block = cif.first_block() +def read_alphafold_mmcif(path: str) -> pd.DataFrame: + """ + Parse an AlphaFold mmCIF (Macromolecular Crystallographic Information File) file. - df = pd.DataFrame({ - "label": block["_atom_site_label"], - "element": block["_atom_site_type_symbol"], - "x": block["_atom_site_fract_x"], - "y": block["_atom_site_fract_y"], - "z": block["_atom_site_fract_z"], - }) + :param path: The path to the mmCIF file + :return: A DataFrame containing the atom site information from the CIF file + :raises FileNotFoundError: If the file does not exist + :raises IsADirectoryError: If the path points to a directory instead of a file + :raises ValueError: If no CIF blocks are found in the file + """ + p = Path(path) + if not p.exists(): + raise FileNotFoundError(f"File not found: {p}") + if p.is_dir(): + raise IsADirectoryError(f"Expected a file path, got a directory: {p}") + + doc = gemmi.cif.read_file(str(p)) + if len(doc) == 0: + raise ValueError(f"No CIF blocks found in file: {p}") + + block = doc.sole_block() + + cat_name = "_atom_site." + if cat_name not in block.get_mmcif_category_names(): + return pd.DataFrame() + + table = block.find_mmcif_category(cat_name) + if table is None: + return pd.DataFrame() + + columns = list(table.tags) + nrows = len(table) + data = {} + for j, col in enumerate(columns): + col_values = [] + for i in range(nrows): + row = table[i] + if j < len(row): + col_values.append(row[j]) + else: + col_values.append(None) + data[col] = col_values - return df + return pd.DataFrame(data) def _handle_alphafold_files( @@ -103,15 +160,26 @@ def _handle_alphafold_files( acc: str, persist_upload: bool = False, ) -> tuple[ - dict[str, str], - Path, - pd.DataFrame, - pd.DataFrame, - pd.DataFrame, - pd.DataFrame, + pd.DataFrame | None, + pd.DataFrame | None, + pd.DataFrame | None, + pd.DataFrame | None, ]: """ - Either the files are persistently saved on disk and loaded into dataframes or only loaded into dataframes to be used only for the current run. + Download AlphaFold structure files and convert them to DataFrames. + + Files can either be persistently saved to disk or only loaded into memory for the current run. + The function downloads CIF, PAE, and pLDDT files, converts them to DataFrames, and optionally + saves metadata to a CSV file. + + :param session: The requests session to use for downloading files + :param files_urls: Dictionary containing URLs for CIF, PAE, and pLDDT files + :param uniprot: The UniProt ID of the protein + :param seq: The protein sequence + :param metadata_df: DataFrame containing AlphaFold metadata + :param acc: The accession number (used for directory naming) + :param persist_upload: If True, files are saved persistently; if False, only loaded into memory + :return: Tuple of (cif_df, pae_df, plddt_df, sequence_df) or None values for failed loads """ cif_df = None pae_df = None @@ -160,7 +228,14 @@ def _handle_alphafold_files( downloaded[key] = str(saved) try: if key == "cifUrl": - cif_df = cif_to_dataframe(saved) + try: + cif_df = read_alphafold_mmcif(str(saved)) + except Exception: + logger.exception( + "Failed to load CIF into dataframe. Path=%s", + str(saved), + ) + raise elif key == "paeDocUrl": pae_df = pd.read_json(saved) elif key == "plddtDocUrl": @@ -188,9 +263,22 @@ def _handle_alphafold_files( return cif_df, pae_df, plddt_df, sequence_df -def fetch_alphafold_protein_structure(uniprot: str, persistUploads: bool,) -> dict[str, Any]: - url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot}" +def fetch_alphafold_protein_structure( + uniprot: str, persist_uploads: bool +) -> dict[str, Any]: + """ + Fetch AlphaFold protein structure data from the AlphaFold Database API. + Retrieves metadata and structure files (CIF, PAE, pLDDT) from the AlphaFold Database + for the given UniProt ID. Optionally persists the downloaded files to disk. + + :param uniprot: The UniProt ID of the protein + :param persist_uploads: If True, files are saved persistently; if False, only loaded into memory + :return: A dictionary containing DataFrames for metadata, CIF, PAE, pLDDT, and sequence data + :raises RuntimeError: If the API request fails or returns invalid data + :raises ValueError: If no predictions are found for the given UniProt ID + """ + url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot}" with requests.Session() as session: try: resp = session.get(url, timeout=30) @@ -227,8 +315,15 @@ def fetch_alphafold_protein_structure(uniprot: str, persistUploads: bool,) -> di metadata_df = pd.DataFrame([data]) acc = data.get("uniprotAccession") - cif_df, pae_df, plddt_df, sequence_df = _handle_alphafold_files(session=session, files_urls=files_urls, uniprot=uniprot, seq=seq_tmp, metadata_df=metadata_df, acc=acc, persist_upload=persistUploads) - + cif_df, pae_df, plddt_df, sequence_df = _handle_alphafold_files( + session=session, + files_urls=files_urls, + uniprot=uniprot, + seq=seq_tmp, + metadata_df=metadata_df, + acc=acc, + persist_upload=persist_uploads, + ) return { "metadata_df": metadata_df, diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 9d0570941..087a764e7 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -431,7 +431,7 @@ def create_form(self): label="Protein ID", ), CheckboxField( - name="persistUploads", + name="persist_uploads", label="Upload should be saved persistently across runs", value=True, ), diff --git a/requirements.txt b/requirements.txt index 87d0e1cf7..8caedb5b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ dash-bio==1.0.2 debugpy==1.8.18 Django==5.2.8 django-cors-headers==4.9.0 +gemmi==0.6.6 gseapy==1.1.10 isort==7.0.0 joblib==1.5.2 @@ -24,7 +25,6 @@ plotly==6.4.0 pre-commit==4.4.0 git+https://git@github.com/hendraet/ptm-visualization.git@main#egg=protein_sequencing psutil==7.1.3 -PyCifRW==5.0.1 pydeseq2==0.5.3 pytest==9.0.1 pytest-cov==7.0.0 From 054e3725647f1e263c4a4698d6bb7c4986e895b0 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Sat, 17 Jan 2026 13:48:16 +0100 Subject: [PATCH 020/240] fix bugs in uniprot fallback lookup --- .../importing/cross_linking_import.py | 193 +++++++----------- 1 file changed, 71 insertions(+), 122 deletions(-) diff --git a/backend/protzilla/importing/cross_linking_import.py b/backend/protzilla/importing/cross_linking_import.py index cf11fee8d..5bfb43132 100644 --- a/backend/protzilla/importing/cross_linking_import.py +++ b/backend/protzilla/importing/cross_linking_import.py @@ -111,80 +111,65 @@ def execute_uniprot_request(url, params, valid_data, results): return None -def fallback_single_lookup(query: str, query_type: str): - try: - if query_type == "genes": - url = f"https://rest.uniprot.org/uniprotkb/{query}" - params = { - "fields": "gene_primary", - "format": "json" - } - elif query_type == "results": - url = "https://rest.uniprot.org/uniprotkb/search" - params = { - "query": f"gene_exact:{query}", - "format": "json", - "fields": "accession,gene_primary", - "size": 500 - } - response = requests.get(url, params=params, timeout=10) - response.raise_for_status() - data = response.json() - output = data.get(query_type, []) - return output if output else None - - except requests.exceptions.RequestException: - return None - except (KeyError, TypeError): - return None +def process_uniprot_response_containing_gene_names(response, results): + data = response.json() + for entry in data.get("results", []): + protein_id = entry.get("primaryAccession") + output = entry.get("genes", [{}]) + gene_name = output[0].get("geneName", {}).get("value") if output else None -""" -def fallback(protein_id: str): - url = f"https://rest.uniprot.org/uniprotkb/{protein_id}" - params = { - "fields": "gene_primary", - "format": "json" - } + if gene_name: + results[protein_id] = (True, gene_name, None) + else: + results[protein_id] = (False, None, "NO_GENE_NAME_FOUND") - response = requests.get(url, params=params, timeout=10) - response.raise_for_status() - data = response.json() +def process_uniprot_response_containing_protein_ids(response, valid_input, isFallback: bool): + output = defaultdict(lambda: { + "protein_ids": [], + "list_of_protein_isoforms": [] + }) - genes = data.get("genes", []) - if not genes: - return None + lines = response.text.strip().split("\n") + header = lines[0].split("\t") + protein_id_idx = header.index("Entry") + gene_name_idx = header.index("Gene Names (primary)") - return genes[0].get("geneName", {}).get("value") -""" -""" -def fallback_gene(gene_name): - url = "https://rest.uniprot.org/uniprotkb/search" - params = { - "query": f"gene_exact:{gene_name}", - "format": "json", - "fields": "accession,gene_primary" - } - response = requests.get(url, params=params) - response.raise_for_status() - data = response.json() - ids = data.get("results", []) - if not ids: - return None + for line in lines[1:]: + parts = line.split("\t") + protein_id = parts[protein_id_idx] + output_gene_names = parts[gene_name_idx].split() - inner_dict = { - "protein_ids": [], - "list_of_protein_isoforms": [] - } + for g in output_gene_names: + if g in valid_input: + if "-" in protein_id: + output[g]["list_of_protein_isoforms"].append(protein_id) + else: + output[g]["protein_ids"].append(protein_id) + elif isFallback: + if "-" in protein_id: + output[valid_input]["list_of_protein_isoforms"].append(protein_id) + else: + output[valid_input]["protein_ids"].append(protein_id) + return output - for entry in ids: - accession = entry.get("primaryAccession") - if accession: - inner_dict["protein_ids"].append(accession) - return inner_dict if inner_dict["protein_ids"] else None -""" +def fallback_single_lookup(query: str, query_type: str, results): + if query_type == "get_gene_name": + url = f"https://rest.uniprot.org/uniprotkb/{query}" + params = { + "fields": "gene_primary", + "format": "json" + } + elif query_type == "get_protein_ids": + url = "https://rest.uniprot.org/uniprotkb/search" + params = { + "query": f"gene_exact:{query}", + "format": "tsv", + "fields": "accession,gene_primary" + } + return execute_uniprot_request(url, params, query, results) def get_gene_name_from_protein_ids(protein_ids: set): @@ -231,30 +216,21 @@ def get_gene_name_from_protein_ids(protein_ids: set): if response is None: return results - data = response.json() - - for entry in data.get("results", []): - protein_id = entry.get("primaryAccession") - output = entry.get("genes", [{}]) - gene_name = output[0].get("geneName", {}).get("value") if output else None - - # If there is more than one protein id for a gene name, - # we only store the last one that was found. - if gene_name: - results[protein_id] = (True, gene_name, None) - else: - results[protein_id] = (False, None, "NO_GENE_NAME_FOUND") - + process_uniprot_response_containing_gene_names(response, results) + for pid in valid_ids: if pid not in results: - output = fallback_single_lookup(pid, "genes") - gene_name = output[0].get("geneName", {}).get("value") - #gene_name = fallback(pid) + + response = fallback_single_lookup(pid, "get_gene_name", results) + data = response.json() + processed_data = data.get("genes", []) + gene_name = (processed_data[0].get("geneName", {}).get("value") if processed_data else None) + if gene_name: results[pid] = (True, gene_name, None) else: results[pid] = (False, None, "PROTEIN_ID_NOT_FOUND") - + return results @@ -296,51 +272,24 @@ def get_protein_ids_from_gene_name(gene_names: set): response = execute_uniprot_request(url, params, valid_gene_names, results) if response is None: return results - - output = defaultdict(lambda: { - "protein_ids": [], - "list_of_protein_isoforms": [] - }) - - lines = response.text.strip().split("\n") - header = lines[0].split("\t") - protein_id_idx = header.index("Entry") - gene_name_idx = header.index("Gene Names (primary)") - - for line in lines[1:]: - parts = line.split("\t") - protein_id = parts[protein_id_idx] - output_gene_names = parts[gene_name_idx].split() - - for g in output_gene_names: - if g in valid_gene_names: - if "-" in protein_id: - output[g]["list_of_protein_isoforms"].append(protein_id) - else: - output[g]["protein_ids"].append(protein_id) + + output = process_uniprot_response_containing_protein_ids(response, valid_gene_names, False) for gn in valid_gene_names: data = output.get(gn) - if not data or not data["protein_ids"]: - output = fallback_single_lookup(gn, "results") - inner_dict = { - "protein_ids": [], - "list_of_protein_isoforms": [] - } - if output and isinstance(output, list): - for entry in output: - if not isinstance(entry, dict): - continue - pid = entry.get("primaryAccession") - if pid: - inner_dict["protein_ids"].append(pid) - protein_id = inner_dict if inner_dict["protein_ids"] else None - #protein_id = fallback_gene(gn) - if protein_id: + + response = fallback_single_lookup(gn, "get_protein_ids", results) + if response is not None: + new_output = process_uniprot_response_containing_protein_ids(response, gn, True) + protein_id = new_output.get(gn) + else: + protein_id = None + if protein_id: results[gn] = (True, protein_id, None) else: results[gn] = (False, None, "NO_PROTEIN_ID_FOUND") + else: results[gn] = (True, data, None) @@ -548,10 +497,10 @@ def cross_linking_import(file_path: Path) -> dict: dict(level=logging.WARNING, msg=msg), dict(level=logging.WARNING, msg=f"Failed rows:\n{failed_df}") ] + # TODO: Implement display of failed rows (Issue #194) pd.set_option("display.max_columns", None) failed_df.to_csv("failed_rows.csv", index=False) print("Failed rows saved to failed_rows.csv") - #print(f"Failed rows:\n{failed_df}") return dict( crosslinking_df=good_df, From d6cca48287f8fe4550aea59eda9fcc419ab069e8 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Sat, 17 Jan 2026 14:02:22 +0100 Subject: [PATCH 021/240] format backend with black --- .../importing/cross_linking_import.py | 244 +++++++++--------- 1 file changed, 124 insertions(+), 120 deletions(-) diff --git a/backend/protzilla/importing/cross_linking_import.py b/backend/protzilla/importing/cross_linking_import.py index 5bfb43132..8a3a60eb9 100644 --- a/backend/protzilla/importing/cross_linking_import.py +++ b/backend/protzilla/importing/cross_linking_import.py @@ -3,7 +3,7 @@ """ import logging -from pathlib import Path +from pathlib import Path from collections import defaultdict import pandas as pd import traceback @@ -30,14 +30,13 @@ def aggregate_data(df: pd.DataFrame, column: str) -> set: set: Unique values from the columns """ return set( - df[[column + "1", column + "2"]] - .stack() - .dropna() - .astype(str) - .str.strip() + df[[column + "1", column + "2"]].stack().dropna().astype(str).str.strip() ) -def validate_data_before_lookup(data_for_lookup:set, is_valid_function, error_code: str): + +def validate_data_before_lookup( + data_for_lookup: set, is_valid_function, error_code: str +): """ Splits input values into valid and invalid ones. @@ -51,32 +50,34 @@ def validate_data_before_lookup(data_for_lookup:set, is_valid_function, error_co results = {} if not data_for_lookup: - return valid_data, results - + return valid_data, results + for data in data_for_lookup: if is_valid_function(data): valid_data.add(data) - else: + else: results[data] = (False, None, error_code) return valid_data, results def build_uniprot_search_params( - data_for_lookup:set, + data_for_lookup: set, field_of_existing_data: str, *, - extra_query: str | None = None, + extra_query: str | None = None, response_format: str, - fields: str, + fields: str, include_isoforms: bool = False, -): +): """ Builds UniProt search URL and params. """ uniprot_search_url = "https://rest.uniprot.org/uniprotkb/search" - base_query = " OR ".join(f"{field_of_existing_data}:{data}" for data in data_for_lookup) + base_query = " OR ".join( + f"{field_of_existing_data}:{data}" for data in data_for_lookup + ) if extra_query: base_query = f"({base_query}) AND {extra_query}" @@ -90,20 +91,20 @@ def build_uniprot_search_params( if include_isoforms: params["includeIsoform"] = "true" - return uniprot_search_url, params + return uniprot_search_url, params -def execute_uniprot_request(url, params, valid_data, results): - try: +def execute_uniprot_request(url, params, valid_data, results): + try: response = requests.get(url, params=params, timeout=15) - response.raise_for_status() - return response - - except requests.exceptions.Timeout: + response.raise_for_status() + return response + + except requests.exceptions.Timeout: error = "TIMEOUT" - except requests.exceptions.HTTPError as e: + except requests.exceptions.HTTPError as e: error = f"HTTP_{e.response.status_code}" - except requests.exceptions.RequestException: + except requests.exceptions.RequestException: error = "REQUEST_ERROR" for data in valid_data: @@ -111,7 +112,7 @@ def execute_uniprot_request(url, params, valid_data, results): return None -def process_uniprot_response_containing_gene_names(response, results): +def process_uniprot_response_containing_gene_names(response, results): data = response.json() for entry in data.get("results", []): @@ -119,20 +120,19 @@ def process_uniprot_response_containing_gene_names(response, results): output = entry.get("genes", [{}]) gene_name = output[0].get("geneName", {}).get("value") if output else None - if gene_name: + if gene_name: results[protein_id] = (True, gene_name, None) else: results[protein_id] = (False, None, "NO_GENE_NAME_FOUND") -def process_uniprot_response_containing_protein_ids(response, valid_input, isFallback: bool): - output = defaultdict(lambda: { - "protein_ids": [], - "list_of_protein_isoforms": [] - }) +def process_uniprot_response_containing_protein_ids( + response, valid_input, isFallback: bool +): + output = defaultdict(lambda: {"protein_ids": [], "list_of_protein_isoforms": []}) lines = response.text.strip().split("\n") - header = lines[0].split("\t") + header = lines[0].split("\t") protein_id_idx = header.index("Entry") gene_name_idx = header.index("Gene Names (primary)") @@ -152,22 +152,19 @@ def process_uniprot_response_containing_protein_ids(response, valid_input, isFal output[valid_input]["list_of_protein_isoforms"].append(protein_id) else: output[valid_input]["protein_ids"].append(protein_id) - return output + return output def fallback_single_lookup(query: str, query_type: str, results): if query_type == "get_gene_name": url = f"https://rest.uniprot.org/uniprotkb/{query}" - params = { - "fields": "gene_primary", - "format": "json" - } + params = {"fields": "gene_primary", "format": "json"} elif query_type == "get_protein_ids": url = "https://rest.uniprot.org/uniprotkb/search" params = { "query": f"gene_exact:{query}", "format": "tsv", - "fields": "accession,gene_primary" + "fields": "accession,gene_primary", } return execute_uniprot_request(url, params, query, results) @@ -180,13 +177,13 @@ def get_gene_name_from_protein_ids(protein_ids: set): protein_ids (set): Set of UniProt accession IDs (e.g. {"Q92878", "P51587"}). Returns: - dict: Mapping protein_id -> (success, gene_name, error) + dict: Mapping protein_id -> (success, gene_name, error) success (bool): True if the lookup for that protein_id succeeded, False otherwise gene_name (str or None): Official gene name if successful, else None error (str or None): Error code/message if failed, else None """ - # Regex for valid accession input directly from UniProt - # A batch request containing an id that doesn't match this regex, + # Regex for valid accession input directly from UniProt + # A batch request containing an id that doesn't match this regex, # leads to an http 400 for the whole request. valid_id_pattern = re.compile( r"^(?:" @@ -199,17 +196,17 @@ def get_gene_name_from_protein_ids(protein_ids: set): valid_ids, results = validate_data_before_lookup( protein_ids, is_valid_function=lambda pid: bool(valid_id_pattern.match(pid)), - error_code="NOT_A_VALID_PROTEIN_ID" + error_code="NOT_A_VALID_PROTEIN_ID", ) if not valid_ids: return results - + url, params = build_uniprot_search_params( - valid_ids, + valid_ids, field_of_existing_data="accession", response_format="json", - fields="accession,gene_primary" + fields="accession,gene_primary", ) response = execute_uniprot_request(url, params, valid_ids, results) @@ -217,20 +214,24 @@ def get_gene_name_from_protein_ids(protein_ids: set): return results process_uniprot_response_containing_gene_names(response, results) - - for pid in valid_ids: - if pid not in results: + + for pid in valid_ids: + if pid not in results: response = fallback_single_lookup(pid, "get_gene_name", results) data = response.json() processed_data = data.get("genes", []) - gene_name = (processed_data[0].get("geneName", {}).get("value") if processed_data else None) + gene_name = ( + processed_data[0].get("geneName", {}).get("value") + if processed_data + else None + ) if gene_name: results[pid] = (True, gene_name, None) else: results[pid] = (False, None, "PROTEIN_ID_NOT_FOUND") - + return results @@ -238,28 +239,28 @@ def get_protein_ids_from_gene_name(gene_names: set): """ Retrieves UniProt protein IDs for a given set of human gene names as a batch query. - Parameters: + Parameters: gene_names (set): Set of gene symbols to look up (e.g. {"RAD50", "MRE11"}) - + Returns: dict: Mapping gene_name -> (success, data, error) success (bool): True if lookup for this gene_name succeeded, False otherwise data (dict or None): { - "protein_ids" (list of str): all protein IDs without any isoform information, + "protein_ids" (list of str): all protein IDs without any isoform information, "list_of_protein_isoforms" (list of str): all isomform IDs } if success else None error (str or None): error code/message if failed, else None """ - # Filter decoy Proteins, because we cannot process them decently? + # Filter decoy Proteins, because we cannot process them decently? valid_gene_names, results = validate_data_before_lookup( gene_names, is_valid_function=lambda name: not name.startswith("DECOY:"), - error_code="IS_DECOY_PROTEIN" + error_code="IS_DECOY_PROTEIN", ) if not valid_gene_names: return results - + url, params = build_uniprot_search_params( valid_gene_names, field_of_existing_data="gene_exact", @@ -268,24 +269,28 @@ def get_protein_ids_from_gene_name(gene_names: set): fields="accession,gene_primary", include_isoforms=True, ) - + response = execute_uniprot_request(url, params, valid_gene_names, results) if response is None: return results - - output = process_uniprot_response_containing_protein_ids(response, valid_gene_names, False) + + output = process_uniprot_response_containing_protein_ids( + response, valid_gene_names, False + ) for gn in valid_gene_names: - data = output.get(gn) - if not data or not data["protein_ids"]: - + data = output.get(gn) + if not data or not data["protein_ids"]: + response = fallback_single_lookup(gn, "get_protein_ids", results) - if response is not None: - new_output = process_uniprot_response_containing_protein_ids(response, gn, True) + if response is not None: + new_output = process_uniprot_response_containing_protein_ids( + response, gn, True + ) protein_id = new_output.get(gn) - else: + else: protein_id = None - if protein_id: + if protein_id: results[gn] = (True, protein_id, None) else: results[gn] = (False, None, "NO_PROTEIN_ID_FOUND") @@ -293,19 +298,19 @@ def get_protein_ids_from_gene_name(gene_names: set): else: results[gn] = (True, data, None) - return results - - + return results + + def iterate_for_protein_designation( - df, - existing_designation, - new_designation, - uniprot_lookup_results, - value_extractor=lambda x: x + df, + existing_designation, + new_designation, + uniprot_lookup_results, + value_extractor=lambda x: x, ): """ Iterates over a DataFrame and adds the missing protein designations to the dataframe using - precomputed lookup results. (either protein ids or gene names are included in the imported + precomputed lookup results. (either protein ids or gene names are included in the imported data and the other is added to the data frame here) Parameters: @@ -334,12 +339,12 @@ def iterate_for_protein_designation( ) errors_occurred = {} - if not success1: + if not success1: errors_occurred["Protein1_error"] = error1 - if not success2: + if not success2: errors_occurred["Protein2_error"] = error2 - if errors_occurred: + if errors_occurred: failed_row = row_dict.copy() failed_row.update(errors_occurred) failed_rows.append(failed_row) @@ -353,32 +358,30 @@ def iterate_for_protein_designation( return good_df, failed_df + def get_missing_protein_designation( - df: pd.DataFrame, - existing_column: str, - missing_column: str, - uniprot_lookup_function, - value_extractor=lambda x: x -): + df: pd.DataFrame, + existing_column: str, + missing_column: str, + uniprot_lookup_function, + value_extractor=lambda x: x, +): unique_existing_designations = aggregate_data(df, existing_column) uniprot_lookup_results = uniprot_lookup_function(unique_existing_designations) good_df, failed_df = iterate_for_protein_designation( - df, - existing_column, - missing_column, - uniprot_lookup_results, - value_extractor + df, existing_column, missing_column, uniprot_lookup_results, value_extractor ) - return good_df, failed_df + return good_df, failed_df def normalize_gene_name_column(df, columns: list[str]): for col in columns: df[col] = df[col].astype("string").str.upper() - return df + return df + def remove_isoform_from_protein_id(protein_id: str) -> str: - return protein_id.split('-', 1)[0] + return protein_id.split("-", 1)[0] def remove_brackets_from_peptide(peptide: str) -> str: @@ -408,26 +411,30 @@ def read_ProteomeDiscoverer_XlinkX_file(file_path: Path) -> pd.DataFrame: df["Is_intra_crosslink"] = df["Is_intra_crosslink"].eq("Intra") - # Right now we remove the isoform ending from every protein_id (if necessary), - # because we cannot process isoforms properly. - # If we ever wanted to add an "Isoforms" column, we need to store the original value from - # the "Protein_id1/2" column in the "Isoforms" column first, before removing the isoform ending. - df["Protein_id1"] = df["Protein_id1"].apply(remove_isoform_from_protein_id).astype("string") - df["Protein_id2"] = df["Protein_id2"].apply(remove_isoform_from_protein_id).astype("string") + # Right now we remove the isoform ending from every protein_id (if necessary), + # because we cannot process isoforms properly. + # If we ever wanted to add an "Isoforms" column, we need to store the original value from + # the "Protein_id1/2" column in the "Isoforms" column first, before removing the isoform ending. + df["Protein_id1"] = ( + df["Protein_id1"].apply(remove_isoform_from_protein_id).astype("string") + ) + df["Protein_id2"] = ( + df["Protein_id2"].apply(remove_isoform_from_protein_id).astype("string") + ) good_df, failed_df = get_missing_protein_designation( - df=df, - existing_column="Protein_id", - missing_column="Protein", - uniprot_lookup_function=get_gene_name_from_protein_ids, - value_extractor=lambda x: x + df=df, + existing_column="Protein_id", + missing_column="Protein", + uniprot_lookup_function=get_gene_name_from_protein_ids, + value_extractor=lambda x: x, ) return good_df, failed_df def read_csm_file(file_path: Path) -> pd.DataFrame: - """ + """ Returns two DataFrames: - good_df: only rows with successful UniProt lookups - failed_df: rows where UniProt lookup failed, including error messages @@ -440,18 +447,18 @@ def read_csm_file(file_path: Path) -> pd.DataFrame: df = normalize_gene_name_column(df, ["Protein1", "Protein2"]) - # In our UniProt lookup we already get all isoforms of the respective gene name. - # Right now we only store the protein id without any isoform information in our dataframe to keep it consistent. - # If we ever need the isoform information we just have to change what the value extractor stores in our dataframe. + # In our UniProt lookup we already get all isoforms of the respective gene name. + # Right now we only store the protein id without any isoform information in our dataframe to keep it consistent. + # If we ever need the isoform information we just have to change what the value extractor stores in our dataframe. good_df, failed_df = get_missing_protein_designation( - df=df, - existing_column="Protein", - missing_column="Protein_id", - uniprot_lookup_function=get_protein_ids_from_gene_name, - value_extractor=lambda x: x["protein_ids"][0] if x else None + df=df, + existing_column="Protein", + missing_column="Protein_id", + uniprot_lookup_function=get_protein_ids_from_gene_name, + value_extractor=lambda x: x["protein_ids"][0] if x else None, ) - return good_df, failed_df + return good_df, failed_df def normalize_crosslinking_df(df: pd.DataFrame) -> pd.DataFrame: @@ -491,18 +498,15 @@ def cross_linking_import(file_path: Path) -> dict: if failed_df.empty: msg = f"Successfully imported data of {len(good_df)} cross-links." messages = [dict(level=logging.INFO, msg=msg)] - else: + else: msg = f"Warning: {len(failed_df)} rows failed to import, however {len(good_df)} cross-links were successfully imported." messages = [ dict(level=logging.WARNING, msg=msg), - dict(level=logging.WARNING, msg=f"Failed rows:\n{failed_df}") + dict(level=logging.WARNING, msg=f"Failed rows:\n{failed_df}"), ] # TODO: Implement display of failed rows (Issue #194) pd.set_option("display.max_columns", None) failed_df.to_csv("failed_rows.csv", index=False) print("Failed rows saved to failed_rows.csv") - - return dict( - crosslinking_df=good_df, - messages=messages - ) \ No newline at end of file + + return dict(crosslinking_df=good_df, messages=messages) From 23728b30a233407768ed879bd758f886777e5791 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Sat, 17 Jan 2026 14:15:38 +0100 Subject: [PATCH 022/240] update test test_get_all_possible_step_names --- backend/tests/main/test_views_helper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/tests/main/test_views_helper.py b/backend/tests/main/test_views_helper.py index d40de2d75..37b762e14 100644 --- a/backend/tests/main/test_views_helper.py +++ b/backend/tests/main/test_views_helper.py @@ -13,6 +13,7 @@ def test_get_all_possible_step_names(): "EvidenceImport", "ExampleDatasetImport", "FastaImport", + "CrossLinkingImport", "FilterProteinsBySamplesMissing", "FilterProteinsBySilacRatios", "FilterByProteinsCount", From af58ee0aa7245eff030b50e54ed9ea128252fe34 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 19 Jan 2026 14:20:02 +0100 Subject: [PATCH 023/240] fix: move download_files into networking.py, use fasta_import to turn fasta files into dfs and remove the passing around of sessions --- .../alphafold_protein_structure_load.py | 125 ++++-------------- backend/protzilla/networking.py | 32 +++++ 2 files changed, 55 insertions(+), 102 deletions(-) create mode 100644 backend/protzilla/networking.py diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 66cde9df1..b31543abd 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -12,33 +12,8 @@ from backend.protzilla.constants import paths from backend.protzilla.constants.protzilla_logging import logger - - -def _download_file(session: requests.Session, url: str, dest: Path) -> Path | None: - """ - Download a file from a URL and save it to the specified destination path. - - :param session: The requests session to use for the download - :param url: The URL of the file to download - :param dest: The destination path where the file should be saved - :return: The destination path if successful, None otherwise - """ - try: - dest.parent.mkdir(parents=True, exist_ok=True) - with session.get(url, stream=True, timeout=60) as r: - r.raise_for_status() - with open(dest, "wb") as f: - for chunk in r.iter_content(chunk_size=8192): - if chunk: - f.write(chunk) - logger.info("Downloaded %s -> %s", url, dest) - return dest - except requests.RequestException: - logger.exception("Failed to download %s", url) - return None - except OSError: - logger.exception("Failed to write file %s", dest) - return None +from backend.protzilla.importing.fasta_import import fasta_import +from backend.protzilla.networking import download_file_from_url def to_fasta(seq: str, header: str = "protein_sequence", width: int = 60) -> str: @@ -51,58 +26,15 @@ def to_fasta(seq: str, header: str = "protein_sequence", width: int = 60) -> str :return: The sequence in FASTA format :raises ValueError: If the sequence contains invalid characters or whitespace """ - VALID_AA = set("ACDEFGHIKLMNPQRSTVWYBXZJUO*-") + VALID_AMINO_ACID = set("ACDEFGHIKLMNPQRSTVWYBXZJUO*-") if not seq or any(c.isspace() for c in seq): raise ValueError("Sequence must be a single, whitespace-free string.") seq = seq.upper() - bad = set(seq) - VALID_AA + bad = set(seq) - VALID_AMINO_ACID if bad: raise ValueError(f"Invalid characters in sequence: {''.join(sorted(bad))}") - return ">" + header + "\n" + "\n".join(wrap(seq, width)) + "\n" - - -def fasta_to_dataframe(fasta_path: str) -> pd.DataFrame: - """ - Parse a FASTA file and convert it to a DataFrame. - - :param fasta_path: The path to the FASTA file - :return: A DataFrame with columns 'id', 'sequence', and 'length' - """ - records = [] - seq_id = None - seq = [] - - with open(fasta_path, "r") as f: - for line in f: - line = line.strip() - if not line: - continue - if line.startswith(">"): - if seq_id is not None: - sequence = "".join(seq) - records.append( - { - "id": seq_id, - "sequence": sequence, - "length": len(sequence), - } - ) - seq_id = line[1:].split()[0] - seq = [] - else: - seq.append(line) - - if seq_id is not None: - sequence = "".join(seq) - records.append( - { - "id": seq_id, - "sequence": sequence, - "length": len(sequence), - } - ) - - return pd.DataFrame(records) + joined = "\n".join(wrap(seq, width)) + return f">alpha|{header}\n{joined}\n" def read_alphafold_mmcif(path: str) -> pd.DataFrame: @@ -151,20 +83,14 @@ def read_alphafold_mmcif(path: str) -> pd.DataFrame: return pd.DataFrame(data) -def _handle_alphafold_files( - session: requests.Session, +def handle_alphafold_files( files_urls: dict[str, Any], uniprot: str, seq: str, metadata_df: pd.DataFrame, acc: str, persist_upload: bool = False, -) -> tuple[ - pd.DataFrame | None, - pd.DataFrame | None, - pd.DataFrame | None, - pd.DataFrame | None, -]: +) -> dict[str, pd.DataFrame | None]: """ Download AlphaFold structure files and convert them to DataFrames. @@ -172,7 +98,6 @@ def _handle_alphafold_files( The function downloads CIF, PAE, and pLDDT files, converts them to DataFrames, and optionally saves metadata to a CSV file. - :param session: The requests session to use for downloading files :param files_urls: Dictionary containing URLs for CIF, PAE, and pLDDT files :param uniprot: The UniProt ID of the protein :param seq: The protein sequence @@ -187,14 +112,14 @@ def _handle_alphafold_files( sequence_df = None meta_dir = paths.EXTERNAL_DATA_PATH / "alphafold" - target_dir = meta_dir / (acc or uniprot) + target_dir = meta_dir / uniprot downloaded: dict[str, str] = {} temp_dir = None - work_dir = target_dir if persist_upload: target_dir.mkdir(parents=True, exist_ok=True) + work_dir = target_dir else: temp_dir = Path(tempfile.mkdtemp()) work_dir = temp_dir @@ -223,19 +148,12 @@ def _handle_alphafold_files( if isinstance(urlval, str) and urlval: fname = urlval.split("?")[0].rstrip("/").split("/")[-1] dest = work_dir / fname - saved = _download_file(session, urlval, dest) + saved = download_file_from_url(urlval, dest) if saved: downloaded[key] = str(saved) try: if key == "cifUrl": - try: - cif_df = read_alphafold_mmcif(str(saved)) - except Exception: - logger.exception( - "Failed to load CIF into dataframe. Path=%s", - str(saved), - ) - raise + cif_df = read_alphafold_mmcif(saved) elif key == "paeDocUrl": pae_df = pd.read_json(saved) elif key == "plddtDocUrl": @@ -250,7 +168,8 @@ def _handle_alphafold_files( with open(fasta_dest, "w") as f: f.write(sequence) logger.info("Wrote FASTA sequence to %s", fasta_dest) - sequence_df = fasta_to_dataframe(str(fasta_dest)) + fasta_dict = fasta_import(str(fasta_dest)) + sequence_df = fasta_dict["fasta_df"] except OSError: logger.exception("Failed to write FASTA file %s", fasta_dest) except Exception: @@ -260,7 +179,10 @@ def _handle_alphafold_files( if temp_dir is not None: shutil.rmtree(temp_dir, ignore_errors=True) - return cif_df, pae_df, plddt_df, sequence_df + return {"cif_df":cif_df, + "pae_df": pae_df, + "plddt_df": plddt_df, + "sequence_df": sequence_df} def fetch_alphafold_protein_structure( @@ -315,8 +237,7 @@ def fetch_alphafold_protein_structure( metadata_df = pd.DataFrame([data]) acc = data.get("uniprotAccession") - cif_df, pae_df, plddt_df, sequence_df = _handle_alphafold_files( - session=session, + alpha_dfs = handle_alphafold_files( files_urls=files_urls, uniprot=uniprot, seq=seq_tmp, @@ -327,8 +248,8 @@ def fetch_alphafold_protein_structure( return { "metadata_df": metadata_df, - "cif_df": cif_df, - "pae_df": pae_df, - "plddt_df": plddt_df, - "sequence_df": sequence_df, + "cif_df": alpha_dfs["cif_df"], + "pae_df": alpha_dfs["pae_df"], + "plddt_df": alpha_dfs["plddt_df"], + "sequence_df": alpha_dfs["sequence_df"], } diff --git a/backend/protzilla/networking.py b/backend/protzilla/networking.py new file mode 100644 index 000000000..b91eb8d11 --- /dev/null +++ b/backend/protzilla/networking.py @@ -0,0 +1,32 @@ +import requests +from pathlib import Path +from backend.protzilla.constants.protzilla_logging import logger + + +def download_file_from_url(url: str, dest: Path) -> Path | None: + """ + Download a file from a URL and save it to the specified destination path. + + :param url: The URL of the file to download + :param dest: The destination path where the file should be saved + :return: The destination path if successful, None otherwise + """ + with requests.Session() as session: + + r = session.get(url, timeout=30) + r.raise_for_status() + + try: + dest.parent.mkdir(parents=True, exist_ok=True) + with open(dest, "wb") as f: + for chunk in r.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + logger.info("Downloaded %s -> %s", url, dest) + return dest + except requests.RequestException: + logger.exception("Failed to download %s", url) + return None + except OSError: + logger.exception("Failed to write file %s", dest) + return None From 211a2bfbc106471714f08e0208574eead06404de Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 19 Jan 2026 15:48:02 +0100 Subject: [PATCH 024/240] feat: basic distance calculation of distances in AlphaFold based on Ca-atom --- .../data_analysis/cross_linking_validation.py | 192 +++++++++++++++++- backend/protzilla/form_helper.py | 2 +- .../importing/cross_linking_import.py | 186 ++++++++--------- backend/protzilla/methods/data_analysis.py | 26 ++- backend/protzilla/steps.py | 16 +- 5 files changed, 304 insertions(+), 118 deletions(-) diff --git a/backend/protzilla/data_analysis/cross_linking_validation.py b/backend/protzilla/data_analysis/cross_linking_validation.py index 24c58d627..ae4a88071 100644 --- a/backend/protzilla/data_analysis/cross_linking_validation.py +++ b/backend/protzilla/data_analysis/cross_linking_validation.py @@ -1,15 +1,189 @@ -""" -This module contains the code to parse a file containing cross linking data. -""" +import pandas as pd +import math +from plotly.graph_objects import Figure -import logging +from protzilla.importing.alphafold_protein_structure_load import ( + fetch_alphafold_protein_structure, +) +from protzilla.data_preprocessing.plots import create_bar_plot -from pathlib import Path -import traceback +def get_coordinates_of_ca_atom_from_cif_df( + cif_df: pd.DataFrame, amino_acid_position: int +) -> tuple[float, float, float]: + """ + Extract the 3D coordinates of the C-alpha (CA) atom for a given amino acid + position from CIF-derived DataFrame. -from backend.protzilla.utilities import format_trace + :param cif_df: DataFrame containing atomic data parsed from a CIF file. Must include the columns + "_atom_site.label_atom_id", "_atom_site.label_seq_id", + "_atom_site.Cartn_x", "_atom_site.Cartn_y", and "_atom_site.Cartn_z". + :param amino_acid_position: The sequence position of the amino acid whose C-alpha (CA) atom + coordinates should be extracted. + :return: A tuple (x, y, z) of floats representing the Cartesian coordinates of the C-alpha atom. + :raises ValueError: If no C-alpha atom is found for the given amino acid position. + """ + cif_df = cif_df[cif_df["_atom_site.label_atom_id"] == "CA"] + cif_df = cif_df[ + cif_df["_atom_site.label_seq_id"].astype(int) == amino_acid_position + ] -def validate_cross_linking_with_angstrom_deviation(): - pass + if cif_df.empty: + raise ValueError( + f"No central Ca atom found for amino acid at position {amino_acid_position}." + ) + + row = cif_df.iloc[0] + + x = float(row["_atom_site.Cartn_x"]) + y = float(row["_atom_site.Cartn_y"]) + z = float(row["_atom_site.Cartn_z"]) + + return x, y, z + + +def get_distance_between_two_amino_acids_in_angstrom( + position1: int, position2: int, cif_df: pd.DataFrame +) -> float: + x1, y1, z1 = get_coordinates_of_ca_atom_from_cif_df(cif_df, position1) + x2, y2, z2 = get_coordinates_of_ca_atom_from_cif_df(cif_df, position2) + + distance = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2 + (z2 - z1) ** 2) + + return distance + + +def get_position_of_amino_acid_crosslinker_bound_to( + protein_sequence: str, + peptide_sequence: str, + crosslinker_position_within_peptide: int, +) -> int: + """Returns which amino acid the cross-linker bound to, 1-based.""" + peptide_start_position = protein_sequence.find(peptide_sequence) + 1 + if peptide_start_position == 0: + raise ValueError( + f"Peptide {peptide_sequence} was not found in protein sequence" + ) + return peptide_start_position + crosslinker_position_within_peptide - 1 + + +def get_distance_between_crosslinker_connected_amino_acids_in_alphafold( + fasta_df: pd.DataFrame, cif_df: pd.DataFrame, crosslink +) -> float: + amino_acid_crosslinker1_is_bound_to = ( + get_position_of_amino_acid_crosslinker_bound_to( + protein_sequence=fasta_df.iloc[0]["sequence"], + peptide_sequence=crosslink.Peptide1, + crosslinker_position_within_peptide=crosslink.CL_position1, + ) + ) + amino_acid_crosslinker2_is_bound_to = ( + get_position_of_amino_acid_crosslinker_bound_to( + protein_sequence=fasta_df.iloc[0]["sequence"], + peptide_sequence=crosslink.Peptide2, + crosslinker_position_within_peptide=crosslink.CL_position2, + ) + ) + distance_in_alphafold = get_distance_between_two_amino_acids_in_angstrom( + amino_acid_crosslinker1_is_bound_to, + amino_acid_crosslinker2_is_bound_to, + cif_df, + ) + return distance_in_alphafold + + +def validate_with_angstrom_deviation( + crosslinking_df: pd.DataFrame, protein_to_validate: str, **kwargs +) -> tuple[int, int]: + """ + Validates cross-links by comparing the cross-linker + lengths with the distances between the linked amino acids in the AlphaFold + protein structure. A cross-link is regarded as valid if the distance between the connected amino acids in AlphaFold + is less than the cross-linker length + the allowed deviation. + + :param crosslinking_df: DataFrame containing cross-linking data. + :param protein_to_validate: UniProt ID of the protein to validate. + :param kwargs: Dynamically generated keyword arguments containing: + - length_of_: float, the length of the crosslinker in Ångström + - accepted_deviation_for_: float, accepted deviation in Ångström + for the respective crosslinker + :return: Tuple (valid_cross_links, invalid_cross_links), counts of cross-links that + pass or fail the distance validation. + :raises KeyError: If a required crosslinker field is missing in kwargs. + :raises ValueError: If peptide sequences cannot be matched to the protein sequence. + """ + alphafold_data = fetch_alphafold_protein_structure( + uniprot=protein_to_validate, persist_uploads=False + ) + cif_df = alphafold_data["cif_df"] + fasta_df = alphafold_data["sequence_df"] + valid_cross_links = 0 + invalid_cross_links = 0 + for crosslink in crosslinking_df.itertuples(index=False): + if ( + crosslink.Protein_id1 != protein_to_validate + or crosslink.Protein_id2 != protein_to_validate + ): + continue + + distance_in_alphafold = ( + get_distance_between_crosslinker_connected_amino_acids_in_alphafold( + fasta_df, cif_df, crosslink + ) + ) + + try: + crosslinker_length = kwargs[f"length_of_{crosslink.Crosslinker}"] + accepted_deviation = kwargs[ + f"accepted_deviation_for_{crosslink.Crosslinker}" + ] + except KeyError as e: + missing_key = e.args[0] + raise KeyError( + f"Missing required field '{missing_key}' for crosslinker '{crosslink.Crosslinker}'. " + "Please check that your form includes the correct fields." + ) + upper_limit_on_allowed_distance = crosslinker_length + accepted_deviation + + if distance_in_alphafold <= upper_limit_on_allowed_distance: + valid_cross_links += 1 + else: + invalid_cross_links += 1 + + return valid_cross_links, invalid_cross_links + + +def bar_plot_of_valid_crosslinks( + crosslinking_df: pd.DataFrame, protein_to_validate: str, **kwargs +) -> list[Figure]: + """ + Creates a bar plot summarizing the number of valid and invalid cross-links + based on their distances in the AlphaFold structure compared to cross-linker + lengths and allowed deviations. + + :param crosslinking_df: DataFrame containing cross-linking data. + :param protein_to_validate: UniProt ID of the protein to validate. + :param kwargs: Dynamically generated keyword arguments containing crosslinker + lengths and accepted deviations, passed to + validate_with_angstrom_deviation: + - length_of_: float + - accepted_deviation_for_: float + :return: List containing a single bar plot object representing counts of + valid and invalid cross-links. + :raises KeyError: If a required crosslinker field is missing in kwargs. + """ + valid_crosslinks, invalid_crosslinks = validate_with_angstrom_deviation( + crosslinking_df, protein_to_validate, **kwargs + ) + return [ + create_bar_plot( + values_of_sectors=[ + valid_crosslinks, + invalid_crosslinks, + ], + names_of_sectors=["Valid Cross-Links", "Invalid Cross-Links"], + heading="Cross-Links used for Validation", + y_title="Number of Cross-Links", + ) + ] diff --git a/backend/protzilla/form_helper.py b/backend/protzilla/form_helper.py index fe9817533..d04922863 100644 --- a/backend/protzilla/form_helper.py +++ b/backend/protzilla/form_helper.py @@ -58,7 +58,7 @@ def get_choices_for_metadata_non_sample_columns( def get_crosslinker_names_from_crosslinker_df(run: Run) -> list[str]: df = run.steps.get_step_output( Step, output_key="crosslinking_df" - ) # first step that returns a crosslinking_df + ) if df is None or "Crosslinker" not in df.columns: return [] crosslinkers = df["Crosslinker"].dropna().unique() diff --git a/backend/protzilla/importing/cross_linking_import.py b/backend/protzilla/importing/cross_linking_import.py index 3afd1f280..cd00c29e9 100644 --- a/backend/protzilla/importing/cross_linking_import.py +++ b/backend/protzilla/importing/cross_linking_import.py @@ -3,7 +3,7 @@ """ import logging -from pathlib import Path +from pathlib import Path from collections import defaultdict import pandas as pd import traceback @@ -30,14 +30,13 @@ def aggregate_data(df: pd.DataFrame, column: str) -> set: set: Unique values from the columns """ return set( - df[[column + "1", column + "2"]] - .stack() - .dropna() - .astype(str) - .str.strip() + df[[column + "1", column + "2"]].stack().dropna().astype(str).str.strip() ) -def validate_data_before_lookup(data_for_lookup:set, is_valid_function, error_code: str): + +def validate_data_before_lookup( + data_for_lookup: set, is_valid_function, error_code: str +): """ Splits input values into valid and invalid ones. @@ -51,32 +50,34 @@ def validate_data_before_lookup(data_for_lookup:set, is_valid_function, error_co results = {} if not data_for_lookup: - return valid_data, results - + return valid_data, results + for data in data_for_lookup: if is_valid_function(data): valid_data.add(data) - else: + else: results[data] = (False, None, error_code) return valid_data, results def build_uniprot_search_params( - data_for_lookup:set, + data_for_lookup: set, field_of_existing_data: str, *, - extra_query: str | None = None, + extra_query: str | None = None, response_format: str, - fields: str, + fields: str, include_isoforms: bool = False, -): +): """ Builds UniProt search URL and params. """ uniprot_search_url = "https://rest.uniprot.org/uniprotkb/search" - base_query = " OR ".join(f"{field_of_existing_data}:{data}" for data in data_for_lookup) + base_query = " OR ".join( + f"{field_of_existing_data}:{data}" for data in data_for_lookup + ) if extra_query: base_query = f"({base_query}) AND {extra_query}" @@ -90,20 +91,20 @@ def build_uniprot_search_params( if include_isoforms: params["includeIsoform"] = "true" - return uniprot_search_url, params + return uniprot_search_url, params -def execute_uniprot_request(url, params, valid_data, results): - try: +def execute_uniprot_request(url, params, valid_data, results): + try: response = requests.get(url, params=params, timeout=15) - response.raise_for_status() - return response - - except requests.exceptions.Timeout: + response.raise_for_status() + return response + + except requests.exceptions.Timeout: error = "TIMEOUT" - except requests.exceptions.HTTPError as e: + except requests.exceptions.HTTPError as e: error = f"HTTP_{e.response.status_code}" - except requests.exceptions.RequestException: + except requests.exceptions.RequestException: error = "REQUEST_ERROR" for data in valid_data: @@ -119,13 +120,13 @@ def get_gene_name_from_protein_ids(protein_ids: set): protein_ids (set): Set of UniProt accession IDs (e.g. {"Q92878", "P51587"}). Returns: - dict: Mapping protein_id -> (success, gene_name, error) + dict: Mapping protein_id -> (success, gene_name, error) success (bool): True if the lookup for that protein_id succeeded, False otherwise gene_name (str or None): Official gene name if successful, else None error (str or None): Error code/message if failed, else None """ - # Regex for valid accession input directly from UniProt - # A batch request containing an id that doesn't match this regex, + # Regex for valid accession input directly from UniProt + # A batch request containing an id that doesn't match this regex, # leads to an http 400 for the whole request. valid_id_pattern = re.compile( r"^(?:" @@ -138,17 +139,17 @@ def get_gene_name_from_protein_ids(protein_ids: set): valid_ids, results = validate_data_before_lookup( protein_ids, is_valid_function=lambda pid: bool(valid_id_pattern.match(pid)), - error_code="NOT_A_VALID_PROTEIN_ID" + error_code="NOT_A_VALID_PROTEIN_ID", ) if not valid_ids: return results - + url, params = build_uniprot_search_params( - valid_ids, + valid_ids, field_of_existing_data="accession", response_format="json", - fields="accession,gene_primary" + fields="accession,gene_primary", ) response = execute_uniprot_request(url, params, valid_ids, results) @@ -162,15 +163,15 @@ def get_gene_name_from_protein_ids(protein_ids: set): output = entry.get("genes", [{}]) gene_name = output[0].get("geneName", {}).get("value") if output else None - # If there is more than one protein id for a gene name, - # we only store the last one that was found. - if gene_name: + # If there is more than one protein id for a gene name, + # we only store the last one that was found. + if gene_name: results[protein_id] = (True, gene_name, None) else: results[protein_id] = (False, None, "NO_GENE_NAME_FOUND") - - for pid in valid_ids: - if pid not in results: + + for pid in valid_ids: + if pid not in results: results[pid] = (False, None, "PROTEIN_ID_NOT_FOUND") return results @@ -180,28 +181,28 @@ def get_protein_ids_from_gene_name(gene_names: set): """ Retrieves UniProt protein IDs for a given set of human gene names as a batch query. - Parameters: + Parameters: gene_names (set): Set of gene symbols to look up (e.g. {"RAD50", "MRE11"}) - + Returns: dict: Mapping gene_name -> (success, data, error) success (bool): True if lookup for this gene_name succeeded, False otherwise data (dict or None): { - "protein_ids" (list of str): all protein IDs without any isoform information, + "protein_ids" (list of str): all protein IDs without any isoform information, "list_of_protein_isoforms" (list of str): all isomform IDs } if success else None error (str or None): error code/message if failed, else None """ - # Filter decoy Proteins, because we cannot process them decently? + # Filter decoy Proteins, because we cannot process them decently? valid_gene_names, results = validate_data_before_lookup( gene_names, is_valid_function=lambda name: not name.startswith("decoy:"), - error_code="IS_DECOY_PROTEIN" + error_code="IS_DECOY_PROTEIN", ) if not valid_gene_names: return results - + url, params = build_uniprot_search_params( valid_gene_names, field_of_existing_data="gene", @@ -210,18 +211,15 @@ def get_protein_ids_from_gene_name(gene_names: set): fields="accession,gene_primary", include_isoforms=True, ) - + response = execute_uniprot_request(url, params, valid_gene_names, results) if response is None: return results - output = defaultdict(lambda: { - "protein_ids": [], - "list_of_protein_isoforms": [] - }) + output = defaultdict(lambda: {"protein_ids": [], "list_of_protein_isoforms": []}) lines = response.text.strip().split("\n") - header = lines[0].split("\t") + header = lines[0].split("\t") protein_id_idx = header.index("Entry") gene_name_idx = header.index("Gene Names (primary)") @@ -238,26 +236,26 @@ def get_protein_ids_from_gene_name(gene_names: set): output[g]["protein_ids"].append(protein_id) for gn in valid_gene_names: - data = output.get(gn) + data = output.get(gn) - if not data or not data["protein_ids"]: + if not data or not data["protein_ids"]: results[gn] = (False, None, "NO_PROTEIN_ID_FOUND") else: results[gn] = (True, data, None) - return results - - + return results + + def iterate_for_protein_designation( - df, - existing_designation, - new_designation, - uniprot_lookup_results, - value_extractor=lambda x: x + df, + existing_designation, + new_designation, + uniprot_lookup_results, + value_extractor=lambda x: x, ): """ Iterates over a DataFrame and adds the missing protein designations to the dataframe using - precomputed lookup results. (either protein ids or gene names are included in the imported + precomputed lookup results. (either protein ids or gene names are included in the imported data and the other is added to the data frame here) Parameters: @@ -286,12 +284,12 @@ def iterate_for_protein_designation( ) errors_occurred = {} - if not success1: + if not success1: errors_occurred["Protein1_error"] = error1 - if not success2: + if not success2: errors_occurred["Protein2_error"] = error2 - if errors_occurred: + if errors_occurred: failed_row = row_dict.copy() failed_row.update(errors_occurred) failed_rows.append(failed_row) @@ -305,23 +303,20 @@ def iterate_for_protein_designation( return good_df, failed_df + def get_missing_protein_designation( - df: pd.DataFrame, - existing_column: str, - missing_column: str, - uniprot_lookup_function, - value_extractor=lambda x: x -): + df: pd.DataFrame, + existing_column: str, + missing_column: str, + uniprot_lookup_function, + value_extractor=lambda x: x, +): unique_existing_designations = aggregate_data(df, existing_column) uniprot_lookup_results = uniprot_lookup_function(unique_existing_designations) good_df, failed_df = iterate_for_protein_designation( - df, - existing_column, - missing_column, - uniprot_lookup_results, - value_extractor + df, existing_column, missing_column, uniprot_lookup_results, value_extractor ) - return good_df, failed_df + return good_df, failed_df def remove_brackets_from_peptide(peptide: str) -> str: @@ -352,18 +347,18 @@ def read_ProteomeDiscoverer_XlinkX_file(file_path: Path) -> pd.DataFrame: df["Is_intra_crosslink"] = df["Is_intra_crosslink"].eq("Intra") good_df, failed_df = get_missing_protein_designation( - df=df, - existing_column="Protein_id", - missing_column="Protein", - uniprot_lookup_function=get_gene_name_from_protein_ids, - value_extractor=lambda x: x + df=df, + existing_column="Protein_id", + missing_column="Protein", + uniprot_lookup_function=get_gene_name_from_protein_ids, + value_extractor=lambda x: x, ) return good_df, failed_df def read_csm_file(file_path: Path) -> pd.DataFrame: - """ + """ Returns two DataFrames: - good_df: only rows with successful UniProt lookups - failed_df: rows where UniProt lookup failed, including error messages @@ -374,18 +369,18 @@ def read_csm_file(file_path: Path) -> pd.DataFrame: df["Is_intra_crosslink"] = df["Protein1"].eq(df["Protein2"]) - # In our UniProt lookup we already get all isoforms of the respective gene name. - # Right now we only store the protein id without any isoform information in our dataframe to keep it consistent. - # If we ever need the isoform information we just have to change what the value extractor stores in our dataframe. + # In our UniProt lookup we already get all isoforms of the respective gene name. + # Right now we only store the protein id without any isoform information in our dataframe to keep it consistent. + # If we ever need the isoform information we just have to change what the value extractor stores in our dataframe. good_df, failed_df = get_missing_protein_designation( - df=df, - existing_column="Protein", - missing_column="Protein_id", - uniprot_lookup_function=get_protein_ids_from_gene_name, - value_extractor=lambda x: x["protein_ids"][0] if x else None + df=df, + existing_column="Protein", + missing_column="Protein_id", + uniprot_lookup_function=get_protein_ids_from_gene_name, + value_extractor=lambda x: x["protein_ids"][0] if x else None, ) - return good_df, failed_df + return good_df, failed_df def normalize_crosslinking_df(df: pd.DataFrame) -> pd.DataFrame: @@ -425,14 +420,11 @@ def cross_linking_import(file_path: Path) -> dict: if failed_df.empty: msg = f"Successfully imported data of {len(good_df)} cross-links." messages = [dict(level=logging.INFO, msg=msg)] - else: + else: msg = f"Warning: {len(failed_df)} rows failed to import, however {len(good_df)} cross-links were successfully imported." messages = [ dict(level=logging.WARNING, msg=msg), - dict(level=logging.WARNING, msg=f"Failed rows:\n{failed_df}") + dict(level=logging.WARNING, msg=f"Failed rows:\n{failed_df}"), ] - - return dict( - crosslinking_df=good_df, - messages=messages - ) + + return dict(crosslinking_df=good_df, messages=messages) diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index cbc8a547d..acc5efa00 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -61,7 +61,8 @@ get_detected_modifications, ) from protzilla.data_analysis.cross_linking_validation import ( - validate_cross_linking_with_angstrom_deviation, + validate_with_angstrom_deviation, + bar_plot_of_valid_crosslinks, ) from backend.protzilla.run import Run from backend.protzilla.form_helper import get_crosslinker_names_from_crosslinker_df @@ -2469,15 +2470,15 @@ class CrossLinkingValidationWithAngstromDeviation(DataAnalysisStep): operation = "Cross Linking Validation" method_description = "Validates cross links based on the difference between the length of the cross linker and the distance between the amino acids which were connected by the cross linker. (in Ångström)" + output_keys = ["crosslinking_df_result"] + def create_form(self): return Form( label="Ångström Deviation", input_fields=[ - FloatField( - name="accepted_deviation", - label="Accepted deviation in Ångström", - min=0, - value=0.20, + TextField( + name="protein_to_validate", + label="Protein prediction that should be validated", ), ], ) @@ -2487,15 +2488,20 @@ def modify_form(self, form: Form, run: Run) -> None: for cl in cross_linker: field_name = f"length_of_{cl}" if field_name not in form: - field = FloatField( + crosslinker_length_field = FloatField( name=field_name, label=f"Length of {cl} in Ångström", min=0, - value=1.0, ) - form.add_field(field) + allowed_length_deviation_field = FloatField( + name=f"accepted_deviation_for_{cl}", + label=f"Accepted deviation for {cl} Cross-Links in Ångström", + min=0, + ) + form.add_field(crosslinker_length_field) + form.add_field(allowed_length_deviation_field) - calc_method = staticmethod(validate_cross_linking_with_angstrom_deviation) + plot_method = staticmethod(bar_plot_of_valid_crosslinks) def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["crosslinking_df"] = steps.get_step_output( diff --git a/backend/protzilla/steps.py b/backend/protzilla/steps.py index 3840a0dae..6c4fb2c8e 100644 --- a/backend/protzilla/steps.py +++ b/backend/protzilla/steps.py @@ -267,19 +267,33 @@ def plot_input(self) -> dict: plot_input = self.inputs | prefixed_output input_parameters = inspect.signature(self.plot_method).parameters + + has_kwargs = any( + param.kind == inspect.Parameter.VAR_KEYWORD + for param in input_parameters.values() + ) + required_keys = [ key for key, param in input_parameters.items() if param.default == inspect.Parameter.empty + and param.kind + in (inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY) ] for key in required_keys: if key not in plot_input: raise ValueError(f"Missing required input '{key}' for the plot method") - return { + output_dict = { key: plot_input[key] for key in input_parameters.keys() if key in plot_input } + if has_kwargs: + kwargs_dict = {k: v for k, v in plot_input.items() if k not in output_dict} + output_dict.update(kwargs_dict) + + return output_dict + def validate_outputs(self, soft_check: bool = False) -> bool: """ Validates the outputs of the step. Uses the output_keys attribute to check if all required keys are present in From 98c790ebb68bc8f48ea58b6929264d38076913ee Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 19 Jan 2026 15:58:34 +0100 Subject: [PATCH 025/240] fix: remove unnecessary crosslinking dataframe reference --- backend/main/views.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/main/views.py b/backend/main/views.py index eff8e49f4..9c17a3806 100644 --- a/backend/main/views.py +++ b/backend/main/views.py @@ -53,7 +53,6 @@ "metadata_df", "peptide_df", "modification_df", - "crosslinking_df", ] From e542ea1befc43745d2b5b24c85f627bf8363dc26 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 19 Jan 2026 15:58:42 +0100 Subject: [PATCH 026/240] fix: fix parameter to stay consistent and format with black --- .../alphafold_protein_structure_load.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index b31543abd..61dfe6d4f 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -89,7 +89,7 @@ def handle_alphafold_files( seq: str, metadata_df: pd.DataFrame, acc: str, - persist_upload: bool = False, + persist_uploads: bool = False, ) -> dict[str, pd.DataFrame | None]: """ Download AlphaFold structure files and convert them to DataFrames. @@ -103,7 +103,7 @@ def handle_alphafold_files( :param seq: The protein sequence :param metadata_df: DataFrame containing AlphaFold metadata :param acc: The accession number (used for directory naming) - :param persist_upload: If True, files are saved persistently; if False, only loaded into memory + :param persist_uploads: If True, files are saved persistently; if False, only loaded into memory :return: Tuple of (cif_df, pae_df, plddt_df, sequence_df) or None values for failed loads """ cif_df = None @@ -117,7 +117,7 @@ def handle_alphafold_files( temp_dir = None - if persist_upload: + if persist_uploads: target_dir.mkdir(parents=True, exist_ok=True) work_dir = target_dir else: @@ -125,7 +125,7 @@ def handle_alphafold_files( work_dir = temp_dir try: - if persist_upload and metadata_df is not None: + if persist_uploads and metadata_df is not None: meta_dir.mkdir(parents=True, exist_ok=True) metadata_csv = meta_dir / "alphafold_metadata.csv" try: @@ -179,10 +179,12 @@ def handle_alphafold_files( if temp_dir is not None: shutil.rmtree(temp_dir, ignore_errors=True) - return {"cif_df":cif_df, - "pae_df": pae_df, - "plddt_df": plddt_df, - "sequence_df": sequence_df} + return { + "cif_df": cif_df, + "pae_df": pae_df, + "plddt_df": plddt_df, + "sequence_df": sequence_df, + } def fetch_alphafold_protein_structure( @@ -243,7 +245,7 @@ def fetch_alphafold_protein_structure( seq=seq_tmp, metadata_df=metadata_df, acc=acc, - persist_upload=persist_uploads, + persist_uploads=persist_uploads, ) return { From 9cd306d68516af6e7564a91b3283cd5d40fa79ab Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 19 Jan 2026 15:59:15 +0100 Subject: [PATCH 027/240] feat: add tests for alphafold_protein_structure_load --- .../test_alphafold_protein_structure_load.py | 167 ++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py new file mode 100644 index 000000000..7bd82233e --- /dev/null +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -0,0 +1,167 @@ +import pandas as pd +import pytest +from pathlib import Path + + +from backend.protzilla.importing.alphafold_protein_structure_load import ( + fetch_alphafold_protein_structure, + to_fasta, + read_alphafold_mmcif, +) +import backend.protzilla.importing.alphafold_protein_structure_load as af + + +def test_to_fasta_default_header_and_newline(): + seq = "A" * 130 + out = to_fasta(seq, "test_id", 60) + + expected = ( + ">alpha|test_id\n" + ("A" * 60) + "\n" + ("A" * 60) + "\n" + ("A" * 10) + "\n" + ) + assert out == expected + + +def test_to_fasta_invalid_characters(): + with pytest.raises(ValueError, match=r"Invalid characters in sequence: 01@"): + to_fasta("AbbC@D1Eeff0") + + +def test_to_fasta_whitespace(): + with pytest.raises( + ValueError, match=r"Sequence must be a single, whitespace-free string." + ): + to_fasta(" ") + + +def test_read_alphafold_mmcif_file_not_found(tmp_path): + missing = tmp_path / "unexisting.cif" + with pytest.raises(FileNotFoundError): + read_alphafold_mmcif(str(missing)) + + +def test_read_alphafold_mmcif_is_directory(tmp_path): + with pytest.raises(IsADirectoryError): + read_alphafold_mmcif(str(tmp_path)) + + +def test_read_alphafold_mmcif_empty(tmp_path): + cif = tmp_path / "empty.cif" + cif.write_text("") + with pytest.raises(ValueError, match="No CIF blocks found"): + read_alphafold_mmcif(str(cif)) + + +def test_read_alphafold_mmcif_atom_site_not_found(tmp_path): + cif = tmp_path / "no_atom_site.cif" + cif.write_text( + """ +data_test +_entry.id test +""" + ) + df = read_alphafold_mmcif(str(cif)) + assert isinstance(df, pd.DataFrame) + assert df.empty + + +def test_read_alphafold_mmcif_valid_atom_site(tmp_path): + cif = tmp_path / "atom_site.cif" + cif.write_text( + """ +data_test +loop_ +_atom_site.id +_atom_site.type_symbol +_atom_site.Cartn_x +N N 1.0 +CA C 2.0 +""" + ) + + df = read_alphafold_mmcif(str(cif)) + + assert isinstance(df, pd.DataFrame) + assert list(df.columns) == [ + "_atom_site.id", + "_atom_site.type_symbol", + "_atom_site.Cartn_x", + ] + assert len(df) == 2 + assert df["_atom_site.id"].tolist() == ["N", "CA"] + assert df["_atom_site.type_symbol"].tolist() == ["N", "C"] + assert df["_atom_site.Cartn_x"].tolist() == ["1.0", "2.0"] + + +def test_fetch_alphafold_protein_structure_wrong_uniprot_id(): + with pytest.raises(RuntimeError, match="AlphaFold request failed for NOPROTEIN"): + fetch_alphafold_protein_structure(uniprot="NOPROTEIN", persist_uploads=True) + + +def test_fetch_alphafold_returned_keys(tmp_path, monkeypatch): + monkeypatch.setattr(af.paths, "EXTERNAL_DATA_PATH", tmp_path) + + out = af.fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) + assert set(out.keys()) == { + "metadata_df", + "cif_df", + "pae_df", + "plddt_df", + "sequence_df", + } + + +def test_fetch_alphafold_metadata(tmp_path, monkeypatch): + monkeypatch.setattr(af.paths, "EXTERNAL_DATA_PATH", tmp_path) + out = af.fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) + + assert isinstance(out["metadata_df"], pd.DataFrame) + assert not out["metadata_df"].empty + assert out["metadata_df"].iloc[0]["uniprotAccession"] == "Q8WP00" + assert out["metadata_df"].iloc[0]["modelCreatedDate"] == "2025-08-01T00:00:00Z" + assert out["metadata_df"].iloc[0]["gene"] == "PRM1" + assert ( + out["metadata_df"].iloc[0]["alphafold_version"] + == "AlphaFold Monomer v2.0 pipeline" + ) + + +def test_fetch_alphafold_files_exist(tmp_path, monkeypatch): + monkeypatch.setattr(af.paths, "EXTERNAL_DATA_PATH", tmp_path) + af.fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) + + target_dir = tmp_path / "alphafold" / "Q8WP00" + assert target_dir.exists() + assert target_dir.is_dir() + + fasta_path = target_dir / "Q8WP00.fasta" + assert fasta_path.exists() + assert fasta_path.stat().st_size > 0 + + cif_files = sorted(target_dir.glob("*.cif")) + json_files = sorted(target_dir.glob("*.json")) + + # one mmCif file, confidence json and predicted aligned error + assert len(cif_files) == 1 + assert len(json_files) == 2 + + +def test_fetch_alphafold_dfs_exist(tmp_path, monkeypatch): + monkeypatch.setattr(af.paths, "EXTERNAL_DATA_PATH", tmp_path) + out = af.fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) + + cif_df = out["cif_df"] + assert isinstance(cif_df, pd.DataFrame) + assert not cif_df.empty + assert any(col.startswith("_atom_site.") for col in cif_df.columns) + + pae_df = out["pae_df"] + assert isinstance(pae_df, pd.DataFrame) + assert not pae_df.empty + + plddt_df = out["plddt_df"] + assert isinstance(plddt_df, pd.DataFrame) + assert not plddt_df.empty + + seq_df = out["sequence_df"] + assert isinstance(seq_df, pd.DataFrame) + assert not seq_df.empty From a8a74a2cf272499f1e96192e109ff8036d45c733 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 19 Jan 2026 16:49:52 +0100 Subject: [PATCH 028/240] fix: rename uniprot to uniprot_id --- backend/protzilla/importing/alphafold_protein_structure_load.py | 2 +- backend/protzilla/methods/importing.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 61dfe6d4f..4e0774f62 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -188,7 +188,7 @@ def handle_alphafold_files( def fetch_alphafold_protein_structure( - uniprot: str, persist_uploads: bool + uniprot_id: str, persist_uploads: bool ) -> dict[str, Any]: """ Fetch AlphaFold protein structure data from the AlphaFold Database API. diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 087a764e7..71f28e965 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -427,7 +427,7 @@ def create_form(self): label="AlphaFold DB Prediction Load", input_fields=[ TextField( - name="uniprot", + name="uniprot_id", label="Protein ID", ), CheckboxField( From ce4a9fca2a7c8ef3146954dda82985bdab675d96 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 19 Jan 2026 16:54:29 +0100 Subject: [PATCH 029/240] fix: change variable name consistently --- .../importing/alphafold_protein_structure_load.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 4e0774f62..16f192815 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -196,29 +196,29 @@ def fetch_alphafold_protein_structure( Retrieves metadata and structure files (CIF, PAE, pLDDT) from the AlphaFold Database for the given UniProt ID. Optionally persists the downloaded files to disk. - :param uniprot: The UniProt ID of the protein + :param uniprot_id: The UniProt ID of the protein :param persist_uploads: If True, files are saved persistently; if False, only loaded into memory :return: A dictionary containing DataFrames for metadata, CIF, PAE, pLDDT, and sequence data :raises RuntimeError: If the API request fails or returns invalid data :raises ValueError: If no predictions are found for the given UniProt ID """ - url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot}" + url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot_id}" with requests.Session() as session: try: resp = session.get(url, timeout=30) resp.raise_for_status() records = resp.json() except requests.RequestException as e: - raise RuntimeError(f"AlphaFold request failed for {uniprot}: {e}") from e + raise RuntimeError(f"AlphaFold request failed for {uniprot_id}: {e}") from e except ValueError as e: - raise RuntimeError(f"AlphaFold returned non-JSON for {uniprot}: {e}") from e + raise RuntimeError(f"AlphaFold returned non-JSON for {uniprot_id}: {e}") from e if not isinstance(records, list) or not records: - raise ValueError(f"No AlphaFold DB predictions for {uniprot}") + raise ValueError(f"No AlphaFold DB predictions for {uniprot_id}") r = records[0] if not isinstance(r, dict): - raise RuntimeError(f"Unexpected AlphaFold payload for {uniprot}") + raise RuntimeError(f"Unexpected AlphaFold payload for {uniprot_id}") data: dict[str, Any] = { "entryID": r.get("uniprotAccession"), @@ -241,7 +241,7 @@ def fetch_alphafold_protein_structure( alpha_dfs = handle_alphafold_files( files_urls=files_urls, - uniprot=uniprot, + uniprot=uniprot_id, seq=seq_tmp, metadata_df=metadata_df, acc=acc, From 644052177f6a53aae6224023d9f1f8f716542935 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 19 Jan 2026 16:57:13 +0100 Subject: [PATCH 030/240] fix: format with black --- .../protzilla/importing/alphafold_protein_structure_load.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 16f192815..a490a8b81 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -211,7 +211,9 @@ def fetch_alphafold_protein_structure( except requests.RequestException as e: raise RuntimeError(f"AlphaFold request failed for {uniprot_id}: {e}") from e except ValueError as e: - raise RuntimeError(f"AlphaFold returned non-JSON for {uniprot_id}: {e}") from e + raise RuntimeError( + f"AlphaFold returned non-JSON for {uniprot_id}: {e}" + ) from e if not isinstance(records, list) or not records: raise ValueError(f"No AlphaFold DB predictions for {uniprot_id}") From f8b12716f9bff2c8f9a5b604b14256ef9a75e6f2 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 19 Jan 2026 17:03:26 +0100 Subject: [PATCH 031/240] fix: variable name change in testing --- .../importing/test_alphafold_protein_structure_load.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index 7bd82233e..774ecd840 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -94,7 +94,7 @@ def test_read_alphafold_mmcif_valid_atom_site(tmp_path): def test_fetch_alphafold_protein_structure_wrong_uniprot_id(): with pytest.raises(RuntimeError, match="AlphaFold request failed for NOPROTEIN"): - fetch_alphafold_protein_structure(uniprot="NOPROTEIN", persist_uploads=True) + fetch_alphafold_protein_structure(uniprot_id="NOPROTEIN", persist_uploads=True) def test_fetch_alphafold_returned_keys(tmp_path, monkeypatch): From ec4da2be4258d852bfc76e13868f3c9053f01790 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Mon, 19 Jan 2026 18:50:17 +0100 Subject: [PATCH 032/240] refactor: change some variables to more explicit naming and fix inconsistency of the writing of crosslinking --- backend/protzilla/all_steps.py | 2 +- ...nking_import.py => crosslinking_import.py} | 44 +++++++++---------- backend/protzilla/importing/import_utils.py | 2 +- backend/protzilla/methods/importing.py | 16 +++---- 4 files changed, 32 insertions(+), 32 deletions(-) rename backend/protzilla/importing/{cross_linking_import.py => crosslinking_import.py} (93%) diff --git a/backend/protzilla/all_steps.py b/backend/protzilla/all_steps.py index ca55d5945..1d27496cc 100644 --- a/backend/protzilla/all_steps.py +++ b/backend/protzilla/all_steps.py @@ -14,7 +14,7 @@ importing.EvidenceImport, importing.ExampleDatasetImport, importing.FastaImport, - importing.CrossLinkingImport, + importing.CrosslinkingImport, data_preprocessing.FilterProteinsBySamplesMissing, data_preprocessing.FilterProteinsBySilacRatios, data_preprocessing.FilterByProteinsCount, diff --git a/backend/protzilla/importing/cross_linking_import.py b/backend/protzilla/importing/crosslinking_import.py similarity index 93% rename from backend/protzilla/importing/cross_linking_import.py rename to backend/protzilla/importing/crosslinking_import.py index 8a3a60eb9..e329f3e73 100644 --- a/backend/protzilla/importing/cross_linking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -12,7 +12,7 @@ from backend.protzilla.utilities import format_trace from backend.protzilla.importing.import_utils import ( - columns_in_cross_linking_df, + columns_in_crosslinking_df, rename_columns_csm_format, rename_columns_proteomediscoverer_xlinkx_format, ) @@ -35,7 +35,7 @@ def aggregate_data(df: pd.DataFrame, column: str) -> set: def validate_data_before_lookup( - data_for_lookup: set, is_valid_function, error_code: str + data_for_lookup: set, validator_function, error_code: str ): """ Splits input values into valid and invalid ones. @@ -53,7 +53,7 @@ def validate_data_before_lookup( return valid_data, results for data in data_for_lookup: - if is_valid_function(data): + if validator_function(data): valid_data.add(data) else: results[data] = (False, None, error_code) @@ -127,7 +127,7 @@ def process_uniprot_response_containing_gene_names(response, results): def process_uniprot_response_containing_protein_ids( - response, valid_input, isFallback: bool + response, valid_input, is_fallback: bool ): output = defaultdict(lambda: {"protein_ids": [], "list_of_protein_isoforms": []}) @@ -141,13 +141,13 @@ def process_uniprot_response_containing_protein_ids( protein_id = parts[protein_id_idx] output_gene_names = parts[gene_name_idx].split() - for g in output_gene_names: - if g in valid_input: + for gene_name in output_gene_names: + if gene_name in valid_input: if "-" in protein_id: - output[g]["list_of_protein_isoforms"].append(protein_id) + output[gene_name]["list_of_protein_isoforms"].append(protein_id) else: - output[g]["protein_ids"].append(protein_id) - elif isFallback: + output[gene_name]["protein_ids"].append(protein_id) + elif is_fallback: if "-" in protein_id: output[valid_input]["list_of_protein_isoforms"].append(protein_id) else: @@ -195,7 +195,7 @@ def get_gene_name_from_protein_ids(protein_ids: set): valid_ids, results = validate_data_before_lookup( protein_ids, - is_valid_function=lambda pid: bool(valid_id_pattern.match(pid)), + validator_function=lambda pid: bool(valid_id_pattern.match(pid)), error_code="NOT_A_VALID_PROTEIN_ID", ) @@ -251,10 +251,10 @@ def get_protein_ids_from_gene_name(gene_names: set): } if success else None error (str or None): error code/message if failed, else None """ - # Filter decoy Proteins, because we cannot process them decently? + # Filter decoy Proteins, because we cannot process them decently valid_gene_names, results = validate_data_before_lookup( gene_names, - is_valid_function=lambda name: not name.startswith("DECOY:"), + validator_function=lambda name: not name.startswith("DECOY:"), error_code="IS_DECOY_PROTEIN", ) @@ -278,25 +278,25 @@ def get_protein_ids_from_gene_name(gene_names: set): response, valid_gene_names, False ) - for gn in valid_gene_names: - data = output.get(gn) + for gene_name in valid_gene_names: + data = output.get(gene_name) if not data or not data["protein_ids"]: - response = fallback_single_lookup(gn, "get_protein_ids", results) + response = fallback_single_lookup(gene_name, "get_protein_ids", results) if response is not None: new_output = process_uniprot_response_containing_protein_ids( - response, gn, True + response, gene_name, True ) - protein_id = new_output.get(gn) + protein_id = new_output.get(gene_name) else: protein_id = None if protein_id: - results[gn] = (True, protein_id, None) + results[gene_name] = (True, protein_id, None) else: - results[gn] = (False, None, "NO_PROTEIN_ID_FOUND") + results[gene_name] = (False, None, "NO_PROTEIN_ID_FOUND") else: - results[gn] = (True, data, None) + results[gene_name] = (True, data, None) return results @@ -473,10 +473,10 @@ def normalize_crosslinking_df(df: pd.DataFrame) -> pd.DataFrame: "Q_value": "Float64", } ) - return df.loc[:, columns_in_cross_linking_df] + return df.loc[:, columns_in_crosslinking_df] -def cross_linking_import(file_path: Path) -> dict: +def crosslinking_import(file_path: Path) -> dict: try: if file_path.suffix == ".csv": good_df, failed_df = read_csm_file(file_path) diff --git a/backend/protzilla/importing/import_utils.py b/backend/protzilla/importing/import_utils.py index f5d0420ec..b9da300e4 100644 --- a/backend/protzilla/importing/import_utils.py +++ b/backend/protzilla/importing/import_utils.py @@ -38,7 +38,7 @@ class AggregationMethods(Enum): "Q-value": "Q_value", } -columns_in_cross_linking_df = [ +columns_in_crosslinking_df = [ "Protein1", "Protein2", "Protein_id1", diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index f8d180655..abb7bf716 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -15,7 +15,7 @@ from backend.protzilla.steps import Step, StepManager from protzilla.importing.example_dataset_import import example_dataset_import from protzilla.importing.fasta_import import fasta_import -from protzilla.importing.cross_linking_import import cross_linking_import +from protzilla.importing.crosslinking_import import crosslinking_import from protzilla.importing.import_utils import ( AggregationMethods, FeatureOrientationType, @@ -402,23 +402,23 @@ def create_form(self): calc_method = staticmethod(example_dataset_import) -class CrossLinkingImport(ImportingStep): - display_name = "Cross Linking Data Import" - operation = "Cross Linking Data Import" - method_description = "Import a file containing cross linking data" +class CrosslinkingImport(ImportingStep): + display_name = "Cross-Linking Data Import" + operation = "Cross-Linking Data Import" + method_description = "Import a file containing cross-linking data" output_keys = ["crosslinking_df"] def create_form(self): return Form( - label="Cross Linking Data Import", + label="Cross-Linking Data Import", input_fields=[ FileInput( name="file_path", - label="Cross Linking Data file (.xlsx or .csv)", + label="Cross-Linking Data file (.xlsx or .csv)", value=None, ), ], ) - calc_method = staticmethod(cross_linking_import) + calc_method = staticmethod(crosslinking_import) From 47cd187b57057f37ffc43a59af7796cf0a440298 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 19 Jan 2026 20:52:42 +0100 Subject: [PATCH 033/240] fix: change variable name to plural and fix doc string to match output --- .../protzilla/importing/alphafold_protein_structure_load.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index a490a8b81..aa917c6e3 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -26,11 +26,11 @@ def to_fasta(seq: str, header: str = "protein_sequence", width: int = 60) -> str :return: The sequence in FASTA format :raises ValueError: If the sequence contains invalid characters or whitespace """ - VALID_AMINO_ACID = set("ACDEFGHIKLMNPQRSTVWYBXZJUO*-") + VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWYBXZJUO*-") if not seq or any(c.isspace() for c in seq): raise ValueError("Sequence must be a single, whitespace-free string.") seq = seq.upper() - bad = set(seq) - VALID_AMINO_ACID + bad = set(seq) - VALID_AMINO_ACIDS if bad: raise ValueError(f"Invalid characters in sequence: {''.join(sorted(bad))}") joined = "\n".join(wrap(seq, width)) @@ -104,7 +104,7 @@ def handle_alphafold_files( :param metadata_df: DataFrame containing AlphaFold metadata :param acc: The accession number (used for directory naming) :param persist_uploads: If True, files are saved persistently; if False, only loaded into memory - :return: Tuple of (cif_df, pae_df, plddt_df, sequence_df) or None values for failed loads + :return: A dictionary containing DataFrames for metadata, CIF, PAE, pLDDT, and sequence data or None values for failed loads """ cif_df = None pae_df = None From 41a09ec27cd6bf102ece0518e74565bc2af57332 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Mon, 19 Jan 2026 23:37:03 +0100 Subject: [PATCH 034/240] refactor: fix and add docstrings --- .../importing/crosslinking_import.py | 303 ++++++++++++++---- 1 file changed, 246 insertions(+), 57 deletions(-) diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index e329f3e73..1d1bc4f3f 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -1,5 +1,5 @@ """ -This module contains the code to parse a file containing cross linking data. +This module contains the code to parse a file containing crosslinking data. """ import logging @@ -20,14 +20,14 @@ def aggregate_data(df: pd.DataFrame, column: str) -> set: """ - Extracts unique values from two DataFrame columns and returns them as a set. - - Parameters: - df (pd.DataFrame): Input DataFrame - column (str): Column name - - Returns: - set: Unique values from the columns + Extract unique values from two DataFrame columns and return them as a set. + + :param df: Input DataFrame + :type df: pd.DataFrame + :param column: Column name + :type column: str + :return: Unique values from the column + :rtype: set """ return set( df[[column + "1", column + "2"]].stack().dropna().astype(str).str.strip() @@ -38,13 +38,23 @@ def validate_data_before_lookup( data_for_lookup: set, validator_function, error_code: str ): """ - Splits input values into valid and invalid ones. - - Invalid values are directly written to results with the given error code. - - Returns: - valid_data (set) - results (dict): value -> (False, None, error_code) + Split input values into valid and invalid ones. + Invalid values are directly written to the results with the given error code. + + :param data_for_lookup: Set of input values to be validated + :type data_for_lookup: set[str] + :param validator_function: Validation function applied to each value. + Must accept a single string and return ``True`` if valid, + otherwise ``False``. + :type validator_function: Callable[[str], bool] + :param error_code: Error code assigned to invalid values + :type error_code: str + + :return: Tuple containing valid data and validation results + :rtype: tuple[set[str], dict[str, tuple[bool, None, str]]] + + :returns valid_data: Set of values that passed validation + :returns results: Mapping of invalid values to ``(False, None, error_code)`` """ valid_data = set() results = {} @@ -71,7 +81,26 @@ def build_uniprot_search_params( include_isoforms: bool = False, ): """ - Builds UniProt search URL and params. + Build the UniProt search URL and query parameters for a batch of identifiers. + + :param data_for_lookup: Set of values to look up (e.g., UniProt IDs or gene names) + :type data_for_lookup: set[str] + :param field_of_existing_data: Field name in UniProt to search for (e.g., "accession" or "gene_exact") + :type field_of_existing_data: str + :param extra_query: Optional additional query string to filter results + :type extra_query: str or None + :param response_format: Desired response format (e.g., "json", "tsv") + :type response_format: str + :param fields: Comma-separated list of fields to return (e.g., "accession,id,protein_name") + :type fields: str + :param include_isoforms: Whether to include isoform entries in the results + :type include_isoforms: bool + + :return: Tuple containing the UniProt search URL and the query parameters dictionary + :rtype: tuple[str, dict[str, str]] + + :returns uniprot_search_url: Base URL for UniProt REST API search + :returns params: Dictionary of query parameters for the request """ uniprot_search_url = "https://rest.uniprot.org/uniprotkb/search" @@ -95,6 +124,32 @@ def build_uniprot_search_params( def execute_uniprot_request(url, params, valid_data, results): + """ + Execute a UniProt HTTP request with error handling and update the results for failed queries. + + :param url: UniProt REST API URL to send the request to + :type url: str + :param params: Dictionary of query parameters for the request + :type params: dict[str, str] + :param valid_data: Set of input values that were intended to be queried + :type valid_data: set[str] + :param results: Dictionary to store lookup results; failed lookups are updated here + as ``data -> (False, None, error_code)`` + :type results: dict[str, tuple[bool, None, str]] + + :return: The HTTP response object if the request succeeded, otherwise None + :rtype: requests.Response or None + + :raises requests.exceptions.Timeout: If the request times out + :raises requests.exceptions.HTTPError: If the server returns an HTTP error + :raises requests.exceptions.RequestException: For other request-related errors + + :note: On failure, all entries in `valid_data` are updated in `results` with the + corresponding error code: + - "TIMEOUT" for a timeout + - "HTTP_" for HTTP errors + - "REQUEST_ERROR" for other request failures + """ try: response = requests.get(url, params=params, timeout=15) response.raise_for_status() @@ -113,6 +168,22 @@ def execute_uniprot_request(url, params, valid_data, results): def process_uniprot_response_containing_gene_names(response, results): + """ + Process a UniProt API response containing gene name information and update the results dictionary. + + :param response: HTTP response object returned by a UniProt request + :type response: requests.Response + :param results: Dictionary to store lookup results. Each protein ID will be updated as: + ``protein_id -> (success, gene_name, error_code)`` + :type results: dict[str, tuple[bool, str | None, str | None]] + + :return: None (updates `results` in-place) + :rtype: None + + :note: For each entry in the response: + - If a gene name is found, ``results[protein_id] = (True, gene_name, None)`` + - If no gene name is found, ``results[protein_id] = (False, None, "NO_GENE_NAME_FOUND")`` + """ data = response.json() for entry in data.get("results", []): @@ -129,6 +200,34 @@ def process_uniprot_response_containing_gene_names(response, results): def process_uniprot_response_containing_protein_ids( response, valid_input, is_fallback: bool ): + """ + Process a UniProt TSV response containing protein IDs and map them to gene names. + + :param response: HTTP response object returned by a UniProt request in TSV format + :type response: requests.Response + :param valid_input: Set of gene names to extract protein IDs for + :type valid_input: set[str] + :param is_fallback: True if the response comes from a fallback individual UniProt request + instead of the standard UniProt batch request + :type is_fallback: bool + + :return: Dictionary mapping gene_name -> protein information + :rtype: dict[str, dict[str, list[str]]] + + :returns output: Dictionary with the following structure: + { + gene_name: { + "protein_ids": List of protein IDs without isoform suffix, + "list_of_protein_isoforms": List of protein IDs with isoform suffix + } + } + + :note: For each line in the TSV response: + - Protein IDs with a dash ("-") are considered isoforms and added to + "list_of_protein_isoforms" + - Other protein IDs are added to "protein_ids" + - Only gene names present in `valid_input` are considered, unless `is_fallback` is True + """ output = defaultdict(lambda: {"protein_ids": [], "list_of_protein_isoforms": []}) lines = response.text.strip().split("\n") @@ -156,6 +255,25 @@ def process_uniprot_response_containing_protein_ids( def fallback_single_lookup(query: str, query_type: str, results): + """ + Perform a fallback UniProt lookup for a single gene or protein ID and update the results. + + :param query: The gene name or UniProt ID to look up + :type query: str + :param query_type: Type of lookup to perform. Either: + - "get_gene_name": Retrieve the primary gene name for a UniProt ID + - "get_protein_ids": Retrieve UniProt accession IDs for a gene + :type query_type: str + :param results: Dictionary to store lookup results. Will be updated in-place. + Entries are stored as ``key -> (success, data, error_code)`` + :type results: dict[str, tuple[bool, Any, str | None]] + + :return: HTTP response object from the UniProt request if successful, otherwise None + :rtype: requests.Response or None + + :note: This function constructs the appropriate UniProt REST API request depending on + `query_type` and uses `execute_uniprot_request` to perform the request and handle errors. + """ if query_type == "get_gene_name": url = f"https://rest.uniprot.org/uniprotkb/{query}" params = {"fields": "gene_primary", "format": "json"} @@ -171,16 +289,17 @@ def fallback_single_lookup(query: str, query_type: str, results): def get_gene_name_from_protein_ids(protein_ids: set): """ - Retrieves the gene names for a given set of Protein IDs in a batch from UniProt. + Retrieve the gene names for a given set of Protein IDs in a batch from UniProt. - Parameters: - protein_ids (set): Set of UniProt accession IDs (e.g. {"Q92878", "P51587"}). + :param protein_ids: Set of UniProt accession IDs (e.g., {"Q92878", "P51587"}) + :type protein_ids: set[str] - Returns: - dict: Mapping protein_id -> (success, gene_name, error) - success (bool): True if the lookup for that protein_id succeeded, False otherwise - gene_name (str or None): Official gene name if successful, else None - error (str or None): Error code/message if failed, else None + :return: Mapping of protein_id to a tuple containing lookup result, gene name, and error + :rtype: dict[str, tuple[bool, str | None, str | None]] + + :returns success: True if the lookup for that protein_id succeeded, False otherwise + :returns gene_name: Official gene name if successful, else None + :returns error: Error code or message if the lookup failed, else None """ # Regex for valid accession input directly from UniProt # A batch request containing an id that doesn't match this regex, @@ -237,21 +356,22 @@ def get_gene_name_from_protein_ids(protein_ids: set): def get_protein_ids_from_gene_name(gene_names: set): """ - Retrieves UniProt protein IDs for a given set of human gene names as a batch query. - - Parameters: - gene_names (set): Set of gene symbols to look up (e.g. {"RAD50", "MRE11"}) - - Returns: - dict: Mapping gene_name -> (success, data, error) - success (bool): True if lookup for this gene_name succeeded, False otherwise - data (dict or None): { - "protein_ids" (list of str): all protein IDs without any isoform information, - "list_of_protein_isoforms" (list of str): all isomform IDs - } if success else None - error (str or None): error code/message if failed, else None + Retrieve UniProt protein IDs for a given set of human gene names as a batch query. + + :param gene_names: Set of gene symbols to look up (e.g., {"RAD50", "MRE11"}) + :type gene_names: set[str] + + :return: Mapping of gene_name to a tuple containing lookup result, data, and error + :rtype: dict[str, tuple[bool, dict[str, list[str]] | None, str | None]] + + :returns success: True if the lookup for this gene_name succeeded, False otherwise + :returns data: Dictionary with protein information if successful, else None. + Contains: + - "protein_ids" (list of str): All protein IDs without any isoform information + - "list_of_protein_isoforms" (list of str): All isoform IDs + :returns error: Error code or message if the lookup failed, else None """ - # Filter decoy Proteins, because we cannot process them decently + # Filter decoy Proteins, because we cannot process them decently valid_gene_names, results = validate_data_before_lookup( gene_names, validator_function=lambda name: not name.startswith("DECOY:"), @@ -309,21 +429,30 @@ def iterate_for_protein_designation( value_extractor=lambda x: x, ): """ - Iterates over a DataFrame and adds the missing protein designations to the dataframe using - precomputed lookup results. (either protein ids or gene names are included in the imported - data and the other is added to the data frame here) - - Parameters: - df (pd.DataFrame) - protein_designation (str): existing protein designation, e.g. "Protein_id" or "Protein" - uniprot_lookup_results (dict): - Mapping key -> (success, data, error) - value_extractor (callable): - function(data) -> value to store in DataFrame cell - - Returns: - good_df (pd.DataFrame): Rows with successful lookups - failed_df (pd.DataFrame): Rows with lookup errors + Iterate over a DataFrame and add missing protein designations using precomputed lookup results. + Either protein IDs or gene names are included in the DataFrame, and the other is added + to the DataFrame in this function. + + :param df: Input DataFrame + :type df: pandas.DataFrame + :param existing_designation: Column name in `df` containing existing protein designation + (e.g., "Protein_id" or "Protein") + :type existing_designation: str + :param new_designation: Column name to store the newly added protein designation + :type new_designation: str + :param uniprot_lookup_results: Mapping of key -> (success, data, error) + Contains precomputed lookup results + :type uniprot_lookup_results: dict + :param value_extractor: Function that extracts the value to store in the DataFrame cell + from `data`. Default is identity function. + Signature: ``value_extractor(data) -> Any`` + :type value_extractor: Callable[[Any], Any] + + :return: Tuple containing rows with successful lookups and rows with lookup errors + :rtype: tuple[pandas.DataFrame, pandas.DataFrame] + + :returns good_df: Rows with successful lookups + :returns failed_df: Rows with lookup errors """ good_rows = [] failed_rows = [] @@ -366,6 +495,33 @@ def get_missing_protein_designation( uniprot_lookup_function, value_extractor=lambda x: x, ): + """ + Fill missing protein designations in a DataFrame using a UniProt lookup function. + + This function aggregates unique values from the existing column, performs a batch + lookup using `uniprot_lookup_function`, and populates the missing column. The resulting + rows are split into successful and failed lookups. + + :param df: Input DataFrame containing existing protein designations + :type df: pandas.DataFrame + :param existing_column: Name of the column with existing protein designations + :type existing_column: str + :param missing_column: Name of the column to populate with missing designations + :type missing_column: str + :param uniprot_lookup_function: Function that performs a batch UniProt lookup. + Should accept a set of values and return results + as a dictionary ``key -> (success, data, error_code)`` + :type uniprot_lookup_function: Callable[[set[str]], dict[str, tuple[bool, Any, str | None]]] + :param value_extractor: Function to extract the value to store in the missing column + from the lookup data. Default is the identity function. + :type value_extractor: Callable[[Any], Any] + + :return: Tuple of DataFrames containing rows with successful lookups and rows with errors + :rtype: tuple[pandas.DataFrame, pandas.DataFrame] + + :returns good_df: Rows where missing protein designations were successfully populated + :returns failed_df: Rows where the lookup failed + """ unique_existing_designations = aggregate_data(df, existing_column) uniprot_lookup_results = uniprot_lookup_function(unique_existing_designations) good_df, failed_df = iterate_for_protein_designation( @@ -395,6 +551,25 @@ def get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format def read_ProteomeDiscoverer_XlinkX_file(file_path: Path) -> pd.DataFrame: + """ + Read and process a ProteomeDiscoverer XlinkX Excel file: + 1. Reads the Excel file and renames columns to a standard format. + 2. Extracts crosslink positions for both peptides. + 3. Cleans peptide sequences by removing brackets and converting to string type. + 4. Converts intra-crosslink annotations to boolean. + 5. Removes isoform suffixes from protein IDs. + 6. Fills missing protein designations using UniProt gene name lookup. + 7. Splits the resulting DataFrame into successful and failed lookups. + + :param file_path: Path to the ProteomeDiscoverer XlinkX Excel file + :type file_path: pathlib.Path + + :return: Tuple of DataFrames containing rows with successfully mapped proteins and rows where lookup failed + :rtype: tuple[pandas.DataFrame, pandas.DataFrame] + + :returns good_df: Rows where missing protein designations were successfully populated + :returns failed_df: Rows where protein lookup failed + """ df = pd.read_excel(file_path).rename( columns=rename_columns_proteomediscoverer_xlinkx_format ) @@ -435,9 +610,23 @@ def read_ProteomeDiscoverer_XlinkX_file(file_path: Path) -> pd.DataFrame: def read_csm_file(file_path: Path) -> pd.DataFrame: """ - Returns two DataFrames: - - good_df: only rows with successful UniProt lookups - - failed_df: rows where UniProt lookup failed, including error messages + Read and process a CSM CSV file: + 1. Reads the CSV file and renames columns to a standard format. + 2. Determines intra-crosslinks by comparing Protein1 and Protein2. + 3. Normalizes gene names in the specified protein columns. + 4. Uses UniProt lookups to fill missing protein IDs, storing only the first protein ID + for each gene. + 5. Splits the resulting DataFrame into successful and failed lookups. + + :param file_path: Path to the CSM CSV file + :type file_path: pathlib.Path + + :return: Tuple of DataFrames containing rows with successfully mapped protein IDs + and rows where lookup failed + :rtype: tuple[pandas.DataFrame, pandas.DataFrame] + + :returns good_df: Rows where missing protein IDs were successfully populated + :returns failed_df: Rows where the UniProt lookup failed, including error messages """ df = pd.read_csv(file_path, low_memory=False).rename( columns=rename_columns_csm_format From ade64f2301db96183f17bb53e278cc660c582df9 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Tue, 20 Jan 2026 15:24:09 +0100 Subject: [PATCH 035/240] add tests and delete a tiny bit of unnecessary code --- .../importing/crosslinking_import.py | 14 +- .../importing/test_crosslinking_import.py | 173 ++++++++++++++++++ 2 files changed, 176 insertions(+), 11 deletions(-) create mode 100644 backend/tests/protzilla/importing/test_crosslinking_import.py diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index 1d1bc4f3f..a06ad3d9f 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -336,7 +336,7 @@ def get_gene_name_from_protein_ids(protein_ids: set): for pid in valid_ids: if pid not in results: - + response = fallback_single_lookup(pid, "get_gene_name", results) data = response.json() processed_data = data.get("genes", []) @@ -350,7 +350,7 @@ def get_gene_name_from_protein_ids(protein_ids: set): results[pid] = (True, gene_name, None) else: results[pid] = (False, None, "PROTEIN_ID_NOT_FOUND") - + return results @@ -374,7 +374,7 @@ def get_protein_ids_from_gene_name(gene_names: set): # Filter decoy Proteins, because we cannot process them decently valid_gene_names, results = validate_data_before_lookup( gene_names, - validator_function=lambda name: not name.startswith("DECOY:"), + validator_function=lambda name: not name.startswith("decoy:"), error_code="IS_DECOY_PROTEIN", ) @@ -530,12 +530,6 @@ def get_missing_protein_designation( return good_df, failed_df -def normalize_gene_name_column(df, columns: list[str]): - for col in columns: - df[col] = df[col].astype("string").str.upper() - return df - - def remove_isoform_from_protein_id(protein_id: str) -> str: return protein_id.split("-", 1)[0] @@ -634,8 +628,6 @@ def read_csm_file(file_path: Path) -> pd.DataFrame: df["Is_intra_crosslink"] = df["Protein1"].eq(df["Protein2"]) - df = normalize_gene_name_column(df, ["Protein1", "Protein2"]) - # In our UniProt lookup we already get all isoforms of the respective gene name. # Right now we only store the protein id without any isoform information in our dataframe to keep it consistent. # If we ever need the isoform information we just have to change what the value extractor stores in our dataframe. diff --git a/backend/tests/protzilla/importing/test_crosslinking_import.py b/backend/tests/protzilla/importing/test_crosslinking_import.py new file mode 100644 index 000000000..100b3e4d2 --- /dev/null +++ b/backend/tests/protzilla/importing/test_crosslinking_import.py @@ -0,0 +1,173 @@ +import pytest +import pandas as pd +from unittest.mock import patch, Mock +from requests.exceptions import Timeout +from protzilla.importing.crosslinking_import import ( + aggregate_data, + remove_isoform_from_protein_id, + remove_brackets_from_peptide, + get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format, + validate_data_before_lookup, + execute_uniprot_request, + process_uniprot_response_containing_gene_names, + iterate_for_protein_designation, + get_missing_protein_designation, + crosslinking_import, +) + + +def test_aggregate_data(): + df = pd.DataFrame({ + "Protein1": ["A", "B", None], + "Protein2": ["C", "B", "D"] + }) + result = aggregate_data(df, "Protein") + assert result == {"A", "B", "C", "D"} + + +def test_remove_isoform_from_protein_id(): + assert remove_isoform_from_protein_id("P12345-2") == "P12345" + assert remove_isoform_from_protein_id("Q67890") == "Q67890" + + +def test_remove_brackets_from_peptide(): + assert remove_brackets_from_peptide("[ABC]DE[FG]") == "ABCDEFG" + + +def test_get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format(): + assert get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format("[ACD]EF") == 1 + assert get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format("ACDEF") == 0 + + +def test_validate_data_before_lookup(): + data = {"A", "B", "C"} + def validator(x): + return x != "B" + valid, results = validate_data_before_lookup(data, validator, "ERROR") + assert valid == {"A", "C"} + assert results == {"B": (False, None, "ERROR")} + + +def test_validate_data_before_lookup_empty(): + valid, results = validate_data_before_lookup(set(), lambda x: True, "ERR") + assert valid == set() + assert results == {} + + +def test_execute_uniprot_request_success(): + mock_response = Mock() + mock_response.raise_for_status.return_value = None + results = {} + valid_data = {"P12345"} + with patch("protzilla.importing.crosslinking_import.requests.get", return_value=mock_response): + response = execute_uniprot_request("url", {"param": "value"}, valid_data, results) + assert response == mock_response + assert results == {} + + +def test_execute_uniprot_request_timeout(): + results = {} + valid_data = {"P12345"} + with patch( + "protzilla.importing.crosslinking_import.requests.get", + side_effect=Timeout(), + ): + response = execute_uniprot_request("url", {"param": "value"}, valid_data, results) + assert response is None + assert results["P12345"][2] == "TIMEOUT" + + +def test_process_uniprot_response_containing_gene_names(): + results = {} + mock_response = Mock() + mock_response.json.return_value = { + "results": [ + {"primaryAccession": "P1", "genes": [{"geneName": {"value": "GENE1"}}]}, + {"primaryAccession": "P2", "genes": []} + ] + } + process_uniprot_response_containing_gene_names(mock_response, results) + assert results["P1"] == (True, "GENE1", None) + assert results["P2"] == (False, None, "NO_GENE_NAME_FOUND") + + +def _minimal_valid_crosslinking_df(): + return pd.DataFrame({ + "Protein_id1": ["P1"], + "Protein_id2": ["P2"], + "Protein1": ["GENE1"], + "Protein2": ["GENE2"], + "Is_intra_crosslink": [False], + "Crosslinker": ["DSS"], + "Peptide1": ["AAA"], + "Peptide2": ["BBB"], + "Peptide_position1": [1], + "Peptide_position2": [2], + "CL_position1": [3], + "CL_position2": [4], + "Q_value": [0.01], + }) + + +def test_iterate_for_protein_designation(): + df = _minimal_valid_crosslinking_df() + lookup_results = { + "P1": (True, "GENE1", None), + "P2": (True, "GENE2", None), + } + good_df, failed_df = iterate_for_protein_designation( + df, + "Protein_id", + "Protein", + lookup_results, + ) + + assert len(good_df) == 1 + assert failed_df.empty + + +def test_get_missing_protein_designation(): + df = _minimal_valid_crosslinking_df() + def mock_lookup(ids): + return {pid: (True, f"Gene_{pid}", None) for pid in ids} + good_df, failed_df = get_missing_protein_designation( + df, + "Protein_id", + "Protein", + mock_lookup, + ) + + assert len(good_df) == 1 + assert failed_df.empty + + +def test_crosslinking_import_csv(tmp_path): + csv_file = tmp_path / "test.csv" + csv_file.write_text( + "Protein1,Protein2,Peptide1,Peptide2," + "Peptide_position1,Peptide_position2," + "CL_position1,CL_position2," + "Crosslinker,Q_value\n" + "RAD50,MRE11,AAA,BBB,1,2,3,4,DSS,0.01\n" + ) + + with patch( + "protzilla.importing.crosslinking_import.get_protein_ids_from_gene_name", + return_value={ + "RAD50": (True, {"protein_ids": ["P12345"]}, None), + "MRE11": (True, {"protein_ids": ["Q67890"]}, None), + }, + ): + result = crosslinking_import(csv_file) + + assert "crosslinking_df" in result + assert not result["crosslinking_df"].empty + + + +def test_crosslinking_import_invalid_file(tmp_path): + bad_file = tmp_path / "test.txt" + bad_file.write_text("something invalid") + result = crosslinking_import(bad_file) + assert "messages" in result + assert any("Unsupported file type" in m["msg"] for m in result["messages"]) \ No newline at end of file From f92f26c87a8109993cd8f7dda600a4326dade34f Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Tue, 20 Jan 2026 15:27:35 +0100 Subject: [PATCH 036/240] format backend code with black --- .../importing/crosslinking_import.py | 8 +- .../importing/test_crosslinking_import.py | 73 ++++++++++++------- 2 files changed, 50 insertions(+), 31 deletions(-) diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index a06ad3d9f..f3d11f76c 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -208,7 +208,7 @@ def process_uniprot_response_containing_protein_ids( :param valid_input: Set of gene names to extract protein IDs for :type valid_input: set[str] :param is_fallback: True if the response comes from a fallback individual UniProt request - instead of the standard UniProt batch request + instead of the standard UniProt batch request :type is_fallback: bool :return: Dictionary mapping gene_name -> protein information @@ -336,7 +336,7 @@ def get_gene_name_from_protein_ids(protein_ids: set): for pid in valid_ids: if pid not in results: - + response = fallback_single_lookup(pid, "get_gene_name", results) data = response.json() processed_data = data.get("genes", []) @@ -350,7 +350,7 @@ def get_gene_name_from_protein_ids(protein_ids: set): results[pid] = (True, gene_name, None) else: results[pid] = (False, None, "PROTEIN_ID_NOT_FOUND") - + return results @@ -371,7 +371,7 @@ def get_protein_ids_from_gene_name(gene_names: set): - "list_of_protein_isoforms" (list of str): All isoform IDs :returns error: Error code or message if the lookup failed, else None """ - # Filter decoy Proteins, because we cannot process them decently + # Filter decoy Proteins, because we cannot process them decently valid_gene_names, results = validate_data_before_lookup( gene_names, validator_function=lambda name: not name.startswith("decoy:"), diff --git a/backend/tests/protzilla/importing/test_crosslinking_import.py b/backend/tests/protzilla/importing/test_crosslinking_import.py index 100b3e4d2..dd67e937b 100644 --- a/backend/tests/protzilla/importing/test_crosslinking_import.py +++ b/backend/tests/protzilla/importing/test_crosslinking_import.py @@ -17,10 +17,7 @@ def test_aggregate_data(): - df = pd.DataFrame({ - "Protein1": ["A", "B", None], - "Protein2": ["C", "B", "D"] - }) + df = pd.DataFrame({"Protein1": ["A", "B", None], "Protein2": ["C", "B", "D"]}) result = aggregate_data(df, "Protein") assert result == {"A", "B", "C", "D"} @@ -35,14 +32,26 @@ def test_remove_brackets_from_peptide(): def test_get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format(): - assert get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format("[ACD]EF") == 1 - assert get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format("ACDEF") == 0 + assert ( + get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format( + "[ACD]EF" + ) + == 1 + ) + assert ( + get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format( + "ACDEF" + ) + == 0 + ) def test_validate_data_before_lookup(): data = {"A", "B", "C"} + def validator(x): return x != "B" + valid, results = validate_data_before_lookup(data, validator, "ERROR") assert valid == {"A", "C"} assert results == {"B": (False, None, "ERROR")} @@ -59,8 +68,13 @@ def test_execute_uniprot_request_success(): mock_response.raise_for_status.return_value = None results = {} valid_data = {"P12345"} - with patch("protzilla.importing.crosslinking_import.requests.get", return_value=mock_response): - response = execute_uniprot_request("url", {"param": "value"}, valid_data, results) + with patch( + "protzilla.importing.crosslinking_import.requests.get", + return_value=mock_response, + ): + response = execute_uniprot_request( + "url", {"param": "value"}, valid_data, results + ) assert response == mock_response assert results == {} @@ -72,7 +86,9 @@ def test_execute_uniprot_request_timeout(): "protzilla.importing.crosslinking_import.requests.get", side_effect=Timeout(), ): - response = execute_uniprot_request("url", {"param": "value"}, valid_data, results) + response = execute_uniprot_request( + "url", {"param": "value"}, valid_data, results + ) assert response is None assert results["P12345"][2] == "TIMEOUT" @@ -83,7 +99,7 @@ def test_process_uniprot_response_containing_gene_names(): mock_response.json.return_value = { "results": [ {"primaryAccession": "P1", "genes": [{"geneName": {"value": "GENE1"}}]}, - {"primaryAccession": "P2", "genes": []} + {"primaryAccession": "P2", "genes": []}, ] } process_uniprot_response_containing_gene_names(mock_response, results) @@ -92,21 +108,23 @@ def test_process_uniprot_response_containing_gene_names(): def _minimal_valid_crosslinking_df(): - return pd.DataFrame({ - "Protein_id1": ["P1"], - "Protein_id2": ["P2"], - "Protein1": ["GENE1"], - "Protein2": ["GENE2"], - "Is_intra_crosslink": [False], - "Crosslinker": ["DSS"], - "Peptide1": ["AAA"], - "Peptide2": ["BBB"], - "Peptide_position1": [1], - "Peptide_position2": [2], - "CL_position1": [3], - "CL_position2": [4], - "Q_value": [0.01], - }) + return pd.DataFrame( + { + "Protein_id1": ["P1"], + "Protein_id2": ["P2"], + "Protein1": ["GENE1"], + "Protein2": ["GENE2"], + "Is_intra_crosslink": [False], + "Crosslinker": ["DSS"], + "Peptide1": ["AAA"], + "Peptide2": ["BBB"], + "Peptide_position1": [1], + "Peptide_position2": [2], + "CL_position1": [3], + "CL_position2": [4], + "Q_value": [0.01], + } + ) def test_iterate_for_protein_designation(): @@ -128,8 +146,10 @@ def test_iterate_for_protein_designation(): def test_get_missing_protein_designation(): df = _minimal_valid_crosslinking_df() + def mock_lookup(ids): return {pid: (True, f"Gene_{pid}", None) for pid in ids} + good_df, failed_df = get_missing_protein_designation( df, "Protein_id", @@ -164,10 +184,9 @@ def test_crosslinking_import_csv(tmp_path): assert not result["crosslinking_df"].empty - def test_crosslinking_import_invalid_file(tmp_path): bad_file = tmp_path / "test.txt" bad_file.write_text("something invalid") result = crosslinking_import(bad_file) assert "messages" in result - assert any("Unsupported file type" in m["msg"] for m in result["messages"]) \ No newline at end of file + assert any("Unsupported file type" in m["msg"] for m in result["messages"]) From 0548921449d45769fc914e11204bebc65d46c6aa Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Tue, 20 Jan 2026 15:42:33 +0100 Subject: [PATCH 037/240] fix: change spelling of crosslinking in test --- backend/tests/main/test_views_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/tests/main/test_views_helper.py b/backend/tests/main/test_views_helper.py index 37b762e14..718fe4f0b 100644 --- a/backend/tests/main/test_views_helper.py +++ b/backend/tests/main/test_views_helper.py @@ -13,7 +13,7 @@ def test_get_all_possible_step_names(): "EvidenceImport", "ExampleDatasetImport", "FastaImport", - "CrossLinkingImport", + "CrosslinkingImport", "FilterProteinsBySamplesMissing", "FilterProteinsBySilacRatios", "FilterByProteinsCount", From 5d0f65f355a2a9433c49c56c89e42b9accf74875 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sat, 24 Jan 2026 11:05:05 +0100 Subject: [PATCH 038/240] refactor: remove kwargs from plot method --- .../data_analysis/cross_linking_validation.py | 31 +++++++++---------- backend/protzilla/form_helper.py | 10 ------ backend/protzilla/methods/data_analysis.py | 20 ++++++++++-- backend/protzilla/steps.py | 17 +--------- 4 files changed, 34 insertions(+), 44 deletions(-) diff --git a/backend/protzilla/data_analysis/cross_linking_validation.py b/backend/protzilla/data_analysis/cross_linking_validation.py index ae4a88071..41a5cfb6f 100644 --- a/backend/protzilla/data_analysis/cross_linking_validation.py +++ b/backend/protzilla/data_analysis/cross_linking_validation.py @@ -94,7 +94,9 @@ def get_distance_between_crosslinker_connected_amino_acids_in_alphafold( def validate_with_angstrom_deviation( - crosslinking_df: pd.DataFrame, protein_to_validate: str, **kwargs + crosslinking_df: pd.DataFrame, + protein_to_validate: str, + crosslinker_information: dict[str, list[float]], ) -> tuple[int, int]: """ Validates cross-links by comparing the cross-linker @@ -104,13 +106,12 @@ def validate_with_angstrom_deviation( :param crosslinking_df: DataFrame containing cross-linking data. :param protein_to_validate: UniProt ID of the protein to validate. - :param kwargs: Dynamically generated keyword arguments containing: - - length_of_: float, the length of the crosslinker in Ångström - - accepted_deviation_for_: float, accepted deviation in Ångström - for the respective crosslinker + :param crosslinker_information: Contains for each Crosslinker: + - length_of_: float + - accepted_deviation_for_: float :return: Tuple (valid_cross_links, invalid_cross_links), counts of cross-links that pass or fail the distance validation. - :raises KeyError: If a required crosslinker field is missing in kwargs. + :raises KeyError: If a required crosslinker field is missing in crosslinker_information. :raises ValueError: If peptide sequences cannot be matched to the protein sequence. """ alphafold_data = fetch_alphafold_protein_structure( @@ -134,10 +135,8 @@ def validate_with_angstrom_deviation( ) try: - crosslinker_length = kwargs[f"length_of_{crosslink.Crosslinker}"] - accepted_deviation = kwargs[ - f"accepted_deviation_for_{crosslink.Crosslinker}" - ] + crosslinker_length = crosslinker_information[crosslink.Crosslinker][0] + accepted_deviation = crosslinker_information[crosslink.Crosslinker][1] except KeyError as e: missing_key = e.args[0] raise KeyError( @@ -155,7 +154,9 @@ def validate_with_angstrom_deviation( def bar_plot_of_valid_crosslinks( - crosslinking_df: pd.DataFrame, protein_to_validate: str, **kwargs + crosslinking_df: pd.DataFrame, + protein_to_validate: str, + crosslinker_information: dict[str, list[float]], ) -> list[Figure]: """ Creates a bar plot summarizing the number of valid and invalid cross-links @@ -164,17 +165,15 @@ def bar_plot_of_valid_crosslinks( :param crosslinking_df: DataFrame containing cross-linking data. :param protein_to_validate: UniProt ID of the protein to validate. - :param kwargs: Dynamically generated keyword arguments containing crosslinker - lengths and accepted deviations, passed to - validate_with_angstrom_deviation: + :param crosslinker_information: Contains for each Crosslinker: - length_of_: float - accepted_deviation_for_: float :return: List containing a single bar plot object representing counts of valid and invalid cross-links. - :raises KeyError: If a required crosslinker field is missing in kwargs. + :raises KeyError: If a required crosslinker field is missing in crosslinker_information. """ valid_crosslinks, invalid_crosslinks = validate_with_angstrom_deviation( - crosslinking_df, protein_to_validate, **kwargs + crosslinking_df, protein_to_validate, crosslinker_information ) return [ create_bar_plot( diff --git a/backend/protzilla/form_helper.py b/backend/protzilla/form_helper.py index d04922863..60655d53f 100644 --- a/backend/protzilla/form_helper.py +++ b/backend/protzilla/form_helper.py @@ -53,13 +53,3 @@ def get_choices_for_metadata_non_sample_columns( ) -> list[Option]: metadata_choices = get_choices_for_metadata(run, instance_identifier) return [c for c in metadata_choices if c.label != "Sample"] - - -def get_crosslinker_names_from_crosslinker_df(run: Run) -> list[str]: - df = run.steps.get_step_output( - Step, output_key="crosslinking_df" - ) - if df is None or "Crosslinker" not in df.columns: - return [] - crosslinkers = df["Crosslinker"].dropna().unique() - return crosslinkers diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index acc5efa00..0b1cfde01 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -65,7 +65,6 @@ bar_plot_of_valid_crosslinks, ) from backend.protzilla.run import Run -from backend.protzilla.form_helper import get_crosslinker_names_from_crosslinker_df class TTestType(Enum): @@ -2472,6 +2471,14 @@ class CrossLinkingValidationWithAngstromDeviation(DataAnalysisStep): output_keys = ["crosslinking_df_result"] + @staticmethod + def _get_crosslinker_names_from_crosslinker_df(steps: StepManager) -> list[str]: + df = steps.get_step_output(Step, output_key="crosslinking_df") + if df is None or "Crosslinker" not in df.columns: + return [] + crosslinkers = df["Crosslinker"].dropna().unique() + return list(crosslinkers) + def create_form(self): return Form( label="Ångström Deviation", @@ -2484,7 +2491,7 @@ def create_form(self): ) def modify_form(self, form: Form, run: Run) -> None: - cross_linker = get_crosslinker_names_from_crosslinker_df(run) + cross_linker = self._get_crosslinker_names_from_crosslinker_df(run.steps) for cl in cross_linker: field_name = f"length_of_{cl}" if field_name not in form: @@ -2504,10 +2511,19 @@ def modify_form(self, form: Form, run: Run) -> None: plot_method = staticmethod(bar_plot_of_valid_crosslinks) def insert_dataframes(self, steps: StepManager, inputs) -> dict: + crosslinker_to_length_and_deviation = {} + for crosslinker in self._get_crosslinker_names_from_crosslinker_df(steps): + crosslinker_to_length_and_deviation[crosslinker] = [ + inputs.get(f"length_of_{crosslinker}"), + inputs.get(f"accepted_deviation_for_{crosslinker}"), + ] + inputs["crosslinker_information"] = crosslinker_to_length_and_deviation + inputs["crosslinking_df"] = steps.get_step_output( Step, "crosslinking_df", ) if inputs.get("crosslinking_df") is None: raise ValueError("No cross linking data found.") + return inputs diff --git a/backend/protzilla/steps.py b/backend/protzilla/steps.py index 6c4fb2c8e..a12415133 100644 --- a/backend/protzilla/steps.py +++ b/backend/protzilla/steps.py @@ -268,18 +268,7 @@ def plot_input(self) -> dict: input_parameters = inspect.signature(self.plot_method).parameters - has_kwargs = any( - param.kind == inspect.Parameter.VAR_KEYWORD - for param in input_parameters.values() - ) - - required_keys = [ - key - for key, param in input_parameters.items() - if param.default == inspect.Parameter.empty - and param.kind - in (inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY) - ] + required_keys = [key for key, param in input_parameters.items()] for key in required_keys: if key not in plot_input: raise ValueError(f"Missing required input '{key}' for the plot method") @@ -288,10 +277,6 @@ def plot_input(self) -> dict: key: plot_input[key] for key in input_parameters.keys() if key in plot_input } - if has_kwargs: - kwargs_dict = {k: v for k, v in plot_input.items() if k not in output_dict} - output_dict.update(kwargs_dict) - return output_dict def validate_outputs(self, soft_check: bool = False) -> bool: From b7779e2f34e8a4b80ebd2a85210d9dfb20467510 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sat, 24 Jan 2026 12:40:31 +0100 Subject: [PATCH 039/240] fix: change variable and dataframe names so that crosslinking validation is working after merge --- backend/protzilla/data_analysis/cross_linking_validation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/protzilla/data_analysis/cross_linking_validation.py b/backend/protzilla/data_analysis/cross_linking_validation.py index 41a5cfb6f..511d895ab 100644 --- a/backend/protzilla/data_analysis/cross_linking_validation.py +++ b/backend/protzilla/data_analysis/cross_linking_validation.py @@ -73,14 +73,14 @@ def get_distance_between_crosslinker_connected_amino_acids_in_alphafold( ) -> float: amino_acid_crosslinker1_is_bound_to = ( get_position_of_amino_acid_crosslinker_bound_to( - protein_sequence=fasta_df.iloc[0]["sequence"], + protein_sequence=fasta_df.at[0, "Protein Sequence"], peptide_sequence=crosslink.Peptide1, crosslinker_position_within_peptide=crosslink.CL_position1, ) ) amino_acid_crosslinker2_is_bound_to = ( get_position_of_amino_acid_crosslinker_bound_to( - protein_sequence=fasta_df.iloc[0]["sequence"], + protein_sequence=fasta_df.at[0, "Protein Sequence"], peptide_sequence=crosslink.Peptide2, crosslinker_position_within_peptide=crosslink.CL_position2, ) @@ -115,7 +115,7 @@ def validate_with_angstrom_deviation( :raises ValueError: If peptide sequences cannot be matched to the protein sequence. """ alphafold_data = fetch_alphafold_protein_structure( - uniprot=protein_to_validate, persist_uploads=False + uniprot_id=protein_to_validate, persist_uploads=False ) cif_df = alphafold_data["cif_df"] fasta_df = alphafold_data["sequence_df"] From e97884c22a67bf7f10eb3bbe223d4ac1c49d86a2 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 26 Jan 2026 23:01:54 +0100 Subject: [PATCH 040/240] feat: add calc method to crosslinking validation step --- .../data_analysis/cross_linking_validation.py | 55 ++++++++++--------- backend/protzilla/methods/data_analysis.py | 1 + 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/backend/protzilla/data_analysis/cross_linking_validation.py b/backend/protzilla/data_analysis/cross_linking_validation.py index 511d895ab..81c5ea740 100644 --- a/backend/protzilla/data_analysis/cross_linking_validation.py +++ b/backend/protzilla/data_analysis/cross_linking_validation.py @@ -97,7 +97,7 @@ def validate_with_angstrom_deviation( crosslinking_df: pd.DataFrame, protein_to_validate: str, crosslinker_information: dict[str, list[float]], -) -> tuple[int, int]: +) -> dict: """ Validates cross-links by comparing the cross-linker lengths with the distances between the linked amino acids in the AlphaFold @@ -119,38 +119,33 @@ def validate_with_angstrom_deviation( ) cif_df = alphafold_data["cif_df"] fasta_df = alphafold_data["sequence_df"] - valid_cross_links = 0 - invalid_cross_links = 0 - for crosslink in crosslinking_df.itertuples(index=False): - if ( - crosslink.Protein_id1 != protein_to_validate - or crosslink.Protein_id2 != protein_to_validate - ): - continue - - distance_in_alphafold = ( - get_distance_between_crosslinker_connected_amino_acids_in_alphafold( - fasta_df, cif_df, crosslink - ) - ) + df = crosslinking_df.copy() # TODO: really necessary? + mask = (df.Protein_id1 == protein_to_validate) & ( + df.Protein_id2 == protein_to_validate + ) + relevant_crosslinks_df = df[mask] + + def check_crosslink(crosslink): + distance = get_distance_between_crosslinker_connected_amino_acids_in_alphafold( + fasta_df, cif_df, crosslink + ) try: - crosslinker_length = crosslinker_information[crosslink.Crosslinker][0] - accepted_deviation = crosslinker_information[crosslink.Crosslinker][1] + crosslinker_length, accepted_deviation = crosslinker_information[ + crosslink.Crosslinker + ] except KeyError as e: missing_key = e.args[0] raise KeyError( - f"Missing required field '{missing_key}' for crosslinker '{crosslink.Crosslinker}'. " - "Please check that your form includes the correct fields." + f"Missing required field '{missing_key}' for crosslinker '{crosslink.Crosslinker}'." ) - upper_limit_on_allowed_distance = crosslinker_length + accepted_deviation + return distance <= (crosslinker_length + accepted_deviation) - if distance_in_alphafold <= upper_limit_on_allowed_distance: - valid_cross_links += 1 - else: - invalid_cross_links += 1 + df.loc[mask, "valid_crosslink"] = relevant_crosslinks_df.apply( + check_crosslink, axis=1 + ) - return valid_cross_links, invalid_cross_links + return dict(crosslinking_df_result=df, messages={}) def bar_plot_of_valid_crosslinks( @@ -172,9 +167,15 @@ def bar_plot_of_valid_crosslinks( valid and invalid cross-links. :raises KeyError: If a required crosslinker field is missing in crosslinker_information. """ - valid_crosslinks, invalid_crosslinks = validate_with_angstrom_deviation( + validated_df = validate_with_angstrom_deviation( crosslinking_df, protein_to_validate, crosslinker_information - ) + )["crosslinking_df_result"] + + evaluated = validated_df["valid_crosslink"].dropna() + + valid_crosslinks = (evaluated == True).sum() + invalid_crosslinks = (evaluated == False).sum() + return [ create_bar_plot( values_of_sectors=[ diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 0b1cfde01..7dc632c7e 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -2509,6 +2509,7 @@ def modify_form(self, form: Form, run: Run) -> None: form.add_field(allowed_length_deviation_field) plot_method = staticmethod(bar_plot_of_valid_crosslinks) + calc_method = staticmethod(validate_with_angstrom_deviation) def insert_dataframes(self, steps: StepManager, inputs) -> dict: crosslinker_to_length_and_deviation = {} From 9a5fca72d945b9d0c56f9206cb8b24ba32f186d2 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Tue, 27 Jan 2026 18:16:47 +0100 Subject: [PATCH 041/240] feat: add lower bound on allowed deviation of crosslink length --- .../data_analysis/cross_linking_validation.py | 28 ++++++++++++---- backend/protzilla/methods/data_analysis.py | 32 ++++++++++++------- 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/backend/protzilla/data_analysis/cross_linking_validation.py b/backend/protzilla/data_analysis/cross_linking_validation.py index 81c5ea740..a78524975 100644 --- a/backend/protzilla/data_analysis/cross_linking_validation.py +++ b/backend/protzilla/data_analysis/cross_linking_validation.py @@ -126,23 +126,37 @@ def validate_with_angstrom_deviation( ) relevant_crosslinks_df = df[mask] - def check_crosslink(crosslink): + def check_crosslink(crosslink: pd.Series) -> pd.Series: distance = get_distance_between_crosslinker_connected_amino_acids_in_alphafold( fasta_df, cif_df, crosslink ) try: - crosslinker_length, accepted_deviation = crosslinker_information[ - crosslink.Crosslinker - ] + ( + crosslinker_length, + accepted_deviation_upper_bound, + accepted_deviation_lower_bound, + ) = crosslinker_information[crosslink.Crosslinker] except KeyError as e: missing_key = e.args[0] raise KeyError( f"Missing required field '{missing_key}' for crosslinker '{crosslink.Crosslinker}'." ) - return distance <= (crosslinker_length + accepted_deviation) + # Fallback to default deviation bounds when not explicitly provided + accepted_distance_lower_bound = crosslinker_length - ( + accepted_deviation_lower_bound or crosslinker_length + ) + accepted_distance_upper_bound = ( + accepted_deviation_upper_bound or 1e9 + ) + crosslinker_length + + valid = ( + accepted_distance_lower_bound <= distance <= accepted_distance_upper_bound + ) + + return pd.Series({"alphafold_distance": distance, "valid_crosslink": valid}) - df.loc[mask, "valid_crosslink"] = relevant_crosslinks_df.apply( - check_crosslink, axis=1 + df.loc[mask, ["alphafold_distance", "valid_crosslink"]] = ( + relevant_crosslinks_df.apply(check_crosslink, axis=1) ) return dict(crosslinking_df_result=df, messages={}) diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 7dc632c7e..897997d7b 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -2500,26 +2500,24 @@ def modify_form(self, form: Form, run: Run) -> None: label=f"Length of {cl} in Ångström", min=0, ) - allowed_length_deviation_field = FloatField( - name=f"accepted_deviation_for_{cl}", - label=f"Accepted deviation for {cl} Cross-Links in Ångström", + upper_bound_length_deviation_field = FloatField( + name=f"upper_accepted_deviation_for_{cl}", + label=f"Upper Bound on the accepted deviation for {cl} Cross-Links in Ångström (0 equals no bound)", + min=0, + ) + lower_bound_length_deviation_field = FloatField( + name=f"lower_accepted_deviation_for_{cl}", + label=f"Lower Bound on the accepted deviation for {cl} Cross-Links in Ångström (0 equals no bound)", min=0, ) form.add_field(crosslinker_length_field) - form.add_field(allowed_length_deviation_field) + form.add_field(upper_bound_length_deviation_field) + form.add_field(lower_bound_length_deviation_field) plot_method = staticmethod(bar_plot_of_valid_crosslinks) calc_method = staticmethod(validate_with_angstrom_deviation) def insert_dataframes(self, steps: StepManager, inputs) -> dict: - crosslinker_to_length_and_deviation = {} - for crosslinker in self._get_crosslinker_names_from_crosslinker_df(steps): - crosslinker_to_length_and_deviation[crosslinker] = [ - inputs.get(f"length_of_{crosslinker}"), - inputs.get(f"accepted_deviation_for_{crosslinker}"), - ] - inputs["crosslinker_information"] = crosslinker_to_length_and_deviation - inputs["crosslinking_df"] = steps.get_step_output( Step, "crosslinking_df", @@ -2527,4 +2525,14 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: if inputs.get("crosslinking_df") is None: raise ValueError("No cross linking data found.") + # although crosslinker_information is not a dataframe we need to insert the user information regarding the crosslinks as a dictionary into the inputs + crosslinker_to_length_and_deviation = {} + for crosslinker in self._get_crosslinker_names_from_crosslinker_df(steps): + crosslinker_to_length_and_deviation[crosslinker] = [ + inputs.get(f"length_of_{crosslinker}"), + inputs.get(f"upper_accepted_deviation_for_{crosslinker}"), + inputs.get(f"lower_accepted_deviation_for_{crosslinker}"), + ] + inputs["crosslinker_information"] = crosslinker_to_length_and_deviation + return inputs From e9d2e134e5c381edd2b53cbb17ebfab701f5ffab Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Wed, 28 Jan 2026 13:50:05 +0100 Subject: [PATCH 042/240] feat: Add a new tab to settings where one can upload and delete predicted protein structures --- backend/main/urls.py | 3 + backend/main/views_helper.py | 86 ++++++ backend/main/views_settings.py | 156 +++++++++- .../alphafold_protein_structure_load.py | 11 +- .../app/settings/other-settings/index.ts | 2 + .../protein-structure-upload.tsx | 270 ++++++++++++++++++ .../src/components/app/settings/settings.tsx | 13 +- .../core/shared/icon/icons/index.ts | 1 + .../core/shared/icon/icons/structure.svg | 68 +++++ 9 files changed, 600 insertions(+), 10 deletions(-) create mode 100644 frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx create mode 100644 frontend/src/components/core/shared/icon/icons/structure.svg diff --git a/backend/main/urls.py b/backend/main/urls.py index c683abbf8..4e22eec3d 100644 --- a/backend/main/urls.py +++ b/backend/main/urls.py @@ -61,6 +61,9 @@ path("api/get_databases", views_settings.get_databases, name="get_databases"), path("api/upload_database", views_settings.database_upload, name="database_upload"), path("api/delete_database", views_settings.database_delete, name="database_delete"), + path("api/get_prot_structure", views_settings.get_prot_structure, name="get_prot_structure"), + path("api/upload_prot_structure", views_settings.upload_prot_structure, name="upload_prot_structure"), + path("api/prot_structure_delete", views_settings.prot_structure_delete, name="prot_structure_delete"), path( "api/load_ptm_settings", views_settings.load_ptm_settings, diff --git a/backend/main/views_helper.py b/backend/main/views_helper.py index 72755d2b0..ba9546ab5 100644 --- a/backend/main/views_helper.py +++ b/backend/main/views_helper.py @@ -1,9 +1,11 @@ import re +import shutil from pathlib import Path import numpy as np from backend.protzilla.constants.paths import SETTINGS_PATH +from backend.protzilla.constants.protzilla_logging import logger from backend.protzilla.disk_operator import YamlOperator from backend.protzilla.steps import StepManager, Step from backend.protzilla.utilities import name_to_title @@ -184,3 +186,87 @@ def load_yaml_from_file(path: Path) -> str: raise FileNotFoundError(f"File {path} does not exist.") with path.open("r") as f: return f.read() + + +def copy_file_to_directory(source_file: Path, dest_dir: Path) -> tuple[bool, str]: + """ + Copy a single file to a destination directory. + Creates the destination directory if it doesn't exist. + + :param source_file: Path to the source file + :param dest_dir: Path to the destination directory + :return: Tuple of (success: bool, message: str) + """ + + if not source_file.exists(): + msg = f"Source file does not exist: {source_file}" + logger.error(msg) + return False, msg + + if not source_file.is_file(): + msg = f"Source path is not a file: {source_file}" + logger.error(msg) + return False, msg + + try: + dest_dir.mkdir(parents=True, exist_ok=True) + dest_file = dest_dir / source_file.name + + shutil.copy2(source_file, dest_file) + + msg = f"Successfully copied file {source_file} to {dest_dir}" + logger.info(msg) + return True, msg + + except OSError as e: + msg = f"Failed to copy file: {str(e)}" + logger.error(msg) + return False, msg + + +def validate_uploaded_files( + upload_dir: Path, file_mapping: dict[str, list[str]] +) -> tuple[bool, str]: + """ + Validate that expected files exist in the upload directory with correct formats. + + :param upload_dir: Path to the upload directory + :param file_mapping: Dictionary mapping file names to list of valid extensions + e.g., {"cif_file": [".cif"], "fasta_file": [".fasta", ".fa"]} + :return: Tuple of (success: bool, message: str) + """ + if not upload_dir.exists(): + msg = f"Upload directory does not exist: {upload_dir}" + logger.error(msg) + return False, msg + + missing_files = [] + invalid_files = [] + + for file_name, valid_extensions in file_mapping.items(): + file_path = upload_dir / file_name + if not file_path.exists(): + missing_files.append(file_name) + else: + # Check file extension + if not any(file_name.lower().endswith(ext) for ext in valid_extensions): + invalid_files.append( + f"{file_name} (expected: {', '.join(valid_extensions)})" + ) + + # Build error message + error_messages = [] + if missing_files: + error_messages.append(f"Missing files: {', '.join(missing_files)}") + if invalid_files: + error_messages.append(f"Invalid file format: {', '.join(invalid_files)}") + + if error_messages: + msg = " | ".join(error_messages) + logger.warning(msg) + return False, msg + + msg = f"All {len(file_mapping)} files validated successfully" + logger.info(msg) + return True, msg + diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index f400f28df..130418110 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -1,8 +1,10 @@ import json import os import shutil -from datetime import date +from datetime import date, datetime, timezone from io import BytesIO +from pathlib import Path + import pandas import plotly.graph_objects as go @@ -12,7 +14,7 @@ from django.http import JsonResponse, FileResponse from backend.main import settings -from backend.main.views_helper import sanitize_name, load_settings_from_file +from backend.main.views_helper import sanitize_name, load_settings_from_file, validate_uploaded_files, copy_file_to_directory from backend.protzilla.constants.paths import EXTERNAL_DATA_PATH, SETTINGS_PATH from backend.protzilla.data_integration.database_query import ( uniprot_columns, @@ -221,6 +223,156 @@ def save_ptm_settings(request, default_file_stem: str = DEFAULT_PTM_SETTINGS_FIL ) +# <--- Protein Structure Predictions ---> + +AF_DICT_PATH = EXTERNAL_DATA_PATH / "alphafold" + +def get_prot_structure(request): + metadata_csv = AF_DICT_PATH / "alphafold_metadata.csv" + df = pandas.read_csv(metadata_csv) + + df_infos = df.rename( + columns={ + "entryID": "entry_id", + "uniprotAccession": "uniprot_id", + "modelCreatedDate": "date_modified", + "gene": "gene", + "alphafold_version": "af_version", + } + ).to_dict(orient="records") + + return JsonResponse(df_infos, safe=False) + + +def upload_prot_structure(request): + if request.method == "POST": + data = json.loads(request.body) + uniprot_id = data.get("uniprot_id") + entry_id = data.get("entry_id") + af_version = data.get("af_version") + gene = data.get("gene") + cif_file = data.get("cif_file") + confidence = data.get("confidence") + pae = data.get("pae") + fasta_file = data.get("fasta_file") + + # Validate uploaded files and copy them to source directory out of temp directory + file_mapping = { + cif_file: [".cif"], + confidence: [".json"], + pae: [".json"], + fasta_file: [".fasta", ".fa"], + } + + is_valid, validation_message = validate_uploaded_files( + settings.FILE_UPLOAD_TEMP_DIR, file_mapping + ) + if not is_valid: + messages.add_message(request, messages.ERROR, validation_message, "alert-danger") + return JsonResponse({"success": False, "message": validation_message}, status=400) + + af_path = AF_DICT_PATH / entry_id.upper() + if af_path.exists(): + return JsonResponse( + {"success": False, "message": "Entry ID is not unique."}, status=405 + ) + else: + af_path.mkdir(parents=True, exist_ok=True) + + for file_name in [cif_file, confidence, pae, fasta_file]: + source_dir = settings.FILE_UPLOAD_TEMP_DIR / file_name + success, message = copy_file_to_directory( + source_dir, + af_path + ) + + # add row to metadata csv + AF_DICT_PATH.mkdir(parents=True, exist_ok=True) + metadata_csv = AF_DICT_PATH / "alphafold_metadata.csv" + df = pandas.read_csv(metadata_csv) + + now_utc = datetime.now(timezone.utc) + formatted = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ") + + new_row = { + "entryID": entry_id, + "uniprotAccession": uniprot_id, + "modelCreatedDate": formatted, + "gene": gene, + "alphafold_version": af_version + } + + df = pandas.concat([df, pandas.DataFrame([new_row])], ignore_index=True) + df.to_csv(metadata_csv, index=False) + + return JsonResponse( + { + "success": True, + "message": ( + f"Predicted Protein Structure uploaded successfully. \n {message}" + if len(message) > 0 + else "Predicted Protein Structure uploaded successfully." + ), + }, + status=200, + ) + else: + return JsonResponse( + {"success": False, "message": "Invalid request method"}, status=405 + ) + + +def prot_structure_delete(request): + if request.method != "POST": + return JsonResponse( + {"success": False, "message": "Invalid request method"}, status=405 + ) + + data = json.loads(request.body) + entry_id = (data.get("entry_id") or "").strip() + if not entry_id: + return JsonResponse( + {"success": False, "message": "Missing entry_id"}, status=400 + ) + + # delete folder with files for the protein structure + target_dir = AF_DICT_PATH / entry_id.upper() + metadata_csv = AF_DICT_PATH / "alphafold_metadata.csv" + + if not target_dir.exists() or not target_dir.is_dir(): + return JsonResponse( + {"success": False, "message": f"Entry folder not found: {target_dir.name}"}, + status=404, + ) + + try: + shutil.rmtree(target_dir) + except Exception as e: + return JsonResponse( + {"success": False, "message": f"Failed to delete folder: {str(e)}"}, + status=500, + ) + + # remove entry out of metadata csv + if metadata_csv.exists() and metadata_csv.is_file() and metadata_csv.stat().st_size > 0: + try: + df = pandas.read_csv(metadata_csv, dtype=str) + df = df[df["entryID"].fillna("").str.strip().str.upper() != entry_id.upper()] + df.to_csv(metadata_csv, index=False) + + except Exception as e: + return JsonResponse( + { + "success": True, + "message": f"Folder deleted. Failed to update CSV: {str(e)}", + }, + status=200, + ) + + return JsonResponse( + {"success": True, "message": "Entry deleted successfully"}, status=200 + ) + # <--- Databases ---> diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index aa917c6e3..87dfd3513 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -88,7 +88,7 @@ def handle_alphafold_files( uniprot: str, seq: str, metadata_df: pd.DataFrame, - acc: str, + entry_id: str, persist_uploads: bool = False, ) -> dict[str, pd.DataFrame | None]: """ @@ -102,7 +102,7 @@ def handle_alphafold_files( :param uniprot: The UniProt ID of the protein :param seq: The protein sequence :param metadata_df: DataFrame containing AlphaFold metadata - :param acc: The accession number (used for directory naming) + :param entry_id: The entry_id (in the case of fetching from AF DB the same as uniprot id) (used for directory naming) :param persist_uploads: If True, files are saved persistently; if False, only loaded into memory :return: A dictionary containing DataFrames for metadata, CIF, PAE, pLDDT, and sequence data or None values for failed loads """ @@ -131,8 +131,8 @@ def handle_alphafold_files( try: if metadata_csv.exists(): existing = pd.read_csv(metadata_csv, dtype=str) - if acc and "uniprotAccession" in existing.columns: - existing = existing[existing["uniprotAccession"] != acc] + if entry_id and "entry_id" in existing.columns: + existing = existing[existing["entry_id"] != entry_id] combined = pd.concat([existing, metadata_df], ignore_index=True) combined.to_csv(metadata_csv, index=False) else: @@ -239,14 +239,13 @@ def fetch_alphafold_protein_structure( files_urls[key] = r[key] metadata_df = pd.DataFrame([data]) - acc = data.get("uniprotAccession") alpha_dfs = handle_alphafold_files( files_urls=files_urls, uniprot=uniprot_id, seq=seq_tmp, metadata_df=metadata_df, - acc=acc, + entry_id=uniprot_id, persist_uploads=persist_uploads, ) diff --git a/frontend/src/components/app/settings/other-settings/index.ts b/frontend/src/components/app/settings/other-settings/index.ts index 969a2f042..44646465a 100644 --- a/frontend/src/components/app/settings/other-settings/index.ts +++ b/frontend/src/components/app/settings/other-settings/index.ts @@ -1,3 +1,5 @@ export * from "./citation"; export * from "./database-settings"; export * from "./github"; +export * from "./ptm-vis-settings"; +export * from "./protein-structure-upload"; diff --git a/frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx b/frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx new file mode 100644 index 000000000..dedae1b6c --- /dev/null +++ b/frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx @@ -0,0 +1,270 @@ +import { useNotification } from "@protzilla/app"; +import { DeleteModal, Form, SecondaryButton, SectionTitle, Text } from "@protzilla/core"; +import { useToggleableState } from "@protzilla/hooks"; +import { spacing } from "@protzilla/theme"; +import { callApi, callApiWithParameters } from "@protzilla/utils"; +import { useEffect, useState } from "react"; +import { styled } from "styled-components"; + + +const ProteinStructureTitle = styled(SectionTitle)` + padding-top: ${spacing("large")}; + padding-bottom: ${spacing("small")}; +`; + +const ProtStructureList = styled.div` + display: flex; + flex-direction: column; + gap: ${spacing("verySmall")}; +`; + +interface ProtStructureProps { + entry_id: string; + uniprot_id: string; + date_modified: string; + gene: string; + af_version: string; + handleDelete?: () => void; +} + +const ProtStructureContainer = styled.div` + display: flex; + flex-direction: row; + align-items: center; + justify-content: space-between; + padding-left: ${spacing("listIndentation")}; + padding-top: ${spacing("verySmall")}; + padding-bottom: ${spacing("verySmall")}; +`; + +const ProtStructureInfo = styled.div` + display: flex; + justify-content: space-between; + align-content: center; + flex-direction: column; + width: 90%; +`; + +const ProtStructureEntry = ({ + entry_id, + uniprot_id, + date_modified, + gene, + af_version, + handleDelete, +}: ProtStructureProps) => { + + return ( + + + + + + + + ); +}; + +export const ProteinStructureUpload = () => { + const notify = useNotification(); + const [protStructureList, setProtStructureList] = useState([]); + const [isDeleteModalOpen, openDeleteModal, closeDeleteModal] = useToggleableState(false); + const [selectedProtStructure, setSelectedProtStructure] = useState(""); + + const fetchDatabases = async () => { + const protStructures = await callApi("get_prot_structure"); + if (protStructures) { + setProtStructureList(protStructures); + } + }; + + useEffect(() => { + void fetchDatabases(); + }, []); + + const handleAddProteinStructure = async ( + uniprot_id: string, + entry_id: string, + af_version: string, + gene: string, + cif_file: string, + confidence: string, + pae: string, + fasta_file: string, + ) => { + const response = await callApiWithParameters("upload_prot_structure", { + uniprot_id: uniprot_id, + entry_id: entry_id, + af_version: af_version, + gene: gene, + cif_file: cif_file, + confidence: confidence, + pae: pae, + fasta_file: fasta_file, + }); + if (response?.success) { + notify({ + title: "Predicted protein structure upload", + message: response.message as string, + type: "success", + isClosingAutomatically: true, + }); + } else { + notify({ + title: "Predicted protein structure upload failed", + message: response.message ?? "Unknown error", + type: "error", + isClosingAutomatically: true, + }); + } + }; + + const onDeleteProtStructure = (entry_id: string) => { + openDeleteModal(); + setSelectedProtStructure(entry_id); + }; + + const handleDeleteProtStructure = async (entry_id: string) => { + const response = await callApiWithParameters("prot_structure_delete", { + entry_id: entry_id, + }); + if (response?.success) { + notify({ + title: "Protein structure deleted", + message: response.message as string, + type: "success", + isClosingAutomatically: true, + }); + } else { + notify({ + title: "Protein structure deletion failed", + message: response?.message ?? "Unknown error", + type: "error", + isClosingAutomatically: true, + }); + } + void fetchDatabases(); + closeDeleteModal(); + }; + + return ( +
+ + + +
{ + void handleAddProteinStructure( + data.uniprot_id as string, + data.entry_id as string, + data.af_version as string, + data.gene as string, + data.cif_file as string, + data.confidence_file as string, + data.pae_file as string, + data.fasta_file as string, + ); + }} + /> + + + {protStructureList.map((ps) => ( + { + onDeleteProtStructure(ps.entry_id); + }} + /> + ))} + + void handleDeleteProtStructure(selectedProtStructure)} + title={`Deleted protein structure "${selectedProtStructure}"?`} + /> +
+ ); +}; diff --git a/frontend/src/components/app/settings/settings.tsx b/frontend/src/components/app/settings/settings.tsx index 468db8927..7d43f197e 100644 --- a/frontend/src/components/app/settings/settings.tsx +++ b/frontend/src/components/app/settings/settings.tsx @@ -3,8 +3,7 @@ import { spacing } from "@protzilla/theme"; import { useState } from "react"; import { styled } from "styled-components"; -import { DatabaseSettings, GitHub } from "./other-settings/"; -import { PTMVisSettings } from "./other-settings/ptm-vis-settings.tsx"; +import { DatabaseSettings, GitHub, ProteinStructureUpload, PTMVisSettings } from "./other-settings/"; import { PlotSettingsModal } from "./plot-settings"; import { SettingsProps } from "./settings.props.ts"; import { DiscardModal, Modal, ToggleableButton } from "../../core/"; @@ -111,6 +110,15 @@ export const Settings: React.FC = ({ handleSwitchSection("ptm-vis"); }} /> + { + handleSwitchSection("protein-struc-upload"); + }} + /> = ({ )} {selectedSetting === "database" && } {selectedSetting === "ptm-vis" && } + {selectedSetting === "protein-struc-upload" && } {selectedSetting === "github" && } + + + +Created by potrace 1.15, written by Peter Selinger 2001-2017 + + + + + From f7fe67d6f2447d690d3c27c28555e45e4936f963 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Wed, 28 Jan 2026 15:21:55 +0100 Subject: [PATCH 043/240] Implement display of failed rows (Issue #194) --- .../importing/crosslinking_import.py | 26 ++++++++++++++----- backend/protzilla/methods/importing.py | 2 +- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index f3d11f76c..1e9b3596d 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -657,6 +657,24 @@ def normalize_crosslinking_df(df: pd.DataFrame) -> pd.DataFrame: return df.loc[:, columns_in_crosslinking_df] +def aggregate_failed_proteins_for_display(failed_df: pd.DataFrame) -> str: + protein_with_error_set = set() + + if "Protein1" in failed_df.columns and "Protein2" in failed_df.columns: + protein_cols = ["Protein1", "Protein2"] + elif "Protein_id1" in failed_df.columns and "Protein_id2" in failed_df.columns: + protein_cols = ["Protein_id1", "Protein_id2"] + + error_cols = ["Protein1_error", "Protein2_error"] + + for prot_col, err_col in zip(protein_cols, error_cols): + for protein_val, error_val in zip(failed_df[prot_col], failed_df[err_col]): + if pd.notna(error_val): + protein_with_error_set.add(f"{protein_val} -> {error_val}") + + return "\n".join(sorted(protein_with_error_set)) + + def crosslinking_import(file_path: Path) -> dict: try: if file_path.suffix == ".csv": @@ -683,11 +701,7 @@ def crosslinking_import(file_path: Path) -> dict: msg = f"Warning: {len(failed_df)} rows failed to import, however {len(good_df)} cross-links were successfully imported." messages = [ dict(level=logging.WARNING, msg=msg), - dict(level=logging.WARNING, msg=f"Failed rows:\n{failed_df}"), + dict(level=logging.WARNING, msg=f"Failed proteins:\n{aggregate_failed_proteins_for_display(failed_df)}"), ] - # TODO: Implement display of failed rows (Issue #194) - pd.set_option("display.max_columns", None) - failed_df.to_csv("failed_rows.csv", index=False) - print("Failed rows saved to failed_rows.csv") - return dict(crosslinking_df=good_df, messages=messages) + return dict(crosslinking_df=good_df, imported_rows_with_errors_df=failed_df, messages=messages) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index a86a3ff56..44728556d 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -447,7 +447,7 @@ class CrosslinkingImport(ImportingStep): operation = "Cross-Linking Data Import" method_description = "Import a file containing cross-linking data" - output_keys = ["crosslinking_df"] + output_keys = ["crosslinking_df", "imported_rows_with_errors_df"] def create_form(self): return Form( From f4fdb2a7a3087f8030bebe4ceac585613c093fdb Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Wed, 28 Jan 2026 16:28:19 +0100 Subject: [PATCH 044/240] test: add tests for crosslinking validation based on Angstrom deviation --- backend/tests/main/test_views_helper.py | 1 + .../test_crosslinking_validation.py | 104 ++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 backend/tests/protzilla/data_analysis/test_crosslinking_validation.py diff --git a/backend/tests/main/test_views_helper.py b/backend/tests/main/test_views_helper.py index fb9b793bc..0b39fe4e4 100644 --- a/backend/tests/main/test_views_helper.py +++ b/backend/tests/main/test_views_helper.py @@ -78,6 +78,7 @@ def test_get_all_possible_step_names(): "PlotGOEnrichmentDotPlot", "PlotGSEADotPlot", "PlotGSEAEnrichmentPlot", + "CrossLinkingValidationWithAngstromDeviation", } steps = get_all_possible_steps() diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py new file mode 100644 index 000000000..476ac41b9 --- /dev/null +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -0,0 +1,104 @@ +import pandas as pd +from unittest.mock import patch +from unittest.mock import MagicMock + + +from backend.protzilla.data_analysis.cross_linking_validation import ( + get_distance_between_two_amino_acids_in_angstrom, + get_position_of_amino_acid_crosslinker_bound_to, + validate_with_angstrom_deviation, +) +from protzilla.methods.data_analysis import CrossLinkingValidationWithAngstromDeviation + + +def test_get_distance_between_two_amino_acids_in_angstrom(): + cif_df = pd.DataFrame( + { + "_atom_site.label_atom_id": ["CA", "CA"], + "_atom_site.label_seq_id": [1, 2], + "_atom_site.Cartn_x": [0, 3], + "_atom_site.Cartn_y": [0, 0], + "_atom_site.Cartn_z": [0, 0], + } + ) + + dist = get_distance_between_two_amino_acids_in_angstrom(1, 2, cif_df) + assert dist == 3.0 + + +def test_get_position_of_amino_acid_crosslinker_bound_to(): + protein = "MABCDEFGHIJK" + peptide = "ABC" + pos = get_position_of_amino_acid_crosslinker_bound_to(protein, peptide, 2) + assert pos == 3 + + +@patch( + "backend.protzilla.data_analysis.cross_linking_validation.fetch_alphafold_protein_structure" +) +def test_validate_with_angstrom_deviation(mock_fetch): + # Fake AlphaFold Data + cif_df = pd.DataFrame( + { + "_atom_site.label_atom_id": ["CA", "CA"], + "_atom_site.label_seq_id": [1, 2], + "_atom_site.Cartn_x": [0, 4], + "_atom_site.Cartn_y": [0, 0], + "_atom_site.Cartn_z": [0, 0], + } + ) + + fasta_df = pd.DataFrame({"Protein Sequence": ["AB"]}) + + mock_fetch.return_value = {"cif_df": cif_df, "sequence_df": fasta_df} + + # Fake Crosslink Data + crosslinking_df = pd.DataFrame( + { + "Protein_id1": ["P12345"], + "Protein_id2": ["P12345"], + "Peptide1": ["A"], + "Peptide2": ["B"], + "CL_position1": [1], + "CL_position2": [1], + "Crosslinker": ["DSS"], + } + ) + + crosslinker_information = {"DSS": [5.0, 1.0, 1.0]} # Länge 5 Å ± 1 Å + + result = validate_with_angstrom_deviation( + crosslinking_df, + protein_to_validate="P12345", + crosslinker_information=crosslinker_information, + ) + + df = result["crosslinking_df_result"] + + assert "alphafold_distance" in df.columns + assert "valid_crosslink" in df.columns + assert df.loc[0, "alphafold_distance"] == 4.0 + assert df.loc[0, "valid_crosslink"] is True + + +def test_modify_form_creates_crosslinker_fields(): + crosslinking_df = pd.DataFrame({"Crosslinker": ["DSS", "BS3", "DSS"]}) + + steps = MagicMock() + steps.get_step_output.return_value = crosslinking_df + + run = MagicMock() + run.steps = steps + + step = CrossLinkingValidationWithAngstromDeviation() + form = step.create_form() + + step.modify_form(form, run) + + assert "length_of_DSS" in form + assert "upper_accepted_deviation_for_DSS" in form + assert "lower_accepted_deviation_for_DSS" in form + + assert "length_of_BS3" in form + assert "upper_accepted_deviation_for_BS3" in form + assert "lower_accepted_deviation_for_BS3" in form From 5e1a4ee513ee1e3803ce66b02a860fa60c48438f Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Wed, 28 Jan 2026 18:14:05 +0100 Subject: [PATCH 045/240] refactor: extract determination of relevant atom of amino acid into placeholder function --- .../data_analysis/cross_linking_validation.py | 67 +++++++++++-------- 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/backend/protzilla/data_analysis/cross_linking_validation.py b/backend/protzilla/data_analysis/cross_linking_validation.py index a78524975..cc68c7b7e 100644 --- a/backend/protzilla/data_analysis/cross_linking_validation.py +++ b/backend/protzilla/data_analysis/cross_linking_validation.py @@ -8,30 +8,32 @@ from protzilla.data_preprocessing.plots import create_bar_plot -def get_coordinates_of_ca_atom_from_cif_df( - cif_df: pd.DataFrame, amino_acid_position: int +def get_reactive_atom_of_amino_acid_residue(amino_acid_kind: str) -> str: + # right now we always return the central C atom + # later we might want to return the reactive atom of the amino acid residue of the specific amino acid kind + return "CA" + + +def get_coordinates_of_atom_crosslinker_bound_to( + amino_acid_position_where_crosslinker_bound: int, + amino_acid_kind: str, + cif_df: pd.DataFrame, ) -> tuple[float, float, float]: - """ - Extract the 3D coordinates of the C-alpha (CA) atom for a given amino acid - position from CIF-derived DataFrame. - - :param cif_df: DataFrame containing atomic data parsed from a CIF file. Must include the columns - "_atom_site.label_atom_id", "_atom_site.label_seq_id", - "_atom_site.Cartn_x", "_atom_site.Cartn_y", and "_atom_site.Cartn_z". - :param amino_acid_position: The sequence position of the amino acid whose C-alpha (CA) atom - coordinates should be extracted. - :return: A tuple (x, y, z) of floats representing the Cartesian coordinates of the C-alpha atom. - :raises ValueError: If no C-alpha atom is found for the given amino acid position. - """ - cif_df = cif_df[cif_df["_atom_site.label_atom_id"] == "CA"] + relevant_atom = get_reactive_atom_of_amino_acid_residue(amino_acid_kind) + # Filter to the exact reactive atom of the amino acid residue + # where the crosslinker is bound (e.g. CA at position 45) cif_df = cif_df[ - cif_df["_atom_site.label_seq_id"].astype(int) == amino_acid_position + (cif_df["_atom_site.label_atom_id"] == relevant_atom) + & ( + cif_df["_atom_site.label_seq_id"].astype(int) + == amino_acid_position_where_crosslinker_bound + ) ] if cif_df.empty: raise ValueError( - f"No central Ca atom found for amino acid at position {amino_acid_position}." + f"No {relevant_atom} atom found for amino acid at position {amino_acid_position_where_crosslinker_bound}." ) row = cif_df.iloc[0] @@ -44,10 +46,18 @@ def get_coordinates_of_ca_atom_from_cif_df( def get_distance_between_two_amino_acids_in_angstrom( - position1: int, position2: int, cif_df: pd.DataFrame + amino_acid_position1: int, + amino_acid_position2: int, + amino_acid_kind1: str, + amino_acid_kind2: str, + cif_df: pd.DataFrame, ) -> float: - x1, y1, z1 = get_coordinates_of_ca_atom_from_cif_df(cif_df, position1) - x2, y2, z2 = get_coordinates_of_ca_atom_from_cif_df(cif_df, position2) + x1, y1, z1 = get_coordinates_of_atom_crosslinker_bound_to( + amino_acid_position1, amino_acid_kind1, cif_df + ) + x2, y2, z2 = get_coordinates_of_atom_crosslinker_bound_to( + amino_acid_position2, amino_acid_kind2, cif_df + ) distance = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2 + (z2 - z1) ** 2) @@ -71,23 +81,26 @@ def get_position_of_amino_acid_crosslinker_bound_to( def get_distance_between_crosslinker_connected_amino_acids_in_alphafold( fasta_df: pd.DataFrame, cif_df: pd.DataFrame, crosslink ) -> float: - amino_acid_crosslinker1_is_bound_to = ( + protein_sequence = fasta_df.at[0, "Protein Sequence"] + amino_acid_position_crosslinker1_is_bound_to = ( get_position_of_amino_acid_crosslinker_bound_to( - protein_sequence=fasta_df.at[0, "Protein Sequence"], + protein_sequence=protein_sequence, peptide_sequence=crosslink.Peptide1, crosslinker_position_within_peptide=crosslink.CL_position1, ) ) - amino_acid_crosslinker2_is_bound_to = ( + amino_acid_position_crosslinker2_is_bound_to = ( get_position_of_amino_acid_crosslinker_bound_to( - protein_sequence=fasta_df.at[0, "Protein Sequence"], + protein_sequence=protein_sequence, peptide_sequence=crosslink.Peptide2, crosslinker_position_within_peptide=crosslink.CL_position2, ) ) distance_in_alphafold = get_distance_between_two_amino_acids_in_angstrom( - amino_acid_crosslinker1_is_bound_to, - amino_acid_crosslinker2_is_bound_to, + amino_acid_position_crosslinker1_is_bound_to, + amino_acid_position_crosslinker2_is_bound_to, + protein_sequence[amino_acid_position_crosslinker1_is_bound_to], + protein_sequence[amino_acid_position_crosslinker2_is_bound_to], cif_df, ) return distance_in_alphafold @@ -119,7 +132,7 @@ def validate_with_angstrom_deviation( ) cif_df = alphafold_data["cif_df"] fasta_df = alphafold_data["sequence_df"] - df = crosslinking_df.copy() # TODO: really necessary? + df = crosslinking_df.copy() mask = (df.Protein_id1 == protein_to_validate) & ( df.Protein_id2 == protein_to_validate From 97aa58af7f5257df757e553c56063a3387d685b2 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Wed, 28 Jan 2026 18:18:08 +0100 Subject: [PATCH 046/240] refactor: only keep rows in result df which contain crosslinks that were checked with validation method --- backend/protzilla/data_analysis/cross_linking_validation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/protzilla/data_analysis/cross_linking_validation.py b/backend/protzilla/data_analysis/cross_linking_validation.py index cc68c7b7e..bce116f68 100644 --- a/backend/protzilla/data_analysis/cross_linking_validation.py +++ b/backend/protzilla/data_analysis/cross_linking_validation.py @@ -172,6 +172,8 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: relevant_crosslinks_df.apply(check_crosslink, axis=1) ) + df = df[df["valid_crosslink"].notna()] + return dict(crosslinking_df_result=df, messages={}) From 659d8d1e47848efdfb6d4d947187014b7b79a0ec Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Wed, 28 Jan 2026 18:19:41 +0100 Subject: [PATCH 047/240] fix: spelling mistake --- backend/protzilla/methods/data_analysis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 897997d7b..9e958071f 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -2502,12 +2502,12 @@ def modify_form(self, form: Form, run: Run) -> None: ) upper_bound_length_deviation_field = FloatField( name=f"upper_accepted_deviation_for_{cl}", - label=f"Upper Bound on the accepted deviation for {cl} Cross-Links in Ångström (0 equals no bound)", + label=f"Upper bound on the accepted deviation for {cl} Cross-Links in Ångström (0 equals no bound)", min=0, ) lower_bound_length_deviation_field = FloatField( name=f"lower_accepted_deviation_for_{cl}", - label=f"Lower Bound on the accepted deviation for {cl} Cross-Links in Ångström (0 equals no bound)", + label=f"Lower bound on the accepted deviation for {cl} Cross-Links in Ångström (0 equals no bound)", min=0, ) form.add_field(crosslinker_length_field) From bd205ee9a54f14e57e43e191492e90c28e779c52 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Wed, 28 Jan 2026 18:31:32 +0100 Subject: [PATCH 048/240] fix: broken tests --- .../data_analysis/cross_linking_validation.py | 5 +++-- .../data_analysis/test_crosslinking_validation.py | 15 --------------- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/backend/protzilla/data_analysis/cross_linking_validation.py b/backend/protzilla/data_analysis/cross_linking_validation.py index bce116f68..551705bd4 100644 --- a/backend/protzilla/data_analysis/cross_linking_validation.py +++ b/backend/protzilla/data_analysis/cross_linking_validation.py @@ -11,6 +11,7 @@ def get_reactive_atom_of_amino_acid_residue(amino_acid_kind: str) -> str: # right now we always return the central C atom # later we might want to return the reactive atom of the amino acid residue of the specific amino acid kind + # as soon as we change this, we will need to change the test test_validate_with_angstrom_deviation return "CA" @@ -99,8 +100,8 @@ def get_distance_between_crosslinker_connected_amino_acids_in_alphafold( distance_in_alphafold = get_distance_between_two_amino_acids_in_angstrom( amino_acid_position_crosslinker1_is_bound_to, amino_acid_position_crosslinker2_is_bound_to, - protein_sequence[amino_acid_position_crosslinker1_is_bound_to], - protein_sequence[amino_acid_position_crosslinker2_is_bound_to], + protein_sequence[amino_acid_position_crosslinker1_is_bound_to - 1], + protein_sequence[amino_acid_position_crosslinker2_is_bound_to - 1], cif_df, ) return distance_in_alphafold diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 476ac41b9..0377692a0 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -11,21 +11,6 @@ from protzilla.methods.data_analysis import CrossLinkingValidationWithAngstromDeviation -def test_get_distance_between_two_amino_acids_in_angstrom(): - cif_df = pd.DataFrame( - { - "_atom_site.label_atom_id": ["CA", "CA"], - "_atom_site.label_seq_id": [1, 2], - "_atom_site.Cartn_x": [0, 3], - "_atom_site.Cartn_y": [0, 0], - "_atom_site.Cartn_z": [0, 0], - } - ) - - dist = get_distance_between_two_amino_acids_in_angstrom(1, 2, cif_df) - assert dist == 3.0 - - def test_get_position_of_amino_acid_crosslinker_bound_to(): protein = "MABCDEFGHIJK" peptide = "ABC" From cf17f05691c2f89423982d00d7d0608243b4e67b Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Thu, 29 Jan 2026 09:58:37 +0100 Subject: [PATCH 049/240] fix: revert unnecessary changes to steps.py --- backend/protzilla/steps.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/backend/protzilla/steps.py b/backend/protzilla/steps.py index a12415133..727f9f8d6 100644 --- a/backend/protzilla/steps.py +++ b/backend/protzilla/steps.py @@ -268,17 +268,19 @@ def plot_input(self) -> dict: input_parameters = inspect.signature(self.plot_method).parameters - required_keys = [key for key, param in input_parameters.items()] + required_keys = [ + key + for key, param in input_parameters.items() + if param.default == inspect.Parameter.empty + ] for key in required_keys: if key not in plot_input: raise ValueError(f"Missing required input '{key}' for the plot method") - output_dict = { + return { key: plot_input[key] for key in input_parameters.keys() if key in plot_input } - return output_dict - def validate_outputs(self, soft_check: bool = False) -> bool: """ Validates the outputs of the step. Uses the output_keys attribute to check if all required keys are present in From 32d9e341ca2d5064cbe8612e644263c430653b2f Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Thu, 29 Jan 2026 10:09:48 +0100 Subject: [PATCH 050/240] chore: update docstrings --- .../data_analysis/cross_linking_validation.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/backend/protzilla/data_analysis/cross_linking_validation.py b/backend/protzilla/data_analysis/cross_linking_validation.py index 551705bd4..d0b67b24b 100644 --- a/backend/protzilla/data_analysis/cross_linking_validation.py +++ b/backend/protzilla/data_analysis/cross_linking_validation.py @@ -113,18 +113,20 @@ def validate_with_angstrom_deviation( crosslinker_information: dict[str, list[float]], ) -> dict: """ - Validates cross-links by comparing the cross-linker - lengths with the distances between the linked amino acids in the AlphaFold - protein structure. A cross-link is regarded as valid if the distance between the connected amino acids in AlphaFold - is less than the cross-linker length + the allowed deviation. + Validates cross-links by comparing the cross-linker lengths with the distances between the linked + amino acids in the AlphaFold protein structure. A cross-link is regarded as valid if it matches the AlphaFold data, + so if the distance between the connected amino acids in AlphaFold is less than (cross-linker length + the upper allowed deviation) + and more than (cross-linker length - the lower allowed deviation). If one of the bounds is zero only the other bound will be applied. :param crosslinking_df: DataFrame containing cross-linking data. :param protein_to_validate: UniProt ID of the protein to validate. :param crosslinker_information: Contains for each Crosslinker: - length_of_: float - - accepted_deviation_for_: float - :return: Tuple (valid_cross_links, invalid_cross_links), counts of cross-links that - pass or fail the distance validation. + - lower_accepted_deviation_for_: float + - upper_accepted_deviation_for_: float + :return: dict (crosslinking_df_result, messages), crosslinking_df_result contains the relevant rows (rows of intra-crosslinks within the + protein to validate) of crosslinking_df and two more colums containing the distances in AlphaFold and wheter the crosslink matches the + AlphaFold data or not :raises KeyError: If a required crosslinker field is missing in crosslinker_information. :raises ValueError: If peptide sequences cannot be matched to the protein sequence. """ @@ -192,7 +194,8 @@ def bar_plot_of_valid_crosslinks( :param protein_to_validate: UniProt ID of the protein to validate. :param crosslinker_information: Contains for each Crosslinker: - length_of_: float - - accepted_deviation_for_: float + - lower_accepted_deviation_for_: float + - upper_accepted_deviation_for_: float :return: List containing a single bar plot object representing counts of valid and invalid cross-links. :raises KeyError: If a required crosslinker field is missing in crosslinker_information. From b57ced38feadf6a732d4e7a51f1ad58d1856c9bc Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Thu, 29 Jan 2026 12:01:20 +0100 Subject: [PATCH 051/240] add text field for organism id and validation of those inputs --- .../importing/crosslinking_import.py | 34 +++++++++++++++++-- backend/protzilla/methods/importing.py | 4 +++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index 1e9b3596d..d88d86463 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -9,6 +9,9 @@ import traceback import requests import re +import zipfile +import io +import json from backend.protzilla.utilities import format_trace from backend.protzilla.importing.import_utils import ( @@ -657,6 +660,20 @@ def normalize_crosslinking_df(df: pd.DataFrame) -> pd.DataFrame: return df.loc[:, columns_in_crosslinking_df] +def process_organism_id_from_text_field(organism_id: str): + cleaned_organism_id = organism_id.strip().replace(" ", "") + url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=taxonomy&id={cleaned_organism_id}&retmode=json" + response = requests.get(url) + if response.status_code != 200: + return False, None + data = response.json() + output_ids = data.get("result", {}) + if cleaned_organism_id not in output_ids: + return False, None + name = output_ids[cleaned_organism_id].get("scientificname") + return True, name + + def aggregate_failed_proteins_for_display(failed_df: pd.DataFrame) -> str: protein_with_error_set = set() @@ -675,7 +692,18 @@ def aggregate_failed_proteins_for_display(failed_df: pd.DataFrame) -> str: return "\n".join(sorted(protein_with_error_set)) -def crosslinking_import(file_path: Path) -> dict: +def crosslinking_import(file_path: Path, organism_id: str) -> dict: + success, scientific_organism_name = process_organism_id_from_text_field(organism_id) + if not success: + msg = f"Unsupported organism id: {organism_id}. Please provide a valid taxonomy id." + return dict( + messages=[ + dict( + level=logging.ERROR, + msg=msg, + ) + ] + ) try: if file_path.suffix == ".csv": good_df, failed_df = read_csm_file(file_path) @@ -695,10 +723,10 @@ def crosslinking_import(file_path: Path) -> dict: ] ) if failed_df.empty: - msg = f"Successfully imported data of {len(good_df)} cross-links." + msg = f"Successfully imported data of {len(good_df)} cross-links for the {scientific_organism_name} organism." messages = [dict(level=logging.INFO, msg=msg)] else: - msg = f"Warning: {len(failed_df)} rows failed to import, however {len(good_df)} cross-links were successfully imported." + msg = f"Warning: {len(failed_df)} rows failed to import, however {len(good_df)} cross-links for the {scientific_organism_name} organism were successfully imported." messages = [ dict(level=logging.WARNING, msg=msg), dict(level=logging.WARNING, msg=f"Failed proteins:\n{aggregate_failed_proteins_for_display(failed_df)}"), diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 44728556d..cdf9245f5 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -458,6 +458,10 @@ def create_form(self): label="Cross-Linking Data file (.xlsx or .csv)", value=None, ), + TextField( + name="organism_id", + label="Organism ID", + ), ], ) From 5f3190061e506350ddfa5c2f9fbcbf14839b87ae Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Thu, 29 Jan 2026 16:17:15 +0100 Subject: [PATCH 052/240] refactor uniprot batch request to meet the changed requirements --- .../importing/crosslinking_import.py | 314 +++++------------- 1 file changed, 89 insertions(+), 225 deletions(-) diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index d88d86463..1b3ce3638 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -9,9 +9,9 @@ import traceback import requests import re -import zipfile -import io -import json +from io import StringIO +from itertools import islice +from functools import partial from backend.protzilla.utilities import format_trace from backend.protzilla.importing.import_utils import ( @@ -74,14 +74,18 @@ def validate_data_before_lookup( return valid_data, results +def split_data_in_batches(data): + max_allowed_uniprot_batch_size = 25 + iterable = iter(data) + while batch := list(islice(iterable, max_allowed_uniprot_batch_size)): + yield batch + + def build_uniprot_search_params( data_for_lookup: set, field_of_existing_data: str, - *, extra_query: str | None = None, - response_format: str, - fields: str, - include_isoforms: bool = False, + extra_fields: str | None = None, ): """ Build the UniProt search URL and query parameters for a batch of identifiers. @@ -110,19 +114,19 @@ def build_uniprot_search_params( base_query = " OR ".join( f"{field_of_existing_data}:{data}" for data in data_for_lookup ) - if extra_query: base_query = f"({base_query}) AND {extra_query}" + fields = "accession,gene_primary" + if extra_fields: + fields = fields + "," + extra_fields + params = { "query": base_query, - "format": response_format, + "format": "tsv", "fields": fields, } - if include_isoforms: - params["includeIsoform"] = "true" - return uniprot_search_url, params @@ -170,124 +174,61 @@ def execute_uniprot_request(url, params, valid_data, results): return None -def process_uniprot_response_containing_gene_names(response, results): - """ - Process a UniProt API response containing gene name information and update the results dictionary. - - :param response: HTTP response object returned by a UniProt request - :type response: requests.Response - :param results: Dictionary to store lookup results. Each protein ID will be updated as: - ``protein_id -> (success, gene_name, error_code)`` - :type results: dict[str, tuple[bool, str | None, str | None]] - - :return: None (updates `results` in-place) - :rtype: None - - :note: For each entry in the response: - - If a gene name is found, ``results[protein_id] = (True, gene_name, None)`` - - If no gene name is found, ``results[protein_id] = (False, None, "NO_GENE_NAME_FOUND")`` - """ - data = response.json() - - for entry in data.get("results", []): - protein_id = entry.get("primaryAccession") - output = entry.get("genes", [{}]) - gene_name = output[0].get("geneName", {}).get("value") if output else None +def process_uniprot_response(response, results, input_data, mode): + df = pd.read_csv(StringIO(response.text), sep="\t") - if gene_name: - results[protein_id] = (True, gene_name, None) - else: - results[protein_id] = (False, None, "NO_GENE_NAME_FOUND") + for _, row in df.iterrows(): + protein_id = row.get("Entry") + primary_gene_name = row.get("Gene Names (primary)") + + if mode == "id_to_gene_name": + existing_data = protein_id + requested_data = primary_gene_name + elif mode == "gene_name_to_id": + existing_data = primary_gene_name + requested_data = protein_id + + if pd.notna(requested_data) and requested_data != "": + if existing_data in input_data: + results[existing_data] = (True, requested_data, None) + elif mode == "gene_name_to_id": + alternative_gene_names = str(row.get("Gene Names", "")).split() + for gene_name in alternative_gene_names: + if gene_name in input_data: + results[gene_name] = (True, requested_data, None) + break + + +def uniprot_lookup(input_data, mode, results, organism_id): + if mode == "id_to_gene_name": + error = "NO_GENE_NAME_FOUND" + field_of_existing_data="accession" + extra_query=None + extra_fields=None + elif mode == "gene_name_to_id": + error = "NO_PROTEIN_ID_FOUND" + field_of_existing_data="gene_exact" + extra_query=f"organism_id:{organism_id} AND reviewed:true" + extra_fields="gene_names" + + for batch in split_data_in_batches(input_data): + + url, params = build_uniprot_search_params( + batch, + field_of_existing_data, + extra_query, + extra_fields, + ) + response = execute_uniprot_request(url, params, batch, results) + if response is None: + continue -def process_uniprot_response_containing_protein_ids( - response, valid_input, is_fallback: bool -): - """ - Process a UniProt TSV response containing protein IDs and map them to gene names. - - :param response: HTTP response object returned by a UniProt request in TSV format - :type response: requests.Response - :param valid_input: Set of gene names to extract protein IDs for - :type valid_input: set[str] - :param is_fallback: True if the response comes from a fallback individual UniProt request - instead of the standard UniProt batch request - :type is_fallback: bool - - :return: Dictionary mapping gene_name -> protein information - :rtype: dict[str, dict[str, list[str]]] - - :returns output: Dictionary with the following structure: - { - gene_name: { - "protein_ids": List of protein IDs without isoform suffix, - "list_of_protein_isoforms": List of protein IDs with isoform suffix - } - } - - :note: For each line in the TSV response: - - Protein IDs with a dash ("-") are considered isoforms and added to - "list_of_protein_isoforms" - - Other protein IDs are added to "protein_ids" - - Only gene names present in `valid_input` are considered, unless `is_fallback` is True - """ - output = defaultdict(lambda: {"protein_ids": [], "list_of_protein_isoforms": []}) - - lines = response.text.strip().split("\n") - header = lines[0].split("\t") - protein_id_idx = header.index("Entry") - gene_name_idx = header.index("Gene Names (primary)") - - for line in lines[1:]: - parts = line.split("\t") - protein_id = parts[protein_id_idx] - output_gene_names = parts[gene_name_idx].split() - - for gene_name in output_gene_names: - if gene_name in valid_input: - if "-" in protein_id: - output[gene_name]["list_of_protein_isoforms"].append(protein_id) - else: - output[gene_name]["protein_ids"].append(protein_id) - elif is_fallback: - if "-" in protein_id: - output[valid_input]["list_of_protein_isoforms"].append(protein_id) - else: - output[valid_input]["protein_ids"].append(protein_id) - return output - - -def fallback_single_lookup(query: str, query_type: str, results): - """ - Perform a fallback UniProt lookup for a single gene or protein ID and update the results. - - :param query: The gene name or UniProt ID to look up - :type query: str - :param query_type: Type of lookup to perform. Either: - - "get_gene_name": Retrieve the primary gene name for a UniProt ID - - "get_protein_ids": Retrieve UniProt accession IDs for a gene - :type query_type: str - :param results: Dictionary to store lookup results. Will be updated in-place. - Entries are stored as ``key -> (success, data, error_code)`` - :type results: dict[str, tuple[bool, Any, str | None]] - - :return: HTTP response object from the UniProt request if successful, otherwise None - :rtype: requests.Response or None + process_uniprot_response(response, results, batch, mode) - :note: This function constructs the appropriate UniProt REST API request depending on - `query_type` and uses `execute_uniprot_request` to perform the request and handle errors. - """ - if query_type == "get_gene_name": - url = f"https://rest.uniprot.org/uniprotkb/{query}" - params = {"fields": "gene_primary", "format": "json"} - elif query_type == "get_protein_ids": - url = "https://rest.uniprot.org/uniprotkb/search" - params = { - "query": f"gene_exact:{query}", - "format": "tsv", - "fields": "accession,gene_primary", - } - return execute_uniprot_request(url, params, query, results) + for data in input_data: + if data not in results: + results[data] = (False, None, error) def get_gene_name_from_protein_ids(protein_ids: set): @@ -304,7 +245,8 @@ def get_gene_name_from_protein_ids(protein_ids: set): :returns gene_name: Official gene name if successful, else None :returns error: Error code or message if the lookup failed, else None """ - # Regex for valid accession input directly from UniProt + # Regex for valid accession input directly from UniProt + # (extended to include isoforms) # A batch request containing an id that doesn't match this regex, # leads to an http 400 for the whole request. valid_id_pattern = re.compile( @@ -312,7 +254,8 @@ def get_gene_name_from_protein_ids(protein_ids: set): r"[OPQ][0-9][A-Z0-9]{3}[0-9]" r"|" r"[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2}" - r")$" + r")" + r"(?:-.+)?$" ) valid_ids, results = validate_data_before_lookup( @@ -323,41 +266,15 @@ def get_gene_name_from_protein_ids(protein_ids: set): if not valid_ids: return results + + valid_ids_without_isoform = {x.split("-", 1)[0] for x in valid_ids} - url, params = build_uniprot_search_params( - valid_ids, - field_of_existing_data="accession", - response_format="json", - fields="accession,gene_primary", - ) - - response = execute_uniprot_request(url, params, valid_ids, results) - if response is None: - return results - - process_uniprot_response_containing_gene_names(response, results) - - for pid in valid_ids: - if pid not in results: - - response = fallback_single_lookup(pid, "get_gene_name", results) - data = response.json() - processed_data = data.get("genes", []) - gene_name = ( - processed_data[0].get("geneName", {}).get("value") - if processed_data - else None - ) - - if gene_name: - results[pid] = (True, gene_name, None) - else: - results[pid] = (False, None, "PROTEIN_ID_NOT_FOUND") + uniprot_lookup(input_data=valid_ids_without_isoform, mode="id_to_gene_name", results=results, organism_id=None) return results -def get_protein_ids_from_gene_name(gene_names: set): +def get_protein_ids_from_gene_name(gene_names: set, organism_id): """ Retrieve UniProt protein IDs for a given set of human gene names as a batch query. @@ -383,43 +300,8 @@ def get_protein_ids_from_gene_name(gene_names: set): if not valid_gene_names: return results - - url, params = build_uniprot_search_params( - valid_gene_names, - field_of_existing_data="gene_exact", - extra_query="organism_id:9606 AND reviewed:true", - response_format="tsv", - fields="accession,gene_primary", - include_isoforms=True, - ) - - response = execute_uniprot_request(url, params, valid_gene_names, results) - if response is None: - return results - - output = process_uniprot_response_containing_protein_ids( - response, valid_gene_names, False - ) - - for gene_name in valid_gene_names: - data = output.get(gene_name) - if not data or not data["protein_ids"]: - - response = fallback_single_lookup(gene_name, "get_protein_ids", results) - if response is not None: - new_output = process_uniprot_response_containing_protein_ids( - response, gene_name, True - ) - protein_id = new_output.get(gene_name) - else: - protein_id = None - if protein_id: - results[gene_name] = (True, protein_id, None) - else: - results[gene_name] = (False, None, "NO_PROTEIN_ID_FOUND") - - else: - results[gene_name] = (True, data, None) + + uniprot_lookup(input_data=valid_gene_names, mode="gene_name_to_id", results=results, organism_id=organism_id) return results @@ -429,7 +311,6 @@ def iterate_for_protein_designation( existing_designation, new_designation, uniprot_lookup_results, - value_extractor=lambda x: x, ): """ Iterate over a DataFrame and add missing protein designations using precomputed lookup results. @@ -463,11 +344,14 @@ def iterate_for_protein_designation( for _, row in df.iterrows(): row_dict = row.to_dict() + protein_id1 = row[existing_designation + "1"].split("-", 1)[0] + protein_id2 = row[existing_designation + "2"].split("-", 1)[0] + success1, data1, error1 = uniprot_lookup_results.get( - row[existing_designation + "1"], (False, None, "NOT_LOOKED_UP") + protein_id1, (False, None, "NOT_LOOKED_UP") ) success2, data2, error2 = uniprot_lookup_results.get( - row[existing_designation + "2"], (False, None, "NOT_LOOKED_UP") + protein_id2, (False, None, "NOT_LOOKED_UP") ) errors_occurred = {} @@ -481,8 +365,8 @@ def iterate_for_protein_designation( failed_row.update(errors_occurred) failed_rows.append(failed_row) else: - row_dict[new_designation + "1"] = value_extractor(data1) - row_dict[new_designation + "2"] = value_extractor(data2) + row_dict[new_designation + "1"] = data1 + row_dict[new_designation + "2"] = data2 good_rows.append(row_dict) good_df = normalize_crosslinking_df(pd.DataFrame(good_rows)) @@ -496,7 +380,6 @@ def get_missing_protein_designation( existing_column: str, missing_column: str, uniprot_lookup_function, - value_extractor=lambda x: x, ): """ Fill missing protein designations in a DataFrame using a UniProt lookup function. @@ -528,15 +411,11 @@ def get_missing_protein_designation( unique_existing_designations = aggregate_data(df, existing_column) uniprot_lookup_results = uniprot_lookup_function(unique_existing_designations) good_df, failed_df = iterate_for_protein_designation( - df, existing_column, missing_column, uniprot_lookup_results, value_extractor + df, existing_column, missing_column, uniprot_lookup_results ) return good_df, failed_df -def remove_isoform_from_protein_id(protein_id: str) -> str: - return protein_id.split("-", 1)[0] - - def remove_brackets_from_peptide(peptide: str) -> str: return peptide.replace("[", "").replace("]", "") @@ -583,29 +462,17 @@ def read_ProteomeDiscoverer_XlinkX_file(file_path: Path) -> pd.DataFrame: df["Is_intra_crosslink"] = df["Is_intra_crosslink"].eq("Intra") - # Right now we remove the isoform ending from every protein_id (if necessary), - # because we cannot process isoforms properly. - # If we ever wanted to add an "Isoforms" column, we need to store the original value from - # the "Protein_id1/2" column in the "Isoforms" column first, before removing the isoform ending. - df["Protein_id1"] = ( - df["Protein_id1"].apply(remove_isoform_from_protein_id).astype("string") - ) - df["Protein_id2"] = ( - df["Protein_id2"].apply(remove_isoform_from_protein_id).astype("string") - ) - good_df, failed_df = get_missing_protein_designation( df=df, existing_column="Protein_id", missing_column="Protein", uniprot_lookup_function=get_gene_name_from_protein_ids, - value_extractor=lambda x: x, ) return good_df, failed_df -def read_csm_file(file_path: Path) -> pd.DataFrame: +def read_csm_file(file_path: Path, organism_id) -> pd.DataFrame: """ Read and process a CSM CSV file: 1. Reads the CSV file and renames columns to a standard format. @@ -631,15 +498,12 @@ def read_csm_file(file_path: Path) -> pd.DataFrame: df["Is_intra_crosslink"] = df["Protein1"].eq(df["Protein2"]) - # In our UniProt lookup we already get all isoforms of the respective gene name. - # Right now we only store the protein id without any isoform information in our dataframe to keep it consistent. - # If we ever need the isoform information we just have to change what the value extractor stores in our dataframe. + uniprot_lookup_function_with_organism_id = partial(get_protein_ids_from_gene_name, organism_id=organism_id) good_df, failed_df = get_missing_protein_designation( df=df, existing_column="Protein", missing_column="Protein_id", - uniprot_lookup_function=get_protein_ids_from_gene_name, - value_extractor=lambda x: x["protein_ids"][0] if x else None, + uniprot_lookup_function=uniprot_lookup_function_with_organism_id, ) return good_df, failed_df @@ -706,7 +570,7 @@ def crosslinking_import(file_path: Path, organism_id: str) -> dict: ) try: if file_path.suffix == ".csv": - good_df, failed_df = read_csm_file(file_path) + good_df, failed_df = read_csm_file(file_path, organism_id) elif file_path.suffix == ".xlsx": good_df, failed_df = read_ProteomeDiscoverer_XlinkX_file(file_path) else: From dc624445bdbec68a8888bd83c753343abb105982 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Thu, 29 Jan 2026 17:19:22 +0100 Subject: [PATCH 053/240] refactor some variable names, update docstrings and make types explicit --- .../importing/crosslinking_import.py | 245 +++++++++++++----- 1 file changed, 185 insertions(+), 60 deletions(-) diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index 1b3ce3638..d783beae5 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -4,7 +4,6 @@ import logging from pathlib import Path -from collections import defaultdict import pandas as pd import traceback import requests @@ -12,6 +11,7 @@ from io import StringIO from itertools import islice from functools import partial +from typing import Callable, Optional, Literal from backend.protzilla.utilities import format_trace from backend.protzilla.importing.import_utils import ( @@ -38,8 +38,10 @@ def aggregate_data(df: pd.DataFrame, column: str) -> set: def validate_data_before_lookup( - data_for_lookup: set, validator_function, error_code: str -): + data_for_lookup: set[str], + validator_function: Callable[[str], bool], + error_code: str +)-> tuple[set[str], dict[str, tuple[bool, None, str]]]: """ Split input values into valid and invalid ones. Invalid values are directly written to the results with the given error code. @@ -74,7 +76,22 @@ def validate_data_before_lookup( return valid_data, results -def split_data_in_batches(data): +def split_data_in_batches(data: "iterable") -> "iterable": + """ + Split an iterable into consecutive batches of fixed maximum size. + + The function yields lists containing up to 25 elements from the input iterable. + The final batch may contain fewer elements. + + :param data: Iterable containing input elements to be batched + :type data: iterable + + :return: Iterator yielding batches of input elements as lists + :rtype: iterable + + :yields: Lists of at most 25 elements + :yield type: list + """ max_allowed_uniprot_batch_size = 25 iterable = iter(data) while batch := list(islice(iterable, max_allowed_uniprot_batch_size)): @@ -82,11 +99,11 @@ def split_data_in_batches(data): def build_uniprot_search_params( - data_for_lookup: set, + data_for_lookup: set[str], field_of_existing_data: str, extra_query: str | None = None, extra_fields: str | None = None, -): +)-> tuple[str, dict[str, str]]: """ Build the UniProt search URL and query parameters for a batch of identifiers. @@ -96,12 +113,8 @@ def build_uniprot_search_params( :type field_of_existing_data: str :param extra_query: Optional additional query string to filter results :type extra_query: str or None - :param response_format: Desired response format (e.g., "json", "tsv") - :type response_format: str - :param fields: Comma-separated list of fields to return (e.g., "accession,id,protein_name") - :type fields: str - :param include_isoforms: Whether to include isoform entries in the results - :type include_isoforms: bool + :param extra_fields: Comma-separated list of additional fields to return (e.g., "id,protein_name") + :type extra_fields: str or None :return: Tuple containing the UniProt search URL and the query parameters dictionary :rtype: tuple[str, dict[str, str]] @@ -130,7 +143,12 @@ def build_uniprot_search_params( return uniprot_search_url, params -def execute_uniprot_request(url, params, valid_data, results): +def execute_uniprot_request( + url: str, + params: dict[str, str], + valid_data: set[str], + results: dict[str, tuple[bool, None, str]] +) -> Optional[requests.Response]: """ Execute a UniProt HTTP request with error handling and update the results for failed queries. @@ -174,7 +192,32 @@ def execute_uniprot_request(url, params, valid_data, results): return None -def process_uniprot_response(response, results, input_data, mode): +def process_uniprot_response( + response: requests.Response, + results: dict[str, tuple[bool, str | None, None | str]], + input_data: set[str], + mode: Literal["id_to_gene_name", "gene_name_to_id"] +) -> None: + """ + Process a UniProt API response and update the results dictionary. + + The function reads a TSV response from UniProt, extracts the requested data + (gene name or protein ID depending on mode), and updates the results dictionary + with valid lookups. In `gene_name_to_id` mode, it also checks alternative gene names. + + :param response: The HTTP response object returned from a UniProt request + :type response: requests.Response + :param results: Dictionary to store lookup results; updated in place + as ``existing_data -> (True, requested_data, None)`` + :type results: dict[str, tuple[bool, str | None, str | None]] + :param input_data: Set of input values that were originally queried + :type input_data: set[str] + :param mode: Lookup mode, either mapping IDs to gene names or gene names to IDs + :type mode: Literal["id_to_gene_name", "gene_name_to_id"] + + :return: None (results dictionary is updated in place) + :rtype: None + """ df = pd.read_csv(StringIO(response.text), sep="\t") for _, row in df.iterrows(): @@ -199,7 +242,33 @@ def process_uniprot_response(response, results, input_data, mode): break -def uniprot_lookup(input_data, mode, results, organism_id): +def uniprot_lookup( + input_data: set[str], + mode: Literal["id_to_gene_name", "gene_name_to_id"], + results: dict[str, tuple[bool, Optional[str], Optional[str]]], + organism_id: Optional[str] = None +) -> None: + """ + Perform a UniProt lookup for a batch of input data, updating the results dictionary. + + Depending on the mode, the function either maps protein IDs to gene names + or gene names to protein IDs. The function handles batching, requests, and + response processing. Any input values that do not return results are marked + as failed in the results dictionary with an appropriate error code. + + :param input_data: Set of input values to look up (protein IDs or gene names) + :type input_data: set[str] + :param mode: Lookup mode, either "id_to_gene_name" or "gene_name_to_id" + :type mode: Literal["id_to_gene_name", "gene_name_to_id"] + :param results: Dictionary to store lookup results; updated in place + with ``existing_data -> (success, value, error_code)`` + :type results: dict[str, tuple[bool, str | None, str | None]] + :param organism_id: Required only for 'gene_name_to_id' mode to filter queries + :type organism_id: str, optional + + :return: None (results dictionary is updated in place) + :rtype: None + """ if mode == "id_to_gene_name": error = "NO_GENE_NAME_FOUND" field_of_existing_data="accession" @@ -211,27 +280,39 @@ def uniprot_lookup(input_data, mode, results, organism_id): extra_query=f"organism_id:{organism_id} AND reviewed:true" extra_fields="gene_names" - for batch in split_data_in_batches(input_data): + for batch in split_data_in_batches(data=input_data): url, params = build_uniprot_search_params( - batch, - field_of_existing_data, - extra_query, - extra_fields, + data_for_lookup=batch, + field_of_existing_data=field_of_existing_data, + extra_query=extra_query, + extra_fields=extra_fields, ) - response = execute_uniprot_request(url, params, batch, results) + response = execute_uniprot_request( + url=url, + params=params, + valid_data=batch, + results=results + ) if response is None: continue - process_uniprot_response(response, results, batch, mode) + process_uniprot_response( + response=response, + results=results, + input_data=batch, + mode=mode + ) for data in input_data: if data not in results: results[data] = (False, None, error) -def get_gene_name_from_protein_ids(protein_ids: set): +def get_gene_name_from_protein_ids( + protein_ids: set[str] +)-> dict[str, tuple[bool, Optional[str], Optional[str]]]: """ Retrieve the gene names for a given set of Protein IDs in a batch from UniProt. @@ -259,7 +340,7 @@ def get_gene_name_from_protein_ids(protein_ids: set): ) valid_ids, results = validate_data_before_lookup( - protein_ids, + data_for_lookup=protein_ids, validator_function=lambda pid: bool(valid_id_pattern.match(pid)), error_code="NOT_A_VALID_PROTEIN_ID", ) @@ -269,31 +350,38 @@ def get_gene_name_from_protein_ids(protein_ids: set): valid_ids_without_isoform = {x.split("-", 1)[0] for x in valid_ids} - uniprot_lookup(input_data=valid_ids_without_isoform, mode="id_to_gene_name", results=results, organism_id=None) + uniprot_lookup( + input_data=valid_ids_without_isoform, + mode="id_to_gene_name", + results=results, + organism_id=None + ) return results -def get_protein_ids_from_gene_name(gene_names: set, organism_id): +def get_protein_ids_from_gene_name( + gene_names: set[str], + organism_id: str +)-> dict[str, tuple[bool, Optional[str], Optional[str]]]: """ Retrieve UniProt protein IDs for a given set of human gene names as a batch query. :param gene_names: Set of gene symbols to look up (e.g., {"RAD50", "MRE11"}) :type gene_names: set[str] + :param organism_id: Organism identifier for filtering UniProt queries (e.g., "9606" for human) + :type organism_id: str - :return: Mapping of gene_name to a tuple containing lookup result, data, and error - :rtype: dict[str, tuple[bool, dict[str, list[str]] | None, str | None]] + :return: Dictionary mapping each gene name to a tuple of (success, protein_id, error) + :rtype: dict[str, tuple[bool, str | None, str | None]] - :returns success: True if the lookup for this gene_name succeeded, False otherwise - :returns data: Dictionary with protein information if successful, else None. - Contains: - - "protein_ids" (list of str): All protein IDs without any isoform information - - "list_of_protein_isoforms" (list of str): All isoform IDs + :returns success: True if the lookup for this gene name succeeded, False otherwise + :returns protein_id: The first valid protein ID found for the gene, or None if lookup failed :returns error: Error code or message if the lookup failed, else None """ # Filter decoy Proteins, because we cannot process them decently valid_gene_names, results = validate_data_before_lookup( - gene_names, + data_for_lookup=gene_names, validator_function=lambda name: not name.startswith("decoy:"), error_code="IS_DECOY_PROTEIN", ) @@ -301,17 +389,22 @@ def get_protein_ids_from_gene_name(gene_names: set, organism_id): if not valid_gene_names: return results - uniprot_lookup(input_data=valid_gene_names, mode="gene_name_to_id", results=results, organism_id=organism_id) + uniprot_lookup( + input_data=valid_gene_names, + mode="gene_name_to_id", + results=results, + organism_id=organism_id + ) return results def iterate_for_protein_designation( - df, - existing_designation, - new_designation, - uniprot_lookup_results, -): + df: pd.DataFrame, + existing_designation: str, + new_designation: str, + uniprot_lookup_results: dict[str, tuple[bool, Optional[str], Optional[str]]] +) -> tuple[pd.DataFrame, pd.DataFrame]: """ Iterate over a DataFrame and add missing protein designations using precomputed lookup results. Either protein IDs or gene names are included in the DataFrame, and the other is added @@ -327,10 +420,6 @@ def iterate_for_protein_designation( :param uniprot_lookup_results: Mapping of key -> (success, data, error) Contains precomputed lookup results :type uniprot_lookup_results: dict - :param value_extractor: Function that extracts the value to store in the DataFrame cell - from `data`. Default is identity function. - Signature: ``value_extractor(data) -> Any`` - :type value_extractor: Callable[[Any], Any] :return: Tuple containing rows with successful lookups and rows with lookup errors :rtype: tuple[pandas.DataFrame, pandas.DataFrame] @@ -379,8 +468,8 @@ def get_missing_protein_designation( df: pd.DataFrame, existing_column: str, missing_column: str, - uniprot_lookup_function, -): + uniprot_lookup_function: Callable[[set[str]], dict[str, tuple[bool, str | None, str | None]]], +) -> tuple[pd.DataFrame, pd.DataFrame]: """ Fill missing protein designations in a DataFrame using a UniProt lookup function. @@ -398,9 +487,6 @@ def get_missing_protein_designation( Should accept a set of values and return results as a dictionary ``key -> (success, data, error_code)`` :type uniprot_lookup_function: Callable[[set[str]], dict[str, tuple[bool, Any, str | None]]] - :param value_extractor: Function to extract the value to store in the missing column - from the lookup data. Default is the identity function. - :type value_extractor: Callable[[Any], Any] :return: Tuple of DataFrames containing rows with successful lookups and rows with errors :rtype: tuple[pandas.DataFrame, pandas.DataFrame] @@ -408,10 +494,13 @@ def get_missing_protein_designation( :returns good_df: Rows where missing protein designations were successfully populated :returns failed_df: Rows where the lookup failed """ - unique_existing_designations = aggregate_data(df, existing_column) + unique_existing_designations = aggregate_data(df=df, column=existing_column) uniprot_lookup_results = uniprot_lookup_function(unique_existing_designations) good_df, failed_df = iterate_for_protein_designation( - df, existing_column, missing_column, uniprot_lookup_results + df=df, + existing_designation=existing_column, + new_designation=missing_column, + uniprot_lookup_results=uniprot_lookup_results ) return good_df, failed_df @@ -426,7 +515,7 @@ def get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format return peptide.find("[") + 1 # 1-based index -def read_ProteomeDiscoverer_XlinkX_file(file_path: Path) -> pd.DataFrame: +def read_ProteomeDiscoverer_XlinkX_file(file_path: Path) -> tuple[pd.DataFrame, pd.DataFrame]: """ Read and process a ProteomeDiscoverer XlinkX Excel file: 1. Reads the Excel file and renames columns to a standard format. @@ -472,7 +561,7 @@ def read_ProteomeDiscoverer_XlinkX_file(file_path: Path) -> pd.DataFrame: return good_df, failed_df -def read_csm_file(file_path: Path, organism_id) -> pd.DataFrame: +def read_csm_file(file_path: Path, organism_id: str) -> tuple[pd.DataFrame, pd.DataFrame]: """ Read and process a CSM CSV file: 1. Reads the CSV file and renames columns to a standard format. @@ -484,6 +573,8 @@ def read_csm_file(file_path: Path, organism_id) -> pd.DataFrame: :param file_path: Path to the CSM CSV file :type file_path: pathlib.Path + :param organism_id: Organism identifier used for UniProt lookups (e.g., "9606" for human) + :type organism_id: str :return: Tuple of DataFrames containing rows with successfully mapped protein IDs and rows where lookup failed @@ -524,7 +615,24 @@ def normalize_crosslinking_df(df: pd.DataFrame) -> pd.DataFrame: return df.loc[:, columns_in_crosslinking_df] -def process_organism_id_from_text_field(organism_id: str): +def process_organism_id_from_text_field(organism_id: str)-> tuple[bool, Optional[str]]: + """ + Retrieve the scientific name of an organism from its NCBI Taxonomy ID. + + The function: + 1. Cleans the input organism ID (removes spaces). + 2. Queries the NCBI Entrez E-utilities esummary endpoint. + 3. Returns a tuple indicating whether the lookup succeeded and the scientific name. + + :param organism_id: NCBI Taxonomy ID as a string (may contain spaces) + :type organism_id: str + + :return: Tuple indicating success and the scientific name + :rtype: tuple[bool, str | None] + + :returns success: True if the organism ID was found and the scientific name retrieved + :returns name: Scientific name of the organism if found, else None + """ cleaned_organism_id = organism_id.strip().replace(" ", "") url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=taxonomy&id={cleaned_organism_id}&retmode=json" response = requests.get(url) @@ -538,18 +646,35 @@ def process_organism_id_from_text_field(organism_id: str): return True, name -def aggregate_failed_proteins_for_display(failed_df: pd.DataFrame) -> str: +def aggregate_failed_proteins_for_display(failed_df: pd.DataFrame) -> str: + """ + Aggregate failed protein lookups into a human-readable string. + + For each row in `failed_df`, this function pairs protein values with their + corresponding error codes and returns a sorted, newline-separated string. + + The function checks for the presence of either "Protein1"/"Protein2" columns + or "Protein_id1"/"Protein_id2" columns to determine which values to process. + + :param failed_df: DataFrame containing rows with failed protein lookups + Must include columns for proteins and their error codes + :type failed_df: pandas.DataFrame + + :return: String summarizing all failed protein lookups in the format + "Protein_value -> ERROR_CODE", sorted alphabetically and separated by newlines + :rtype: str + """ protein_with_error_set = set() if "Protein1" in failed_df.columns and "Protein2" in failed_df.columns: - protein_cols = ["Protein1", "Protein2"] + protein_columns = ["Protein1", "Protein2"] elif "Protein_id1" in failed_df.columns and "Protein_id2" in failed_df.columns: - protein_cols = ["Protein_id1", "Protein_id2"] + protein_columns = ["Protein_id1", "Protein_id2"] - error_cols = ["Protein1_error", "Protein2_error"] + error_columns = ["Protein1_error", "Protein2_error"] - for prot_col, err_col in zip(protein_cols, error_cols): - for protein_val, error_val in zip(failed_df[prot_col], failed_df[err_col]): + for protein_col, error_col in zip(protein_columns, error_columns): + for protein_val, error_val in zip(failed_df[protein_col], failed_df[error_col]): if pd.notna(error_val): protein_with_error_set.add(f"{protein_val} -> {error_val}") From 064566277281e09f90f34c1aa283ff3a4826c3f5 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Thu, 29 Jan 2026 17:59:45 +0100 Subject: [PATCH 054/240] format backend code with black --- .../importing/crosslinking_import.py | 130 ++++++++++-------- 1 file changed, 69 insertions(+), 61 deletions(-) diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index d783beae5..f03cbb042 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -38,10 +38,10 @@ def aggregate_data(df: pd.DataFrame, column: str) -> set: def validate_data_before_lookup( - data_for_lookup: set[str], - validator_function: Callable[[str], bool], - error_code: str -)-> tuple[set[str], dict[str, tuple[bool, None, str]]]: + data_for_lookup: set[str], + validator_function: Callable[[str], bool], + error_code: str, +) -> tuple[set[str], dict[str, tuple[bool, None, str]]]: """ Split input values into valid and invalid ones. Invalid values are directly written to the results with the given error code. @@ -103,7 +103,7 @@ def build_uniprot_search_params( field_of_existing_data: str, extra_query: str | None = None, extra_fields: str | None = None, -)-> tuple[str, dict[str, str]]: +) -> tuple[str, dict[str, str]]: """ Build the UniProt search URL and query parameters for a batch of identifiers. @@ -131,7 +131,7 @@ def build_uniprot_search_params( base_query = f"({base_query}) AND {extra_query}" fields = "accession,gene_primary" - if extra_fields: + if extra_fields: fields = fields + "," + extra_fields params = { @@ -147,7 +147,7 @@ def execute_uniprot_request( url: str, params: dict[str, str], valid_data: set[str], - results: dict[str, tuple[bool, None, str]] + results: dict[str, tuple[bool, None, str]], ) -> Optional[requests.Response]: """ Execute a UniProt HTTP request with error handling and update the results for failed queries. @@ -196,7 +196,7 @@ def process_uniprot_response( response: requests.Response, results: dict[str, tuple[bool, str | None, None | str]], input_data: set[str], - mode: Literal["id_to_gene_name", "gene_name_to_id"] + mode: Literal["id_to_gene_name", "gene_name_to_id"], ) -> None: """ Process a UniProt API response and update the results dictionary. @@ -232,7 +232,7 @@ def process_uniprot_response( requested_data = protein_id if pd.notna(requested_data) and requested_data != "": - if existing_data in input_data: + if existing_data in input_data: results[existing_data] = (True, requested_data, None) elif mode == "gene_name_to_id": alternative_gene_names = str(row.get("Gene Names", "")).split() @@ -246,7 +246,7 @@ def uniprot_lookup( input_data: set[str], mode: Literal["id_to_gene_name", "gene_name_to_id"], results: dict[str, tuple[bool, Optional[str], Optional[str]]], - organism_id: Optional[str] = None + organism_id: Optional[str] = None, ) -> None: """ Perform a UniProt lookup for a batch of input data, updating the results dictionary. @@ -270,49 +270,43 @@ def uniprot_lookup( :rtype: None """ if mode == "id_to_gene_name": - error = "NO_GENE_NAME_FOUND" - field_of_existing_data="accession" - extra_query=None - extra_fields=None + error = "NO_GENE_NAME_FOUND" + field_of_existing_data = "accession" + extra_query = None + extra_fields = None elif mode == "gene_name_to_id": - error = "NO_PROTEIN_ID_FOUND" - field_of_existing_data="gene_exact" - extra_query=f"organism_id:{organism_id} AND reviewed:true" - extra_fields="gene_names" - + error = "NO_PROTEIN_ID_FOUND" + field_of_existing_data = "gene_exact" + extra_query = f"organism_id:{organism_id} AND reviewed:true" + extra_fields = "gene_names" + for batch in split_data_in_batches(data=input_data): url, params = build_uniprot_search_params( data_for_lookup=batch, field_of_existing_data=field_of_existing_data, extra_query=extra_query, - extra_fields=extra_fields, + extra_fields=extra_fields, ) response = execute_uniprot_request( - url=url, - params=params, - valid_data=batch, - results=results + url=url, params=params, valid_data=batch, results=results ) if response is None: continue process_uniprot_response( - response=response, - results=results, - input_data=batch, - mode=mode + response=response, results=results, input_data=batch, mode=mode ) for data in input_data: - if data not in results: + if data not in results: results[data] = (False, None, error) def get_gene_name_from_protein_ids( - protein_ids: set[str] -)-> dict[str, tuple[bool, Optional[str], Optional[str]]]: + protein_ids: set[str], +) -> dict[str, tuple[bool, Optional[str], Optional[str]]]: """ Retrieve the gene names for a given set of Protein IDs in a batch from UniProt. @@ -326,7 +320,7 @@ def get_gene_name_from_protein_ids( :returns gene_name: Official gene name if successful, else None :returns error: Error code or message if the lookup failed, else None """ - # Regex for valid accession input directly from UniProt + # Regex for valid accession input directly from UniProt # (extended to include isoforms) # A batch request containing an id that doesn't match this regex, # leads to an http 400 for the whole request. @@ -347,23 +341,22 @@ def get_gene_name_from_protein_ids( if not valid_ids: return results - + valid_ids_without_isoform = {x.split("-", 1)[0] for x in valid_ids} uniprot_lookup( - input_data=valid_ids_without_isoform, - mode="id_to_gene_name", - results=results, - organism_id=None + input_data=valid_ids_without_isoform, + mode="id_to_gene_name", + results=results, + organism_id=None, ) return results def get_protein_ids_from_gene_name( - gene_names: set[str], - organism_id: str -)-> dict[str, tuple[bool, Optional[str], Optional[str]]]: + gene_names: set[str], organism_id: str +) -> dict[str, tuple[bool, Optional[str], Optional[str]]]: """ Retrieve UniProt protein IDs for a given set of human gene names as a batch query. @@ -388,12 +381,12 @@ def get_protein_ids_from_gene_name( if not valid_gene_names: return results - + uniprot_lookup( - input_data=valid_gene_names, - mode="gene_name_to_id", - results=results, - organism_id=organism_id + input_data=valid_gene_names, + mode="gene_name_to_id", + results=results, + organism_id=organism_id, ) return results @@ -403,7 +396,7 @@ def iterate_for_protein_designation( df: pd.DataFrame, existing_designation: str, new_designation: str, - uniprot_lookup_results: dict[str, tuple[bool, Optional[str], Optional[str]]] + uniprot_lookup_results: dict[str, tuple[bool, Optional[str], Optional[str]]], ) -> tuple[pd.DataFrame, pd.DataFrame]: """ Iterate over a DataFrame and add missing protein designations using precomputed lookup results. @@ -468,7 +461,9 @@ def get_missing_protein_designation( df: pd.DataFrame, existing_column: str, missing_column: str, - uniprot_lookup_function: Callable[[set[str]], dict[str, tuple[bool, str | None, str | None]]], + uniprot_lookup_function: Callable[ + [set[str]], dict[str, tuple[bool, str | None, str | None]] + ], ) -> tuple[pd.DataFrame, pd.DataFrame]: """ Fill missing protein designations in a DataFrame using a UniProt lookup function. @@ -497,10 +492,10 @@ def get_missing_protein_designation( unique_existing_designations = aggregate_data(df=df, column=existing_column) uniprot_lookup_results = uniprot_lookup_function(unique_existing_designations) good_df, failed_df = iterate_for_protein_designation( - df=df, - existing_designation=existing_column, - new_designation=missing_column, - uniprot_lookup_results=uniprot_lookup_results + df=df, + existing_designation=existing_column, + new_designation=missing_column, + uniprot_lookup_results=uniprot_lookup_results, ) return good_df, failed_df @@ -515,7 +510,9 @@ def get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format return peptide.find("[") + 1 # 1-based index -def read_ProteomeDiscoverer_XlinkX_file(file_path: Path) -> tuple[pd.DataFrame, pd.DataFrame]: +def read_ProteomeDiscoverer_XlinkX_file( + file_path: Path, +) -> tuple[pd.DataFrame, pd.DataFrame]: """ Read and process a ProteomeDiscoverer XlinkX Excel file: 1. Reads the Excel file and renames columns to a standard format. @@ -561,7 +558,9 @@ def read_ProteomeDiscoverer_XlinkX_file(file_path: Path) -> tuple[pd.DataFrame, return good_df, failed_df -def read_csm_file(file_path: Path, organism_id: str) -> tuple[pd.DataFrame, pd.DataFrame]: +def read_csm_file( + file_path: Path, organism_id: str +) -> tuple[pd.DataFrame, pd.DataFrame]: """ Read and process a CSM CSV file: 1. Reads the CSV file and renames columns to a standard format. @@ -589,7 +588,9 @@ def read_csm_file(file_path: Path, organism_id: str) -> tuple[pd.DataFrame, pd.D df["Is_intra_crosslink"] = df["Protein1"].eq(df["Protein2"]) - uniprot_lookup_function_with_organism_id = partial(get_protein_ids_from_gene_name, organism_id=organism_id) + uniprot_lookup_function_with_organism_id = partial( + get_protein_ids_from_gene_name, organism_id=organism_id + ) good_df, failed_df = get_missing_protein_designation( df=df, existing_column="Protein", @@ -615,7 +616,7 @@ def normalize_crosslinking_df(df: pd.DataFrame) -> pd.DataFrame: return df.loc[:, columns_in_crosslinking_df] -def process_organism_id_from_text_field(organism_id: str)-> tuple[bool, Optional[str]]: +def process_organism_id_from_text_field(organism_id: str) -> tuple[bool, Optional[str]]: """ Retrieve the scientific name of an organism from its NCBI Taxonomy ID. @@ -632,7 +633,7 @@ def process_organism_id_from_text_field(organism_id: str)-> tuple[bool, Optional :returns success: True if the organism ID was found and the scientific name retrieved :returns name: Scientific name of the organism if found, else None - """ + """ cleaned_organism_id = organism_id.strip().replace(" ", "") url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=taxonomy&id={cleaned_organism_id}&retmode=json" response = requests.get(url) @@ -663,8 +664,8 @@ def aggregate_failed_proteins_for_display(failed_df: pd.DataFrame) -> str: :return: String summarizing all failed protein lookups in the format "Protein_value -> ERROR_CODE", sorted alphabetically and separated by newlines :rtype: str - """ - protein_with_error_set = set() + """ + protein_with_error_set = set() if "Protein1" in failed_df.columns and "Protein2" in failed_df.columns: protein_columns = ["Protein1", "Protein2"] @@ -683,7 +684,7 @@ def aggregate_failed_proteins_for_display(failed_df: pd.DataFrame) -> str: def crosslinking_import(file_path: Path, organism_id: str) -> dict: success, scientific_organism_name = process_organism_id_from_text_field(organism_id) - if not success: + if not success: msg = f"Unsupported organism id: {organism_id}. Please provide a valid taxonomy id." return dict( messages=[ @@ -718,7 +719,14 @@ def crosslinking_import(file_path: Path, organism_id: str) -> dict: msg = f"Warning: {len(failed_df)} rows failed to import, however {len(good_df)} cross-links for the {scientific_organism_name} organism were successfully imported." messages = [ dict(level=logging.WARNING, msg=msg), - dict(level=logging.WARNING, msg=f"Failed proteins:\n{aggregate_failed_proteins_for_display(failed_df)}"), + dict( + level=logging.WARNING, + msg=f"Failed proteins:\n{aggregate_failed_proteins_for_display(failed_df)}", + ), ] - return dict(crosslinking_df=good_df, imported_rows_with_errors_df=failed_df, messages=messages) + return dict( + crosslinking_df=good_df, + imported_rows_with_errors_df=failed_df, + messages=messages, + ) From 5b192cd5846a1415e7bbc6cf246992fa02cbfa05 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Fri, 30 Jan 2026 11:23:25 +0100 Subject: [PATCH 055/240] feat: Add warning when protein structure is replaced in Node, replace icon in settings, fix minor bugs --- backend/main/urls.py | 18 +- backend/main/views_helper.py | 1 - backend/main/views_settings.py | 48 ++-- .../alphafold_protein_structure_load.py | 18 +- .../test_alphafold_protein_structure_load.py | 16 +- .../protein-structure-upload.tsx | 241 +++++++++--------- .../src/components/app/settings/settings.tsx | 9 +- .../core/shared/icon/icons/index.ts | 2 +- .../core/shared/icon/icons/prot_structure.svg | 5 + 9 files changed, 206 insertions(+), 152 deletions(-) create mode 100644 frontend/src/components/core/shared/icon/icons/prot_structure.svg diff --git a/backend/main/urls.py b/backend/main/urls.py index 4e22eec3d..a1543e5ee 100644 --- a/backend/main/urls.py +++ b/backend/main/urls.py @@ -61,9 +61,21 @@ path("api/get_databases", views_settings.get_databases, name="get_databases"), path("api/upload_database", views_settings.database_upload, name="database_upload"), path("api/delete_database", views_settings.database_delete, name="database_delete"), - path("api/get_prot_structure", views_settings.get_prot_structure, name="get_prot_structure"), - path("api/upload_prot_structure", views_settings.upload_prot_structure, name="upload_prot_structure"), - path("api/prot_structure_delete", views_settings.prot_structure_delete, name="prot_structure_delete"), + path( + "api/get_prot_structure", + views_settings.get_prot_structure, + name="get_prot_structure", + ), + path( + "api/upload_prot_structure", + views_settings.upload_prot_structure, + name="upload_prot_structure", + ), + path( + "api/prot_structure_delete", + views_settings.prot_structure_delete, + name="prot_structure_delete", + ), path( "api/load_ptm_settings", views_settings.load_ptm_settings, diff --git a/backend/main/views_helper.py b/backend/main/views_helper.py index ba9546ab5..5ca442f97 100644 --- a/backend/main/views_helper.py +++ b/backend/main/views_helper.py @@ -269,4 +269,3 @@ def validate_uploaded_files( msg = f"All {len(file_mapping)} files validated successfully" logger.info(msg) return True, msg - diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index 130418110..2c6079ddf 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -14,7 +14,12 @@ from django.http import JsonResponse, FileResponse from backend.main import settings -from backend.main.views_helper import sanitize_name, load_settings_from_file, validate_uploaded_files, copy_file_to_directory +from backend.main.views_helper import ( + sanitize_name, + load_settings_from_file, + validate_uploaded_files, + copy_file_to_directory, +) from backend.protzilla.constants.paths import EXTERNAL_DATA_PATH, SETTINGS_PATH from backend.protzilla.data_integration.database_query import ( uniprot_columns, @@ -227,6 +232,7 @@ def save_ptm_settings(request, default_file_stem: str = DEFAULT_PTM_SETTINGS_FIL AF_DICT_PATH = EXTERNAL_DATA_PATH / "alphafold" + def get_prot_structure(request): metadata_csv = AF_DICT_PATH / "alphafold_metadata.csv" df = pandas.read_csv(metadata_csv) @@ -263,29 +269,30 @@ def upload_prot_structure(request): pae: [".json"], fasta_file: [".fasta", ".fa"], } - + is_valid, validation_message = validate_uploaded_files( settings.FILE_UPLOAD_TEMP_DIR, file_mapping ) if not is_valid: - messages.add_message(request, messages.ERROR, validation_message, "alert-danger") - return JsonResponse({"success": False, "message": validation_message}, status=400) + messages.add_message( + request, messages.ERROR, validation_message, "alert-danger" + ) + return JsonResponse( + {"success": False, "message": validation_message}, status=400 + ) af_path = AF_DICT_PATH / entry_id.upper() if af_path.exists(): return JsonResponse( - {"success": False, "message": "Entry ID is not unique."}, status=405 - ) + {"success": False, "message": "Entry ID is not unique."}, status=405 + ) else: af_path.mkdir(parents=True, exist_ok=True) for file_name in [cif_file, confidence, pae, fasta_file]: source_dir = settings.FILE_UPLOAD_TEMP_DIR / file_name - success, message = copy_file_to_directory( - source_dir, - af_path - ) - + success, message = copy_file_to_directory(source_dir, af_path) + # add row to metadata csv AF_DICT_PATH.mkdir(parents=True, exist_ok=True) metadata_csv = AF_DICT_PATH / "alphafold_metadata.csv" @@ -299,11 +306,11 @@ def upload_prot_structure(request): "uniprotAccession": uniprot_id, "modelCreatedDate": formatted, "gene": gene, - "alphafold_version": af_version + "alphafold_version": af_version, } df = pandas.concat([df, pandas.DataFrame([new_row])], ignore_index=True) - df.to_csv(metadata_csv, index=False) + df.to_csv(metadata_csv, index=False) return JsonResponse( { @@ -327,7 +334,7 @@ def prot_structure_delete(request): return JsonResponse( {"success": False, "message": "Invalid request method"}, status=405 ) - + data = json.loads(request.body) entry_id = (data.get("entry_id") or "").strip() if not entry_id: @@ -354,11 +361,17 @@ def prot_structure_delete(request): ) # remove entry out of metadata csv - if metadata_csv.exists() and metadata_csv.is_file() and metadata_csv.stat().st_size > 0: + if ( + metadata_csv.exists() + and metadata_csv.is_file() + and metadata_csv.stat().st_size > 0 + ): try: df = pandas.read_csv(metadata_csv, dtype=str) - df = df[df["entryID"].fillna("").str.strip().str.upper() != entry_id.upper()] - df.to_csv(metadata_csv, index=False) + df = df[ + df["entryID"].fillna("").str.strip().str.upper() != entry_id.upper() + ] + df.to_csv(metadata_csv, index=False) except Exception as e: return JsonResponse( @@ -373,6 +386,7 @@ def prot_structure_delete(request): {"success": True, "message": "Entry deleted successfully"}, status=200 ) + # <--- Databases ---> diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 87dfd3513..1c2d22197 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -5,6 +5,7 @@ from pathlib import Path from textwrap import wrap from typing import Any +import logging import gemmi import pandas as pd @@ -104,12 +105,14 @@ def handle_alphafold_files( :param metadata_df: DataFrame containing AlphaFold metadata :param entry_id: The entry_id (in the case of fetching from AF DB the same as uniprot id) (used for directory naming) :param persist_uploads: If True, files are saved persistently; if False, only loaded into memory - :return: A dictionary containing DataFrames for metadata, CIF, PAE, pLDDT, and sequence data or None values for failed loads + :return: A dictionary containing DataFrames for metadata, CIF, PAE, pLDDT, sequence data or None values for + failed loads and messages such as warnings """ cif_df = None pae_df = None plddt_df = None sequence_df = None + messages = [] meta_dir = paths.EXTERNAL_DATA_PATH / "alphafold" target_dir = meta_dir / uniprot @@ -131,8 +134,15 @@ def handle_alphafold_files( try: if metadata_csv.exists(): existing = pd.read_csv(metadata_csv, dtype=str) - if entry_id and "entry_id" in existing.columns: - existing = existing[existing["entry_id"] != entry_id] + mask = existing["entryID"] == entry_id + if mask.any(): + msg = ( + f'Existing entry with EntryID "{entry_id}" was overwritten.' + ) + logger.warning(msg) + messages.append(dict(level=logging.WARNING, msg=msg)) + existing = existing[~mask] + combined = pd.concat([existing, metadata_df], ignore_index=True) combined.to_csv(metadata_csv, index=False) else: @@ -184,6 +194,7 @@ def handle_alphafold_files( "pae_df": pae_df, "plddt_df": plddt_df, "sequence_df": sequence_df, + "messages": messages, } @@ -255,4 +266,5 @@ def fetch_alphafold_protein_structure( "pae_df": alpha_dfs["pae_df"], "plddt_df": alpha_dfs["plddt_df"], "sequence_df": alpha_dfs["sequence_df"], + "messages": alpha_dfs.get("messages", []), } diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index 774ecd840..af3bc244f 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -101,13 +101,15 @@ def test_fetch_alphafold_returned_keys(tmp_path, monkeypatch): monkeypatch.setattr(af.paths, "EXTERNAL_DATA_PATH", tmp_path) out = af.fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) - assert set(out.keys()) == { - "metadata_df", - "cif_df", - "pae_df", - "plddt_df", - "sequence_df", - } + assert set(out.keys()).issuperset( + { + "metadata_df", + "cif_df", + "pae_df", + "plddt_df", + "sequence_df", + } + ) def test_fetch_alphafold_metadata(tmp_path, monkeypatch): diff --git a/frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx b/frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx index dedae1b6c..9c6055e4a 100644 --- a/frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx +++ b/frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx @@ -6,7 +6,6 @@ import { callApi, callApiWithParameters } from "@protzilla/utils"; import { useEffect, useState } from "react"; import { styled } from "styled-components"; - const ProteinStructureTitle = styled(SectionTitle)` padding-top: ${spacing("large")}; padding-bottom: ${spacing("small")}; @@ -53,7 +52,6 @@ const ProtStructureEntry = ({ af_version, handleDelete, }: ProtStructureProps) => { - return ( @@ -77,41 +75,41 @@ const ProtStructureEntry = ({ }; export const ProteinStructureUpload = () => { - const notify = useNotification(); - const [protStructureList, setProtStructureList] = useState([]); - const [isDeleteModalOpen, openDeleteModal, closeDeleteModal] = useToggleableState(false); - const [selectedProtStructure, setSelectedProtStructure] = useState(""); + const notify = useNotification(); + const [protStructureList, setProtStructureList] = useState([]); + const [isDeleteModalOpen, openDeleteModal, closeDeleteModal] = useToggleableState(false); + const [selectedProtStructure, setSelectedProtStructure] = useState(""); - const fetchDatabases = async () => { - const protStructures = await callApi("get_prot_structure"); - if (protStructures) { - setProtStructureList(protStructures); - } - }; - - useEffect(() => { - void fetchDatabases(); - }, []); + const fetchProtStructures = async () => { + const protStructures = await callApi("get_prot_structure"); + if (protStructures) { + setProtStructureList(protStructures); + } + }; + + useEffect(() => { + void fetchProtStructures(); + }, []); - const handleAddProteinStructure = async ( - uniprot_id: string, - entry_id: string, - af_version: string, - gene: string, - cif_file: string, - confidence: string, - pae: string, - fasta_file: string, + const handleAddProteinStructure = async ( + uniprot_id: string, + entry_id: string, + af_version: string, + gene: string, + cif_file: string, + confidence: string, + pae: string, + fasta_file: string, ) => { const response = await callApiWithParameters("upload_prot_structure", { uniprot_id: uniprot_id, entry_id: entry_id, - af_version: af_version, - gene: gene, - cif_file: cif_file, - confidence: confidence, - pae: pae, - fasta_file: fasta_file, + af_version: af_version, + gene: gene, + cif_file: cif_file, + confidence: confidence, + pae: pae, + fasta_file: fasta_file, }); if (response?.success) { notify({ @@ -128,35 +126,36 @@ export const ProteinStructureUpload = () => { isClosingAutomatically: true, }); } + void fetchProtStructures(); }; const onDeleteProtStructure = (entry_id: string) => { - openDeleteModal(); - setSelectedProtStructure(entry_id); - }; - - const handleDeleteProtStructure = async (entry_id: string) => { - const response = await callApiWithParameters("prot_structure_delete", { - entry_id: entry_id, + openDeleteModal(); + setSelectedProtStructure(entry_id); + }; + + const handleDeleteProtStructure = async (entry_id: string) => { + const response = await callApiWithParameters("prot_structure_delete", { + entry_id: entry_id, + }); + if (response?.success) { + notify({ + title: "Protein structure deleted", + message: response.message as string, + type: "success", + isClosingAutomatically: true, }); - if (response?.success) { - notify({ - title: "Protein structure deleted", - message: response.message as string, - type: "success", - isClosingAutomatically: true, - }); - } else { - notify({ - title: "Protein structure deletion failed", - message: response?.message ?? "Unknown error", - type: "error", - isClosingAutomatically: true, - }); - } - void fetchDatabases(); - closeDeleteModal(); - }; + } else { + notify({ + title: "Protein structure deletion failed", + message: response?.message ?? "Unknown error", + type: "error", + isClosingAutomatically: true, + }); + } + void fetchProtStructures(); + closeDeleteModal(); + }; return (
@@ -173,65 +172,65 @@ export const ProteinStructureUpload = () => { style={{ paddingBottom: "8px" }} /> - { - void handleAddProteinStructure( + void handleAddProteinStructure( data.uniprot_id as string, data.entry_id as string, data.af_version as string, @@ -242,29 +241,35 @@ export const ProteinStructureUpload = () => { data.fasta_file as string, ); }} - /> - - - {protStructureList.map((ps) => ( - { - onDeleteProtStructure(ps.entry_id); - }} - /> - ))} - - void handleDeleteProtStructure(selectedProtStructure)} - title={`Deleted protein structure "${selectedProtStructure}"?`} + /> + + + {protStructureList.map((ps) => ( + { + onDeleteProtStructure(ps.entry_id); + }} /> + ))} + + void handleDeleteProtStructure(selectedProtStructure)} + title={ + `The uploaded protein structure prediction with the entryID ` + + `"${selectedProtStructure}" will permanently be deleted. Would you like to proceed?` + } + />
); }; diff --git a/frontend/src/components/app/settings/settings.tsx b/frontend/src/components/app/settings/settings.tsx index 7d43f197e..37581bcca 100644 --- a/frontend/src/components/app/settings/settings.tsx +++ b/frontend/src/components/app/settings/settings.tsx @@ -3,7 +3,12 @@ import { spacing } from "@protzilla/theme"; import { useState } from "react"; import { styled } from "styled-components"; -import { DatabaseSettings, GitHub, ProteinStructureUpload, PTMVisSettings } from "./other-settings/"; +import { + DatabaseSettings, + GitHub, + ProteinStructureUpload, + PTMVisSettings, +} from "./other-settings/"; import { PlotSettingsModal } from "./plot-settings"; import { SettingsProps } from "./settings.props.ts"; import { DiscardModal, Modal, ToggleableButton } from "../../core/"; @@ -113,7 +118,7 @@ export const Settings: React.FC = ({ { handleSwitchSection("protein-struc-upload"); diff --git a/frontend/src/components/core/shared/icon/icons/index.ts b/frontend/src/components/core/shared/icon/icons/index.ts index 6b84c4c28..08bebffc7 100644 --- a/frontend/src/components/core/shared/icon/icons/index.ts +++ b/frontend/src/components/core/shared/icon/icons/index.ts @@ -31,8 +31,8 @@ export { default as list } from "./list.svg?react"; export { default as outdated } from "./outdated.svg?react"; export { default as play } from "./play-btn.svg?react"; export { default as protzilla } from "./protzillablackwhite.svg?react"; +export { default as prot_structure } from "./prot_structure.svg?react"; export { default as save } from "./save.svg?react"; -export { default as structure } from "./structure.svg?react"; export { default as reload } from "./reload.svg?react"; export { default as searchLens } from "./search-lens.svg?react"; export { default as settings } from "./setting-fill.svg?react"; diff --git a/frontend/src/components/core/shared/icon/icons/prot_structure.svg b/frontend/src/components/core/shared/icon/icons/prot_structure.svg new file mode 100644 index 000000000..817a8fcef --- /dev/null +++ b/frontend/src/components/core/shared/icon/icons/prot_structure.svg @@ -0,0 +1,5 @@ + + + + + \ No newline at end of file From 0f4f485dbdfe3097f56e9c8991e62afd978d482e Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Fri, 30 Jan 2026 11:50:40 +0100 Subject: [PATCH 056/240] fix: Delete unused svg --- .../core/shared/icon/icons/structure.svg | 68 ------------------- 1 file changed, 68 deletions(-) delete mode 100644 frontend/src/components/core/shared/icon/icons/structure.svg diff --git a/frontend/src/components/core/shared/icon/icons/structure.svg b/frontend/src/components/core/shared/icon/icons/structure.svg deleted file mode 100644 index 95f37842a..000000000 --- a/frontend/src/components/core/shared/icon/icons/structure.svg +++ /dev/null @@ -1,68 +0,0 @@ - - - - -Created by potrace 1.15, written by Peter Selinger 2001-2017 - - - - - From f77958a69aa63f12af23e34c63cb994b1de830f0 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Fri, 30 Jan 2026 12:50:13 +0100 Subject: [PATCH 057/240] update tests --- .../importing/test_crosslinking_import.py | 128 +++++++++++++++--- 1 file changed, 110 insertions(+), 18 deletions(-) diff --git a/backend/tests/protzilla/importing/test_crosslinking_import.py b/backend/tests/protzilla/importing/test_crosslinking_import.py index dd67e937b..ca0dfb46f 100644 --- a/backend/tests/protzilla/importing/test_crosslinking_import.py +++ b/backend/tests/protzilla/importing/test_crosslinking_import.py @@ -4,12 +4,11 @@ from requests.exceptions import Timeout from protzilla.importing.crosslinking_import import ( aggregate_data, - remove_isoform_from_protein_id, remove_brackets_from_peptide, get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format, validate_data_before_lookup, execute_uniprot_request, - process_uniprot_response_containing_gene_names, + process_uniprot_response, iterate_for_protein_designation, get_missing_protein_designation, crosslinking_import, @@ -22,9 +21,10 @@ def test_aggregate_data(): assert result == {"A", "B", "C", "D"} -def test_remove_isoform_from_protein_id(): - assert remove_isoform_from_protein_id("P12345-2") == "P12345" - assert remove_isoform_from_protein_id("Q67890") == "Q67890" +def test_isoform_removal_logic(): + ids = {"P12345-2", "Q67890"} + cleaned = {x.split("-", 1)[0] for x in ids} + assert cleaned == {"P12345", "Q67890"} def test_remove_brackets_from_peptide(): @@ -93,18 +93,65 @@ def test_execute_uniprot_request_timeout(): assert results["P12345"][2] == "TIMEOUT" -def test_process_uniprot_response_containing_gene_names(): +def test_process_uniprot_response_id_to_gene_name(): results = {} + input_data = {"P1", "P2"} + mock_response = Mock() - mock_response.json.return_value = { - "results": [ - {"primaryAccession": "P1", "genes": [{"geneName": {"value": "GENE1"}}]}, - {"primaryAccession": "P2", "genes": []}, - ] - } - process_uniprot_response_containing_gene_names(mock_response, results) + mock_response.text = "Entry\tGene Names (primary)\n" "P1\tGENE1\n" "P2\t\n" + + process_uniprot_response( + response=mock_response, + results=results, + input_data=input_data, + mode="id_to_gene_name", + ) + assert results["P1"] == (True, "GENE1", None) - assert results["P2"] == (False, None, "NO_GENE_NAME_FOUND") + + +def test_uniprot_lookup_successful_request_but_no_results(monkeypatch): + from protzilla.importing.crosslinking_import import uniprot_lookup + + def mock_execute(*args, **kwargs): + mock = Mock() + mock.text = "Entry\tGene Names (primary)\n" + return mock + + monkeypatch.setattr( + "protzilla.importing.crosslinking_import.execute_uniprot_request", + mock_execute, + ) + + results = {} + uniprot_lookup( + input_data={"P1"}, + mode="id_to_gene_name", + results=results, + ) + + assert results["P1"] == (False, None, "NO_GENE_NAME_FOUND") + + +def test_uniprot_lookup_no_results(monkeypatch): + from protzilla.importing.crosslinking_import import uniprot_lookup + + def mock_execute(*args, **kwargs): + return Mock(text="Entry\tGene Names (primary)\n") + + monkeypatch.setattr( + "protzilla.importing.crosslinking_import.execute_uniprot_request", + mock_execute, + ) + + results = {} + uniprot_lookup( + input_data={"P1"}, + mode="id_to_gene_name", + results=results, + ) + + assert results["P1"] == (False, None, "NO_GENE_NAME_FOUND") def _minimal_valid_crosslinking_df(): @@ -161,6 +208,24 @@ def mock_lookup(ids): assert failed_df.empty +def test_aggregate_failed_proteins_for_display(): + df = pd.DataFrame( + { + "Protein1": ["A"], + "Protein2": ["B"], + "Protein1_error": ["ERR1"], + "Protein2_error": [None], + } + ) + + from protzilla.importing.crosslinking_import import ( + aggregate_failed_proteins_for_display, + ) + + result = aggregate_failed_proteins_for_display(df) + assert result == "A -> ERR1" + + def test_crosslinking_import_csv(tmp_path): csv_file = tmp_path / "test.csv" csv_file.write_text( @@ -174,19 +239,46 @@ def test_crosslinking_import_csv(tmp_path): with patch( "protzilla.importing.crosslinking_import.get_protein_ids_from_gene_name", return_value={ - "RAD50": (True, {"protein_ids": ["P12345"]}, None), - "MRE11": (True, {"protein_ids": ["Q67890"]}, None), + "RAD50": (True, "P12345", None), + "MRE11": (True, "Q67890", None), }, ): - result = crosslinking_import(csv_file) + result = crosslinking_import(csv_file, organism_id="9606") assert "crosslinking_df" in result assert not result["crosslinking_df"].empty +def test_crosslinking_import_xlsx(monkeypatch, tmp_path): + xlsx = tmp_path / "test.xlsx" + pd.DataFrame( + { + "Protein_id1": ["P1"], + "Protein_id2": ["P2"], + "Peptide1": ["[AAA]"], + "Peptide2": ["[BBB]"], + "Is_intra_crosslink": ["Intra"], + "Peptide_position1": [1], + "Peptide_position2": [2], + "CL_position1": [3], + "CL_position2": [4], + "Crosslinker": ["DSS"], + "Q_value": [0.01], + } + ).to_excel(xlsx, index=False) + + monkeypatch.setattr( + "protzilla.importing.crosslinking_import.get_gene_name_from_protein_ids", + lambda ids: {i: (True, f"G{i}", None) for i in ids}, + ) + + result = crosslinking_import(xlsx, organism_id="9606") + assert "crosslinking_df" in result + + def test_crosslinking_import_invalid_file(tmp_path): bad_file = tmp_path / "test.txt" bad_file.write_text("something invalid") - result = crosslinking_import(bad_file) + result = crosslinking_import(bad_file, organism_id="9606") assert "messages" in result assert any("Unsupported file type" in m["msg"] for m in result["messages"]) From c491226f4a29348e1e4de45fee6c4196b4e6b37f Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sat, 31 Jan 2026 15:59:08 +0100 Subject: [PATCH 058/240] refactor: address code review feedback (mostly naming) --- ...lidation.py => crosslinking_validation.py} | 32 +++++++++++-------- backend/protzilla/methods/data_analysis.py | 24 +++++++------- .../test_crosslinking_validation.py | 2 +- 3 files changed, 32 insertions(+), 26 deletions(-) rename backend/protzilla/data_analysis/{cross_linking_validation.py => crosslinking_validation.py} (89%) diff --git a/backend/protzilla/data_analysis/cross_linking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py similarity index 89% rename from backend/protzilla/data_analysis/cross_linking_validation.py rename to backend/protzilla/data_analysis/crosslinking_validation.py index d0b67b24b..e78d3bc1c 100644 --- a/backend/protzilla/data_analysis/cross_linking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -71,16 +71,16 @@ def get_position_of_amino_acid_crosslinker_bound_to( crosslinker_position_within_peptide: int, ) -> int: """Returns which amino acid the cross-linker bound to, 1-based.""" - peptide_start_position = protein_sequence.find(peptide_sequence) + 1 + peptide_start_position = protein_sequence.find(peptide_sequence) if peptide_start_position == 0: raise ValueError( f"Peptide {peptide_sequence} was not found in protein sequence" ) - return peptide_start_position + crosslinker_position_within_peptide - 1 + return peptide_start_position + crosslinker_position_within_peptide def get_distance_between_crosslinker_connected_amino_acids_in_alphafold( - fasta_df: pd.DataFrame, cif_df: pd.DataFrame, crosslink + fasta_df: pd.DataFrame, cif_df: pd.DataFrame, crosslink: pd.Series ) -> float: protein_sequence = fasta_df.at[0, "Protein Sequence"] amino_acid_position_crosslinker1_is_bound_to = ( @@ -135,12 +135,14 @@ def validate_with_angstrom_deviation( ) cif_df = alphafold_data["cif_df"] fasta_df = alphafold_data["sequence_df"] - df = crosslinking_df.copy() - mask = (df.Protein_id1 == protein_to_validate) & ( - df.Protein_id2 == protein_to_validate + all_crosslinks_df = crosslinking_df.copy() + + # we are only interested in intra-crosslinks of the protein we want to validate + mask = (all_crosslinks_df.Protein_id1 == protein_to_validate) & ( + all_crosslinks_df.Protein_id2 == protein_to_validate ) - relevant_crosslinks_df = df[mask] + relevant_crosslinks_df = all_crosslinks_df[mask].copy() def check_crosslink(crosslink: pd.Series) -> pd.Series: distance = get_distance_between_crosslinker_connected_amino_acids_in_alphafold( @@ -152,10 +154,10 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: accepted_deviation_upper_bound, accepted_deviation_lower_bound, ) = crosslinker_information[crosslink.Crosslinker] - except KeyError as e: - missing_key = e.args[0] + except KeyError: raise KeyError( - f"Missing required field '{missing_key}' for crosslinker '{crosslink.Crosslinker}'." + f"Missing required information regarding crosslinker length " + f"and/or accepted deviation for crosslinker '{crosslink.Crosslinker}'." ) # Fallback to default deviation bounds when not explicitly provided accepted_distance_lower_bound = crosslinker_length - ( @@ -171,13 +173,17 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: return pd.Series({"alphafold_distance": distance, "valid_crosslink": valid}) - df.loc[mask, ["alphafold_distance", "valid_crosslink"]] = ( + # adding the distance in alphafold and the result of the validation to all relevant crosslinks + all_crosslinks_df.loc[mask, ["alphafold_distance", "valid_crosslink"]] = ( relevant_crosslinks_df.apply(check_crosslink, axis=1) ) - df = df[df["valid_crosslink"].notna()] + # removing all crosslinks that weren't checked from the df + checked_crosslinks_df = all_crosslinks_df[ + all_crosslinks_df["valid_crosslink"].notna() + ] - return dict(crosslinking_df_result=df, messages={}) + return dict(crosslinking_df_result=checked_crosslinks_df, messages={}) def bar_plot_of_valid_crosslinks( diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 9e958071f..2b384efde 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -60,7 +60,7 @@ from protzilla.data_analysis.ptm_visualization.ptm_overview_plot import ( get_detected_modifications, ) -from protzilla.data_analysis.cross_linking_validation import ( +from protzilla.data_analysis.crosslinking_validation import ( validate_with_angstrom_deviation, bar_plot_of_valid_crosslinks, ) @@ -2491,23 +2491,23 @@ def create_form(self): ) def modify_form(self, form: Form, run: Run) -> None: - cross_linker = self._get_crosslinker_names_from_crosslinker_df(run.steps) - for cl in cross_linker: - field_name = f"length_of_{cl}" + crosslinkers = self._get_crosslinker_names_from_crosslinker_df(run.steps) + for crosslinker in crosslinkers: + field_name = f"{crosslinker}_length" if field_name not in form: crosslinker_length_field = FloatField( name=field_name, - label=f"Length of {cl} in Ångström", + label=f"Length of {crosslinker} in Ångström", min=0, ) upper_bound_length_deviation_field = FloatField( - name=f"upper_accepted_deviation_for_{cl}", - label=f"Upper bound on the accepted deviation for {cl} Cross-Links in Ångström (0 equals no bound)", + name=f"{crosslinker}_upper_accepted_deviation", + label=f"Upper bound on the accepted deviation for {crosslinker} Cross-Links in Ångström (0 equals no bound)", min=0, ) lower_bound_length_deviation_field = FloatField( - name=f"lower_accepted_deviation_for_{cl}", - label=f"Lower bound on the accepted deviation for {cl} Cross-Links in Ångström (0 equals no bound)", + name=f"{crosslinker}_lower_accepted_deviation", + label=f"Lower bound on the accepted deviation for {crosslinker} Cross-Links in Ångström (0 equals no bound)", min=0, ) form.add_field(crosslinker_length_field) @@ -2529,9 +2529,9 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: crosslinker_to_length_and_deviation = {} for crosslinker in self._get_crosslinker_names_from_crosslinker_df(steps): crosslinker_to_length_and_deviation[crosslinker] = [ - inputs.get(f"length_of_{crosslinker}"), - inputs.get(f"upper_accepted_deviation_for_{crosslinker}"), - inputs.get(f"lower_accepted_deviation_for_{crosslinker}"), + inputs.get(f"{crosslinker}_length"), + inputs.get(f"{crosslinker}_upper_accepted_deviation"), + inputs.get(f"{crosslinker}_lower_accepted_deviation"), ] inputs["crosslinker_information"] = crosslinker_to_length_and_deviation diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 0377692a0..9e5dafb52 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -3,7 +3,7 @@ from unittest.mock import MagicMock -from backend.protzilla.data_analysis.cross_linking_validation import ( +from backend.protzilla.data_analysis.crosslinking_validation import ( get_distance_between_two_amino_acids_in_angstrom, get_position_of_amino_acid_crosslinker_bound_to, validate_with_angstrom_deviation, From 917df7119802d21dfcb4425fbabac2740c229258 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sat, 31 Jan 2026 16:00:50 +0100 Subject: [PATCH 059/240] refactor: address code review feedback (mostly naming) --- backend/protzilla/data_analysis/crosslinking_validation.py | 2 +- backend/protzilla/methods/data_analysis.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index e78d3bc1c..57b48368f 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -183,7 +183,7 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: all_crosslinks_df["valid_crosslink"].notna() ] - return dict(crosslinking_df_result=checked_crosslinks_df, messages={}) + return dict(crosslinking_result_df=checked_crosslinks_df, messages={}) def bar_plot_of_valid_crosslinks( diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 2b384efde..071644cc2 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -2469,7 +2469,7 @@ class CrossLinkingValidationWithAngstromDeviation(DataAnalysisStep): operation = "Cross Linking Validation" method_description = "Validates cross links based on the difference between the length of the cross linker and the distance between the amino acids which were connected by the cross linker. (in Ångström)" - output_keys = ["crosslinking_df_result"] + output_keys = ["crosslinking_result_df"] @staticmethod def _get_crosslinker_names_from_crosslinker_df(steps: StepManager) -> list[str]: From 9c9a6f16c678febb4cd74f142e08ab07c9f8a066 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sat, 31 Jan 2026 16:18:14 +0100 Subject: [PATCH 060/240] chore: add more docstrings --- .../data_analysis/crosslinking_validation.py | 55 +++++++++++++++++-- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 57b48368f..83d6afaf5 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -8,7 +8,15 @@ from protzilla.data_preprocessing.plots import create_bar_plot -def get_reactive_atom_of_amino_acid_residue(amino_acid_kind: str) -> str: +def get_reactive_atom_of_amino_acid_residue(amino_acid_type: str) -> str: + """ + Returns the atom of an amino acid residue that is considered reactive for + cross-linking. Currently, this always returns the central alpha carbon (CA). + + :param amino_acid_type: code of the amino acid + + :return: the atom identifier of the reactive atom as a string + """ # right now we always return the central C atom # later we might want to return the reactive atom of the amino acid residue of the specific amino acid kind # as soon as we change this, we will need to change the test test_validate_with_angstrom_deviation @@ -17,10 +25,21 @@ def get_reactive_atom_of_amino_acid_residue(amino_acid_kind: str) -> str: def get_coordinates_of_atom_crosslinker_bound_to( amino_acid_position_where_crosslinker_bound: int, - amino_acid_kind: str, + amino_acid_type: str, cif_df: pd.DataFrame, ) -> tuple[float, float, float]: - relevant_atom = get_reactive_atom_of_amino_acid_residue(amino_acid_kind) + """ + Returns the Cartesian coordinates of the atom to which the cross-linker is + bound for a given amino acid residue in a protein structure. + + :param amino_acid_position_where_crosslinker_bound: 1-based position of the amino acid residue + :param amino_acid_type: amino acid type at the given position + :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) + :return: a tuple (x, y, z) containing the Cartesian coordinates of the atom in Ångström + :raises ValueError: if the specified atom cannot be found in the CIF data + """ + + relevant_atom = get_reactive_atom_of_amino_acid_residue(amino_acid_type) # Filter to the exact reactive atom of the amino acid residue # where the crosslinker is bound (e.g. CA at position 45) @@ -53,6 +72,17 @@ def get_distance_between_two_amino_acids_in_angstrom( amino_acid_kind2: str, cif_df: pd.DataFrame, ) -> float: + """ + Calculates the Euclidean distance in Ångström between two amino acid residues + based on the coordinates of their reactive atoms in the AlphaFold/predicted structure. + + :param amino_acid_position1: 1-based position of the first amino acid residue + :param amino_acid_position2: 1-based position of the second amino acid residue + :param amino_acid_kind1: amino acid type at the first position + :param amino_acid_kind2: amino acid type at the second position + :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) + :return: the distance between the two residues in Ångström + """ x1, y1, z1 = get_coordinates_of_atom_crosslinker_bound_to( amino_acid_position1, amino_acid_kind1, cif_df ) @@ -70,7 +100,15 @@ def get_position_of_amino_acid_crosslinker_bound_to( peptide_sequence: str, crosslinker_position_within_peptide: int, ) -> int: - """Returns which amino acid the cross-linker bound to, 1-based.""" + """ + Determines the position of the amino acid to which the cross-linker bound. + + :param protein_sequence: full protein amino acid sequence + :param peptide_sequence: peptide sequence containing the amino acid the cross-linker bound to + :param crosslinker_position_within_peptide: 1-based position of the cross-linker within the peptide + :return: 1-based position of the amino acid residue in the protein sequence + :raises ValueError: if the peptide sequence cannot be found in the protein sequence + """ peptide_start_position = protein_sequence.find(peptide_sequence) if peptide_start_position == 0: raise ValueError( @@ -82,6 +120,15 @@ def get_position_of_amino_acid_crosslinker_bound_to( def get_distance_between_crosslinker_connected_amino_acids_in_alphafold( fasta_df: pd.DataFrame, cif_df: pd.DataFrame, crosslink: pd.Series ) -> float: + """ + Calculates the distance in Ångström between two amino acid residues connected + by a cross-linker using a predicted protein structure (e.g. from AlphaFold). + + :param fasta_df: DataFrame containing the protein sequence + :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) + :param crosslink: Series describing a cross-link, including cross-linker positions + :return: the distance between the cross-linked amino acids in Ångström + """ protein_sequence = fasta_df.at[0, "Protein Sequence"] amino_acid_position_crosslinker1_is_bound_to = ( get_position_of_amino_acid_crosslinker_bound_to( From 2709474872e9e074348fd5000f881df333db1236 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sat, 31 Jan 2026 16:27:37 +0100 Subject: [PATCH 061/240] fix: fix broken tests --- .../data_analysis/crosslinking_validation.py | 2 +- .../test_crosslinking_validation.py | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 83d6afaf5..b8c5ca942 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -110,7 +110,7 @@ def get_position_of_amino_acid_crosslinker_bound_to( :raises ValueError: if the peptide sequence cannot be found in the protein sequence """ peptide_start_position = protein_sequence.find(peptide_sequence) - if peptide_start_position == 0: + if peptide_start_position == -1: raise ValueError( f"Peptide {peptide_sequence} was not found in protein sequence" ) diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 9e5dafb52..3cff657cf 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -4,7 +4,6 @@ from backend.protzilla.data_analysis.crosslinking_validation import ( - get_distance_between_two_amino_acids_in_angstrom, get_position_of_amino_acid_crosslinker_bound_to, validate_with_angstrom_deviation, ) @@ -19,7 +18,7 @@ def test_get_position_of_amino_acid_crosslinker_bound_to(): @patch( - "backend.protzilla.data_analysis.cross_linking_validation.fetch_alphafold_protein_structure" + "backend.protzilla.data_analysis.crosslinking_validation.fetch_alphafold_protein_structure" ) def test_validate_with_angstrom_deviation(mock_fetch): # Fake AlphaFold Data @@ -58,7 +57,7 @@ def test_validate_with_angstrom_deviation(mock_fetch): crosslinker_information=crosslinker_information, ) - df = result["crosslinking_df_result"] + df = result["crosslinking_result_df"] assert "alphafold_distance" in df.columns assert "valid_crosslink" in df.columns @@ -80,10 +79,10 @@ def test_modify_form_creates_crosslinker_fields(): step.modify_form(form, run) - assert "length_of_DSS" in form - assert "upper_accepted_deviation_for_DSS" in form - assert "lower_accepted_deviation_for_DSS" in form + assert "DSS_length" in form + assert "DSS_upper_accepted_deviation" in form + assert "DSS_lower_accepted_deviation" in form - assert "length_of_BS3" in form - assert "upper_accepted_deviation_for_BS3" in form - assert "lower_accepted_deviation_for_BS3" in form + assert "BS3_length" in form + assert "BS3_upper_accepted_deviation" in form + assert "BS3_lower_accepted_deviation" in form From 3ff37f6d36e0c4e4f764cdbb1709e78ae2803f69 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sat, 31 Jan 2026 16:42:20 +0100 Subject: [PATCH 062/240] test: add more tests for crosslinking validation --- .../test_crosslinking_validation.py | 35 ++++++++++++++++--- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 3cff657cf..d5de0ca93 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -1,4 +1,5 @@ import pandas as pd +import pytest from unittest.mock import patch from unittest.mock import MagicMock @@ -6,6 +7,7 @@ from backend.protzilla.data_analysis.crosslinking_validation import ( get_position_of_amino_acid_crosslinker_bound_to, validate_with_angstrom_deviation, + get_distance_between_two_amino_acids_in_angstrom, ) from protzilla.methods.data_analysis import CrossLinkingValidationWithAngstromDeviation @@ -20,13 +22,22 @@ def test_get_position_of_amino_acid_crosslinker_bound_to(): @patch( "backend.protzilla.data_analysis.crosslinking_validation.fetch_alphafold_protein_structure" ) -def test_validate_with_angstrom_deviation(mock_fetch): +@pytest.mark.parametrize( + "distance, expected", + [ + (3.99, False), # outside bounds + (4.0, True), # lower bound + (6.0, True), # upper bound + (6.01, False), # outside bounds + ], +) +def test_validate_with_angstrom_deviation(mock_fetch, distance, expected): # Fake AlphaFold Data cif_df = pd.DataFrame( { "_atom_site.label_atom_id": ["CA", "CA"], "_atom_site.label_seq_id": [1, 2], - "_atom_site.Cartn_x": [0, 4], + "_atom_site.Cartn_x": [0, distance], "_atom_site.Cartn_y": [0, 0], "_atom_site.Cartn_z": [0, 0], } @@ -61,8 +72,8 @@ def test_validate_with_angstrom_deviation(mock_fetch): assert "alphafold_distance" in df.columns assert "valid_crosslink" in df.columns - assert df.loc[0, "alphafold_distance"] == 4.0 - assert df.loc[0, "valid_crosslink"] is True + assert df.loc[0, "alphafold_distance"] == distance + assert df.loc[0, "valid_crosslink"] is expected def test_modify_form_creates_crosslinker_fields(): @@ -86,3 +97,19 @@ def test_modify_form_creates_crosslinker_fields(): assert "BS3_length" in form assert "BS3_upper_accepted_deviation" in form assert "BS3_lower_accepted_deviation" in form + + +def test_get_distance_between_two_amino_acids_in_angstrom(): + cif_df = pd.DataFrame( + { + "_atom_site.label_atom_id": ["CA", "CA"], + "_atom_site.label_seq_id": [1, 2], + "_atom_site.Cartn_x": [0, 3], + "_atom_site.Cartn_y": [0, 4], + "_atom_site.Cartn_z": [0, 0], + } + ) + + dist = get_distance_between_two_amino_acids_in_angstrom(1, 2, "A", "B", cif_df) + + assert dist == 5.0 From cfe2ef8d590bc99d833ca6ee871ea809f267965f Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Sun, 1 Feb 2026 10:32:25 +0100 Subject: [PATCH 063/240] fix: Change how correct file upload is enforced, add an info text if no uploaded protein structures exist, improve function and variable naming --- backend/main/urls.py | 6 +- backend/main/views_helper.py | 71 ++++--------------- backend/main/views_settings.py | 43 ++++++----- .../test_alphafold_protein_structure_load.py | 6 +- .../protein-structure-upload.tsx | 46 +++++++----- 5 files changed, 69 insertions(+), 103 deletions(-) diff --git a/backend/main/urls.py b/backend/main/urls.py index a1543e5ee..0d6f61f73 100644 --- a/backend/main/urls.py +++ b/backend/main/urls.py @@ -72,9 +72,9 @@ name="upload_prot_structure", ), path( - "api/prot_structure_delete", - views_settings.prot_structure_delete, - name="prot_structure_delete", + "api/delete_prot_structure", + views_settings.delete_prot_structure, + name="delete_prot_structure", ), path( "api/load_ptm_settings", diff --git a/backend/main/views_helper.py b/backend/main/views_helper.py index 5ca442f97..b761b7af4 100644 --- a/backend/main/views_helper.py +++ b/backend/main/views_helper.py @@ -199,14 +199,14 @@ def copy_file_to_directory(source_file: Path, dest_dir: Path) -> tuple[bool, str """ if not source_file.exists(): - msg = f"Source file does not exist: {source_file}" - logger.error(msg) - return False, msg + message = f"Source file does not exist: {source_file}" + logger.error(message) + return False, message if not source_file.is_file(): - msg = f"Source path is not a file: {source_file}" - logger.error(msg) - return False, msg + message = f"Source path is not a file: {source_file}" + logger.error(message) + return False, message try: dest_dir.mkdir(parents=True, exist_ok=True) @@ -214,58 +214,11 @@ def copy_file_to_directory(source_file: Path, dest_dir: Path) -> tuple[bool, str shutil.copy2(source_file, dest_file) - msg = f"Successfully copied file {source_file} to {dest_dir}" - logger.info(msg) - return True, msg + message = f"Successfully copied file {source_file} to {dest_dir}" + logger.info(message) + return True, message except OSError as e: - msg = f"Failed to copy file: {str(e)}" - logger.error(msg) - return False, msg - - -def validate_uploaded_files( - upload_dir: Path, file_mapping: dict[str, list[str]] -) -> tuple[bool, str]: - """ - Validate that expected files exist in the upload directory with correct formats. - - :param upload_dir: Path to the upload directory - :param file_mapping: Dictionary mapping file names to list of valid extensions - e.g., {"cif_file": [".cif"], "fasta_file": [".fasta", ".fa"]} - :return: Tuple of (success: bool, message: str) - """ - if not upload_dir.exists(): - msg = f"Upload directory does not exist: {upload_dir}" - logger.error(msg) - return False, msg - - missing_files = [] - invalid_files = [] - - for file_name, valid_extensions in file_mapping.items(): - file_path = upload_dir / file_name - if not file_path.exists(): - missing_files.append(file_name) - else: - # Check file extension - if not any(file_name.lower().endswith(ext) for ext in valid_extensions): - invalid_files.append( - f"{file_name} (expected: {', '.join(valid_extensions)})" - ) - - # Build error message - error_messages = [] - if missing_files: - error_messages.append(f"Missing files: {', '.join(missing_files)}") - if invalid_files: - error_messages.append(f"Invalid file format: {', '.join(invalid_files)}") - - if error_messages: - msg = " | ".join(error_messages) - logger.warning(msg) - return False, msg - - msg = f"All {len(file_mapping)} files validated successfully" - logger.info(msg) - return True, msg + message = f"Failed to copy file: {str(e)}" + logger.error(message) + return False, message diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index 2c6079ddf..feea53a37 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -17,7 +17,6 @@ from backend.main.views_helper import ( sanitize_name, load_settings_from_file, - validate_uploaded_files, copy_file_to_directory, ) from backend.protzilla.constants.paths import EXTERNAL_DATA_PATH, SETTINGS_PATH @@ -262,25 +261,8 @@ def upload_prot_structure(request): pae = data.get("pae") fasta_file = data.get("fasta_file") - # Validate uploaded files and copy them to source directory out of temp directory - file_mapping = { - cif_file: [".cif"], - confidence: [".json"], - pae: [".json"], - fasta_file: [".fasta", ".fa"], - } - - is_valid, validation_message = validate_uploaded_files( - settings.FILE_UPLOAD_TEMP_DIR, file_mapping - ) - if not is_valid: - messages.add_message( - request, messages.ERROR, validation_message, "alert-danger" - ) - return JsonResponse( - {"success": False, "message": validation_message}, status=400 - ) - + # Copy files to source directory out of temp directory + af_path = AF_DICT_PATH / entry_id.upper() if af_path.exists(): return JsonResponse( @@ -292,11 +274,28 @@ def upload_prot_structure(request): for file_name in [cif_file, confidence, pae, fasta_file]: source_dir = settings.FILE_UPLOAD_TEMP_DIR / file_name success, message = copy_file_to_directory(source_dir, af_path) + if not success: + return JsonResponse( + {"success": False, "message": message}, + status=500, + ) # add row to metadata csv AF_DICT_PATH.mkdir(parents=True, exist_ok=True) metadata_csv = AF_DICT_PATH / "alphafold_metadata.csv" - df = pandas.read_csv(metadata_csv) + + expected_columns = [ + "entryID", + "uniprotAccession", + "modelCreatedDate", + "gene", + "alphafold_version", + ] + + if metadata_csv.exists(): + df = pandas.read_csv(metadata_csv, usecols=lambda c: c in expected_columns) + else: + df = pandas.DataFrame(columns=expected_columns) now_utc = datetime.now(timezone.utc) formatted = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ") @@ -329,7 +328,7 @@ def upload_prot_structure(request): ) -def prot_structure_delete(request): +def delete_prot_structure(request): if request.method != "POST": return JsonResponse( {"success": False, "message": "Invalid request method"}, status=405 diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index af3bc244f..3c5e00dd0 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -101,15 +101,15 @@ def test_fetch_alphafold_returned_keys(tmp_path, monkeypatch): monkeypatch.setattr(af.paths, "EXTERNAL_DATA_PATH", tmp_path) out = af.fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) - assert set(out.keys()).issuperset( - { + assert out.keys() == { "metadata_df", "cif_df", "pae_df", "plddt_df", "sequence_df", + "messages" } - ) + def test_fetch_alphafold_metadata(tmp_path, monkeypatch): diff --git a/frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx b/frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx index 9c6055e4a..9be0621d7 100644 --- a/frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx +++ b/frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx @@ -135,7 +135,7 @@ export const ProteinStructureUpload = () => { }; const handleDeleteProtStructure = async (entry_id: string) => { - const response = await callApiWithParameters("prot_structure_delete", { + const response = await callApiWithParameters("delete_prot_structure", { entry_id: entry_id, }); if (response?.success) { @@ -208,24 +208,28 @@ export const ProteinStructureUpload = () => { name: "cif_file", label: "CIF file (required):", isVisible: true, + accept: ".cif", }, { type: "file", name: "confidence_file", label: "Confidence JSON file (required):", isVisible: true, + accept: ".json", }, { type: "file", name: "pae_file", label: "Predicted Aligned Error JSON file (required):", isVisible: true, + accept: ".json", }, { type: "file", name: "fasta_file", label: "Sequence FASTA file (required):", isVisible: true, + accept: ".fasta, .fa", }, ], }} @@ -246,21 +250,31 @@ export const ProteinStructureUpload = () => { baseComponent={"h2"} title={"Available Predicted Protein Structures"} /> - - {protStructureList.map((ps) => ( - { - onDeleteProtStructure(ps.entry_id); - }} - /> - ))} - + {protStructureList.length === 0 ? ( + + ) : ( + + {protStructureList.map((ps) => ( + { + onDeleteProtStructure(ps.entry_id); + }} + /> + ))} + + )} Date: Sun, 1 Feb 2026 10:40:13 +0100 Subject: [PATCH 064/240] fix: format with black --- backend/main/views_settings.py | 12 ++++++------ .../test_alphafold_protein_structure_load.py | 15 +++++++-------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index feea53a37..69d2118f0 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -262,7 +262,7 @@ def upload_prot_structure(request): fasta_file = data.get("fasta_file") # Copy files to source directory out of temp directory - + af_path = AF_DICT_PATH / entry_id.upper() if af_path.exists(): return JsonResponse( @@ -285,11 +285,11 @@ def upload_prot_structure(request): metadata_csv = AF_DICT_PATH / "alphafold_metadata.csv" expected_columns = [ - "entryID", - "uniprotAccession", - "modelCreatedDate", - "gene", - "alphafold_version", + "entryID", + "uniprotAccession", + "modelCreatedDate", + "gene", + "alphafold_version", ] if metadata_csv.exists(): diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index 3c5e00dd0..a7d4aa7e5 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -102,14 +102,13 @@ def test_fetch_alphafold_returned_keys(tmp_path, monkeypatch): out = af.fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) assert out.keys() == { - "metadata_df", - "cif_df", - "pae_df", - "plddt_df", - "sequence_df", - "messages" - } - + "metadata_df", + "cif_df", + "pae_df", + "plddt_df", + "sequence_df", + "messages", + } def test_fetch_alphafold_metadata(tmp_path, monkeypatch): From 72d2f0429fe54048337171e1057425845037b6ee Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 2 Feb 2026 11:33:42 +0100 Subject: [PATCH 065/240] refactor: address code review feedback, especially use np.linalg.norm for distance calculation --- .../data_analysis/crosslinking_validation.py | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index b8c5ca942..0dc90e2f6 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -1,5 +1,5 @@ import pandas as pd -import math +import numpy as np from plotly.graph_objects import Figure from protzilla.importing.alphafold_protein_structure_load import ( @@ -83,16 +83,22 @@ def get_distance_between_two_amino_acids_in_angstrom( :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) :return: the distance between the two residues in Ångström """ - x1, y1, z1 = get_coordinates_of_atom_crosslinker_bound_to( - amino_acid_position1, amino_acid_kind1, cif_df - ) - x2, y2, z2 = get_coordinates_of_atom_crosslinker_bound_to( - amino_acid_position2, amino_acid_kind2, cif_df + + pos1 = np.array( + get_coordinates_of_atom_crosslinker_bound_to( + amino_acid_position1, amino_acid_kind1, cif_df + ), + dtype=float, ) - distance = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2 + (z2 - z1) ** 2) + pos2 = np.array( + get_coordinates_of_atom_crosslinker_bound_to( + amino_acid_position2, amino_acid_kind2, cif_df + ), + dtype=float, + ) - return distance + return float(np.linalg.norm(pos2 - pos1)) def get_position_of_amino_acid_crosslinker_bound_to( @@ -211,7 +217,7 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: accepted_deviation_lower_bound or crosslinker_length ) accepted_distance_upper_bound = ( - accepted_deviation_upper_bound or 1e9 + accepted_deviation_upper_bound or float("inf") ) + crosslinker_length valid = ( From 91f834c8bba58952f65fc491ac0c09daf73709c7 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 2 Feb 2026 15:11:38 +0100 Subject: [PATCH 066/240] fix: check that expected file with expected columns exist in get and upload functions --- backend/main/views_settings.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index 69d2118f0..dddc0eb52 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -232,9 +232,25 @@ def save_ptm_settings(request, default_file_stem: str = DEFAULT_PTM_SETTINGS_FIL AF_DICT_PATH = EXTERNAL_DATA_PATH / "alphafold" +def get_metadata_df(csv_file_path: str) -> pandas.DataFrame: + expected_columns = [ + "entryID", + "uniprotAccession", + "modelCreatedDate", + "gene", + "alphafold_version", + ] + if csv_file_path.exists(): + df = pandas.read_csv(csv_file_path, usecols=lambda c: c in expected_columns) + else: + df = pandas.DataFrame(columns=expected_columns) + return df + + def get_prot_structure(request): metadata_csv = AF_DICT_PATH / "alphafold_metadata.csv" - df = pandas.read_csv(metadata_csv) + + df = get_metadata_df(metadata_csv) df_infos = df.rename( columns={ @@ -284,18 +300,7 @@ def upload_prot_structure(request): AF_DICT_PATH.mkdir(parents=True, exist_ok=True) metadata_csv = AF_DICT_PATH / "alphafold_metadata.csv" - expected_columns = [ - "entryID", - "uniprotAccession", - "modelCreatedDate", - "gene", - "alphafold_version", - ] - - if metadata_csv.exists(): - df = pandas.read_csv(metadata_csv, usecols=lambda c: c in expected_columns) - else: - df = pandas.DataFrame(columns=expected_columns) + df = get_metadata_df(metadata_csv) now_utc = datetime.now(timezone.utc) formatted = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ") From bc66bfd1bb2c815066c3c1e7713e4cc9d0bc51bc Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 2 Feb 2026 15:56:41 +0100 Subject: [PATCH 067/240] fix: wrong output key --- backend/protzilla/data_analysis/crosslinking_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 0dc90e2f6..1af6f42bc 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -261,7 +261,7 @@ def bar_plot_of_valid_crosslinks( """ validated_df = validate_with_angstrom_deviation( crosslinking_df, protein_to_validate, crosslinker_information - )["crosslinking_df_result"] + )["crosslinking_result_df"] evaluated = validated_df["valid_crosslink"].dropna() From 3b668c38326e83e612aea53df2daf2a3a0224d5c Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Mon, 2 Feb 2026 16:23:07 +0100 Subject: [PATCH 068/240] refactor: remove redundant and unnecessary testing --- .../importing/test_crosslinking_import.py | 27 ------------------- 1 file changed, 27 deletions(-) diff --git a/backend/tests/protzilla/importing/test_crosslinking_import.py b/backend/tests/protzilla/importing/test_crosslinking_import.py index ca0dfb46f..81d6c3bfd 100644 --- a/backend/tests/protzilla/importing/test_crosslinking_import.py +++ b/backend/tests/protzilla/importing/test_crosslinking_import.py @@ -21,12 +21,6 @@ def test_aggregate_data(): assert result == {"A", "B", "C", "D"} -def test_isoform_removal_logic(): - ids = {"P12345-2", "Q67890"} - cleaned = {x.split("-", 1)[0] for x in ids} - assert cleaned == {"P12345", "Q67890"} - - def test_remove_brackets_from_peptide(): assert remove_brackets_from_peptide("[ABC]DE[FG]") == "ABCDEFG" @@ -133,27 +127,6 @@ def mock_execute(*args, **kwargs): assert results["P1"] == (False, None, "NO_GENE_NAME_FOUND") -def test_uniprot_lookup_no_results(monkeypatch): - from protzilla.importing.crosslinking_import import uniprot_lookup - - def mock_execute(*args, **kwargs): - return Mock(text="Entry\tGene Names (primary)\n") - - monkeypatch.setattr( - "protzilla.importing.crosslinking_import.execute_uniprot_request", - mock_execute, - ) - - results = {} - uniprot_lookup( - input_data={"P1"}, - mode="id_to_gene_name", - results=results, - ) - - assert results["P1"] == (False, None, "NO_GENE_NAME_FOUND") - - def _minimal_valid_crosslinking_df(): return pd.DataFrame( { From b941917b21fe2c35ced86a7531760b7583b4d4d8 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Mon, 2 Feb 2026 16:56:39 +0100 Subject: [PATCH 069/240] fix: .xlsx files can be imported without entering an organism id --- .../importing/crosslinking_import.py | 37 +++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index f03cbb042..b801f2433 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -683,21 +683,23 @@ def aggregate_failed_proteins_for_display(failed_df: pd.DataFrame) -> str: def crosslinking_import(file_path: Path, organism_id: str) -> dict: - success, scientific_organism_name = process_organism_id_from_text_field(organism_id) - if not success: - msg = f"Unsupported organism id: {organism_id}. Please provide a valid taxonomy id." - return dict( - messages=[ - dict( - level=logging.ERROR, - msg=msg, - ) - ] - ) + file_type = file_path.suffix try: - if file_path.suffix == ".csv": + scientific_organism_name = None + if file_type == ".csv": + success, scientific_organism_name = process_organism_id_from_text_field(organism_id) + if not success: + msg = f"Unsupported organism id: {organism_id}. Please provide a valid taxonomy id." + return dict( + messages=[ + dict( + level=logging.ERROR, + msg=msg, + ) + ] + ) good_df, failed_df = read_csm_file(file_path, organism_id) - elif file_path.suffix == ".xlsx": + elif file_type == ".xlsx": good_df, failed_df = read_ProteomeDiscoverer_XlinkX_file(file_path) else: raise ValueError(f"Unsupported file type: {file_path.suffix}") @@ -712,11 +714,16 @@ def crosslinking_import(file_path: Path, organism_id: str) -> dict: ) ] ) + + def base_message(): + if file_type == ".csv": + return f"{len(good_df)} cross-links for the {scientific_organism_name} organism" + return f"{len(good_df)} cross-links" if failed_df.empty: - msg = f"Successfully imported data of {len(good_df)} cross-links for the {scientific_organism_name} organism." + msg = f"Successfully imported data of {base_message()}." messages = [dict(level=logging.INFO, msg=msg)] else: - msg = f"Warning: {len(failed_df)} rows failed to import, however {len(good_df)} cross-links for the {scientific_organism_name} organism were successfully imported." + msg = f"Warning: {len(failed_df)} rows failed to import, however {base_message()} were successfully imported." messages = [ dict(level=logging.WARNING, msg=msg), dict( From a5ec3b23ca2c2b151e9f6df73588851bf0625113 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Mon, 2 Feb 2026 18:27:42 +0100 Subject: [PATCH 070/240] add more explicit error handling for better usability --- .../protzilla/importing/crosslinking_import.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index b801f2433..768748140 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -451,7 +451,7 @@ def iterate_for_protein_designation( row_dict[new_designation + "2"] = data2 good_rows.append(row_dict) - good_df = normalize_crosslinking_df(pd.DataFrame(good_rows)) + good_df = pd.DataFrame(good_rows) failed_df = pd.DataFrame(failed_rows) return good_df, failed_df @@ -497,6 +497,10 @@ def get_missing_protein_designation( new_designation=missing_column, uniprot_lookup_results=uniprot_lookup_results, ) + + if not good_df.empty: + good_df = normalize_crosslinking_df(good_df) + return good_df, failed_df @@ -606,10 +610,16 @@ def normalize_crosslinking_df(df: pd.DataFrame) -> pd.DataFrame: { "Protein1": "string", "Protein2": "string", + "Protein_id1": "string", + "Protein_id2": "string", "Is_intra_crosslink": "bool", "Crosslinker": "string", "Peptide1": "string", "Peptide2": "string", + "Peptide_position1": "int", + "Peptide_position2": "int", + "CL_position1": "int", + "CL_position2": "int", "Q_value": "Float64", } ) @@ -719,7 +729,10 @@ def base_message(): if file_type == ".csv": return f"{len(good_df)} cross-links for the {scientific_organism_name} organism" return f"{len(good_df)} cross-links" - if failed_df.empty: + if good_df.empty: + msg = f"No cross-links could be processed from this file. File was read successfully, but the data of {base_message()} could be imported." + messages = [dict(level=logging.ERROR, msg=msg)] + elif failed_df.empty: msg = f"Successfully imported data of {base_message()}." messages = [dict(level=logging.INFO, msg=msg)] else: From 2c6be8cbf3bb2850dff73e4b2ca0e1d5fa61fb7b Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Mon, 2 Feb 2026 18:31:05 +0100 Subject: [PATCH 071/240] refactor: change crosslinking spelling in importing step display --- backend/protzilla/methods/importing.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index cdf9245f5..401ec8d04 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -443,19 +443,19 @@ def create_form(self): class CrosslinkingImport(ImportingStep): - display_name = "Cross-Linking Data Import" - operation = "Cross-Linking Data Import" - method_description = "Import a file containing cross-linking data" + display_name = "Crosslinking Data Import" + operation = "Crosslinking Data Import" + method_description = "Import a file containing crosslinking data" output_keys = ["crosslinking_df", "imported_rows_with_errors_df"] def create_form(self): return Form( - label="Cross-Linking Data Import", + label="Crosslinking Data Import", input_fields=[ FileInput( name="file_path", - label="Cross-Linking Data file (.xlsx or .csv)", + label="Crosslinking Data file (.xlsx or .csv)", value=None, ), TextField( From f5d63f0e27b7fb11ce603f858b611e57cb03189c Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Mon, 2 Feb 2026 19:05:11 +0100 Subject: [PATCH 072/240] refactor: introduce enums for error codes and loookup modes --- .../importing/crosslinking_import.py | 59 ++++++++++++------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index 768748140..db5315a93 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -11,7 +11,8 @@ from io import StringIO from itertools import islice from functools import partial -from typing import Callable, Optional, Literal +from typing import Callable, Optional +from enum import Enum from backend.protzilla.utilities import format_trace from backend.protzilla.importing.import_utils import ( @@ -21,6 +22,22 @@ ) +class ProteinLookupError(Enum): + NOT_A_VALID_PROTEIN_ID = "NOT_A_VALID_PROTEIN_ID" + IS_DECOY_PROTEIN = "IS_DECOY_PROTEIN" + NO_PROTEIN_ID_FOUND = "NO_PROTEIN_ID_FOUND" + NO_GENE_NAME_FOUND = "NO_GENE_NAME_FOUND" + TIMEOUT = "TIMEOUT" + HTTP_ERROR = "HTTP_ERROR" + REQUEST_ERROR = "REQUEST_ERROR" + NOT_LOOKED_UP = "NOT_LOOKED_UP" + + +class ProteinDesignationLookupMode(Enum): + gene_name_to_id = "gene_name_to_id" + id_to_gene_name = "id_to_gene_name" + + def aggregate_data(df: pd.DataFrame, column: str) -> set: """ Extract unique values from two DataFrame columns and return them as a set. @@ -181,11 +198,11 @@ def execute_uniprot_request( return response except requests.exceptions.Timeout: - error = "TIMEOUT" + error = ProteinLookupError.TIMEOUT.value except requests.exceptions.HTTPError as e: - error = f"HTTP_{e.response.status_code}" + error = f"{ProteinLookupError.HTTP_ERROR.value}_{e.response.status_code}" except requests.exceptions.RequestException: - error = "REQUEST_ERROR" + error = ProteinLookupError.REQUEST_ERROR.value for data in valid_data: results[data] = (False, None, error) @@ -196,7 +213,7 @@ def process_uniprot_response( response: requests.Response, results: dict[str, tuple[bool, str | None, None | str]], input_data: set[str], - mode: Literal["id_to_gene_name", "gene_name_to_id"], + mode: ProteinDesignationLookupMode, ) -> None: """ Process a UniProt API response and update the results dictionary. @@ -213,7 +230,7 @@ def process_uniprot_response( :param input_data: Set of input values that were originally queried :type input_data: set[str] :param mode: Lookup mode, either mapping IDs to gene names or gene names to IDs - :type mode: Literal["id_to_gene_name", "gene_name_to_id"] + :type mode: ProteinDesignationLookupMode :return: None (results dictionary is updated in place) :rtype: None @@ -224,17 +241,17 @@ def process_uniprot_response( protein_id = row.get("Entry") primary_gene_name = row.get("Gene Names (primary)") - if mode == "id_to_gene_name": + if mode == ProteinDesignationLookupMode.id_to_gene_name.value: existing_data = protein_id requested_data = primary_gene_name - elif mode == "gene_name_to_id": + elif mode == ProteinDesignationLookupMode.gene_name_to_id.value: existing_data = primary_gene_name requested_data = protein_id if pd.notna(requested_data) and requested_data != "": if existing_data in input_data: results[existing_data] = (True, requested_data, None) - elif mode == "gene_name_to_id": + elif mode == ProteinDesignationLookupMode.gene_name_to_id.value: alternative_gene_names = str(row.get("Gene Names", "")).split() for gene_name in alternative_gene_names: if gene_name in input_data: @@ -244,7 +261,7 @@ def process_uniprot_response( def uniprot_lookup( input_data: set[str], - mode: Literal["id_to_gene_name", "gene_name_to_id"], + mode: ProteinDesignationLookupMode, results: dict[str, tuple[bool, Optional[str], Optional[str]]], organism_id: Optional[str] = None, ) -> None: @@ -259,7 +276,7 @@ def uniprot_lookup( :param input_data: Set of input values to look up (protein IDs or gene names) :type input_data: set[str] :param mode: Lookup mode, either "id_to_gene_name" or "gene_name_to_id" - :type mode: Literal["id_to_gene_name", "gene_name_to_id"] + :type mode: ProteinDesignationLookupMode :param results: Dictionary to store lookup results; updated in place with ``existing_data -> (success, value, error_code)`` :type results: dict[str, tuple[bool, str | None, str | None]] @@ -269,13 +286,13 @@ def uniprot_lookup( :return: None (results dictionary is updated in place) :rtype: None """ - if mode == "id_to_gene_name": - error = "NO_GENE_NAME_FOUND" + if mode == ProteinDesignationLookupMode.id_to_gene_name.value: + error = ProteinLookupError.NO_GENE_NAME_FOUND.value field_of_existing_data = "accession" extra_query = None extra_fields = None - elif mode == "gene_name_to_id": - error = "NO_PROTEIN_ID_FOUND" + elif mode == ProteinDesignationLookupMode.gene_name_to_id.value: + error = ProteinLookupError.NO_PROTEIN_ID_FOUND.value field_of_existing_data = "gene_exact" extra_query = f"organism_id:{organism_id} AND reviewed:true" extra_fields = "gene_names" @@ -336,7 +353,7 @@ def get_gene_name_from_protein_ids( valid_ids, results = validate_data_before_lookup( data_for_lookup=protein_ids, validator_function=lambda pid: bool(valid_id_pattern.match(pid)), - error_code="NOT_A_VALID_PROTEIN_ID", + error_code=ProteinLookupError.NOT_A_VALID_PROTEIN_ID.value, ) if not valid_ids: @@ -346,7 +363,7 @@ def get_gene_name_from_protein_ids( uniprot_lookup( input_data=valid_ids_without_isoform, - mode="id_to_gene_name", + mode=ProteinDesignationLookupMode.id_to_gene_name.value, results=results, organism_id=None, ) @@ -376,7 +393,7 @@ def get_protein_ids_from_gene_name( valid_gene_names, results = validate_data_before_lookup( data_for_lookup=gene_names, validator_function=lambda name: not name.startswith("decoy:"), - error_code="IS_DECOY_PROTEIN", + error_code=ProteinLookupError.IS_DECOY_PROTEIN.value, ) if not valid_gene_names: @@ -384,7 +401,7 @@ def get_protein_ids_from_gene_name( uniprot_lookup( input_data=valid_gene_names, - mode="gene_name_to_id", + mode=ProteinDesignationLookupMode.gene_name_to_id.value, results=results, organism_id=organism_id, ) @@ -430,10 +447,10 @@ def iterate_for_protein_designation( protein_id2 = row[existing_designation + "2"].split("-", 1)[0] success1, data1, error1 = uniprot_lookup_results.get( - protein_id1, (False, None, "NOT_LOOKED_UP") + protein_id1, (False, None, ProteinLookupError.NOT_LOOKED_UP.value) ) success2, data2, error2 = uniprot_lookup_results.get( - protein_id2, (False, None, "NOT_LOOKED_UP") + protein_id2, (False, None, ProteinLookupError.NOT_LOOKED_UP.value) ) errors_occurred = {} From 16fb613b6967275ce8f5fd284ad76ee44e820727 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Mon, 2 Feb 2026 19:15:40 +0100 Subject: [PATCH 073/240] format backend code with black --- backend/protzilla/importing/crosslinking_import.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index db5315a93..06fdd4798 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -515,7 +515,7 @@ def get_missing_protein_designation( uniprot_lookup_results=uniprot_lookup_results, ) - if not good_df.empty: + if not good_df.empty: good_df = normalize_crosslinking_df(good_df) return good_df, failed_df @@ -712,9 +712,11 @@ def aggregate_failed_proteins_for_display(failed_df: pd.DataFrame) -> str: def crosslinking_import(file_path: Path, organism_id: str) -> dict: file_type = file_path.suffix try: - scientific_organism_name = None + scientific_organism_name = None if file_type == ".csv": - success, scientific_organism_name = process_organism_id_from_text_field(organism_id) + success, scientific_organism_name = process_organism_id_from_text_field( + organism_id + ) if not success: msg = f"Unsupported organism id: {organism_id}. Please provide a valid taxonomy id." return dict( @@ -741,12 +743,13 @@ def crosslinking_import(file_path: Path, organism_id: str) -> dict: ) ] ) - + def base_message(): if file_type == ".csv": return f"{len(good_df)} cross-links for the {scientific_organism_name} organism" return f"{len(good_df)} cross-links" - if good_df.empty: + + if good_df.empty: msg = f"No cross-links could be processed from this file. File was read successfully, but the data of {base_message()} could be imported." messages = [dict(level=logging.ERROR, msg=msg)] elif failed_df.empty: From d27ea0ac8152c7744c328f3ac5f86757efc0c385 Mon Sep 17 00:00:00 2001 From: 3dot141592 Date: Tue, 3 Feb 2026 12:39:49 +0100 Subject: [PATCH 074/240] fix icon prot_structure.svg color --- .../src/components/core/shared/icon/icons/prot_structure.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/components/core/shared/icon/icons/prot_structure.svg b/frontend/src/components/core/shared/icon/icons/prot_structure.svg index 817a8fcef..f1e6aad97 100644 --- a/frontend/src/components/core/shared/icon/icons/prot_structure.svg +++ b/frontend/src/components/core/shared/icon/icons/prot_structure.svg @@ -1,5 +1,5 @@ - + \ No newline at end of file From 1fdcf0eabb1114bc4002c451e8347166f35866f6 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Tue, 3 Feb 2026 16:03:57 +0100 Subject: [PATCH 075/240] feat: add new importing step that loads data from disk into dataframes to make the data usable for the run --- backend/protzilla/all_steps.py | 1 + .../alphafold_protein_structure_load.py | 140 ++++++++++++++++++ backend/protzilla/methods/importing.py | 33 +++++ backend/tests/main/test_views_helper.py | 1 + 4 files changed, 175 insertions(+) diff --git a/backend/protzilla/all_steps.py b/backend/protzilla/all_steps.py index 9c390a58e..1955cb8ae 100644 --- a/backend/protzilla/all_steps.py +++ b/backend/protzilla/all_steps.py @@ -16,6 +16,7 @@ importing.FastaImport, importing.AlphaFoldPredictionLoad, importing.CrosslinkingImport, + importing.ImportStructurePredictionsFromDisk, data_preprocessing.FilterProteinsBySamplesMissing, data_preprocessing.FilterProteinsBySilacRatios, data_preprocessing.FilterByProteinsCount, diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 1c2d22197..ddbc966be 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -198,6 +198,146 @@ def handle_alphafold_files( } +def get_all_available_entry_ids() -> list: + meta_dir = paths.EXTERNAL_DATA_PATH / "alphafold" + metadata_csv = meta_dir / "alphafold_metadata.csv" + + if metadata_csv.exists(): + df = pd.read_csv(metadata_csv) + return df["entryID"].tolist() + + else: + return [] + + +def get_prot_structure_dfs(entry_id: str) -> dict[str, Any]: + """ + Writes data from disk of a specific entry ID into dataframes. + + :param entry_id: entryID of the uploaded protein structure + :return: A dictionary containing DataFrames for metadata, CIF, PAE, pLDDT, and sequence data + """ + messages: list[dict[str, Any]] = [] + + meta_dir = paths.EXTERNAL_DATA_PATH / "alphafold" + metadata_csv = meta_dir / "alphafold_metadata.csv" + + if not metadata_csv.exists(): + msg = f"AlphaFold metadata CSV not found: {metadata_csv}" + logger.error(msg) + raise FileNotFoundError(msg) + + all_metadata_df = pd.read_csv(metadata_csv, dtype=str) + metadata_df = all_metadata_df[all_metadata_df["entryID"] == entry_id] + if metadata_df.empty: + msg = f"No metadata for entryID '{entry_id}' in {metadata_csv}" + logger.error(msg) + raise ValueError(msg) + + prot_dir = meta_dir / entry_id.upper() + if not prot_dir.exists() or not prot_dir.is_dir(): + msg = f"AlphaFold data directory not found for entry '{entry_id}': {prot_dir}" + logger.error(msg) + raise FileNotFoundError(msg) + + # get cif file + cif_files = list(prot_dir.glob("*.cif")) + if not cif_files: + msg = f"No CIF file found in {prot_dir} for entry '{entry_id}'" + logger.error(msg) + raise FileNotFoundError(msg) + + cif_file = cif_files[0] + try: + cif_df = read_alphafold_mmcif(str(cif_file)) + except Exception as e: + msg = f"Failed to read CIF file '{cif_file}': {e}" + logger.exception(msg) + raise RuntimeError(msg) from e + + # get fasta file + fasta_files = list(prot_dir.glob("*.fasta")) + list(prot_dir.glob("*.fa")) + if not fasta_files: + msg = f"No FASTA file found in {prot_dir} for entry '{entry_id}'" + logger.error(msg) + raise FileNotFoundError(msg) + + fasta_file = fasta_files[0] + try: + fasta_dict = fasta_import(str(fasta_file)) + sequence_df = fasta_dict.get("fasta_df") + if sequence_df is None: + raise RuntimeError( + f"FASTA importer did not return 'fasta_df' for {fasta_file}" + ) + except Exception as e: + msg = f"Failed to load FASTA '{fasta_file}': {e}" + logger.exception(msg) + raise RuntimeError(msg) from e + + # get jsons (PAE and pLDDT) + json_files = list(prot_dir.glob("*.json")) + if not json_files: + msg = f"No JSON files (PAE/pLDDT) found in {prot_dir} for entry '{entry_id}'" + logger.error(msg) + raise FileNotFoundError(msg) + + pae_df = None + plddt_df = None + + try: + if len(json_files) == 1: + raise RuntimeError() + else: + json1 = pd.read_json(json_files[0]) + json2 = pd.read_json(json_files[1]) + if ( + "predicted_aligned_error" in json1.columns + and "residueNumber" in json2.columns + ): + pae_df = json1 + plddt_df = json2 + elif ( + "predicted_aligned_error" in json2.columns + and "residueNumber" in json1.columns + ): + pae_df = json2 + plddt_df = json1 + else: + # Fallback: assign and warn + pae_df = json1 + plddt_df = json2 + warn = f"Could not detect PAE/pLDDT in JSON files for entry '{entry_id}'; files will be returned as read." + logger.warning(warn) + messages.append(dict(level=logging.WARNING, msg=warn)) + except Exception as e: + msg = f"Failed to read JSON files in {prot_dir}: {e}" + logger.exception(msg) + raise RuntimeError(msg) from e + + def _check_df(df: Any) -> bool: + return df is not None + + if ( + _check_df(cif_df) + and _check_df(pae_df) + and _check_df(plddt_df) + and _check_df(sequence_df) + ): + success_msg = f"Successfully loaded AlphaFold data for entry '{entry_id}'" + logger.info(success_msg) + messages.append(dict(level=logging.INFO, msg=success_msg)) + + return { + "metadata_df": metadata_df, + "cif_df": cif_df, + "pae_df": pae_df, + "plddt_df": plddt_df, + "sequence_df": sequence_df, + "messages": messages, + } + + def fetch_alphafold_protein_structure( uniprot_id: str, persist_uploads: bool ) -> dict[str, Any]: diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index a86a3ff56..9d6ed07aa 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -1,6 +1,7 @@ from __future__ import annotations from backend.protzilla.form import * +from backend.protzilla import form_helper from backend.protzilla.importing.metadata_import import ( metadata_column_assignment, metadata_import_method, @@ -13,6 +14,8 @@ ) from backend.protzilla.importing.alphafold_protein_structure_load import ( fetch_alphafold_protein_structure, + get_all_available_entry_ids, + get_prot_structure_dfs, ) from backend.protzilla.importing.peptide_import import peptide_import, evidence_import from backend.protzilla.steps import Step, StepManager @@ -462,3 +465,33 @@ def create_form(self): ) calc_method = staticmethod(crosslinking_import) + + +class ImportStructurePredictionsFromDisk(ImportingStep): + display_name = "Structure Predictions Import from Disk" + operation = "Protein Structure Import" + method_description = ( + "Load already uploaded protein structure predictions from disk into current run" + ) + + output_keys = [ + "metadata_df", + "cif_df", + "pae_df", + "plddt_df", + "sequence_df", + ] + + def create_form(self): + return Form( + label="Structure Predictions Import from Disk", + input_fields=[ + DropdownField( + name="entry_id", + label="Entry ID of the prediction to be loaded into the run. (Unless specified otherwise this is the Protein ID)", + options=form_helper.to_choices(get_all_available_entry_ids()), + ) + ], + ) + + calc_method = staticmethod(get_prot_structure_dfs) diff --git a/backend/tests/main/test_views_helper.py b/backend/tests/main/test_views_helper.py index 0b39fe4e4..d6b11ac75 100644 --- a/backend/tests/main/test_views_helper.py +++ b/backend/tests/main/test_views_helper.py @@ -15,6 +15,7 @@ def test_get_all_possible_step_names(): "FastaImport", "AlphaFoldPredictionLoad", "CrosslinkingImport", + "ImportStructurePredictionsFromDisk", "FilterProteinsBySamplesMissing", "FilterProteinsBySilacRatios", "FilterByProteinsCount", From 573f4486875abeb613687ba8fa8adc31a9096fdb Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Tue, 3 Feb 2026 16:48:46 +0100 Subject: [PATCH 076/240] feat: add tests for get_all_available_entry_ids and get_prot_structure_dfs --- .../test_alphafold_protein_structure_load.py | 142 ++++++++++++++++-- 1 file changed, 133 insertions(+), 9 deletions(-) diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index a7d4aa7e5..c648b2983 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -1,14 +1,18 @@ import pandas as pd import pytest from pathlib import Path +import json +import logging from backend.protzilla.importing.alphafold_protein_structure_load import ( fetch_alphafold_protein_structure, to_fasta, read_alphafold_mmcif, + get_all_available_entry_ids, + get_prot_structure_dfs, + paths, ) -import backend.protzilla.importing.alphafold_protein_structure_load as af def test_to_fasta_default_header_and_newline(): @@ -98,9 +102,9 @@ def test_fetch_alphafold_protein_structure_wrong_uniprot_id(): def test_fetch_alphafold_returned_keys(tmp_path, monkeypatch): - monkeypatch.setattr(af.paths, "EXTERNAL_DATA_PATH", tmp_path) + monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) - out = af.fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) + out = fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) assert out.keys() == { "metadata_df", "cif_df", @@ -112,8 +116,8 @@ def test_fetch_alphafold_returned_keys(tmp_path, monkeypatch): def test_fetch_alphafold_metadata(tmp_path, monkeypatch): - monkeypatch.setattr(af.paths, "EXTERNAL_DATA_PATH", tmp_path) - out = af.fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) + monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) + out = fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) assert isinstance(out["metadata_df"], pd.DataFrame) assert not out["metadata_df"].empty @@ -127,8 +131,8 @@ def test_fetch_alphafold_metadata(tmp_path, monkeypatch): def test_fetch_alphafold_files_exist(tmp_path, monkeypatch): - monkeypatch.setattr(af.paths, "EXTERNAL_DATA_PATH", tmp_path) - af.fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) + monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) + fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) target_dir = tmp_path / "alphafold" / "Q8WP00" assert target_dir.exists() @@ -147,8 +151,8 @@ def test_fetch_alphafold_files_exist(tmp_path, monkeypatch): def test_fetch_alphafold_dfs_exist(tmp_path, monkeypatch): - monkeypatch.setattr(af.paths, "EXTERNAL_DATA_PATH", tmp_path) - out = af.fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) + monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) + out = fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) cif_df = out["cif_df"] assert isinstance(cif_df, pd.DataFrame) @@ -166,3 +170,123 @@ def test_fetch_alphafold_dfs_exist(tmp_path, monkeypatch): seq_df = out["sequence_df"] assert isinstance(seq_df, pd.DataFrame) assert not seq_df.empty + + +def test_get_all_available_entry_ids_empty(tmp_path, monkeypatch): + monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) + assert get_all_available_entry_ids() == [] + + +def test_get_all_available_entry_ids_nonempty(tmp_path, monkeypatch): + monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) + meta_dir = tmp_path / "alphafold" + meta_dir.mkdir(parents=True, exist_ok=True) + csv = meta_dir / "alphafold_metadata.csv" + df = pd.DataFrame([{"entryID": "Q8WP00", "uniprotAccession": "Q8WP00"}]) + df.to_csv(csv, index=False) + + assert get_all_available_entry_ids() == ["Q8WP00"] + + +def test_get_prot_structure_dfs_no_metadata(tmp_path, monkeypatch): + monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) + with pytest.raises(FileNotFoundError, match=r"AlphaFold metadata CSV not found"): + get_prot_structure_dfs("Q8WP00") + + +def test_get_prot_structure_dfs_no_entry(tmp_path, monkeypatch): + monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) + meta_dir = tmp_path / "alphafold" + meta_dir.mkdir(parents=True, exist_ok=True) + csv = meta_dir / "alphafold_metadata.csv" + pd.DataFrame([{"entryID": "OTHER", "uniprotAccession": "OTHER"}]).to_csv( + csv, index=False + ) + + with pytest.raises(ValueError, match=r"No metadata for entryID 'Q8WP00'"): + get_prot_structure_dfs("Q8WP00") + + +def test_get_prot_structure_dfs_missing_dir(tmp_path, monkeypatch): + monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) + meta_dir = tmp_path / "alphafold" + meta_dir.mkdir(parents=True, exist_ok=True) + csv = meta_dir / "alphafold_metadata.csv" + pd.DataFrame([{"entryID": "Q8WP00", "uniprotAccession": "Q8WP00"}]).to_csv( + csv, index=False + ) + + with pytest.raises(FileNotFoundError, match=r"AlphaFold data directory not found"): + get_prot_structure_dfs("Q8WP00") + + +def test_get_prot_structure_dfs_success(tmp_path, monkeypatch): + monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) + + meta_dir = tmp_path / "alphafold" + meta_dir.mkdir(parents=True, exist_ok=True) + csv = meta_dir / "alphafold_metadata.csv" + + metadata = pd.DataFrame( + [ + { + "entryID": "Q8WP00", + "uniprotAccession": "Q8WP00", + "modelCreatedDate": "2025-08-01T00:00:00Z", + "gene": "PRM1", + "alphafold_version": "AlphaFold Monomer v2.0 pipeline", + } + ] + ) + metadata.to_csv(csv, index=False) + + prot_dir = meta_dir / "Q8WP00" + prot_dir.mkdir(parents=True, exist_ok=True) + + cif = prot_dir / "test.cif" + cif.write_text( + """data_test +loop_ +_atom_site.group_PDB +_atom_site.id +_atom_site.type_symbol +_atom_site.Cartn_x +ATOM 1 N 1.0 +ATOM 2 CA C 2.0 +""" + ) + + fasta = prot_dir / "Q8WP00.fasta" + fasta.write_text(">alpha|Q8WP00\nAAAA\n") + + pae = prot_dir / "pae.json" + plddt = prot_dir / "plddt.json" + pae_data = {"predicted_aligned_error": [0.1]} + with open(pae, "w") as f: + json.dump(pae_data, f) + + plddt_data = [{"residueNumber": 1, "confidenceScore": 90}] + with open(plddt, "w") as f: + json.dump(plddt_data, f) + + out = get_prot_structure_dfs("Q8WP00") + + assert isinstance(out["metadata_df"], pd.DataFrame) + assert not out["metadata_df"].empty + assert out["metadata_df"].iloc[0]["entryID"] == "Q8WP00" + + assert isinstance(out["cif_df"], pd.DataFrame) + assert not out["cif_df"].empty + + assert isinstance(out["pae_df"], pd.DataFrame) + assert not out["pae_df"].empty + + assert isinstance(out["plddt_df"], pd.DataFrame) + assert not out["plddt_df"].empty + + assert isinstance(out["sequence_df"], pd.DataFrame) + assert not out["sequence_df"].empty + + assert any(d.get("level") == logging.INFO for d in out["messages"]) or any( + "Successfully loaded" in d.get("msg", "") for d in out["messages"] + ) From 914ce7005d5c731d02a81439e9dc41e9adde5373 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Tue, 3 Feb 2026 17:57:18 +0100 Subject: [PATCH 077/240] fix: fix cif data in tests --- .../importing/test_alphafold_protein_structure_load.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index c648b2983..7a8121692 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -245,14 +245,14 @@ def test_get_prot_structure_dfs_success(tmp_path, monkeypatch): cif = prot_dir / "test.cif" cif.write_text( - """data_test + """ +data_test loop_ -_atom_site.group_PDB _atom_site.id _atom_site.type_symbol _atom_site.Cartn_x -ATOM 1 N 1.0 -ATOM 2 CA C 2.0 +N N 1.0 +CA C 2.0 """ ) From a88ece24e145a6bc42c3bfc945ded873a0f803c5 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Wed, 4 Feb 2026 12:08:54 +0100 Subject: [PATCH 078/240] feat: import crosslinking files with multiple organism ids --- .../importing/crosslinking_import.py | 145 +++++++++++------- backend/protzilla/methods/importing.py | 6 +- 2 files changed, 97 insertions(+), 54 deletions(-) diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index 06fdd4798..653c01ca0 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -287,12 +287,10 @@ def uniprot_lookup( :rtype: None """ if mode == ProteinDesignationLookupMode.id_to_gene_name.value: - error = ProteinLookupError.NO_GENE_NAME_FOUND.value field_of_existing_data = "accession" extra_query = None extra_fields = None elif mode == ProteinDesignationLookupMode.gene_name_to_id.value: - error = ProteinLookupError.NO_PROTEIN_ID_FOUND.value field_of_existing_data = "gene_exact" extra_query = f"organism_id:{organism_id} AND reviewed:true" extra_fields = "gene_names" @@ -316,10 +314,6 @@ def uniprot_lookup( response=response, results=results, input_data=batch, mode=mode ) - for data in input_data: - if data not in results: - results[data] = (False, None, error) - def get_gene_name_from_protein_ids( protein_ids: set[str], @@ -368,19 +362,27 @@ def get_gene_name_from_protein_ids( organism_id=None, ) + for protein_id in valid_ids_without_isoform: + if protein_id not in results: + results[protein_id] = ( + False, + None, + ProteinLookupError.NO_GENE_NAME_FOUND.value, + ) + return results def get_protein_ids_from_gene_name( - gene_names: set[str], organism_id: str + gene_names: set[str], organism_ids: list[str] ) -> dict[str, tuple[bool, Optional[str], Optional[str]]]: """ Retrieve UniProt protein IDs for a given set of human gene names as a batch query. :param gene_names: Set of gene symbols to look up (e.g., {"RAD50", "MRE11"}) :type gene_names: set[str] - :param organism_id: Organism identifier for filtering UniProt queries (e.g., "9606" for human) - :type organism_id: str + :param organism_ids: list of organism identifiers for filtering UniProt queries (e.g., "9606" for human) + :type organism_ids: list[str] :return: Dictionary mapping each gene name to a tuple of (success, protein_id, error) :rtype: dict[str, tuple[bool, str | None, str | None]] @@ -399,12 +401,30 @@ def get_protein_ids_from_gene_name( if not valid_gene_names: return results - uniprot_lookup( - input_data=valid_gene_names, - mode=ProteinDesignationLookupMode.gene_name_to_id.value, - results=results, - organism_id=organism_id, - ) + remaining_gene_names = set(valid_gene_names) + for single_organism_id in organism_ids: + + if not remaining_gene_names: + continue + + uniprot_lookup( + input_data=remaining_gene_names, + mode=ProteinDesignationLookupMode.gene_name_to_id.value, + results=results, + organism_id=single_organism_id, + ) + + for gene_name in list(remaining_gene_names): + if gene_name in results: + remaining_gene_names.discard(gene_name) + + for gene_name in valid_gene_names: + if gene_name not in results: + results[gene_name] = ( + False, + None, + ProteinLookupError.NO_GENE_NAME_FOUND.value, + ) return results @@ -580,7 +600,7 @@ def read_ProteomeDiscoverer_XlinkX_file( def read_csm_file( - file_path: Path, organism_id: str + file_path: Path, organism_ids: list[str] ) -> tuple[pd.DataFrame, pd.DataFrame]: """ Read and process a CSM CSV file: @@ -593,8 +613,8 @@ def read_csm_file( :param file_path: Path to the CSM CSV file :type file_path: pathlib.Path - :param organism_id: Organism identifier used for UniProt lookups (e.g., "9606" for human) - :type organism_id: str + :param organism_ids: list of organism identifiers used for UniProt lookups (e.g., "9606" for human) + :type organism_id: list[str] :return: Tuple of DataFrames containing rows with successfully mapped protein IDs and rows where lookup failed @@ -609,14 +629,14 @@ def read_csm_file( df["Is_intra_crosslink"] = df["Protein1"].eq(df["Protein2"]) - uniprot_lookup_function_with_organism_id = partial( - get_protein_ids_from_gene_name, organism_id=organism_id + uniprot_lookup_function_with_organism_ids = partial( + get_protein_ids_from_gene_name, organism_ids=organism_ids ) good_df, failed_df = get_missing_protein_designation( df=df, existing_column="Protein", missing_column="Protein_id", - uniprot_lookup_function=uniprot_lookup_function_with_organism_id, + uniprot_lookup_function=uniprot_lookup_function_with_organism_ids, ) return good_df, failed_df @@ -643,35 +663,52 @@ def normalize_crosslinking_df(df: pd.DataFrame) -> pd.DataFrame: return df.loc[:, columns_in_crosslinking_df] -def process_organism_id_from_text_field(organism_id: str) -> tuple[bool, Optional[str]]: +def process_organism_id_from_text_field( + organism_ids: str, +) -> tuple[bool, Optional[list[str]], Optional[list[str]]]: """ - Retrieve the scientific name of an organism from its NCBI Taxonomy ID. + Validates a comma-separated string of NCBI Taxonomy IDs. + Returns False immediately if any ID is invalid. + Otherwise returns True, the list of cleaned IDs, and the corresponding scientific names. - The function: - 1. Cleans the input organism ID (removes spaces). - 2. Queries the NCBI Entrez E-utilities esummary endpoint. - 3. Returns a tuple indicating whether the lookup succeeded and the scientific name. + param organism_ids: Comma-separated string of NCBI Taxonomy IDs (e.g., "9606,10090,10116") + :type organism_ids: str - :param organism_id: NCBI Taxonomy ID as a string (may contain spaces) - :type organism_id: str + :return: A tuple containing: + - success (bool): True if all IDs are valid, False if any ID is invalid + - ids (list[str] | None): List of cleaned IDs in input order if successful, None if failed or the cause of the fail + - names (list[str] | None): List of scientific names corresponding to IDs if successful, None if failed + """ + organism_ids_list: list[str] = [ + id.strip() for id in organism_ids.split(",") if id.strip() + ] + if not organism_ids_list: + return False, None, None + + organism_ids_for_request = ",".join(organism_ids_list) + url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=taxonomy&id={organism_ids_for_request}&retmode=json" + try: + response = requests.get(url, timeout=15) + if response.status_code != 200: + return False, None, None + data = response.json() + except Exception: + return False, None, None - :return: Tuple indicating success and the scientific name - :rtype: tuple[bool, str | None] + result = data.get("result", {}) + valid_organism_ids = result.get("uids", []) + organism_names = [] - :returns success: True if the organism ID was found and the scientific name retrieved - :returns name: Scientific name of the organism if found, else None - """ - cleaned_organism_id = organism_id.strip().replace(" ", "") - url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=taxonomy&id={cleaned_organism_id}&retmode=json" - response = requests.get(url) - if response.status_code != 200: - return False, None - data = response.json() - output_ids = data.get("result", {}) - if cleaned_organism_id not in output_ids: - return False, None - name = output_ids[cleaned_organism_id].get("scientificname") - return True, name + for id in organism_ids_list: + if id not in valid_organism_ids: + # Abort at the first invalid id + return False, id, None + name = result[id].get("scientificname") + if not name: + return False, id, None + organism_names.append(name) + + return True, organism_ids_list, organism_names def aggregate_failed_proteins_for_display(failed_df: pd.DataFrame) -> str: @@ -709,16 +746,19 @@ def aggregate_failed_proteins_for_display(failed_df: pd.DataFrame) -> str: return "\n".join(sorted(protein_with_error_set)) -def crosslinking_import(file_path: Path, organism_id: str) -> dict: +def crosslinking_import(file_path: Path, organism_ids: str) -> dict: file_type = file_path.suffix try: - scientific_organism_name = None + scientific_organism_names: list[str] = None if file_type == ".csv": - success, scientific_organism_name = process_organism_id_from_text_field( - organism_id + success, organism_ids_list, scientific_organism_names = ( + process_organism_id_from_text_field(organism_ids) ) if not success: - msg = f"Unsupported organism id: {organism_id}. Please provide a valid taxonomy id." + if organism_ids_list: + msg = f"Unsupported organism id: {organism_ids_list}. Please provide all valid taxonomy ids." + else: + msg = f"An error occurred while reading the organism ids. Please provide all valid taxonomy ids, separated by a comma." return dict( messages=[ dict( @@ -727,7 +767,7 @@ def crosslinking_import(file_path: Path, organism_id: str) -> dict: ) ] ) - good_df, failed_df = read_csm_file(file_path, organism_id) + good_df, failed_df = read_csm_file(file_path, organism_ids_list) elif file_type == ".xlsx": good_df, failed_df = read_ProteomeDiscoverer_XlinkX_file(file_path) else: @@ -746,7 +786,8 @@ def crosslinking_import(file_path: Path, organism_id: str) -> dict: def base_message(): if file_type == ".csv": - return f"{len(good_df)} cross-links for the {scientific_organism_name} organism" + organism_names_string = ", ".join(scientific_organism_names) + return f"{len(good_df)} cross-links for the {organism_names_string} organism(s)" return f"{len(good_df)} cross-links" if good_df.empty: diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 401ec8d04..f0acb9d66 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -459,9 +459,11 @@ def create_form(self): value=None, ), TextField( - name="organism_id", - label="Organism ID", + name="organism_ids", + label="Organism IDs \n(please list them in the order in which they should be applied, separated by a comma)", + value="", ), + InfoField(label="e.g.: 9606, 10090, 10116"), ], ) From 00c33abd772debbb50fe62201efc8021817d4fc6 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Wed, 4 Feb 2026 15:35:49 +0100 Subject: [PATCH 079/240] fix: change tests for changed code and add some new testing --- .../importing/test_crosslinking_import.py | 53 +++++++++++++++++-- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/backend/tests/protzilla/importing/test_crosslinking_import.py b/backend/tests/protzilla/importing/test_crosslinking_import.py index 81d6c3bfd..e0cc5c431 100644 --- a/backend/tests/protzilla/importing/test_crosslinking_import.py +++ b/backend/tests/protzilla/importing/test_crosslinking_import.py @@ -52,7 +52,7 @@ def validator(x): def test_validate_data_before_lookup_empty(): - valid, results = validate_data_before_lookup(set(), lambda x: True, "ERR") + valid, results = validate_data_before_lookup(set(), lambda x: True, "ERROR") assert valid == set() assert results == {} @@ -124,7 +124,7 @@ def mock_execute(*args, **kwargs): results=results, ) - assert results["P1"] == (False, None, "NO_GENE_NAME_FOUND") + assert results == {} def _minimal_valid_crosslinking_df(): @@ -181,6 +181,49 @@ def mock_lookup(ids): assert failed_df.empty +@pytest.mark.parametrize( + "input_string, mock_result, expected", + [ + ( + "9606,10090", + { + "uids": ["9606", "10090"], + "9606": {"scientificname": "Homo sapiens"}, + "10090": {"scientificname": "Mus musculus"}, + }, + (True, ["9606", "10090"], ["Homo sapiens", "Mus musculus"]), + ), + ( + "9606,9999", + { + "uids": ["9606"], + "9606": {"scientificname": "Homo sapiens"}, + }, + (False, "9999", None), + ), + ], +) +def test_process_organism_id_from_text_field( + monkeypatch, input_string, mock_result, expected +): + from protzilla.importing.crosslinking_import import ( + process_organism_id_from_text_field, + ) + + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = {"result": mock_result} + + monkeypatch.setattr( + "protzilla.importing.crosslinking_import.requests.get", + lambda *args, **kwargs: mock_response, + ) + + result = process_organism_id_from_text_field(input_string) + + assert result == expected + + def test_aggregate_failed_proteins_for_display(): df = pd.DataFrame( { @@ -216,7 +259,7 @@ def test_crosslinking_import_csv(tmp_path): "MRE11": (True, "Q67890", None), }, ): - result = crosslinking_import(csv_file, organism_id="9606") + result = crosslinking_import(csv_file, organism_ids="9606") assert "crosslinking_df" in result assert not result["crosslinking_df"].empty @@ -245,13 +288,13 @@ def test_crosslinking_import_xlsx(monkeypatch, tmp_path): lambda ids: {i: (True, f"G{i}", None) for i in ids}, ) - result = crosslinking_import(xlsx, organism_id="9606") + result = crosslinking_import(xlsx, organism_ids="9606") assert "crosslinking_df" in result def test_crosslinking_import_invalid_file(tmp_path): bad_file = tmp_path / "test.txt" bad_file.write_text("something invalid") - result = crosslinking_import(bad_file, organism_id="9606") + result = crosslinking_import(bad_file, organism_ids="9606") assert "messages" in result assert any("Unsupported file type" in m["msg"] for m in result["messages"]) From 68d5024f1f77100f49e6001ee57f7b012980bf96 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Wed, 4 Feb 2026 16:22:22 +0100 Subject: [PATCH 080/240] fix: rename variables and classes, add tests, improve file handling --- backend/protzilla/all_steps.py | 2 +- backend/protzilla/constants/paths.py | 2 + .../alphafold_protein_structure_load.py | 135 +++++++++--------- backend/protzilla/methods/importing.py | 4 +- backend/tests/main/test_views_helper.py | 2 +- .../test_alphafold_protein_structure_load.py | 83 +++++------ 6 files changed, 119 insertions(+), 109 deletions(-) diff --git a/backend/protzilla/all_steps.py b/backend/protzilla/all_steps.py index 1955cb8ae..8bb5d4d75 100644 --- a/backend/protzilla/all_steps.py +++ b/backend/protzilla/all_steps.py @@ -16,7 +16,7 @@ importing.FastaImport, importing.AlphaFoldPredictionLoad, importing.CrosslinkingImport, - importing.ImportStructurePredictionsFromDisk, + importing.ImportStructurePredictionFromDisk, data_preprocessing.FilterProteinsBySamplesMissing, data_preprocessing.FilterProteinsBySilacRatios, data_preprocessing.FilterByProteinsCount, diff --git a/backend/protzilla/constants/paths.py b/backend/protzilla/constants/paths.py index 0f1c4dea3..bf1ae2694 100644 --- a/backend/protzilla/constants/paths.py +++ b/backend/protzilla/constants/paths.py @@ -11,6 +11,8 @@ SETTINGS_PATH = USER_DATA_PATH / "settings" EXTERNAL_DATA_PATH = USER_DATA_PATH / "external_data" UPLOAD_PATH = BACKEND_PATH / "uploads" +ALPHAFOLD_PATH = EXTERNAL_DATA_PATH / "alphafold" +AF_METADATA_CSV_PATH = ALPHAFOLD_PATH / "alphafold_metadata.csv" CUSTOM_PLOT_SETTINGS_FILE_STEM = "plots" DEFAULT_PLOT_SETTINGS_FILE_STEM = "plots_default" diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index ddbc966be..19e043cd9 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -17,6 +17,31 @@ from backend.protzilla.networking import download_file_from_url +def get_metadata_df() -> pd.DataFrame: + """ + Returns all data from alphafold_metadata.csv in form of a dataframe. If no such csv exist, it returns + a dataframe with the corresponding keys but no values and creates a csv with the expected column names. + """ + metadata_csv = paths.AF_METADATA_CSV_PATH + + if not metadata_csv.exists(): + msg = f"AlphaFold metadata CSV not found: {metadata_csv}. Returning an empty Dataframe." + logger.error(msg) + metadata_df = pd.DataFrame( + columns=[ + "entryID", + "uniprotAccession", + "modelCreatedDate", + "gene", + "alphafold_version", + ] + ) + metadata_df.to_csv(metadata_csv, index=False) + return metadata_df + + return pd.read_csv(metadata_csv, dtype=str) + + def to_fasta(seq: str, header: str = "protein_sequence", width: int = 60) -> str: """ Convert a protein sequence to FASTA format. @@ -65,8 +90,6 @@ def read_alphafold_mmcif(path: str) -> pd.DataFrame: return pd.DataFrame() table = block.find_mmcif_category(cat_name) - if table is None: - return pd.DataFrame() columns = list(table.tags) nrows = len(table) @@ -114,8 +137,7 @@ def handle_alphafold_files( sequence_df = None messages = [] - meta_dir = paths.EXTERNAL_DATA_PATH / "alphafold" - target_dir = meta_dir / uniprot + target_dir = paths.ALPHAFOLD_PATH / uniprot downloaded: dict[str, str] = {} temp_dir = None @@ -128,25 +150,20 @@ def handle_alphafold_files( work_dir = temp_dir try: - if persist_uploads and metadata_df is not None: - meta_dir.mkdir(parents=True, exist_ok=True) - metadata_csv = meta_dir / "alphafold_metadata.csv" + if persist_uploads: + paths.ALPHAFOLD_PATH.mkdir(parents=True, exist_ok=True) + existing = get_metadata_df() try: - if metadata_csv.exists(): - existing = pd.read_csv(metadata_csv, dtype=str) - mask = existing["entryID"] == entry_id - if mask.any(): - msg = ( - f'Existing entry with EntryID "{entry_id}" was overwritten.' - ) - logger.warning(msg) - messages.append(dict(level=logging.WARNING, msg=msg)) - existing = existing[~mask] - - combined = pd.concat([existing, metadata_df], ignore_index=True) - combined.to_csv(metadata_csv, index=False) - else: - metadata_df.to_csv(metadata_csv, index=False) + metadata_csv = paths.AF_METADATA_CSV_PATH + mask = existing["entryID"] == entry_id + if mask.any(): + msg = f'Existing entry with EntryID "{entry_id}" was overwritten.' + logger.warning(msg) + messages.append(dict(level=logging.WARNING, msg=msg)) + existing = existing[~mask] + + combined = pd.concat([existing, metadata_df], ignore_index=True) + combined.to_csv(metadata_csv, index=False) logger.info("Wrote AlphaFold metadata to %s", metadata_csv) except Exception: logger.exception( @@ -198,16 +215,12 @@ def handle_alphafold_files( } -def get_all_available_entry_ids() -> list: - meta_dir = paths.EXTERNAL_DATA_PATH / "alphafold" - metadata_csv = meta_dir / "alphafold_metadata.csv" - - if metadata_csv.exists(): - df = pd.read_csv(metadata_csv) - return df["entryID"].tolist() - - else: - return [] +def get_all_available_entry_ids() -> list[str]: + """ " + Get the entry ids of all the protein structure predictions that can be found on disk. + """ + df = get_metadata_df() + return df["entryID"].tolist() def get_prot_structure_dfs(entry_id: str) -> dict[str, Any]: @@ -217,24 +230,15 @@ def get_prot_structure_dfs(entry_id: str) -> dict[str, Any]: :param entry_id: entryID of the uploaded protein structure :return: A dictionary containing DataFrames for metadata, CIF, PAE, pLDDT, and sequence data """ - messages: list[dict[str, Any]] = [] - - meta_dir = paths.EXTERNAL_DATA_PATH / "alphafold" - metadata_csv = meta_dir / "alphafold_metadata.csv" - - if not metadata_csv.exists(): - msg = f"AlphaFold metadata CSV not found: {metadata_csv}" - logger.error(msg) - raise FileNotFoundError(msg) - - all_metadata_df = pd.read_csv(metadata_csv, dtype=str) + messages: list[dict[str, str | int]] = [] + all_metadata_df = get_metadata_df() metadata_df = all_metadata_df[all_metadata_df["entryID"] == entry_id] if metadata_df.empty: - msg = f"No metadata for entryID '{entry_id}' in {metadata_csv}" + msg = f"No metadata for entryID '{entry_id}' in {paths.AF_METADATA_CSV_PATH}" logger.error(msg) raise ValueError(msg) - prot_dir = meta_dir / entry_id.upper() + prot_dir = paths.ALPHAFOLD_PATH / entry_id.upper() if not prot_dir.exists() or not prot_dir.is_dir(): msg = f"AlphaFold data directory not found for entry '{entry_id}': {prot_dir}" logger.error(msg) @@ -247,6 +251,11 @@ def get_prot_structure_dfs(entry_id: str) -> dict[str, Any]: logger.error(msg) raise FileNotFoundError(msg) + if len(cif_files) > 1: + message = "There are several CIF files for this protein structure prediction. The first one will be read, all others will be ignored." + logger.info(message) + messages.append(dict(level=logging.WARNING, msg=message)) + cif_file = cif_files[0] try: cif_df = read_alphafold_mmcif(str(cif_file)) @@ -267,9 +276,9 @@ def get_prot_structure_dfs(entry_id: str) -> dict[str, Any]: fasta_dict = fasta_import(str(fasta_file)) sequence_df = fasta_dict.get("fasta_df") if sequence_df is None: - raise RuntimeError( - f"FASTA importer did not return 'fasta_df' for {fasta_file}" - ) + msg = f"FASTA importer did not return 'fasta_df' for {fasta_file}" + logger.error(msg) + raise RuntimeError(msg) except Exception as e: msg = f"Failed to load FASTA '{fasta_file}': {e}" logger.exception(msg) @@ -287,6 +296,8 @@ def get_prot_structure_dfs(entry_id: str) -> dict[str, Any]: try: if len(json_files) == 1: + msg = f"Only one json file found in {prot_dir} for entry '{entry_id}'. Two json files are expected" + logger.error(msg) raise RuntimeError() else: json1 = pd.read_json(json_files[0]) @@ -307,7 +318,7 @@ def get_prot_structure_dfs(entry_id: str) -> dict[str, Any]: # Fallback: assign and warn pae_df = json1 plddt_df = json2 - warn = f"Could not detect PAE/pLDDT in JSON files for entry '{entry_id}'; files will be returned as read." + warn = f"Could not detect PAE/pLDDT in JSON files for entry '{entry_id}'; ''{json_files[0]} is read as PAE, {json_files[1]} is read as pLDDT." logger.warning(warn) messages.append(dict(level=logging.WARNING, msg=warn)) except Exception as e: @@ -315,27 +326,23 @@ def get_prot_structure_dfs(entry_id: str) -> dict[str, Any]: logger.exception(msg) raise RuntimeError(msg) from e - def _check_df(df: Any) -> bool: - return df is not None - - if ( - _check_df(cif_df) - and _check_df(pae_df) - and _check_df(plddt_df) - and _check_df(sequence_df) - ): - success_msg = f"Successfully loaded AlphaFold data for entry '{entry_id}'" - logger.info(success_msg) - messages.append(dict(level=logging.INFO, msg=success_msg)) - - return { + df_dict = { "metadata_df": metadata_df, "cif_df": cif_df, "pae_df": pae_df, "plddt_df": plddt_df, "sequence_df": sequence_df, - "messages": messages, } + if not any(df.empty for df in df_dict.values()): + success_msg = f"Successfully loaded AlphaFold data for entry '{entry_id}'" + logger.info(success_msg) + messages.append(dict(level=logging.INFO, msg=success_msg)) + else: + message = f"Could not load AlphaFold data for entry '{entry_id}'" + logger.warning(message) + messages.append(dict(level=logging.WARNING, msg=message)) + df_dict["messages"] = messages + return df_dict def fetch_alphafold_protein_structure( diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 9d6ed07aa..5cca24a8b 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -467,8 +467,8 @@ def create_form(self): calc_method = staticmethod(crosslinking_import) -class ImportStructurePredictionsFromDisk(ImportingStep): - display_name = "Structure Predictions Import from Disk" +class ImportStructurePredictionFromDisk(ImportingStep): + display_name = "Structure Prediction Import from Disk" operation = "Protein Structure Import" method_description = ( "Load already uploaded protein structure predictions from disk into current run" diff --git a/backend/tests/main/test_views_helper.py b/backend/tests/main/test_views_helper.py index d6b11ac75..35df025d7 100644 --- a/backend/tests/main/test_views_helper.py +++ b/backend/tests/main/test_views_helper.py @@ -15,7 +15,7 @@ def test_get_all_possible_step_names(): "FastaImport", "AlphaFoldPredictionLoad", "CrosslinkingImport", - "ImportStructurePredictionsFromDisk", + "ImportStructurePredictionFromDisk", "FilterProteinsBySamplesMissing", "FilterProteinsBySilacRatios", "FilterByProteinsCount", diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index 7a8121692..8a677064f 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -1,6 +1,5 @@ import pandas as pd import pytest -from pathlib import Path import json import logging @@ -11,8 +10,8 @@ read_alphafold_mmcif, get_all_available_entry_ids, get_prot_structure_dfs, - paths, ) +from backend.protzilla.constants import paths def test_to_fasta_default_header_and_newline(): @@ -131,10 +130,10 @@ def test_fetch_alphafold_metadata(tmp_path, monkeypatch): def test_fetch_alphafold_files_exist(tmp_path, monkeypatch): - monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) + monkeypatch.setattr(paths, "ALPHAFOLD_PATH", tmp_path) fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) - target_dir = tmp_path / "alphafold" / "Q8WP00" + target_dir = tmp_path / "Q8WP00" assert target_dir.exists() assert target_dir.is_dir() @@ -173,59 +172,48 @@ def test_fetch_alphafold_dfs_exist(tmp_path, monkeypatch): def test_get_all_available_entry_ids_empty(tmp_path, monkeypatch): - monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) + metadata_csv = tmp_path / "alphafold_metadata.csv" + monkeypatch.setattr(paths, "AF_METADATA_CSV_PATH", metadata_csv) + assert get_all_available_entry_ids() == [] + assert metadata_csv.exists() + + df = pd.read_csv(metadata_csv, dtype=str) + assert list(df.columns) == [ + "entryID", + "uniprotAccession", + "modelCreatedDate", + "gene", + "alphafold_version", + ] + assert len(df) == 0 def test_get_all_available_entry_ids_nonempty(tmp_path, monkeypatch): - monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) - meta_dir = tmp_path / "alphafold" - meta_dir.mkdir(parents=True, exist_ok=True) - csv = meta_dir / "alphafold_metadata.csv" + metadata_csv = tmp_path / "alphafold_metadata.csv" + monkeypatch.setattr(paths, "AF_METADATA_CSV_PATH", metadata_csv) df = pd.DataFrame([{"entryID": "Q8WP00", "uniprotAccession": "Q8WP00"}]) - df.to_csv(csv, index=False) + df.to_csv(metadata_csv, index=False) assert get_all_available_entry_ids() == ["Q8WP00"] -def test_get_prot_structure_dfs_no_metadata(tmp_path, monkeypatch): - monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) - with pytest.raises(FileNotFoundError, match=r"AlphaFold metadata CSV not found"): - get_prot_structure_dfs("Q8WP00") - - def test_get_prot_structure_dfs_no_entry(tmp_path, monkeypatch): - monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) - meta_dir = tmp_path / "alphafold" - meta_dir.mkdir(parents=True, exist_ok=True) - csv = meta_dir / "alphafold_metadata.csv" + metadata_csv = tmp_path / "alphafold_metadata.csv" + monkeypatch.setattr(paths, "AF_METADATA_CSV_PATH", metadata_csv) pd.DataFrame([{"entryID": "OTHER", "uniprotAccession": "OTHER"}]).to_csv( - csv, index=False + metadata_csv, index=False ) with pytest.raises(ValueError, match=r"No metadata for entryID 'Q8WP00'"): get_prot_structure_dfs("Q8WP00") -def test_get_prot_structure_dfs_missing_dir(tmp_path, monkeypatch): - monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) - meta_dir = tmp_path / "alphafold" - meta_dir.mkdir(parents=True, exist_ok=True) - csv = meta_dir / "alphafold_metadata.csv" - pd.DataFrame([{"entryID": "Q8WP00", "uniprotAccession": "Q8WP00"}]).to_csv( - csv, index=False - ) - - with pytest.raises(FileNotFoundError, match=r"AlphaFold data directory not found"): - get_prot_structure_dfs("Q8WP00") - - def test_get_prot_structure_dfs_success(tmp_path, monkeypatch): - monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) - - meta_dir = tmp_path / "alphafold" - meta_dir.mkdir(parents=True, exist_ok=True) - csv = meta_dir / "alphafold_metadata.csv" + monkeypatch.setattr(paths, "ALPHAFOLD_PATH", tmp_path) + tmp_path.mkdir(parents=True, exist_ok=True) + metadata_csv = tmp_path / "alphafold_metadata.csv" + monkeypatch.setattr(paths, "AF_METADATA_CSV_PATH", metadata_csv) metadata = pd.DataFrame( [ @@ -238,9 +226,9 @@ def test_get_prot_structure_dfs_success(tmp_path, monkeypatch): } ] ) - metadata.to_csv(csv, index=False) + metadata.to_csv(metadata_csv, index=False) - prot_dir = meta_dir / "Q8WP00" + prot_dir = tmp_path / "Q8WP00" prot_dir.mkdir(parents=True, exist_ok=True) cif = prot_dir / "test.cif" @@ -277,15 +265,28 @@ def test_get_prot_structure_dfs_success(tmp_path, monkeypatch): assert isinstance(out["cif_df"], pd.DataFrame) assert not out["cif_df"].empty + assert list(out["cif_df"].columns) == [ + "_atom_site.id", + "_atom_site.type_symbol", + "_atom_site.Cartn_x", + ] + assert out["cif_df"]["_atom_site.id"].tolist() == ["N", "CA"] + assert out["cif_df"]["_atom_site.type_symbol"].tolist() == ["N", "C"] + assert out["cif_df"]["_atom_site.Cartn_x"].tolist() == ["1.0", "2.0"] assert isinstance(out["pae_df"], pd.DataFrame) assert not out["pae_df"].empty + assert out["pae_df"]["predicted_aligned_error"].tolist() == [0.1] assert isinstance(out["plddt_df"], pd.DataFrame) assert not out["plddt_df"].empty + assert out["plddt_df"]["residueNumber"].tolist() == [1] + assert out["plddt_df"]["confidenceScore"].tolist() == [90] assert isinstance(out["sequence_df"], pd.DataFrame) assert not out["sequence_df"].empty + assert out["sequence_df"]["Protein ID"].tolist() == ["Q8WP00-1"] + assert out["sequence_df"]["Protein Sequence"].tolist() == ["AAAA"] assert any(d.get("level") == logging.INFO for d in out["messages"]) or any( "Successfully loaded" in d.get("msg", "") for d in out["messages"] From 7e4d55536ebf6e973b5f562a23e8dd7d5e5a72fe Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Wed, 4 Feb 2026 16:50:15 +0100 Subject: [PATCH 081/240] feat: add success message --- .../alphafold_protein_structure_load.py | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 19e043cd9..afba35573 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -406,12 +406,21 @@ def fetch_alphafold_protein_structure( entry_id=uniprot_id, persist_uploads=persist_uploads, ) - - return { - "metadata_df": metadata_df, - "cif_df": alpha_dfs["cif_df"], - "pae_df": alpha_dfs["pae_df"], - "plddt_df": alpha_dfs["plddt_df"], - "sequence_df": alpha_dfs["sequence_df"], - "messages": alpha_dfs.get("messages", []), - } + df_dict = { + "metadata_df": metadata_df, + "cif_df": alpha_dfs["cif_df"], + "pae_df": alpha_dfs["pae_df"], + "plddt_df": alpha_dfs["plddt_df"], + "sequence_df": alpha_dfs["sequence_df"], + } + messages = alpha_dfs["messages"] + if not any(df.empty for df in df_dict.values()): + success_msg = f"Successfully loaded AlphaFold data for protein with Protein ID '{uniprot_id}'" + logger.info(success_msg) + messages.append(dict(level=logging.INFO, msg=success_msg)) + else: + message = f"Could not load AlphaFold data for protein with Protein ID '{uniprot_id}'" + logger.warning(message) + messages.append(dict(level=logging.WARNING, msg=message)) + df_dict["messages"] = messages + return df_dict From ab9c3efa24ed1c7cda2c79cf7e907853c19f8a17 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Thu, 5 Feb 2026 08:59:02 +0100 Subject: [PATCH 082/240] fix: format with black --- .../protzilla/importing/alphafold_protein_structure_load.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index afba35573..489fc1510 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -419,7 +419,9 @@ def fetch_alphafold_protein_structure( logger.info(success_msg) messages.append(dict(level=logging.INFO, msg=success_msg)) else: - message = f"Could not load AlphaFold data for protein with Protein ID '{uniprot_id}'" + message = ( + f"Could not load AlphaFold data for protein with Protein ID '{uniprot_id}'" + ) logger.warning(message) messages.append(dict(level=logging.WARNING, msg=message)) df_dict["messages"] = messages From cbfbfcecc7e60040de0549fd5be4683e098f9092 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Thu, 5 Feb 2026 16:32:13 +0100 Subject: [PATCH 083/240] Changes by elenakalbitzer --- backend/protzilla/methods/data_analysis.py | 21 +++++++++++++++++++- backend/protzilla/steps.py | 23 ++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 071644cc2..cd61c0699 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -65,6 +65,10 @@ bar_plot_of_valid_crosslinks, ) from backend.protzilla.run import Run +from backend.protzilla.methods.importing import ( + ImportStructurePredictionFromDisk, + AlphaFoldPredictionLoad, +) class TTestType(Enum): @@ -2483,7 +2487,7 @@ def create_form(self): return Form( label="Ångström Deviation", input_fields=[ - TextField( + DropdownField( name="protein_to_validate", label="Protein prediction that should be validated", ), @@ -2491,6 +2495,21 @@ def create_form(self): ) def modify_form(self, form: Form, run: Run) -> None: + # add all loaded protein entry ids to the dropdown of protein_to_validate_field + loaded_protein_entry_ids = list( + set( + run.steps.get_inputs_of_step_type( + ImportStructurePredictionFromDisk, "entry_id" + ) + + run.steps.get_inputs_of_step_type( + AlphaFoldPredictionLoad, "uniprot_id" + ) + ) + ) + form["protein_to_validate"].set_options( + form_helper.to_choices(loaded_protein_entry_ids) + ) + # create fields for every crosslink crosslinkers = self._get_crosslinker_names_from_crosslinker_df(run.steps) for crosslinker in crosslinkers: field_name = f"{crosslinker}_length" diff --git a/backend/protzilla/steps.py b/backend/protzilla/steps.py index 727f9f8d6..40a01cd66 100644 --- a/backend/protzilla/steps.py +++ b/backend/protzilla/steps.py @@ -596,6 +596,29 @@ def check_instance_identifier(step): return step.inputs[input_key] return default + def get_inputs_of_step_type( + self, + step_types: type[Step] | list[type[Step]], + input_key: str, + ) -> list[str]: + """ + Get the specific input of all steps that have a specific step type. + :param step_types: The types of the relevant steps + :param input_key: The key of the desired input in the input dictionary of the step + :return: The values of the input of the steps + """ + + step_types = [step_types] if not isinstance(step_types, list) else step_types + inputs = [] + for step in reversed(self.previous_calculated_steps): + print(str(type(step)) + " expected one of these: " + str(step_types)) + if ( + any(isinstance(step, st) for st in step_types) + and input_key in step.inputs + ): + inputs.append(step.inputs[input_key]) + return inputs + def all_steps_in_section(self, section: str) -> list[Step]: """ Get all steps in a specific section via the section name From 008e12e686ae2c6b26a708502a5337b54b5ca301 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Thu, 5 Feb 2026 17:19:45 +0100 Subject: [PATCH 084/240] refactor: remove usage of AlphafoldFetch in crosslinking validation --- .../data_analysis/crosslinking_validation.py | 28 +++++++++++-------- .../alphafold_protein_structure_load.py | 17 +++++------ backend/protzilla/methods/data_analysis.py | 12 ++++++++ backend/protzilla/methods/importing.py | 4 +-- backend/protzilla/steps.py | 25 ++++++++++++++++- .../test_crosslinking_validation.py | 11 +++----- .../test_alphafold_protein_structure_load.py | 12 ++++---- 7 files changed, 72 insertions(+), 37 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 1af6f42bc..d6c14a6ed 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -124,18 +124,18 @@ def get_position_of_amino_acid_crosslinker_bound_to( def get_distance_between_crosslinker_connected_amino_acids_in_alphafold( - fasta_df: pd.DataFrame, cif_df: pd.DataFrame, crosslink: pd.Series + amino_acid_sequence_df: pd.DataFrame, cif_df: pd.DataFrame, crosslink: pd.Series ) -> float: """ Calculates the distance in Ångström between two amino acid residues connected by a cross-linker using a predicted protein structure (e.g. from AlphaFold). - :param fasta_df: DataFrame containing the protein sequence + :param amino_acid_sequence_df: DataFrame containing the protein sequence :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) :param crosslink: Series describing a cross-link, including cross-linker positions :return: the distance between the cross-linked amino acids in Ångström """ - protein_sequence = fasta_df.at[0, "Protein Sequence"] + protein_sequence = amino_acid_sequence_df.at[0, "Protein Sequence"] amino_acid_position_crosslinker1_is_bound_to = ( get_position_of_amino_acid_crosslinker_bound_to( protein_sequence=protein_sequence, @@ -164,6 +164,8 @@ def validate_with_angstrom_deviation( crosslinking_df: pd.DataFrame, protein_to_validate: str, crosslinker_information: dict[str, list[float]], + cif_df: pd.DataFrame, + amino_acid_sequence_df: pd.DataFrame, ) -> dict: """ Validates cross-links by comparing the cross-linker lengths with the distances between the linked @@ -177,18 +179,14 @@ def validate_with_angstrom_deviation( - length_of_: float - lower_accepted_deviation_for_: float - upper_accepted_deviation_for_: float + :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) + :param amino_acid_sequence_df: DataFrame containing the protein sequence :return: dict (crosslinking_df_result, messages), crosslinking_df_result contains the relevant rows (rows of intra-crosslinks within the protein to validate) of crosslinking_df and two more colums containing the distances in AlphaFold and wheter the crosslink matches the AlphaFold data or not :raises KeyError: If a required crosslinker field is missing in crosslinker_information. :raises ValueError: If peptide sequences cannot be matched to the protein sequence. """ - alphafold_data = fetch_alphafold_protein_structure( - uniprot_id=protein_to_validate, persist_uploads=False - ) - cif_df = alphafold_data["cif_df"] - fasta_df = alphafold_data["sequence_df"] - all_crosslinks_df = crosslinking_df.copy() # we are only interested in intra-crosslinks of the protein we want to validate @@ -199,7 +197,7 @@ def validate_with_angstrom_deviation( def check_crosslink(crosslink: pd.Series) -> pd.Series: distance = get_distance_between_crosslinker_connected_amino_acids_in_alphafold( - fasta_df, cif_df, crosslink + amino_acid_sequence_df, cif_df, crosslink ) try: ( @@ -243,6 +241,8 @@ def bar_plot_of_valid_crosslinks( crosslinking_df: pd.DataFrame, protein_to_validate: str, crosslinker_information: dict[str, list[float]], + cif_df: pd.DataFrame, + amino_acid_sequence_df: pd.DataFrame, ) -> list[Figure]: """ Creates a bar plot summarizing the number of valid and invalid cross-links @@ -255,12 +255,18 @@ def bar_plot_of_valid_crosslinks( - length_of_: float - lower_accepted_deviation_for_: float - upper_accepted_deviation_for_: float + :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) + :param amino_acid_sequence_df: DataFrame containing the protein sequence :return: List containing a single bar plot object representing counts of valid and invalid cross-links. :raises KeyError: If a required crosslinker field is missing in crosslinker_information. """ validated_df = validate_with_angstrom_deviation( - crosslinking_df, protein_to_validate, crosslinker_information + crosslinking_df, + protein_to_validate, + crosslinker_information, + cif_df, + amino_acid_sequence_df, )["crosslinking_result_df"] evaluated = validated_df["valid_crosslink"].dropna() diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 489fc1510..0556ba343 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -134,7 +134,7 @@ def handle_alphafold_files( cif_df = None pae_df = None plddt_df = None - sequence_df = None + amino_acid_sequence_df = None messages = [] target_dir = paths.ALPHAFOLD_PATH / uniprot @@ -196,7 +196,7 @@ def handle_alphafold_files( f.write(sequence) logger.info("Wrote FASTA sequence to %s", fasta_dest) fasta_dict = fasta_import(str(fasta_dest)) - sequence_df = fasta_dict["fasta_df"] + amino_acid_sequence_df = fasta_dict["fasta_df"] except OSError: logger.exception("Failed to write FASTA file %s", fasta_dest) except Exception: @@ -210,7 +210,7 @@ def handle_alphafold_files( "cif_df": cif_df, "pae_df": pae_df, "plddt_df": plddt_df, - "sequence_df": sequence_df, + "amino_acid_sequence_df": amino_acid_sequence_df, "messages": messages, } @@ -274,8 +274,8 @@ def get_prot_structure_dfs(entry_id: str) -> dict[str, Any]: fasta_file = fasta_files[0] try: fasta_dict = fasta_import(str(fasta_file)) - sequence_df = fasta_dict.get("fasta_df") - if sequence_df is None: + amino_acid_sequence_df = fasta_dict.get("fasta_df") + if amino_acid_sequence_df is None: msg = f"FASTA importer did not return 'fasta_df' for {fasta_file}" logger.error(msg) raise RuntimeError(msg) @@ -291,9 +291,6 @@ def get_prot_structure_dfs(entry_id: str) -> dict[str, Any]: logger.error(msg) raise FileNotFoundError(msg) - pae_df = None - plddt_df = None - try: if len(json_files) == 1: msg = f"Only one json file found in {prot_dir} for entry '{entry_id}'. Two json files are expected" @@ -331,7 +328,7 @@ def get_prot_structure_dfs(entry_id: str) -> dict[str, Any]: "cif_df": cif_df, "pae_df": pae_df, "plddt_df": plddt_df, - "sequence_df": sequence_df, + "amino_acid_sequence_df": amino_acid_sequence_df, } if not any(df.empty for df in df_dict.values()): success_msg = f"Successfully loaded AlphaFold data for entry '{entry_id}'" @@ -411,7 +408,7 @@ def fetch_alphafold_protein_structure( "cif_df": alpha_dfs["cif_df"], "pae_df": alpha_dfs["pae_df"], "plddt_df": alpha_dfs["plddt_df"], - "sequence_df": alpha_dfs["sequence_df"], + "amino_acid_sequence_df": alpha_dfs["amino_acid_sequence_df"], } messages = alpha_dfs["messages"] if not any(df.empty for df in df_dict.values()): diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index cd61c0699..388c084a3 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -2537,6 +2537,18 @@ def modify_form(self, form: Form, run: Run) -> None: calc_method = staticmethod(validate_with_angstrom_deviation) def insert_dataframes(self, steps: StepManager, inputs) -> dict: + entry_id = inputs["protein_to_validate"] + correct_input_step_identifier = steps.get_step_identifier_of_step_with_input( + ImportStructurePredictionFromDisk, "entry_id", entry_id + ) or steps.get_step_identifier_of_step_with_input( + AlphaFoldPredictionLoad, "uniprot_id", entry_id + ) + inputs["cif_df"] = steps.get_step_output( + Step, "cif_df", correct_input_step_identifier + ) + inputs["amino_acid_sequence_df"] = steps.get_step_output( + Step, "amino_acid_sequence_df", correct_input_step_identifier + ) inputs["crosslinking_df"] = steps.get_step_output( Step, "crosslinking_df", diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 020591623..a66d569ff 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -421,7 +421,7 @@ class AlphaFoldPredictionLoad(ImportingStep): "cif_df", "pae_df", "plddt_df", - "sequence_df", + "amino_acid_sequence_df", ] plot_method = None @@ -483,7 +483,7 @@ class ImportStructurePredictionFromDisk(ImportingStep): "cif_df", "pae_df", "plddt_df", - "sequence_df", + "amino_acid_sequence_df", ] def create_form(self): diff --git a/backend/protzilla/steps.py b/backend/protzilla/steps.py index 40a01cd66..18e9e303a 100644 --- a/backend/protzilla/steps.py +++ b/backend/protzilla/steps.py @@ -506,6 +506,30 @@ def get_instance_identifiers( ) return instance_identifiers + def get_step_identifier_of_step_with_input( + self, + step_types: type[Step] | list[type[Step]], + input_key: str, + input: str, + ) -> str | None: + """ + Get the step identifier of a step with a certain type and a specific input value for one of the input fields. + :param step_types: The types of the relevant steps + :param input_key: The key of the desired input in the input dictionary of the step + :param input: The specific value for the input_key we are looking for. + :return: The step identifier of the step with the correct input + """ + + step_types = [step_types] if not isinstance(step_types, list) else step_types + for step in reversed(self.previous_calculated_steps): + if ( + any(isinstance(step, st) for st in step_types) + and input_key in step.inputs + and step.inputs[input_key] == input + ): + return step.instance_identifier + return None + def get_step_output( self, step_type: type[Step], @@ -611,7 +635,6 @@ def get_inputs_of_step_type( step_types = [step_types] if not isinstance(step_types, list) else step_types inputs = [] for step in reversed(self.previous_calculated_steps): - print(str(type(step)) + " expected one of these: " + str(step_types)) if ( any(isinstance(step, st) for st in step_types) and input_key in step.inputs diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index d5de0ca93..1078c375e 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -19,9 +19,6 @@ def test_get_position_of_amino_acid_crosslinker_bound_to(): assert pos == 3 -@patch( - "backend.protzilla.data_analysis.crosslinking_validation.fetch_alphafold_protein_structure" -) @pytest.mark.parametrize( "distance, expected", [ @@ -31,7 +28,7 @@ def test_get_position_of_amino_acid_crosslinker_bound_to(): (6.01, False), # outside bounds ], ) -def test_validate_with_angstrom_deviation(mock_fetch, distance, expected): +def test_validate_with_angstrom_deviation(distance, expected): # Fake AlphaFold Data cif_df = pd.DataFrame( { @@ -43,9 +40,7 @@ def test_validate_with_angstrom_deviation(mock_fetch, distance, expected): } ) - fasta_df = pd.DataFrame({"Protein Sequence": ["AB"]}) - - mock_fetch.return_value = {"cif_df": cif_df, "sequence_df": fasta_df} + amino_acid_sequence_df = pd.DataFrame({"Protein Sequence": ["AB"]}) # Fake Crosslink Data crosslinking_df = pd.DataFrame( @@ -66,6 +61,8 @@ def test_validate_with_angstrom_deviation(mock_fetch, distance, expected): crosslinking_df, protein_to_validate="P12345", crosslinker_information=crosslinker_information, + amino_acid_sequence_df=amino_acid_sequence_df, + cif_df=cif_df, ) df = result["crosslinking_result_df"] diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index 8a677064f..1c196c139 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -109,7 +109,7 @@ def test_fetch_alphafold_returned_keys(tmp_path, monkeypatch): "cif_df", "pae_df", "plddt_df", - "sequence_df", + "amino_acid_sequence_df", "messages", } @@ -166,7 +166,7 @@ def test_fetch_alphafold_dfs_exist(tmp_path, monkeypatch): assert isinstance(plddt_df, pd.DataFrame) assert not plddt_df.empty - seq_df = out["sequence_df"] + seq_df = out["amino_acid_sequence_df"] assert isinstance(seq_df, pd.DataFrame) assert not seq_df.empty @@ -283,10 +283,10 @@ def test_get_prot_structure_dfs_success(tmp_path, monkeypatch): assert out["plddt_df"]["residueNumber"].tolist() == [1] assert out["plddt_df"]["confidenceScore"].tolist() == [90] - assert isinstance(out["sequence_df"], pd.DataFrame) - assert not out["sequence_df"].empty - assert out["sequence_df"]["Protein ID"].tolist() == ["Q8WP00-1"] - assert out["sequence_df"]["Protein Sequence"].tolist() == ["AAAA"] + assert isinstance(out["amino_acid_sequence_df"], pd.DataFrame) + assert not out["amino_acid_sequence_df"].empty + assert out["amino_acid_sequence_df"]["Protein ID"].tolist() == ["Q8WP00-1"] + assert out["amino_acid_sequence_df"]["Protein Sequence"].tolist() == ["AAAA"] assert any(d.get("level") == logging.INFO for d in out["messages"]) or any( "Successfully loaded" in d.get("msg", "") for d in out["messages"] From 4d8fa542a918d7ec7f9fd3517bdcb22857d2e74b Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sat, 7 Feb 2026 14:48:46 +0100 Subject: [PATCH 085/240] feat: add 2 histograms to crosslinking validation with angstrom deviation --- .../data_analysis/crosslinking_validation.py | 107 +++++++++++++++--- backend/protzilla/data_preprocessing/plots.py | 59 +++++++--- .../components/app/run-screen/run-screen.tsx | 1 + 3 files changed, 139 insertions(+), 28 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 1af6f42bc..0bfce078d 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -1,11 +1,15 @@ import pandas as pd import numpy as np from plotly.graph_objects import Figure +from scipy.ndimage import standard_deviation -from protzilla.importing.alphafold_protein_structure_load import ( +from backend.protzilla.importing.alphafold_protein_structure_load import ( fetch_alphafold_protein_structure, ) -from protzilla.data_preprocessing.plots import create_bar_plot +from backend.protzilla.data_preprocessing.plots import ( + create_histograms, + create_bar_plot, +) def get_reactive_atom_of_amino_acid_residue(amino_acid_type: str) -> str: @@ -261,21 +265,94 @@ def bar_plot_of_valid_crosslinks( """ validated_df = validate_with_angstrom_deviation( crosslinking_df, protein_to_validate, crosslinker_information - )["crosslinking_result_df"] + )[ + "crosslinking_result_df" + ] # TODO: was wenn wir einfach keine relevanten Crosslinks zurück bekommen - evaluated = validated_df["valid_crosslink"].dropna() + validated_df = validated_df.dropna(subset=["valid_crosslink"]) + + distances_valid = validated_df.loc[ + validated_df["valid_crosslink"] == True, "alphafold_distance" + ] + distances_invalid = validated_df.loc[ + validated_df["valid_crosslink"] == False, "alphafold_distance" + ] + df_valid = pd.DataFrame({"alphafold_distance": distances_valid}) + df_invalid = pd.DataFrame({"alphafold_distance": distances_invalid}) + + histogram = create_histograms( + dataframe_a=df_valid, + dataframe_b=df_invalid, + name_a="Valid Crosslinks", + name_b="Invalid Crosslinks", + heading=f"AlphaFold Distances for {protein_to_validate}", + x_title="Distance (Å)", + y_title="Count", + overlay=True, + visual_transformation="linear", + relevant_column_a="alphafold_distance", + relevant_column_b="alphafold_distance", + ) + + mean_predicted_lengths = validated_df["alphafold_distance"].mean() + standard_deviation_predicted_lengths = validated_df["alphafold_distance"].std() + + histogram2 = create_histograms( + dataframe_a=df_valid, + dataframe_b=df_invalid, + name_a="Valid Crosslinks", + name_b="Invalid Crosslinks", + heading=f"AlphaFold Distances for {protein_to_validate}, mean +- 2 standard deviations", + x_title="Distance (Å)", + y_title="Count", + overlay=True, + visual_transformation="linear", + relevant_column_a="alphafold_distance", + relevant_column_b="alphafold_distance", + min_value_to_plot=mean_predicted_lengths + - 2 * standard_deviation_predicted_lengths, + max_value_to_plot=mean_predicted_lengths + + 2 * standard_deviation_predicted_lengths, + vertical_lines=[ + (crosslinker_length, key) + for key, ( + crosslinker_length, + accepted_deviation_upper_bound, + accepted_deviation_lower_bound, + ) in crosslinker_information.items() + ], + vertical_lines_dashed=[ + (crosslinker_length + accepted_deviation_upper_bound, f"{key}_upper_bound") + for key, ( + crosslinker_length, + accepted_deviation_upper_bound, + accepted_deviation_lower_bound, + ) in crosslinker_information.items() + if accepted_deviation_upper_bound != 0 + ] + + [ + (crosslinker_length - accepted_deviation_lower_bound, f"{key}_lower_bound") + for key, ( + crosslinker_length, + accepted_deviation_upper_bound, + accepted_deviation_lower_bound, + ) in crosslinker_information.items() + if accepted_deviation_lower_bound != 0 + ], + ) + evaluated = validated_df["valid_crosslink"].dropna() valid_crosslinks = (evaluated == True).sum() invalid_crosslinks = (evaluated == False).sum() - return [ - create_bar_plot( - values_of_sectors=[ - valid_crosslinks, - invalid_crosslinks, - ], - names_of_sectors=["Valid Cross-Links", "Invalid Cross-Links"], - heading="Cross-Links used for Validation", - y_title="Number of Cross-Links", - ) - ] + bar_plot = create_bar_plot( + values_of_sectors=[ + valid_crosslinks, + invalid_crosslinks, + ], + names_of_sectors=["Valid Cross-Links", "Invalid Cross-Links"], + heading="Cross-Links used for Validation", + y_title="Number of Cross-Links", + ) + + return [histogram2, histogram, bar_plot] diff --git a/backend/protzilla/data_preprocessing/plots.py b/backend/protzilla/data_preprocessing/plots.py index d3f5ca900..e96f1bcdf 100644 --- a/backend/protzilla/data_preprocessing/plots.py +++ b/backend/protzilla/data_preprocessing/plots.py @@ -169,6 +169,12 @@ def create_histograms( x_title: str = "", visual_transformation: str = "linear", overlay: bool = False, + relevant_column_a: str = None, + relevant_column_b: str = None, + min_value_to_plot: int = None, + max_value_to_plot: int = None, + vertical_lines: list[tuple[float, str]] = None, + vertical_lines_dashed: list[tuple[float, str]] = None, ) -> Figure: """ A function to create a histogram for visualisation @@ -196,39 +202,48 @@ def create_histograms( f"""visual_transformation parameter must be "linear" or "log10" but is {visual_transformation}""" ) + if relevant_column_a is None: + relevant_column_a = default_intensity_column(dataframe_a) + if relevant_column_b is None: + relevant_column_b = default_intensity_column(dataframe_b) - intensity_name_a = default_intensity_column(dataframe_a) - intensity_name_b = default_intensity_column(dataframe_b) - - intensities_a = dataframe_a[intensity_name_a] - intensities_b = dataframe_b[intensity_name_b] + values_a = dataframe_a[relevant_column_a] + values_b = dataframe_b[relevant_column_b] if visual_transformation == "log10": - intensities_a = intensities_a.apply(np.log10) - intensities_b = intensities_b.apply(np.log10) + values_a = values_a.apply(np.log10) + values_b = values_b.apply(np.log10) - min_value = min(intensities_a.min(skipna=True), intensities_b.min(skipna=True)) - max_value = max(intensities_a.max(skipna=True), intensities_b.max(skipna=True)) + if min_value_to_plot is None: + min_value = min(values_a.min(skipna=True), values_b.min(skipna=True)) + else: + min_value = min_value_to_plot + if max_value_to_plot is None: + max_value = max(values_a.max(skipna=True), values_b.max(skipna=True)) + else: + max_value = max_value_to_plot number_of_bins = 100 binsize_a = ( - intensities_a.max(skipna=True) - intensities_a.min(skipna=True) + min(values_a.max(skipna=True), max_value) + - max(values_a.min(skipna=True), min_value) ) / number_of_bins binsize_b = ( - intensities_b.max(skipna=True) - intensities_b.min(skipna=True) + min(values_b.max(skipna=True), max_value) + - max(values_b.min(skipna=True), min_value) ) / number_of_bins if overlay: binsize_a = binsize_b = max(binsize_a, binsize_b) trace0 = go.Histogram( - x=intensities_a, + x=values_a, marker_color=PLOT_PRIMARY_COLOR, name=name_a, xbins=dict(start=min_value, end=max_value, size=binsize_a), ) trace1 = go.Histogram( - x=intensities_b, + x=values_b, marker_color=PLOT_SECONDARY_COLOR, name=name_b, xbins=dict(start=min_value, end=max_value, size=binsize_b), @@ -251,6 +266,24 @@ def create_histograms( if visual_transformation == "log10": fig.update_layout(xaxis=generate_tics(0, max_value, True)) + for lines, dash in [ + (vertical_lines, None), + (vertical_lines_dashed, "dash"), + ]: + if lines is None: + continue + + for position, annotation in lines: + fig.add_vline( + x=position, + line=dict(color="red", width=2, dash=dash), + annotation_text=annotation, + annotation_position="top left", + annotation_textangle=-90, + annotation_y=1, + annotation_yanchor="top", + ) + fig.update_layout(title={"text": f"{heading}"}) fig.update_xaxes(title=x_title) fig.update_yaxes(title=y_title, rangemode="tozero") diff --git a/frontend/src/components/app/run-screen/run-screen.tsx b/frontend/src/components/app/run-screen/run-screen.tsx index b01f182e3..d9f7ddaac 100644 --- a/frontend/src/components/app/run-screen/run-screen.tsx +++ b/frontend/src/components/app/run-screen/run-screen.tsx @@ -275,6 +275,7 @@ export const RunScreen: React.FC = () => { Date: Sat, 7 Feb 2026 15:04:30 +0100 Subject: [PATCH 086/240] refactor: remove peptide_position1 and peptide_postion2 (regarding crosslinking_df) --- backend/protzilla/importing/crosslinking_import.py | 2 -- backend/protzilla/importing/import_utils.py | 6 ------ .../tests/protzilla/importing/test_crosslinking_import.py | 7 +------ 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index 06fdd4798..d2a39da49 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -633,8 +633,6 @@ def normalize_crosslinking_df(df: pd.DataFrame) -> pd.DataFrame: "Crosslinker": "string", "Peptide1": "string", "Peptide2": "string", - "Peptide_position1": "int", - "Peptide_position2": "int", "CL_position1": "int", "CL_position2": "int", "Q_value": "Float64", diff --git a/backend/protzilla/importing/import_utils.py b/backend/protzilla/importing/import_utils.py index b9da300e4..90ad1393a 100644 --- a/backend/protzilla/importing/import_utils.py +++ b/backend/protzilla/importing/import_utils.py @@ -20,8 +20,6 @@ class AggregationMethods(Enum): "Crosslink Type": "Is_intra_crosslink", "PepSeq1": "Peptide1", "PepSeq2": "Peptide2", - "PepPos1": "Peptide_position1", - "PepPos2": "Peptide_position2", "LinkPos1": "CL_position1", "LinkPos2": "CL_position2", "PEP": "Q_value", @@ -33,8 +31,6 @@ class AggregationMethods(Enum): "Crosslink Type": "Is_intra_crosslink", "Sequence A": "Peptide1", "Sequence B": "Peptide2", - "Position A": "Peptide_position1", - "Position B": "Peptide_position2", "Q-value": "Q_value", } @@ -47,8 +43,6 @@ class AggregationMethods(Enum): "Crosslinker", "Peptide1", "Peptide2", - "Peptide_position1", - "Peptide_position2", "CL_position1", "CL_position2", "Q_value", diff --git a/backend/tests/protzilla/importing/test_crosslinking_import.py b/backend/tests/protzilla/importing/test_crosslinking_import.py index 81d6c3bfd..e423b695f 100644 --- a/backend/tests/protzilla/importing/test_crosslinking_import.py +++ b/backend/tests/protzilla/importing/test_crosslinking_import.py @@ -138,8 +138,6 @@ def _minimal_valid_crosslinking_df(): "Crosslinker": ["DSS"], "Peptide1": ["AAA"], "Peptide2": ["BBB"], - "Peptide_position1": [1], - "Peptide_position2": [2], "CL_position1": [3], "CL_position2": [4], "Q_value": [0.01], @@ -203,10 +201,9 @@ def test_crosslinking_import_csv(tmp_path): csv_file = tmp_path / "test.csv" csv_file.write_text( "Protein1,Protein2,Peptide1,Peptide2," - "Peptide_position1,Peptide_position2," "CL_position1,CL_position2," "Crosslinker,Q_value\n" - "RAD50,MRE11,AAA,BBB,1,2,3,4,DSS,0.01\n" + "RAD50,MRE11,AAA,BBB,3,4,DSS,0.01\n" ) with patch( @@ -231,8 +228,6 @@ def test_crosslinking_import_xlsx(monkeypatch, tmp_path): "Peptide1": ["[AAA]"], "Peptide2": ["[BBB]"], "Is_intra_crosslink": ["Intra"], - "Peptide_position1": [1], - "Peptide_position2": [2], "CL_position1": [3], "CL_position2": [4], "Crosslinker": ["DSS"], From 5f61a3f16fab4fde3b39947f6d2a267cda1d7a14 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sat, 7 Feb 2026 17:28:02 +0100 Subject: [PATCH 087/240] feat: duplicate crosslink rows when peptide matches several positions within protein sequence --- .../data_analysis/crosslinking_validation.py | 140 ++++++++++-------- .../importing/crosslinking_import.py | 8 +- backend/protzilla/importing/import_utils.py | 8 +- .../test_crosslinking_validation.py | 12 +- .../importing/test_crosslinking_import.py | 10 +- 5 files changed, 94 insertions(+), 84 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 1af6f42bc..bb89d91cc 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -1,5 +1,7 @@ import pandas as pd import numpy as np +import re +import logging from plotly.graph_objects import Figure from protzilla.importing.alphafold_protein_structure_load import ( @@ -101,63 +103,67 @@ def get_distance_between_two_amino_acids_in_angstrom( return float(np.linalg.norm(pos2 - pos1)) -def get_position_of_amino_acid_crosslinker_bound_to( - protein_sequence: str, - peptide_sequence: str, - crosslinker_position_within_peptide: int, -) -> int: - """ - Determines the position of the amino acid to which the cross-linker bound. - - :param protein_sequence: full protein amino acid sequence - :param peptide_sequence: peptide sequence containing the amino acid the cross-linker bound to - :param crosslinker_position_within_peptide: 1-based position of the cross-linker within the peptide - :return: 1-based position of the amino acid residue in the protein sequence - :raises ValueError: if the peptide sequence cannot be found in the protein sequence - """ - peptide_start_position = protein_sequence.find(peptide_sequence) - if peptide_start_position == -1: - raise ValueError( - f"Peptide {peptide_sequence} was not found in protein sequence" - ) - return peptide_start_position + crosslinker_position_within_peptide - - -def get_distance_between_crosslinker_connected_amino_acids_in_alphafold( - fasta_df: pd.DataFrame, cif_df: pd.DataFrame, crosslink: pd.Series -) -> float: - """ - Calculates the distance in Ångström between two amino acid residues connected - by a cross-linker using a predicted protein structure (e.g. from AlphaFold). - - :param fasta_df: DataFrame containing the protein sequence - :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) - :param crosslink: Series describing a cross-link, including cross-linker positions - :return: the distance between the cross-linked amino acids in Ångström - """ - protein_sequence = fasta_df.at[0, "Protein Sequence"] - amino_acid_position_crosslinker1_is_bound_to = ( - get_position_of_amino_acid_crosslinker_bound_to( - protein_sequence=protein_sequence, - peptide_sequence=crosslink.Peptide1, - crosslinker_position_within_peptide=crosslink.CL_position1, - ) - ) - amino_acid_position_crosslinker2_is_bound_to = ( - get_position_of_amino_acid_crosslinker_bound_to( - protein_sequence=protein_sequence, - peptide_sequence=crosslink.Peptide2, - crosslinker_position_within_peptide=crosslink.CL_position2, - ) +def _add_positions_of_amino_acid_where_crosslinker_bound_to_df( + crosslinking_df: pd.DataFrame, protein_sequence: str +) -> list[dict]: + # 0-based + crosslinking_df["crosslinker_position1"] = pd.Series( + [pd.NA] * len(crosslinking_df), dtype="Int64" ) - distance_in_alphafold = get_distance_between_two_amino_acids_in_angstrom( - amino_acid_position_crosslinker1_is_bound_to, - amino_acid_position_crosslinker2_is_bound_to, - protein_sequence[amino_acid_position_crosslinker1_is_bound_to - 1], - protein_sequence[amino_acid_position_crosslinker2_is_bound_to - 1], - cif_df, + crosslinking_df["crosslinker_position2"] = pd.Series( + [pd.NA] * len(crosslinking_df), dtype="Int64" ) - return distance_in_alphafold + rows_to_duplicate = {} + rows_to_delete = [] + messages = [] + for idx, crosslinker_row in crosslinking_df.iterrows(): + peptide_sequence1 = crosslinker_row.Peptide1 + peptide_sequence2 = crosslinker_row.Peptide2 + peptide1_positions = [ + m.start() for m in re.finditer(f"(?={peptide_sequence1})", protein_sequence) + ] + peptide2_positions = [ + m.start() for m in re.finditer(f"(?={peptide_sequence2})", protein_sequence) + ] + all_position_combinations = [ + ( + pos1 + crosslinker_row.CL_position_within_peptide1, + pos2 + crosslinker_row.CL_position_within_peptide2, + ) + for pos1 in peptide1_positions + for pos2 in peptide2_positions + ] + if not all_position_combinations: + msg = f"At least one of the peptide sequences ({peptide_sequence1}, {peptide_sequence2}) of crosslink entry {idx} was not found in the protein sequence. The entry was deleted." + messages.append(dict(level=logging.WARNING, msg=msg)) + rows_to_delete.append(idx) + continue + crosslinking_df.at[idx, "crosslinker_position1"] = all_position_combinations[0][ + 0 + ] + crosslinking_df.loc[idx, "crosslinker_position2"] = all_position_combinations[ + 0 + ][1] + if len(all_position_combinations) > 1: + rows_to_duplicate[idx] = all_position_combinations[1:] + if not rows_to_duplicate: + return messages + for row_to_duplicate_idx, potential_positions in rows_to_duplicate.items(): + for potential_cl_position1, potential_cl_position2 in potential_positions: + new_row = crosslinking_df.loc[row_to_duplicate_idx].copy() + new_row["crosslinker_position1"] = potential_cl_position1 + new_row["crosslinker_position2"] = potential_cl_position2 + crosslinking_df = pd.concat( + [crosslinking_df, new_row.to_frame().T], ignore_index=True + ) + messages.append( + dict( + level=logging.WARNING, + msg=f"Row {row_to_duplicate_idx} was duplicated {len(potential_positions)} times due to several matches between peptide sequence and protein sequence.", + ) + ) + + return messages def validate_with_angstrom_deviation( @@ -187,7 +193,7 @@ def validate_with_angstrom_deviation( uniprot_id=protein_to_validate, persist_uploads=False ) cif_df = alphafold_data["cif_df"] - fasta_df = alphafold_data["sequence_df"] + protein_sequence = alphafold_data["sequence_df"].at[0, "Protein Sequence"] all_crosslinks_df = crosslinking_df.copy() @@ -197,9 +203,17 @@ def validate_with_angstrom_deviation( ) relevant_crosslinks_df = all_crosslinks_df[mask].copy() + messages = _add_positions_of_amino_acid_where_crosslinker_bound_to_df( + relevant_crosslinks_df, protein_sequence + ) + def check_crosslink(crosslink: pd.Series) -> pd.Series: - distance = get_distance_between_crosslinker_connected_amino_acids_in_alphafold( - fasta_df, cif_df, crosslink + predicted_distance = get_distance_between_two_amino_acids_in_angstrom( + amino_acid_position1=crosslink.crosslinker_position1, + amino_acid_position2=crosslink.crosslinker_position2, + amino_acid_kind1=protein_sequence[crosslink.crosslinker_position1], + amino_acid_kind2=protein_sequence[crosslink.crosslinker_position2], + cif_df=cif_df, ) try: ( @@ -221,10 +235,14 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: ) + crosslinker_length valid = ( - accepted_distance_lower_bound <= distance <= accepted_distance_upper_bound + accepted_distance_lower_bound + <= predicted_distance + <= accepted_distance_upper_bound ) - return pd.Series({"alphafold_distance": distance, "valid_crosslink": valid}) + return pd.Series( + {"alphafold_distance": predicted_distance, "valid_crosslink": valid} + ) # adding the distance in alphafold and the result of the validation to all relevant crosslinks all_crosslinks_df.loc[mask, ["alphafold_distance", "valid_crosslink"]] = ( @@ -236,7 +254,7 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: all_crosslinks_df["valid_crosslink"].notna() ] - return dict(crosslinking_result_df=checked_crosslinks_df, messages={}) + return dict(crosslinking_result_df=checked_crosslinks_df, messages=messages) def bar_plot_of_valid_crosslinks( diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index d2a39da49..3b91ee115 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -557,10 +557,10 @@ def read_ProteomeDiscoverer_XlinkX_file( columns=rename_columns_proteomediscoverer_xlinkx_format ) - df["CL_position1"] = df["Peptide1"].apply( + df["CL_position_within_peptide1"] = df["Peptide1"].apply( get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format ) - df["CL_position2"] = df["Peptide2"].apply( + df["CL_position_within_peptide2"] = df["Peptide2"].apply( get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format ) @@ -633,8 +633,8 @@ def normalize_crosslinking_df(df: pd.DataFrame) -> pd.DataFrame: "Crosslinker": "string", "Peptide1": "string", "Peptide2": "string", - "CL_position1": "int", - "CL_position2": "int", + "CL_position_within_peptide1": "int", + "CL_position_within_peptide2": "int", "Q_value": "Float64", } ) diff --git a/backend/protzilla/importing/import_utils.py b/backend/protzilla/importing/import_utils.py index 90ad1393a..b73cc135f 100644 --- a/backend/protzilla/importing/import_utils.py +++ b/backend/protzilla/importing/import_utils.py @@ -20,8 +20,8 @@ class AggregationMethods(Enum): "Crosslink Type": "Is_intra_crosslink", "PepSeq1": "Peptide1", "PepSeq2": "Peptide2", - "LinkPos1": "CL_position1", - "LinkPos2": "CL_position2", + "LinkPos1": "CL_position_within_peptide1", + "LinkPos2": "CL_position_within_peptide2", "PEP": "Q_value", } @@ -43,7 +43,7 @@ class AggregationMethods(Enum): "Crosslinker", "Peptide1", "Peptide2", - "CL_position1", - "CL_position2", + "CL_position_within_peptide1", + "CL_position_within_peptide2", "Q_value", ] diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index d5de0ca93..aaff9b6bd 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -5,20 +5,12 @@ from backend.protzilla.data_analysis.crosslinking_validation import ( - get_position_of_amino_acid_crosslinker_bound_to, validate_with_angstrom_deviation, get_distance_between_two_amino_acids_in_angstrom, ) from protzilla.methods.data_analysis import CrossLinkingValidationWithAngstromDeviation -def test_get_position_of_amino_acid_crosslinker_bound_to(): - protein = "MABCDEFGHIJK" - peptide = "ABC" - pos = get_position_of_amino_acid_crosslinker_bound_to(protein, peptide, 2) - assert pos == 3 - - @patch( "backend.protzilla.data_analysis.crosslinking_validation.fetch_alphafold_protein_structure" ) @@ -54,8 +46,8 @@ def test_validate_with_angstrom_deviation(mock_fetch, distance, expected): "Protein_id2": ["P12345"], "Peptide1": ["A"], "Peptide2": ["B"], - "CL_position1": [1], - "CL_position2": [1], + "CL_position_within_peptide1": [1], + "CL_position_within_peptide2": [1], "Crosslinker": ["DSS"], } ) diff --git a/backend/tests/protzilla/importing/test_crosslinking_import.py b/backend/tests/protzilla/importing/test_crosslinking_import.py index e423b695f..11bb10cda 100644 --- a/backend/tests/protzilla/importing/test_crosslinking_import.py +++ b/backend/tests/protzilla/importing/test_crosslinking_import.py @@ -138,8 +138,8 @@ def _minimal_valid_crosslinking_df(): "Crosslinker": ["DSS"], "Peptide1": ["AAA"], "Peptide2": ["BBB"], - "CL_position1": [3], - "CL_position2": [4], + "CL_position_within_peptide1": [3], + "CL_position_within_peptide2": [4], "Q_value": [0.01], } ) @@ -201,7 +201,7 @@ def test_crosslinking_import_csv(tmp_path): csv_file = tmp_path / "test.csv" csv_file.write_text( "Protein1,Protein2,Peptide1,Peptide2," - "CL_position1,CL_position2," + "CL_position_within_peptide1,CL_position_within_peptide2," "Crosslinker,Q_value\n" "RAD50,MRE11,AAA,BBB,3,4,DSS,0.01\n" ) @@ -228,8 +228,8 @@ def test_crosslinking_import_xlsx(monkeypatch, tmp_path): "Peptide1": ["[AAA]"], "Peptide2": ["[BBB]"], "Is_intra_crosslink": ["Intra"], - "CL_position1": [3], - "CL_position2": [4], + "CL_position_within_peptide1": [3], + "CL_position_within_peptide2": [4], "Crosslinker": ["DSS"], "Q_value": [0.01], } From d2f2448f141f50a6b3e46d2cf06e160189cb74b6 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sun, 8 Feb 2026 12:34:06 +0100 Subject: [PATCH 088/240] test: add tests for duplicating crosslink rows when peptide matches several positions within protein sequence --- .../data_analysis/crosslinking_validation.py | 46 ++++-- .../test_crosslinking_validation.py | 151 +++++++++++++++++- 2 files changed, 179 insertions(+), 18 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index bb89d91cc..9ed1a96b8 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -103,10 +103,10 @@ def get_distance_between_two_amino_acids_in_angstrom( return float(np.linalg.norm(pos2 - pos1)) -def _add_positions_of_amino_acid_where_crosslinker_bound_to_df( +def add_positions_of_amino_acid_where_crosslinker_bound_to_df( crosslinking_df: pd.DataFrame, protein_sequence: str -) -> list[dict]: - # 0-based +) -> tuple[pd.DataFrame, list[dict]]: + # 1-based crosslinking_df["crosslinker_position1"] = pd.Series( [pd.NA] * len(crosslinking_df), dtype="Int64" ) @@ -127,8 +127,8 @@ def _add_positions_of_amino_acid_where_crosslinker_bound_to_df( ] all_position_combinations = [ ( - pos1 + crosslinker_row.CL_position_within_peptide1, - pos2 + crosslinker_row.CL_position_within_peptide2, + pos1 + crosslinker_row.CL_position_within_peptide1 + 1, + pos2 + crosslinker_row.CL_position_within_peptide2 + 1, ) for pos1 in peptide1_positions for pos2 in peptide2_positions @@ -146,8 +146,11 @@ def _add_positions_of_amino_acid_where_crosslinker_bound_to_df( ][1] if len(all_position_combinations) > 1: rows_to_duplicate[idx] = all_position_combinations[1:] + + crosslinking_df.drop(rows_to_delete, inplace=True) + if not rows_to_duplicate: - return messages + return crosslinking_df, messages for row_to_duplicate_idx, potential_positions in rows_to_duplicate.items(): for potential_cl_position1, potential_cl_position2 in potential_positions: new_row = crosslinking_df.loc[row_to_duplicate_idx].copy() @@ -163,7 +166,7 @@ def _add_positions_of_amino_acid_where_crosslinker_bound_to_df( ) ) - return messages + return crosslinking_df, messages def validate_with_angstrom_deviation( @@ -203,16 +206,18 @@ def validate_with_angstrom_deviation( ) relevant_crosslinks_df = all_crosslinks_df[mask].copy() - messages = _add_positions_of_amino_acid_where_crosslinker_bound_to_df( - relevant_crosslinks_df, protein_sequence + relevant_crosslinks_df, messages = ( + add_positions_of_amino_acid_where_crosslinker_bound_to_df( + relevant_crosslinks_df, protein_sequence + ) ) def check_crosslink(crosslink: pd.Series) -> pd.Series: predicted_distance = get_distance_between_two_amino_acids_in_angstrom( amino_acid_position1=crosslink.crosslinker_position1, amino_acid_position2=crosslink.crosslinker_position2, - amino_acid_kind1=protein_sequence[crosslink.crosslinker_position1], - amino_acid_kind2=protein_sequence[crosslink.crosslinker_position2], + amino_acid_kind1=protein_sequence[crosslink.crosslinker_position1 - 1], + amino_acid_kind2=protein_sequence[crosslink.crosslinker_position2 - 1], cif_df=cif_df, ) try: @@ -241,12 +246,23 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: ) return pd.Series( - {"alphafold_distance": predicted_distance, "valid_crosslink": valid} + { + "alphafold_distance": predicted_distance, + "valid_crosslink": valid, + "crosslinker_position1": crosslink.crosslinker_position1, + "crosslinker_position2": crosslink.crosslinker_position2, + } ) - # adding the distance in alphafold and the result of the validation to all relevant crosslinks - all_crosslinks_df.loc[mask, ["alphafold_distance", "valid_crosslink"]] = ( - relevant_crosslinks_df.apply(check_crosslink, axis=1) + # adding the distance in alphafold, the result of the validation and the crosslinker positions to all relevant crosslinks + new_colums = [ + "alphafold_distance", + "valid_crosslink", + "crosslinker_position1", + "crosslinker_position2", + ] + all_crosslinks_df.loc[mask, new_colums] = relevant_crosslinks_df.apply( + check_crosslink, axis=1 ) # removing all crosslinks that weren't checked from the df diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index aaff9b6bd..c583247d9 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -1,5 +1,6 @@ import pandas as pd import pytest +import logging from unittest.mock import patch from unittest.mock import MagicMock @@ -7,6 +8,7 @@ from backend.protzilla.data_analysis.crosslinking_validation import ( validate_with_angstrom_deviation, get_distance_between_two_amino_acids_in_angstrom, + add_positions_of_amino_acid_where_crosslinker_bound_to_df, ) from protzilla.methods.data_analysis import CrossLinkingValidationWithAngstromDeviation @@ -46,8 +48,8 @@ def test_validate_with_angstrom_deviation(mock_fetch, distance, expected): "Protein_id2": ["P12345"], "Peptide1": ["A"], "Peptide2": ["B"], - "CL_position_within_peptide1": [1], - "CL_position_within_peptide2": [1], + "CL_position_within_peptide1": [0], + "CL_position_within_peptide2": [0], "Crosslinker": ["DSS"], } ) @@ -65,7 +67,7 @@ def test_validate_with_angstrom_deviation(mock_fetch, distance, expected): assert "alphafold_distance" in df.columns assert "valid_crosslink" in df.columns assert df.loc[0, "alphafold_distance"] == distance - assert df.loc[0, "valid_crosslink"] is expected + assert df.loc[0, "valid_crosslink"] == expected def test_modify_form_creates_crosslinker_fields(): @@ -105,3 +107,146 @@ def test_get_distance_between_two_amino_acids_in_angstrom(): dist = get_distance_between_two_amino_acids_in_angstrom(1, 2, "A", "B", cif_df) assert dist == 5.0 + + +def test_add_crosslinker_positions_with_exactly_one_possible_position(): + df = pd.DataFrame( + { + "Peptide1": ["ABC"], + "Peptide2": ["DEF"], + "CL_position_within_peptide1": [1], + "CL_position_within_peptide2": [2], + } + ) + + protein_sequence = "XXABCYYYDEFZZ" + + df, messages = add_positions_of_amino_acid_where_crosslinker_bound_to_df( + df, protein_sequence + ) + + assert messages == [] + + assert df.loc[0, "crosslinker_position1"] == 2 + 1 + 1 # 1-based + assert df.loc[0, "crosslinker_position2"] == 8 + 2 + 1 # 1-based + + assert str(df["crosslinker_position1"].dtype) == "Int64" + assert str(df["crosslinker_position2"].dtype) == "Int64" + + +def test_add_crosslinker_positions_with_more_than_one_possible_position(): + df = pd.DataFrame( + { + "Peptide1": ["AA"], + "Peptide2": ["BB"], + "CL_position_within_peptide1": [0], + "CL_position_within_peptide2": [0], + } + ) + + protein_sequence = "AAXXAAZZBBYYBB" + + df, messages = add_positions_of_amino_acid_where_crosslinker_bound_to_df( + df, protein_sequence + ) + + # 2 AA matches × 2 BB matches = 4 combinations + assert len(df) == 4 + + # One warning about duplication + assert len(messages) == 1 + assert messages[0]["level"] == logging.WARNING + assert "duplicated" in messages[0]["msg"] + + # All rows should have valid positions + assert df["crosslinker_position1"].notna().all() + assert df["crosslinker_position2"].notna().all() + + +def test_add_crosslinker_positions_but_one_peptide_not_found_deletes_row(): + df = pd.DataFrame( + { + "Peptide1": ["ABC"], + "Peptide2": ["DEF"], + "CL_position_within_peptide1": [0], + "CL_position_within_peptide2": [0], + } + ) + + protein_sequence = "XXXXXXXX" + + df, messages = add_positions_of_amino_acid_where_crosslinker_bound_to_df( + df, protein_sequence + ) + + assert len(messages) == 1 + assert messages[0]["level"] == logging.WARNING + assert "was not found" in messages[0]["msg"] + + # row should be deleted + assert df.empty + + +def test_add_crosslinker_positions_with_valid_and_invalid_rows_mixed(): + df = pd.DataFrame( + { + "Peptide1": ["ABC", "XXX", "ABC"], + "Peptide2": ["DEF", "DEF", "YYY"], + "CL_position_within_peptide1": [0, 0, 0], + "CL_position_within_peptide2": [0, 0, 0], + } + ) + + protein_sequence = "ABCDEF" + + df, messages = add_positions_of_amino_acid_where_crosslinker_bound_to_df( + df, protein_sequence + ) + + assert len(messages) == 2 + assert messages[0]["level"] == logging.WARNING + + # First row valid + assert df.loc[0, "crosslinker_position1"] == 1 + assert df.loc[0, "crosslinker_position2"] == 4 + + # Second and third row invalid -> df should only have one row + assert len(df) == 1 + + +def test_add_crosslinker_positions_with_overlapping_peptide_matches(): + df = pd.DataFrame( + { + "Peptide1": ["AAA"], + "Peptide2": ["B"], + "CL_position_within_peptide1": [0], + "CL_position_within_peptide2": [0], + } + ) + + protein_sequence = "AAAAB" + + df, messages = add_positions_of_amino_acid_where_crosslinker_bound_to_df( + df, protein_sequence + ) + + # AAA -> positions 0, 1 + # B -> position 4 + # => 2 * 1 = 2 combinations + assert len(df) == 2 + + # One warning about duplication + assert len(messages) == 1 + assert messages[0]["level"] == logging.WARNING + assert "duplicated" in messages[0]["msg"] + + observed_positions = set( + zip( + df["crosslinker_position1"].astype(int), + df["crosslinker_position2"].astype(int), + ) + ) + + expected_positions = {(1, 5), (2, 5)} + + assert observed_positions == expected_positions From ad16167a2e02db46e61e9a73a48fe6f5e9179095 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sun, 8 Feb 2026 13:17:58 +0100 Subject: [PATCH 089/240] fix: fix bug where duplicated rows would get deleted from the crosslinking validation dataframe --- .../data_analysis/crosslinking_validation.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 9ed1a96b8..0ff8beafc 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -151,20 +151,23 @@ def add_positions_of_amino_acid_where_crosslinker_bound_to_df( if not rows_to_duplicate: return crosslinking_df, messages + new_rows = [] for row_to_duplicate_idx, potential_positions in rows_to_duplicate.items(): for potential_cl_position1, potential_cl_position2 in potential_positions: new_row = crosslinking_df.loc[row_to_duplicate_idx].copy() new_row["crosslinker_position1"] = potential_cl_position1 new_row["crosslinker_position2"] = potential_cl_position2 - crosslinking_df = pd.concat( - [crosslinking_df, new_row.to_frame().T], ignore_index=True - ) + new_rows.append(new_row) messages.append( dict( level=logging.WARNING, msg=f"Row {row_to_duplicate_idx} was duplicated {len(potential_positions)} times due to several matches between peptide sequence and protein sequence.", ) ) + if new_rows: + crosslinking_df = pd.concat( + [crosslinking_df, pd.DataFrame(new_rows)], ignore_index=True + ) return crosslinking_df, messages @@ -261,13 +264,13 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: "crosslinker_position1", "crosslinker_position2", ] - all_crosslinks_df.loc[mask, new_colums] = relevant_crosslinks_df.apply( + relevant_crosslinks_df[new_colums] = relevant_crosslinks_df.apply( check_crosslink, axis=1 ) # removing all crosslinks that weren't checked from the df - checked_crosslinks_df = all_crosslinks_df[ - all_crosslinks_df["valid_crosslink"].notna() + checked_crosslinks_df = relevant_crosslinks_df[ + relevant_crosslinks_df["valid_crosslink"].notna() ] return dict(crosslinking_result_df=checked_crosslinks_df, messages=messages) From 256430214814d7f79b84412d7ae361a1b5bad4c2 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sun, 8 Feb 2026 13:43:55 +0100 Subject: [PATCH 090/240] chore: add docstring --- .../data_analysis/crosslinking_validation.py | 21 +++++++++++++++++-- .../importing/test_crosslinking_import.py | 10 ++++----- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 0ff8beafc..3e41e5ccb 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -106,7 +106,25 @@ def get_distance_between_two_amino_acids_in_angstrom( def add_positions_of_amino_acid_where_crosslinker_bound_to_df( crosslinking_df: pd.DataFrame, protein_sequence: str ) -> tuple[pd.DataFrame, list[dict]]: - # 1-based + """ + Adds for each crosslink the 1-based positions of amino acids where the crosslink bound to a crosslinking DataFrame. + If a peptide sequence occurs multiple times in the protein, the row is duplicated for each + additional combination of positions. + If a peptide sequence can't be matched the row will be deleted and a warning emitted. + + :param crosslinking_df: DataFrame containing cross-linking data with at least the following columns: + - 'Peptide1': first peptide sequence + - 'Peptide2': second peptide sequence + - 'CL_position_within_peptide1': 0-based crosslinker position within Peptide1 + - 'CL_position_within_peptide2': 0-based crosslinker position within Peptide2 + :param protein_sequence: Full protein sequence in which the peptides are located. + :return: tuple (updated_crosslinking_df, messages) + - updated_crosslinking_df: input DataFrame with two new columns: + - 'crosslinker_position1': 1-based crosslinker position in Peptide1 + - 'crosslinker_position2': 1-based crosslinker position in Peptide2 + Rows are duplicated for multiple peptide matches. + - messages: list of warning dictionaries with if the peptide was not found or a row was duplicated + """ crosslinking_df["crosslinker_position1"] = pd.Series( [pd.NA] * len(crosslinking_df), dtype="Int64" ) @@ -193,7 +211,6 @@ def validate_with_angstrom_deviation( protein to validate) of crosslinking_df and two more colums containing the distances in AlphaFold and wheter the crosslink matches the AlphaFold data or not :raises KeyError: If a required crosslinker field is missing in crosslinker_information. - :raises ValueError: If peptide sequences cannot be matched to the protein sequence. """ alphafold_data = fetch_alphafold_protein_structure( uniprot_id=protein_to_validate, persist_uploads=False diff --git a/backend/tests/protzilla/importing/test_crosslinking_import.py b/backend/tests/protzilla/importing/test_crosslinking_import.py index 11bb10cda..8fbd646a5 100644 --- a/backend/tests/protzilla/importing/test_crosslinking_import.py +++ b/backend/tests/protzilla/importing/test_crosslinking_import.py @@ -138,8 +138,8 @@ def _minimal_valid_crosslinking_df(): "Crosslinker": ["DSS"], "Peptide1": ["AAA"], "Peptide2": ["BBB"], - "CL_position_within_peptide1": [3], - "CL_position_within_peptide2": [4], + "CL_position_within_peptide1": [1], + "CL_position_within_peptide2": [2], "Q_value": [0.01], } ) @@ -203,7 +203,7 @@ def test_crosslinking_import_csv(tmp_path): "Protein1,Protein2,Peptide1,Peptide2," "CL_position_within_peptide1,CL_position_within_peptide2," "Crosslinker,Q_value\n" - "RAD50,MRE11,AAA,BBB,3,4,DSS,0.01\n" + "RAD50,MRE11,AAA,BBB,1,2,DSS,0.01\n" ) with patch( @@ -228,8 +228,8 @@ def test_crosslinking_import_xlsx(monkeypatch, tmp_path): "Peptide1": ["[AAA]"], "Peptide2": ["[BBB]"], "Is_intra_crosslink": ["Intra"], - "CL_position_within_peptide1": [3], - "CL_position_within_peptide2": [4], + "CL_position_within_peptide1": [1], + "CL_position_within_peptide2": [2], "Crosslinker": ["DSS"], "Q_value": [0.01], } From 9926a6b8cdd5d304b2d47eff5fb75e0fcdb5c351 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Mon, 9 Feb 2026 14:55:29 +0100 Subject: [PATCH 091/240] add clarification if organism id is actually required --- backend/protzilla/methods/importing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index f0acb9d66..33c40975c 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -460,10 +460,10 @@ def create_form(self): ), TextField( name="organism_ids", - label="Organism IDs \n(please list them in the order in which they should be applied, separated by a comma)", + label="Organism IDs \n(only required when importing a CSM file)", value="", ), - InfoField(label="e.g.: 9606, 10090, 10116"), + InfoField(label="Please list them in the order in which they should be applied, separated by a comma \n e.g.: 9606, 10090, 10116"), ], ) From c11cbe7165fbcd77c17e424bb03df6ec50a5fc6b Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Mon, 9 Feb 2026 15:30:59 +0100 Subject: [PATCH 092/240] add more precise error messages for organism id validation --- .../importing/crosslinking_import.py | 57 ++++++++++--------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index 653c01ca0..4d83b31cc 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -665,7 +665,7 @@ def normalize_crosslinking_df(df: pd.DataFrame) -> pd.DataFrame: def process_organism_id_from_text_field( organism_ids: str, -) -> tuple[bool, Optional[list[str]], Optional[list[str]]]: +) -> tuple[bool, Optional[list[str]], Optional[list[str]], Optional[str]]: """ Validates a comma-separated string of NCBI Taxonomy IDs. Returns False immediately if any ID is invalid. @@ -683,32 +683,36 @@ def process_organism_id_from_text_field( id.strip() for id in organism_ids.split(",") if id.strip() ] if not organism_ids_list: - return False, None, None + return False, None, None, "EMPTY_INPUT" organism_ids_for_request = ",".join(organism_ids_list) url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=taxonomy&id={organism_ids_for_request}&retmode=json" try: response = requests.get(url, timeout=15) if response.status_code != 200: - return False, None, None + return False, None, None, "NCBI_TAXONOMY_REQUEST_FAILED" data = response.json() + except requests.Timeout: + return False, None, None, "NCBI_TAXONOMY_TIMEOUT" except Exception: - return False, None, None + return False, None, None, "NCBI_TAXONOMY_SERVICE_UNAVAILABLE" result = data.get("result", {}) + if not result or "uids" not in result: + return False, None, None, "NCBI_TAXONOMY_RESPONSE_INVALID" valid_organism_ids = result.get("uids", []) organism_names = [] for id in organism_ids_list: if id not in valid_organism_ids: # Abort at the first invalid id - return False, id, None + return False, id, None, "ORGANISM_ID_NOT_FOUND" name = result[id].get("scientificname") if not name: - return False, id, None + return False, id, None, "MISSING_SCIENTIFIC_NAME" organism_names.append(name) - return True, organism_ids_list, organism_names + return True, organism_ids_list, organism_names, None def aggregate_failed_proteins_for_display(failed_df: pd.DataFrame) -> str: @@ -746,27 +750,34 @@ def aggregate_failed_proteins_for_display(failed_df: pd.DataFrame) -> str: return "\n".join(sorted(protein_with_error_set)) +def error_output(msg, trace: str | None = None) -> dict: + return dict( + crosslinking_df=pd.DataFrame(), + imported_rows_with_errors_df=pd.DataFrame(), + messages=[ + dict( + level=logging.ERROR, + msg=msg, + trace=trace, + ) + ], + ) + + def crosslinking_import(file_path: Path, organism_ids: str) -> dict: file_type = file_path.suffix try: scientific_organism_names: list[str] = None if file_type == ".csv": - success, organism_ids_list, scientific_organism_names = ( + success, organism_ids_list, scientific_organism_names, error = ( process_organism_id_from_text_field(organism_ids) ) if not success: if organism_ids_list: - msg = f"Unsupported organism id: {organism_ids_list}. Please provide all valid taxonomy ids." + msg = f"Unsupported organism id: {organism_ids_list}. \nOrganism id validation failed with error: {error}. \nPlease provide all valid taxonomy ids." else: - msg = f"An error occurred while reading the organism ids. Please provide all valid taxonomy ids, separated by a comma." - return dict( - messages=[ - dict( - level=logging.ERROR, - msg=msg, - ) - ] - ) + msg = f"An error occurred while reading the organism ids: {error}. \nPlease provide all valid taxonomy ids, separated by a comma." + return error_output(msg) good_df, failed_df = read_csm_file(file_path, organism_ids_list) elif file_type == ".xlsx": good_df, failed_df = read_ProteomeDiscoverer_XlinkX_file(file_path) @@ -774,15 +785,7 @@ def crosslinking_import(file_path: Path, organism_ids: str) -> dict: raise ValueError(f"Unsupported file type: {file_path.suffix}") except Exception as e: msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid cross linking file." - return dict( - messages=[ - dict( - level=logging.ERROR, - msg=msg, - trace=format_trace(traceback.format_exception(e)), - ) - ] - ) + return error_output(msg, trace=format_trace(traceback.format_exception(e))) def base_message(): if file_type == ".csv": From 1a44548ed6a828c55305494019978b1d49b324a2 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Mon, 9 Feb 2026 15:32:11 +0100 Subject: [PATCH 093/240] format backend code with black --- backend/protzilla/importing/crosslinking_import.py | 2 +- backend/protzilla/methods/importing.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index 4d83b31cc..2a3c1ae7c 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -712,7 +712,7 @@ def process_organism_id_from_text_field( return False, id, None, "MISSING_SCIENTIFIC_NAME" organism_names.append(name) - return True, organism_ids_list, organism_names, None + return True, organism_ids_list, organism_names, None def aggregate_failed_proteins_for_display(failed_df: pd.DataFrame) -> str: diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 33c40975c..dfb298a25 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -463,7 +463,9 @@ def create_form(self): label="Organism IDs \n(only required when importing a CSM file)", value="", ), - InfoField(label="Please list them in the order in which they should be applied, separated by a comma \n e.g.: 9606, 10090, 10116"), + InfoField( + label="Please list them in the order in which they should be applied, separated by a comma \n e.g.: 9606, 10090, 10116" + ), ], ) From 30758d2c7b122852028a9551169bb59a814d4511 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Mon, 9 Feb 2026 15:45:30 +0100 Subject: [PATCH 094/240] fix tests --- backend/tests/protzilla/importing/test_crosslinking_import.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/tests/protzilla/importing/test_crosslinking_import.py b/backend/tests/protzilla/importing/test_crosslinking_import.py index e0cc5c431..f9e14390b 100644 --- a/backend/tests/protzilla/importing/test_crosslinking_import.py +++ b/backend/tests/protzilla/importing/test_crosslinking_import.py @@ -191,7 +191,7 @@ def mock_lookup(ids): "9606": {"scientificname": "Homo sapiens"}, "10090": {"scientificname": "Mus musculus"}, }, - (True, ["9606", "10090"], ["Homo sapiens", "Mus musculus"]), + (True, ["9606", "10090"], ["Homo sapiens", "Mus musculus"], None), ), ( "9606,9999", @@ -199,7 +199,7 @@ def mock_lookup(ids): "uids": ["9606"], "9606": {"scientificname": "Homo sapiens"}, }, - (False, "9999", None), + (False, "9999", None, "ORGANISM_ID_NOT_FOUND"), ), ], ) From 15ce74da9d01fae580d0000dd1fc00a71cb08a3a Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 9 Feb 2026 18:06:01 +0100 Subject: [PATCH 095/240] style: fix typos, rename variable, improve error message wording --- .../data_analysis/crosslinking_validation.py | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 3e41e5ccb..8954b1eac 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -20,7 +20,7 @@ def get_reactive_atom_of_amino_acid_residue(amino_acid_type: str) -> str: :return: the atom identifier of the reactive atom as a string """ # right now we always return the central C atom - # later we might want to return the reactive atom of the amino acid residue of the specific amino acid kind + # later we might want to return the reactive atom of the amino acid residue of the specific amino acid type # as soon as we change this, we will need to change the test test_validate_with_angstrom_deviation return "CA" @@ -70,8 +70,8 @@ def get_coordinates_of_atom_crosslinker_bound_to( def get_distance_between_two_amino_acids_in_angstrom( amino_acid_position1: int, amino_acid_position2: int, - amino_acid_kind1: str, - amino_acid_kind2: str, + amino_acid_type1: str, + amino_acid_type2: str, cif_df: pd.DataFrame, ) -> float: """ @@ -80,22 +80,22 @@ def get_distance_between_two_amino_acids_in_angstrom( :param amino_acid_position1: 1-based position of the first amino acid residue :param amino_acid_position2: 1-based position of the second amino acid residue - :param amino_acid_kind1: amino acid type at the first position - :param amino_acid_kind2: amino acid type at the second position + :param amino_acid_type1: amino acid type at the first position + :param amino_acid_type2: amino acid type at the second position :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) :return: the distance between the two residues in Ångström """ pos1 = np.array( get_coordinates_of_atom_crosslinker_bound_to( - amino_acid_position1, amino_acid_kind1, cif_df + amino_acid_position1, amino_acid_type1, cif_df ), dtype=float, ) pos2 = np.array( get_coordinates_of_atom_crosslinker_bound_to( - amino_acid_position2, amino_acid_kind2, cif_df + amino_acid_position2, amino_acid_type2, cif_df ), dtype=float, ) @@ -152,7 +152,10 @@ def add_positions_of_amino_acid_where_crosslinker_bound_to_df( for pos2 in peptide2_positions ] if not all_position_combinations: - msg = f"At least one of the peptide sequences ({peptide_sequence1}, {peptide_sequence2}) of crosslink entry {idx} was not found in the protein sequence. The entry was deleted." + if not peptide1_positions and not peptide2_positions: + msg = f"Peptide sequences {peptide_sequence1} and {peptide_sequence2} of crosslink entry {idx} were not found in the protein sequence. The entry was deleted." + else: + msg = f"Peptide sequence {peptide_sequence1 if not peptide1_positions else peptide_sequence2} of crosslink entry {idx} was not found in the protein sequence. The entry was deleted." messages.append(dict(level=logging.WARNING, msg=msg)) rows_to_delete.append(idx) continue @@ -208,7 +211,7 @@ def validate_with_angstrom_deviation( - lower_accepted_deviation_for_: float - upper_accepted_deviation_for_: float :return: dict (crosslinking_df_result, messages), crosslinking_df_result contains the relevant rows (rows of intra-crosslinks within the - protein to validate) of crosslinking_df and two more colums containing the distances in AlphaFold and wheter the crosslink matches the + protein to validate) of crosslinking_df and two more columns containing the distances in AlphaFold and whether the crosslink matches the AlphaFold data or not :raises KeyError: If a required crosslinker field is missing in crosslinker_information. """ @@ -236,8 +239,8 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: predicted_distance = get_distance_between_two_amino_acids_in_angstrom( amino_acid_position1=crosslink.crosslinker_position1, amino_acid_position2=crosslink.crosslinker_position2, - amino_acid_kind1=protein_sequence[crosslink.crosslinker_position1 - 1], - amino_acid_kind2=protein_sequence[crosslink.crosslinker_position2 - 1], + amino_acid_type1=protein_sequence[crosslink.crosslinker_position1 - 1], + amino_acid_type2=protein_sequence[crosslink.crosslinker_position2 - 1], cif_df=cif_df, ) try: @@ -275,13 +278,13 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: ) # adding the distance in alphafold, the result of the validation and the crosslinker positions to all relevant crosslinks - new_colums = [ + new_columns = [ "alphafold_distance", "valid_crosslink", "crosslinker_position1", "crosslinker_position2", ] - relevant_crosslinks_df[new_colums] = relevant_crosslinks_df.apply( + relevant_crosslinks_df[new_columns] = relevant_crosslinks_df.apply( check_crosslink, axis=1 ) From a8c3f331c6a9a4ed6b722e095b5438b621e0eccc Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Tue, 10 Feb 2026 20:43:06 +0100 Subject: [PATCH 096/240] fix: suggestion form review to check if file suffix is lower case --- backend/protzilla/importing/crosslinking_import.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index 2a3c1ae7c..98864b3cf 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -765,7 +765,7 @@ def error_output(msg, trace: str | None = None) -> dict: def crosslinking_import(file_path: Path, organism_ids: str) -> dict: - file_type = file_path.suffix + file_type = file_path.suffix.lower() try: scientific_organism_names: list[str] = None if file_type == ".csv": From d38f86017e2def791f45b1c4f1a04dd34dd6e0e0 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Wed, 11 Feb 2026 12:09:55 +0100 Subject: [PATCH 097/240] Resolve merge conflict --- backend/main/views.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/backend/main/views.py b/backend/main/views.py index 9c17a3806..de31f8402 100644 --- a/backend/main/views.py +++ b/backend/main/views.py @@ -48,13 +48,8 @@ database_metadata_path = EXTERNAL_DATA_PATH / "internal" / "metadata" / "uniprot.json" -dataframes = [ - "protein_df", - "metadata_df", - "peptide_df", - "modification_df", -] - +# Labels of outputs not sent via the output tables API +hidden_outputs = ["messages"] @ensure_csrf_cookie def get_csrf_token(request): From 9f913a696723b352d3c1ac20eff6791de39bfbfa Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Wed, 11 Feb 2026 12:11:16 +0100 Subject: [PATCH 098/240] Add newline --- backend/main/views.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/main/views.py b/backend/main/views.py index de31f8402..37fc5f88c 100644 --- a/backend/main/views.py +++ b/backend/main/views.py @@ -51,6 +51,7 @@ # Labels of outputs not sent via the output tables API hidden_outputs = ["messages"] + @ensure_csrf_cookie def get_csrf_token(request): csrf_token = get_token(request) From 9a4a60044a09368ffa9eebef64d218d369ff1b82 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Thu, 12 Feb 2026 14:30:20 +0100 Subject: [PATCH 099/240] feat: one histogram for each crosslinker --- .../data_analysis/crosslinking_validation.py | 134 ++++++++---------- backend/protzilla/data_preprocessing/plots.py | 21 +-- 2 files changed, 75 insertions(+), 80 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 0bfce078d..1531e3da1 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -1,5 +1,6 @@ import pandas as pd import numpy as np +from isort.wrap_modes import vertical from plotly.graph_objects import Figure from scipy.ndimage import standard_deviation @@ -267,79 +268,69 @@ def bar_plot_of_valid_crosslinks( crosslinking_df, protein_to_validate, crosslinker_information )[ "crosslinking_result_df" - ] # TODO: was wenn wir einfach keine relevanten Crosslinks zurück bekommen - + ] + figures = [] validated_df = validated_df.dropna(subset=["valid_crosslink"]) + for crosslinker, crosslinker_df in validated_df.groupby("Crosslinker"): + distances_valid = crosslinker_df.loc[ + crosslinker_df["valid_crosslink"] == True, "alphafold_distance" + ] + distances_invalid = crosslinker_df.loc[ + crosslinker_df["valid_crosslink"] == False, "alphafold_distance" + ] + df_valid = pd.DataFrame({"alphafold_distance": distances_valid}) + df_invalid = pd.DataFrame({"alphafold_distance": distances_invalid}) + + histogram = create_histograms( + dataframe_a=df_valid, + dataframe_b=df_invalid, + name_a="Valid Crosslinks", + name_b="Invalid Crosslinks", + heading=f"Predicted distances for protein {protein_to_validate} with crosslinker {crosslinker}", + x_title="Distance (Å)", + y_title="Count", + overlay=True, + visual_transformation="linear", + relevant_column_a="alphafold_distance", + relevant_column_b="alphafold_distance", + one_bin_per_int=True + ) - distances_valid = validated_df.loc[ - validated_df["valid_crosslink"] == True, "alphafold_distance" - ] - distances_invalid = validated_df.loc[ - validated_df["valid_crosslink"] == False, "alphafold_distance" - ] - df_valid = pd.DataFrame({"alphafold_distance": distances_valid}) - df_invalid = pd.DataFrame({"alphafold_distance": distances_invalid}) - - histogram = create_histograms( - dataframe_a=df_valid, - dataframe_b=df_invalid, - name_a="Valid Crosslinks", - name_b="Invalid Crosslinks", - heading=f"AlphaFold Distances for {protein_to_validate}", - x_title="Distance (Å)", - y_title="Count", - overlay=True, - visual_transformation="linear", - relevant_column_a="alphafold_distance", - relevant_column_b="alphafold_distance", - ) + mean_predicted_lengths = crosslinker_df["alphafold_distance"].mean() + standard_deviation_predicted_lengths = crosslinker_df["alphafold_distance"].std() + ( + crosslinker_length, + accepted_deviation_upper_bound, + accepted_deviation_lower_bound, + ) = crosslinker_information[crosslinker] + dashed_lines = [] + if accepted_deviation_upper_bound != 0: + dashed_lines.append((crosslinker_length + accepted_deviation_upper_bound, f"allowed_deviation_upper_bound")) + if accepted_deviation_lower_bound != 0: + dashed_lines.append((crosslinker_length - accepted_deviation_lower_bound, f"allowed_deviation_lower_bound")) + histogram_two_standard_deviations = create_histograms( + dataframe_a=df_valid, + dataframe_b=df_invalid, + name_a="Valid Crosslinks", + name_b="Invalid Crosslinks", + heading=f"Predicted distances for protein {protein_to_validate} with crosslinker {crosslinker}, mean +- 2 standard deviations", + x_title="Distance (Å)", + y_title="Count", + overlay=True, + visual_transformation="linear", + relevant_column_a="alphafold_distance", + relevant_column_b="alphafold_distance", + min_value=max(0, mean_predicted_lengths + - 2 * standard_deviation_predicted_lengths), + max_value=mean_predicted_lengths + + 2 * standard_deviation_predicted_lengths, + vertical_lines=[(crosslinker_length, f"{crosslinker}")], + vertical_lines_dashed= dashed_lines if dashed_lines else None, + one_bin_per_int=True + ) - mean_predicted_lengths = validated_df["alphafold_distance"].mean() - standard_deviation_predicted_lengths = validated_df["alphafold_distance"].std() - - histogram2 = create_histograms( - dataframe_a=df_valid, - dataframe_b=df_invalid, - name_a="Valid Crosslinks", - name_b="Invalid Crosslinks", - heading=f"AlphaFold Distances for {protein_to_validate}, mean +- 2 standard deviations", - x_title="Distance (Å)", - y_title="Count", - overlay=True, - visual_transformation="linear", - relevant_column_a="alphafold_distance", - relevant_column_b="alphafold_distance", - min_value_to_plot=mean_predicted_lengths - - 2 * standard_deviation_predicted_lengths, - max_value_to_plot=mean_predicted_lengths - + 2 * standard_deviation_predicted_lengths, - vertical_lines=[ - (crosslinker_length, key) - for key, ( - crosslinker_length, - accepted_deviation_upper_bound, - accepted_deviation_lower_bound, - ) in crosslinker_information.items() - ], - vertical_lines_dashed=[ - (crosslinker_length + accepted_deviation_upper_bound, f"{key}_upper_bound") - for key, ( - crosslinker_length, - accepted_deviation_upper_bound, - accepted_deviation_lower_bound, - ) in crosslinker_information.items() - if accepted_deviation_upper_bound != 0 - ] - + [ - (crosslinker_length - accepted_deviation_lower_bound, f"{key}_lower_bound") - for key, ( - crosslinker_length, - accepted_deviation_upper_bound, - accepted_deviation_lower_bound, - ) in crosslinker_information.items() - if accepted_deviation_lower_bound != 0 - ], - ) + figures.append(histogram_two_standard_deviations) + figures.append(histogram) evaluated = validated_df["valid_crosslink"].dropna() valid_crosslinks = (evaluated == True).sum() @@ -354,5 +345,6 @@ def bar_plot_of_valid_crosslinks( heading="Cross-Links used for Validation", y_title="Number of Cross-Links", ) + figures.append(bar_plot) - return [histogram2, histogram, bar_plot] + return figures diff --git a/backend/protzilla/data_preprocessing/plots.py b/backend/protzilla/data_preprocessing/plots.py index e96f1bcdf..149b3dec5 100644 --- a/backend/protzilla/data_preprocessing/plots.py +++ b/backend/protzilla/data_preprocessing/plots.py @@ -1,3 +1,5 @@ +import math + import numpy as np import pandas as pd import plotly.express as px @@ -171,10 +173,11 @@ def create_histograms( overlay: bool = False, relevant_column_a: str = None, relevant_column_b: str = None, - min_value_to_plot: int = None, - max_value_to_plot: int = None, + min_value: float = None, + max_value: float = None, vertical_lines: list[tuple[float, str]] = None, vertical_lines_dashed: list[tuple[float, str]] = None, + one_bin_per_int = False ) -> Figure: """ A function to create a histogram for visualisation @@ -214,16 +217,16 @@ def create_histograms( values_a = values_a.apply(np.log10) values_b = values_b.apply(np.log10) - if min_value_to_plot is None: + if min_value is None: min_value = min(values_a.min(skipna=True), values_b.min(skipna=True)) - else: - min_value = min_value_to_plot - if max_value_to_plot is None: + if max_value is None: max_value = max(values_a.max(skipna=True), values_b.max(skipna=True)) - else: - max_value = max_value_to_plot - number_of_bins = 100 + if one_bin_per_int: + min_value = math.floor(min_value) + max_value = math.ceil(max_value) + + number_of_bins = max_value-min_value if one_bin_per_int else 100 binsize_a = ( min(values_a.max(skipna=True), max_value) - max(values_a.min(skipna=True), min_value) From 33b51dbca2e5e2fdc1af87d02f22723f3b909643 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Thu, 12 Feb 2026 15:19:57 +0100 Subject: [PATCH 100/240] fix: broken tests and address code review feedback --- .../data_analysis/crosslinking_validation.py | 40 +++++++++---------- .../test_crosslinking_validation.py | 3 -- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 0bccfd7f7..f944a10bd 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -1,3 +1,5 @@ +import itertools + import pandas as pd import numpy as np import re @@ -104,7 +106,7 @@ def get_distance_between_two_amino_acids_in_angstrom( def add_positions_of_amino_acid_where_crosslinker_bound_to_df( - crosslinking_df: pd.DataFrame, protein_sequence: str + input_crosslinking_df: pd.DataFrame, protein_sequence: str ) -> tuple[pd.DataFrame, list[dict]]: """ Adds for each crosslink the 1-based positions of amino acids where the crosslink bound to a crosslinking DataFrame. @@ -112,7 +114,7 @@ def add_positions_of_amino_acid_where_crosslinker_bound_to_df( additional combination of positions. If a peptide sequence can't be matched the row will be deleted and a warning emitted. - :param crosslinking_df: DataFrame containing cross-linking data with at least the following columns: + :param input_crosslinking_df: DataFrame containing cross-linking data with at least the following columns: - 'Peptide1': first peptide sequence - 'Peptide2': second peptide sequence - 'CL_position_within_peptide1': 0-based crosslinker position within Peptide1 @@ -125,12 +127,9 @@ def add_positions_of_amino_acid_where_crosslinker_bound_to_df( Rows are duplicated for multiple peptide matches. - messages: list of warning dictionaries with if the peptide was not found or a row was duplicated """ - crosslinking_df["crosslinker_position1"] = pd.Series( - [pd.NA] * len(crosslinking_df), dtype="Int64" - ) - crosslinking_df["crosslinker_position2"] = pd.Series( - [pd.NA] * len(crosslinking_df), dtype="Int64" - ) + crosslinking_df = input_crosslinking_df.copy() + crosslinking_df["crosslinker_position1"] = pd.Series(dtype="Int64") + crosslinking_df["crosslinker_position2"] = pd.Series(dtype="Int64") rows_to_duplicate = {} rows_to_delete = [] messages = [] @@ -138,20 +137,17 @@ def add_positions_of_amino_acid_where_crosslinker_bound_to_df( peptide_sequence1 = crosslinker_row.Peptide1 peptide_sequence2 = crosslinker_row.Peptide2 peptide1_positions = [ - m.start() for m in re.finditer(f"(?={peptide_sequence1})", protein_sequence) + m.start() + crosslinker_row.CL_position_within_peptide1 + 1 + for m in re.finditer(f"(?={peptide_sequence1})", protein_sequence) ] peptide2_positions = [ - m.start() for m in re.finditer(f"(?={peptide_sequence2})", protein_sequence) + m.start() + crosslinker_row.CL_position_within_peptide2 + 1 + for m in re.finditer(f"(?={peptide_sequence2})", protein_sequence) ] - all_position_combinations = [ - ( - pos1 + crosslinker_row.CL_position_within_peptide1 + 1, - pos2 + crosslinker_row.CL_position_within_peptide2 + 1, - ) - for pos1 in peptide1_positions - for pos2 in peptide2_positions - ] - if not all_position_combinations: + all_position_combinations = list( + itertools.product(peptide1_positions, peptide2_positions) + ) + if all_position_combinations == [()]: if not peptide1_positions and not peptide2_positions: msg = f"Peptide sequences {peptide_sequence1} and {peptide_sequence2} of crosslink entry {idx} were not found in the protein sequence. The entry was deleted." else: @@ -162,9 +158,9 @@ def add_positions_of_amino_acid_where_crosslinker_bound_to_df( crosslinking_df.at[idx, "crosslinker_position1"] = all_position_combinations[0][ 0 ] - crosslinking_df.loc[idx, "crosslinker_position2"] = all_position_combinations[ - 0 - ][1] + crosslinking_df.at[idx, "crosslinker_position2"] = all_position_combinations[0][ + 1 + ] if len(all_position_combinations) > 1: rows_to_duplicate[idx] = all_position_combinations[1:] diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 157c4159d..2c8c553bb 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -13,9 +13,6 @@ from protzilla.methods.data_analysis import CrossLinkingValidationWithAngstromDeviation -@patch( - "backend.protzilla.data_analysis.crosslinking_validation.fetch_alphafold_protein_structure" -) @pytest.mark.parametrize( "distance, expected", [ From 0f56dde5ccfbd1661f3f9f0a5124a946f5e2a2a0 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Thu, 12 Feb 2026 15:32:56 +0100 Subject: [PATCH 101/240] fix: fix broken tests --- backend/protzilla/data_analysis/crosslinking_validation.py | 2 +- .../protzilla/data_analysis/test_crosslinking_validation.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index f944a10bd..7d6a2b0fb 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -147,7 +147,7 @@ def add_positions_of_amino_acid_where_crosslinker_bound_to_df( all_position_combinations = list( itertools.product(peptide1_positions, peptide2_positions) ) - if all_position_combinations == [()]: + if not all_position_combinations: if not peptide1_positions and not peptide2_positions: msg = f"Peptide sequences {peptide_sequence1} and {peptide_sequence2} of crosslink entry {idx} were not found in the protein sequence. The entry was deleted." else: diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 2c8c553bb..4fdbf63eb 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -178,7 +178,7 @@ def test_add_crosslinker_positions_but_one_peptide_not_found_deletes_row(): assert len(messages) == 1 assert messages[0]["level"] == logging.WARNING - assert "was not found" in messages[0]["msg"] + assert "not found" in messages[0]["msg"] # row should be deleted assert df.empty From c5be8cff8ed8228f7c3776835d4af6e1936653d4 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Thu, 12 Feb 2026 15:54:48 +0100 Subject: [PATCH 102/240] feat: add upload of multimer structure predictions and refactor of monomer upload --- backend/main/views_helper.py | 38 -- backend/main/views_settings.py | 52 ++- backend/protzilla/all_steps.py | 1 + backend/protzilla/constants/paths.py | 7 +- .../alphafold_protein_structure_load.py | 400 +++++++++++----- backend/protzilla/methods/importing.py | 72 ++- backend/protzilla/utilities/utilities.py | 40 +- .../test_alphafold_protein_structure_load.py | 428 ++++++++++++++++-- .../protein-structure-upload.tsx | 18 +- 9 files changed, 820 insertions(+), 236 deletions(-) diff --git a/backend/main/views_helper.py b/backend/main/views_helper.py index b761b7af4..72755d2b0 100644 --- a/backend/main/views_helper.py +++ b/backend/main/views_helper.py @@ -1,11 +1,9 @@ import re -import shutil from pathlib import Path import numpy as np from backend.protzilla.constants.paths import SETTINGS_PATH -from backend.protzilla.constants.protzilla_logging import logger from backend.protzilla.disk_operator import YamlOperator from backend.protzilla.steps import StepManager, Step from backend.protzilla.utilities import name_to_title @@ -186,39 +184,3 @@ def load_yaml_from_file(path: Path) -> str: raise FileNotFoundError(f"File {path} does not exist.") with path.open("r") as f: return f.read() - - -def copy_file_to_directory(source_file: Path, dest_dir: Path) -> tuple[bool, str]: - """ - Copy a single file to a destination directory. - Creates the destination directory if it doesn't exist. - - :param source_file: Path to the source file - :param dest_dir: Path to the destination directory - :return: Tuple of (success: bool, message: str) - """ - - if not source_file.exists(): - message = f"Source file does not exist: {source_file}" - logger.error(message) - return False, message - - if not source_file.is_file(): - message = f"Source path is not a file: {source_file}" - logger.error(message) - return False, message - - try: - dest_dir.mkdir(parents=True, exist_ok=True) - dest_file = dest_dir / source_file.name - - shutil.copy2(source_file, dest_file) - - message = f"Successfully copied file {source_file} to {dest_dir}" - logger.info(message) - return True, message - - except OSError as e: - message = f"Failed to copy file: {str(e)}" - logger.error(message) - return False, message diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index dddc0eb52..6485b58b6 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -3,7 +3,6 @@ import shutil from datetime import date, datetime, timezone from io import BytesIO -from pathlib import Path import pandas @@ -17,9 +16,14 @@ from backend.main.views_helper import ( sanitize_name, load_settings_from_file, - copy_file_to_directory, ) -from backend.protzilla.constants.paths import EXTERNAL_DATA_PATH, SETTINGS_PATH +from backend.protzilla.utilities.utilities import copy_file_to_directory +from backend.protzilla.constants.paths import ( + EXTERNAL_DATA_PATH, + SETTINGS_PATH, + AF_MONOMER_METADATA_CSV_PATH, + ALPHAFOLD_MONOMER_PATH, +) from backend.protzilla.data_integration.database_query import ( uniprot_columns, uniprot_databases, @@ -229,16 +233,14 @@ def save_ptm_settings(request, default_file_stem: str = DEFAULT_PTM_SETTINGS_FIL # <--- Protein Structure Predictions ---> -AF_DICT_PATH = EXTERNAL_DATA_PATH / "alphafold" - def get_metadata_df(csv_file_path: str) -> pandas.DataFrame: expected_columns = [ - "entryID", - "uniprotAccession", - "modelCreatedDate", + "entry_id", + "uniprot_accession", + "model_created_date", "gene", - "alphafold_version", + "model_used", ] if csv_file_path.exists(): df = pandas.read_csv(csv_file_path, usecols=lambda c: c in expected_columns) @@ -248,17 +250,17 @@ def get_metadata_df(csv_file_path: str) -> pandas.DataFrame: def get_prot_structure(request): - metadata_csv = AF_DICT_PATH / "alphafold_metadata.csv" + metadata_csv = AF_MONOMER_METADATA_CSV_PATH df = get_metadata_df(metadata_csv) df_infos = df.rename( columns={ - "entryID": "entry_id", - "uniprotAccession": "uniprot_id", - "modelCreatedDate": "date_modified", + "entry_id": "entry_id", + "uniprot_accession": "uniprot_id", + "model_created_date": "date_modified", "gene": "gene", - "alphafold_version": "af_version", + "model_used": "model_used", } ).to_dict(orient="records") @@ -270,7 +272,7 @@ def upload_prot_structure(request): data = json.loads(request.body) uniprot_id = data.get("uniprot_id") entry_id = data.get("entry_id") - af_version = data.get("af_version") + model_used = data.get("model_used") gene = data.get("gene") cif_file = data.get("cif_file") confidence = data.get("confidence") @@ -279,7 +281,7 @@ def upload_prot_structure(request): # Copy files to source directory out of temp directory - af_path = AF_DICT_PATH / entry_id.upper() + af_path = ALPHAFOLD_MONOMER_PATH / entry_id.upper() if af_path.exists(): return JsonResponse( {"success": False, "message": "Entry ID is not unique."}, status=405 @@ -297,8 +299,8 @@ def upload_prot_structure(request): ) # add row to metadata csv - AF_DICT_PATH.mkdir(parents=True, exist_ok=True) - metadata_csv = AF_DICT_PATH / "alphafold_metadata.csv" + ALPHAFOLD_MONOMER_PATH.mkdir(parents=True, exist_ok=True) + metadata_csv = AF_MONOMER_METADATA_CSV_PATH df = get_metadata_df(metadata_csv) @@ -306,11 +308,11 @@ def upload_prot_structure(request): formatted = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ") new_row = { - "entryID": entry_id, - "uniprotAccession": uniprot_id, - "modelCreatedDate": formatted, + "entry_id": entry_id, + "uniprot_accession": uniprot_id, + "model_created_date": formatted, "gene": gene, - "alphafold_version": af_version, + "model_used": model_used, } df = pandas.concat([df, pandas.DataFrame([new_row])], ignore_index=True) @@ -347,8 +349,8 @@ def delete_prot_structure(request): ) # delete folder with files for the protein structure - target_dir = AF_DICT_PATH / entry_id.upper() - metadata_csv = AF_DICT_PATH / "alphafold_metadata.csv" + target_dir = ALPHAFOLD_MONOMER_PATH / entry_id.upper() + metadata_csv = AF_MONOMER_METADATA_CSV_PATH if not target_dir.exists() or not target_dir.is_dir(): return JsonResponse( @@ -373,7 +375,7 @@ def delete_prot_structure(request): try: df = pandas.read_csv(metadata_csv, dtype=str) df = df[ - df["entryID"].fillna("").str.strip().str.upper() != entry_id.upper() + df["entry_id"].fillna("").str.strip().str.upper() != entry_id.upper() ] df.to_csv(metadata_csv, index=False) diff --git a/backend/protzilla/all_steps.py b/backend/protzilla/all_steps.py index 8bb5d4d75..843d2844d 100644 --- a/backend/protzilla/all_steps.py +++ b/backend/protzilla/all_steps.py @@ -17,6 +17,7 @@ importing.AlphaFoldPredictionLoad, importing.CrosslinkingImport, importing.ImportStructurePredictionFromDisk, + importing.UploadMultimerPredictions, data_preprocessing.FilterProteinsBySamplesMissing, data_preprocessing.FilterProteinsBySilacRatios, data_preprocessing.FilterByProteinsCount, diff --git a/backend/protzilla/constants/paths.py b/backend/protzilla/constants/paths.py index bf1ae2694..dade4c939 100644 --- a/backend/protzilla/constants/paths.py +++ b/backend/protzilla/constants/paths.py @@ -12,7 +12,12 @@ EXTERNAL_DATA_PATH = USER_DATA_PATH / "external_data" UPLOAD_PATH = BACKEND_PATH / "uploads" ALPHAFOLD_PATH = EXTERNAL_DATA_PATH / "alphafold" -AF_METADATA_CSV_PATH = ALPHAFOLD_PATH / "alphafold_metadata.csv" +ALPHAFOLD_MULTIMER_PATH = ALPHAFOLD_PATH / "multimer" +ALPHAFOLD_MONOMER_PATH = ALPHAFOLD_PATH / "monomer" +AF_MONOMER_METADATA_CSV_PATH = ALPHAFOLD_MONOMER_PATH / "alphafold_monomer_metadata.csv" +AF_MULTIMER_METADATA_CSV_PATH = ( + ALPHAFOLD_MULTIMER_PATH / "alphafold_multimer_metadata.csv" +) CUSTOM_PLOT_SETTINGS_FILE_STEM = "plots" DEFAULT_PLOT_SETTINGS_FILE_STEM = "plots_default" diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 0556ba343..79b9ed632 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -6,7 +6,9 @@ from textwrap import wrap from typing import Any import logging +import json +from datetime import datetime, timezone import gemmi import pandas as pd import requests @@ -15,30 +17,50 @@ from backend.protzilla.constants.protzilla_logging import logger from backend.protzilla.importing.fasta_import import fasta_import from backend.protzilla.networking import download_file_from_url +from backend.protzilla.utilities.utilities import copy_file_to_directory -def get_metadata_df() -> pd.DataFrame: +def get_monomer_metadata_df() -> pd.DataFrame: """ - Returns all data from alphafold_metadata.csv in form of a dataframe. If no such csv exist, it returns + Returns all data from alphafold_monomer_metadata.csv in form of a dataframe. If no such csv exist, it returns a dataframe with the corresponding keys but no values and creates a csv with the expected column names. """ - metadata_csv = paths.AF_METADATA_CSV_PATH - + metadata_csv = paths.AF_MONOMER_METADATA_CSV_PATH if not metadata_csv.exists(): - msg = f"AlphaFold metadata CSV not found: {metadata_csv}. Returning an empty Dataframe." - logger.error(msg) metadata_df = pd.DataFrame( columns=[ - "entryID", - "uniprotAccession", - "modelCreatedDate", + "entry_id", + "uniprot_accession", + "model_created_date", "gene", - "alphafold_version", + "model_used", ] ) + metadata_csv.parent.mkdir(parents=True, exist_ok=True) metadata_df.to_csv(metadata_csv, index=False) return metadata_df + return pd.read_csv(metadata_csv, dtype=str) + + +def get_multimer_metadata_df() -> pd.DataFrame: + """ + Returns all data from alphafold_multimer_metadata.csv in form of a dataframe. If no such csv exist, it returns + a dataframe with the corresponding keys but no values and creates a csv with the expected column names. + """ + metadata_csv = paths.AF_MULTIMER_METADATA_CSV_PATH + if not metadata_csv.exists(): + metadata_df = pd.DataFrame( + columns=[ + "entry_id", + "protein_ids", + "model_created_date", + "model_used", + ] + ) + metadata_csv.parent.mkdir(parents=True, exist_ok=True) + metadata_df.to_csv(metadata_csv, index=False) + return metadata_df return pd.read_csv(metadata_csv, dtype=str) @@ -107,13 +129,68 @@ def read_alphafold_mmcif(path: str) -> pd.DataFrame: return pd.DataFrame(data) +def get_correct_af_directories( + entry_id: str, directory_name: Path, persist_upload: bool +) -> list[Path, Path]: + target_dir = directory_name / entry_id.upper() + temp_dir = None + + if persist_upload: + target_dir.mkdir(parents=True, exist_ok=True) + work_dir = target_dir + else: + temp_dir = Path(tempfile.mkdtemp()) + work_dir = temp_dir + + return temp_dir, work_dir + + +def extend_metadata_csv( + entry_id: str, + metadata_csv: Path, + exsisting_metadata_df: pd.DataFrame, + metadata_df: pd.DataFrame, + messages: list, +) -> None: + try: + mask = exsisting_metadata_df["entry_id"] == entry_id + if mask.any(): + msg = f'Existing entry with Entry ID "{entry_id}" was overwritten.' + logger.warning(msg) + messages.append(dict(level=logging.WARNING, msg=msg)) + filtered_exsisting_metadata_df = exsisting_metadata_df[~mask] + combined = pd.concat( + [filtered_exsisting_metadata_df, metadata_df], ignore_index=True + ) + combined.to_csv(metadata_csv, index=False) + else: + metadata_df.to_csv(metadata_csv, index=False) + except Exception: + msg = f'Failed to write AlphaFold metadata CSV to "{metadata_csv}".' + logger.exception(msg) + messages.append(dict(level=logging.ERROR, msg=msg)) + + +def get_amino_acid_sequence_df( + entry_id: str, work_dir: Path, fasta_dest: Path, messages: list +) -> pd.DataFrame: + try: + fasta_dict = fasta_import(str(fasta_dest)) + amino_acid_sequence_df = fasta_dict["fasta_df"] + except Exception: + msg = "Failed to create sequence dataframe" + logger.exception(msg) + messages.append(dict(level=logging.ERROR, msg=msg)) + return amino_acid_sequence_df + + def handle_alphafold_files( files_urls: dict[str, Any], uniprot: str, seq: str, metadata_df: pd.DataFrame, entry_id: str, - persist_uploads: bool = False, + persist_upload: bool = False, ) -> dict[str, pd.DataFrame | None]: """ Download AlphaFold structure files and convert them to DataFrames. @@ -127,7 +204,7 @@ def handle_alphafold_files( :param seq: The protein sequence :param metadata_df: DataFrame containing AlphaFold metadata :param entry_id: The entry_id (in the case of fetching from AF DB the same as uniprot id) (used for directory naming) - :param persist_uploads: If True, files are saved persistently; if False, only loaded into memory + :param persist_upload: If True, files are saved persistently; if False, only loaded into memory :return: A dictionary containing DataFrames for metadata, CIF, PAE, pLDDT, sequence data or None values for failed loads and messages such as warnings """ @@ -136,39 +213,25 @@ def handle_alphafold_files( plddt_df = None amino_acid_sequence_df = None messages = [] - - target_dir = paths.ALPHAFOLD_PATH / uniprot downloaded: dict[str, str] = {} - temp_dir = None - - if persist_uploads: - target_dir.mkdir(parents=True, exist_ok=True) - work_dir = target_dir - else: - temp_dir = Path(tempfile.mkdtemp()) - work_dir = temp_dir + temp_dir, work_dir = get_correct_af_directories( + entry_id=uniprot, + directory_name=paths.ALPHAFOLD_MONOMER_PATH, + persist_upload=persist_upload, + ) try: - if persist_uploads: - paths.ALPHAFOLD_PATH.mkdir(parents=True, exist_ok=True) - existing = get_metadata_df() - try: - metadata_csv = paths.AF_METADATA_CSV_PATH - mask = existing["entryID"] == entry_id - if mask.any(): - msg = f'Existing entry with EntryID "{entry_id}" was overwritten.' - logger.warning(msg) - messages.append(dict(level=logging.WARNING, msg=msg)) - existing = existing[~mask] - - combined = pd.concat([existing, metadata_df], ignore_index=True) - combined.to_csv(metadata_csv, index=False) - logger.info("Wrote AlphaFold metadata to %s", metadata_csv) - except Exception: - logger.exception( - "Failed to write AlphaFold metadata CSV to %s", metadata_csv - ) + if persist_upload: + paths.ALPHAFOLD_MONOMER_PATH.mkdir(parents=True, exist_ok=True) + existing_metadata_df = get_monomer_metadata_df() + extend_metadata_csv( + entry_id=uniprot, + metadata_csv=paths.AF_MONOMER_METADATA_CSV_PATH, + exsisting_metadata_df=existing_metadata_df, + metadata_df=metadata_df, + messages=messages, + ) for key in ("cifUrl", "paeDocUrl", "plddtDocUrl"): urlval = files_urls.get(key) @@ -186,21 +249,25 @@ def handle_alphafold_files( elif key == "plddtDocUrl": plddt_df = pd.read_json(saved) except Exception: - logger.exception("Failed to load %s into dataframe", key) - - sequence = to_fasta(seq=seq, header=uniprot) - fasta_dest = work_dir / f"{uniprot.upper()}.fasta" + msg = f'Failed to load "{key}" into dataframe' + logger.exception(msg) + messages.append(dict(level=logging.ERROR, msg=msg)) try: + sequence = to_fasta(seq=seq, header=uniprot) + fasta_dest = work_dir / f"{entry_id.upper()}.fasta" fasta_dest.parent.mkdir(parents=True, exist_ok=True) with open(fasta_dest, "w") as f: f.write(sequence) - logger.info("Wrote FASTA sequence to %s", fasta_dest) - fasta_dict = fasta_import(str(fasta_dest)) - amino_acid_sequence_df = fasta_dict["fasta_df"] except OSError: - logger.exception("Failed to write FASTA file %s", fasta_dest) - except Exception: - logger.exception("Failed to create sequence dataframe") + msg = f'Failed to write FASTA file "{fasta_dest}"' + logger.exception(msg) + messages.append(dict(level=logging.ERROR, msg=msg)) + amino_acid_sequence_df = get_amino_acid_sequence_df( + entry_id=uniprot, + work_dir=work_dir, + fasta_dest=fasta_dest, + messages=messages, + ) finally: if temp_dir is not None: @@ -215,30 +282,114 @@ def handle_alphafold_files( } -def get_all_available_entry_ids() -> list[str]: +def fetch_alphafold_protein_structure( + uniprot_id: str, persist_upload: bool +) -> dict[str, Any]: + """ + Fetch AlphaFold protein structure data from the AlphaFold Database API. + + Retrieves metadata and structure files (CIF, PAE, pLDDT) from the AlphaFold Database + for the given UniProt ID. Optionally persists the downloaded files to disk. + + :param uniprot_id: The UniProt ID of the protein + :param persist_upload: If True, files are saved persistently; if False, only loaded into memory + :return: A dictionary containing DataFrames for metadata, CIF, PAE, pLDDT, and sequence data + :raises RuntimeError: If the API request fails or returns invalid data + :raises ValueError: If no predictions are found for the given UniProt ID + """ + url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot_id}" + with requests.Session() as session: + try: + resp = session.get(url, timeout=30) + resp.raise_for_status() + records = resp.json() + except requests.RequestException as e: + raise RuntimeError(f"AlphaFold request failed for {uniprot_id}: {e}") from e + except ValueError as e: + raise RuntimeError( + f"AlphaFold returned non-JSON for {uniprot_id}: {e}" + ) from e + + if not isinstance(records, list) or not records: + raise ValueError(f"No AlphaFold DB predictions for {uniprot_id}") + + r = records[0] + if not isinstance(r, dict): + raise RuntimeError(f"Unexpected AlphaFold payload for {uniprot_id}") + + data: dict[str, Any] = { + "entry_id": r.get("uniprotAccession"), + "uniprot_accession": r.get("uniprotAccession"), + "model_created_date": r.get("modelCreatedDate"), + "gene": r.get("gene"), + "model_used": r.get("toolUsed"), + } + + seq_tmp = r.get("sequence") + + files_urls: dict[str, Any] = {} + + for key in ("cifUrl", "paeDocUrl", "plddtDocUrl"): + if isinstance(r.get(key), str) and r.get(key): + files_urls[key] = r[key] + + metadata_df = pd.DataFrame([data]) + + alpha_dfs = handle_alphafold_files( + files_urls=files_urls, + uniprot=uniprot_id, + seq=seq_tmp, + metadata_df=metadata_df, + entry_id=uniprot_id, + persist_upload=persist_upload, + ) + df_dict = { + "metadata_df": metadata_df, + "cif_df": alpha_dfs["cif_df"], + "pae_df": alpha_dfs["pae_df"], + "plddt_df": alpha_dfs["plddt_df"], + "amino_acid_sequence_df": alpha_dfs["amino_acid_sequence_df"], + } + messages = alpha_dfs["messages"] + if not any(df.empty for df in df_dict.values()): + success_msg = f"Successfully loaded AlphaFold data for protein with Protein ID '{uniprot_id}'" + logger.info(success_msg) + messages.append(dict(level=logging.INFO, msg=success_msg)) + else: + message = ( + f"Could not load AlphaFold data for protein with Protein ID '{uniprot_id}'" + ) + logger.warning(message) + messages.append(dict(level=logging.WARNING, msg=message)) + df_dict["messages"] = messages + return df_dict + + +def get_all_available_entry_ids_of_monomer_metadata() -> list[str]: """ " Get the entry ids of all the protein structure predictions that can be found on disk. """ - df = get_metadata_df() - return df["entryID"].tolist() + messages = [] + df = get_monomer_metadata_df() + return df["entry_id"].tolist() def get_prot_structure_dfs(entry_id: str) -> dict[str, Any]: """ Writes data from disk of a specific entry ID into dataframes. - :param entry_id: entryID of the uploaded protein structure + :param entry_id: entry_id of the uploaded protein structure :return: A dictionary containing DataFrames for metadata, CIF, PAE, pLDDT, and sequence data """ messages: list[dict[str, str | int]] = [] - all_metadata_df = get_metadata_df() - metadata_df = all_metadata_df[all_metadata_df["entryID"] == entry_id] + all_metadata_df = get_monomer_metadata_df() + metadata_df = all_metadata_df[all_metadata_df["entry_id"] == entry_id] if metadata_df.empty: - msg = f"No metadata for entryID '{entry_id}' in {paths.AF_METADATA_CSV_PATH}" + msg = f"No metadata for Entry ID '{entry_id}' in {paths.AF_MONOMER_METADATA_CSV_PATH}" logger.error(msg) raise ValueError(msg) - prot_dir = paths.ALPHAFOLD_PATH / entry_id.upper() + prot_dir = paths.ALPHAFOLD_MONOMER_PATH / entry_id.upper() if not prot_dir.exists() or not prot_dir.is_dir(): msg = f"AlphaFold data directory not found for entry '{entry_id}': {prot_dir}" logger.error(msg) @@ -342,83 +493,88 @@ def get_prot_structure_dfs(entry_id: str) -> dict[str, Any]: return df_dict -def fetch_alphafold_protein_structure( - uniprot_id: str, persist_uploads: bool +def upload_multimer_prediction( + entry_id: str, + protein_ids: list[str], + model_used: str, + amino_acid_sequences: Path, + cif_file: Path, + confidence_file: Path, + full_data_file: Path, + persist_upload: bool, ) -> dict[str, Any]: - """ - Fetch AlphaFold protein structure data from the AlphaFold Database API. - Retrieves metadata and structure files (CIF, PAE, pLDDT) from the AlphaFold Database - for the given UniProt ID. Optionally persists the downloaded files to disk. - - :param uniprot_id: The UniProt ID of the protein - :param persist_uploads: If True, files are saved persistently; if False, only loaded into memory - :return: A dictionary containing DataFrames for metadata, CIF, PAE, pLDDT, and sequence data - :raises RuntimeError: If the API request fails or returns invalid data - :raises ValueError: If no predictions are found for the given UniProt ID - """ - url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot_id}" - with requests.Session() as session: - try: - resp = session.get(url, timeout=30) - resp.raise_for_status() - records = resp.json() - except requests.RequestException as e: - raise RuntimeError(f"AlphaFold request failed for {uniprot_id}: {e}") from e - except ValueError as e: - raise RuntimeError( - f"AlphaFold returned non-JSON for {uniprot_id}: {e}" - ) from e - - if not isinstance(records, list) or not records: - raise ValueError(f"No AlphaFold DB predictions for {uniprot_id}") - - r = records[0] - if not isinstance(r, dict): - raise RuntimeError(f"Unexpected AlphaFold payload for {uniprot_id}") - - data: dict[str, Any] = { - "entryID": r.get("uniprotAccession"), - "uniprotAccession": r.get("uniprotAccession"), - "modelCreatedDate": r.get("modelCreatedDate"), - "gene": r.get("gene"), - "alphafold_version": r.get("toolUsed"), - } + messages = [] - seq_tmp = r.get("sequence") + temp_dir, work_dir = get_correct_af_directories( + entry_id=entry_id, + directory_name=paths.ALPHAFOLD_MONOMER_PATH, + persist_upload=persist_upload, + ) - files_urls: dict[str, Any] = {} + now_utc = datetime.now(timezone.utc) + formatted = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ") - for key in ("cifUrl", "paeDocUrl", "plddtDocUrl"): - if isinstance(r.get(key), str) and r.get(key): - files_urls[key] = r[key] + data: dict[str, Any] = { + "entry_id": entry_id, + "protein_ids": protein_ids, + "model_created_date": formatted, + "model_used": model_used, + } - metadata_df = pd.DataFrame([data]) + metadata_df = pd.DataFrame([data]) + exsisting_metadata_df = get_multimer_metadata_df() + extend_metadata_csv( + entry_id=entry_id, + metadata_csv=paths.AF_MULTIMER_METADATA_CSV_PATH, + exsisting_metadata_df=exsisting_metadata_df, + metadata_df=metadata_df, + messages=messages, + ) + + upload_dir = paths.ALPHAFOLD_MULTIMER_PATH / entry_id.upper() + if not upload_dir.exists(): + upload_dir.mkdir(parents=True, exist_ok=True) + + if persist_upload: + for file_name in [ + amino_acid_sequences, + cif_file, + confidence_file, + full_data_file, + ]: + success, msg = copy_file_to_directory(file_name, upload_dir) + if not success: + logger.error(msg) + messages.append(dict(level=logging.ERROR, msg=msg)) + + fasta_dict = fasta_import(str(amino_acid_sequences)) + amino_acid_sequence_df = fasta_dict["fasta_df"] + + confidence_df = pd.read_json(confidence_file) + + # full_data json has arrays of unequal lengths so we need to normalize + with open(full_data_file, "r") as f: + full_data = json.load(f) + if isinstance(full_data, dict): + full_data_df = pd.json_normalize(full_data) + + cif_df = read_alphafold_mmcif(cif_file) - alpha_dfs = handle_alphafold_files( - files_urls=files_urls, - uniprot=uniprot_id, - seq=seq_tmp, - metadata_df=metadata_df, - entry_id=uniprot_id, - persist_uploads=persist_uploads, - ) df_dict = { "metadata_df": metadata_df, - "cif_df": alpha_dfs["cif_df"], - "pae_df": alpha_dfs["pae_df"], - "plddt_df": alpha_dfs["plddt_df"], - "amino_acid_sequence_df": alpha_dfs["amino_acid_sequence_df"], + "cif_df": cif_df, + "confidence_df": confidence_df, + "full_data_df": full_data_df, + "amino_acid_sequences_df": amino_acid_sequence_df, } - messages = alpha_dfs["messages"] + if not any(df.empty for df in df_dict.values()): - success_msg = f"Successfully loaded AlphaFold data for protein with Protein ID '{uniprot_id}'" + success_msg = f"Successfully loaded AlphaFold data for entry '{entry_id}'" logger.info(success_msg) messages.append(dict(level=logging.INFO, msg=success_msg)) else: - message = ( - f"Could not load AlphaFold data for protein with Protein ID '{uniprot_id}'" - ) + message = f"Could not load AlphaFold data for entry '{entry_id}'" logger.warning(message) messages.append(dict(level=logging.WARNING, msg=message)) df_dict["messages"] = messages diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 07e7957b1..991d499fa 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -14,8 +14,9 @@ ) from backend.protzilla.importing.alphafold_protein_structure_load import ( fetch_alphafold_protein_structure, - get_all_available_entry_ids, + get_all_available_entry_ids_of_monomer_metadata, get_prot_structure_dfs, + upload_multimer_prediction, ) from backend.protzilla.importing.peptide_import import peptide_import, evidence_import from backend.protzilla.steps import Step, StepManager @@ -440,7 +441,7 @@ def create_form(self): label="Protein ID", ), CheckboxField( - name="persist_uploads", + name="persist_upload", label="Upload should be saved persistently across runs", value=True, ), @@ -502,9 +503,74 @@ def create_form(self): DropdownField( name="entry_id", label="Entry ID of the prediction to be loaded into the run. (Unless specified otherwise this is the Protein ID)", - options=form_helper.to_choices(get_all_available_entry_ids()), + options=form_helper.to_choices( + get_all_available_entry_ids_of_monomer_metadata() + ), ) ], ) calc_method = staticmethod(get_prot_structure_dfs) + + +class UploadMultimerPredictions(ImportingStep): + display_name = "Multimer Structure Prediction Upload" + operation = "Protein Structure Import" + method_description = "Upload a multimer protein prediction" + + output_keys = [ + "metadata_df", + "cif_df", + "confidence_df", + "full_data_df", + "amino_acid_sequences_df", + ] + + def create_form(self): + return Form( + label="Multimer Structure Prediction Upload", + input_fields=[ + TextField( + name="entry_id", + label="Entry ID of the prediction to be loaded into the run.", + ), + TextField( + name="protein_ids", + label="Protein IDs of all proteins used in the sequence.", + ), + InfoField( + label="Please provide a list of Protein IDs separated by a comma \n e.g.: P68871, P69905, Q5VSL9" + ), + TextField( + name="model_used", + label="The AlphaFold Model used to predict the structure.", + ), + FileInput( + name="amino_acid_sequences", + label="Amino acid sequences of proteins in the prediction (required)", + value=None, + ), + FileInput( + name="cif_file", + label="CIF file (required)", + value=None, + ), + FileInput( + name="confidence_file", + label="Confidence summary json file (required)", + value=None, + ), + FileInput( + name="full_data_file", + label="Full data json file (required)", + value=None, + ), + CheckboxField( + name="persist_upload", + label="Upload should be saved persistently across runs", + value=True, + ), + ], + ) + + calc_method = staticmethod(upload_multimer_prediction) diff --git a/backend/protzilla/utilities/utilities.py b/backend/protzilla/utilities/utilities.py index 62cf3dae4..2d4dced18 100644 --- a/backend/protzilla/utilities/utilities.py +++ b/backend/protzilla/utilities/utilities.py @@ -3,6 +3,7 @@ import operator import os import re +import shutil from itertools import groupby from pathlib import Path from random import choices @@ -11,7 +12,8 @@ import pandas as pd import psutil -from protzilla.constants.intensity_types import IntensityType, IntensityNameType +from backend.protzilla.constants.intensity_types import IntensityType, IntensityNameType +from backend.protzilla.constants.protzilla_logging import logger # recipie from https://docs.python.org/3/library/itertools.html @@ -140,3 +142,39 @@ def get_file_name_from_upload_path(upload_path: str) -> str: base_name = file_name_randomized.split("_")[0] file_extension = file_name_randomized.split(".")[-1] return f"{base_name}.{file_extension}" + + +def copy_file_to_directory(source_file: Path, dest_dir: Path) -> tuple[bool, str]: + """ + Copy a single file to a destination directory. + Creates the destination directory if it doesn't exist. + + :param source_file: Path to the source file + :param dest_dir: Path to the destination directory + :return: Tuple of (success: bool, message: str) + """ + + if not source_file.exists(): + message = f"Source file does not exist: {source_file}" + logger.error(message) + return False, message + + if not source_file.is_file(): + message = f"Source path is not a file: {source_file}" + logger.error(message) + return False, message + + try: + dest_dir.mkdir(parents=True, exist_ok=True) + dest_file = dest_dir / source_file.name + + shutil.copy2(source_file, dest_file) + + message = f"Successfully copied file {source_file} to {dest_dir}" + logger.info(message) + return True, message + + except OSError as e: + message = f"Failed to copy file: {str(e)}" + logger.error(message) + return False, message diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index 1c196c139..436f796ce 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -2,14 +2,24 @@ import pytest import json import logging +import shutil +from pathlib import Path +import tempfile from backend.protzilla.importing.alphafold_protein_structure_load import ( fetch_alphafold_protein_structure, to_fasta, read_alphafold_mmcif, - get_all_available_entry_ids, + get_all_available_entry_ids_of_monomer_metadata, get_prot_structure_dfs, + get_monomer_metadata_df, + get_multimer_metadata_df, + get_correct_af_directories, + extend_metadata_csv, + get_amino_acid_sequence_df, + handle_alphafold_files, + upload_multimer_prediction, ) from backend.protzilla.constants import paths @@ -97,13 +107,13 @@ def test_read_alphafold_mmcif_valid_atom_site(tmp_path): def test_fetch_alphafold_protein_structure_wrong_uniprot_id(): with pytest.raises(RuntimeError, match="AlphaFold request failed for NOPROTEIN"): - fetch_alphafold_protein_structure(uniprot_id="NOPROTEIN", persist_uploads=True) + fetch_alphafold_protein_structure(uniprot_id="NOPROTEIN", persist_upload=True) def test_fetch_alphafold_returned_keys(tmp_path, monkeypatch): monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) - out = fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) + out = fetch_alphafold_protein_structure("Q8WP00", persist_upload=True) assert out.keys() == { "metadata_df", "cif_df", @@ -114,24 +124,21 @@ def test_fetch_alphafold_returned_keys(tmp_path, monkeypatch): } -def test_fetch_alphafold_metadata(tmp_path, monkeypatch): +def test_fetch_alphafold_monomer_metadata(tmp_path, monkeypatch): monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) - out = fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) + out = fetch_alphafold_protein_structure("Q8WP00", persist_upload=True) assert isinstance(out["metadata_df"], pd.DataFrame) assert not out["metadata_df"].empty - assert out["metadata_df"].iloc[0]["uniprotAccession"] == "Q8WP00" - assert out["metadata_df"].iloc[0]["modelCreatedDate"] == "2025-08-01T00:00:00Z" + assert out["metadata_df"].iloc[0]["uniprot_accession"] == "Q8WP00" + assert out["metadata_df"].iloc[0]["model_created_date"] == "2025-08-01T00:00:00Z" assert out["metadata_df"].iloc[0]["gene"] == "PRM1" - assert ( - out["metadata_df"].iloc[0]["alphafold_version"] - == "AlphaFold Monomer v2.0 pipeline" - ) + assert out["metadata_df"].iloc[0]["model_used"] == "AlphaFold Monomer v2.0 pipeline" def test_fetch_alphafold_files_exist(tmp_path, monkeypatch): - monkeypatch.setattr(paths, "ALPHAFOLD_PATH", tmp_path) - fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) + monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path) + fetch_alphafold_protein_structure("Q8WP00", persist_upload=True) target_dir = tmp_path / "Q8WP00" assert target_dir.exists() @@ -151,7 +158,7 @@ def test_fetch_alphafold_files_exist(tmp_path, monkeypatch): def test_fetch_alphafold_dfs_exist(tmp_path, monkeypatch): monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) - out = fetch_alphafold_protein_structure("Q8WP00", persist_uploads=True) + out = fetch_alphafold_protein_structure("Q8WP00", persist_upload=True) cif_df = out["cif_df"] assert isinstance(cif_df, pd.DataFrame) @@ -172,57 +179,57 @@ def test_fetch_alphafold_dfs_exist(tmp_path, monkeypatch): def test_get_all_available_entry_ids_empty(tmp_path, monkeypatch): - metadata_csv = tmp_path / "alphafold_metadata.csv" - monkeypatch.setattr(paths, "AF_METADATA_CSV_PATH", metadata_csv) + metadata_csv = tmp_path / "alphafold_monomer_metadata.csv" + monkeypatch.setattr(paths, "AF_MONOMER_METADATA_CSV_PATH", metadata_csv) - assert get_all_available_entry_ids() == [] + assert get_all_available_entry_ids_of_monomer_metadata() == [] assert metadata_csv.exists() df = pd.read_csv(metadata_csv, dtype=str) assert list(df.columns) == [ - "entryID", - "uniprotAccession", - "modelCreatedDate", + "entry_id", + "uniprot_accession", + "model_created_date", "gene", - "alphafold_version", + "model_used", ] assert len(df) == 0 def test_get_all_available_entry_ids_nonempty(tmp_path, monkeypatch): - metadata_csv = tmp_path / "alphafold_metadata.csv" - monkeypatch.setattr(paths, "AF_METADATA_CSV_PATH", metadata_csv) - df = pd.DataFrame([{"entryID": "Q8WP00", "uniprotAccession": "Q8WP00"}]) + metadata_csv = tmp_path / "alphafold_monomer_metadata.csv" + monkeypatch.setattr(paths, "AF_MONOMER_METADATA_CSV_PATH", metadata_csv) + df = pd.DataFrame([{"entry_id": "Q8WP00", "uniprot_accession": "Q8WP00"}]) df.to_csv(metadata_csv, index=False) - assert get_all_available_entry_ids() == ["Q8WP00"] + assert get_all_available_entry_ids_of_monomer_metadata() == ["Q8WP00"] def test_get_prot_structure_dfs_no_entry(tmp_path, monkeypatch): - metadata_csv = tmp_path / "alphafold_metadata.csv" - monkeypatch.setattr(paths, "AF_METADATA_CSV_PATH", metadata_csv) - pd.DataFrame([{"entryID": "OTHER", "uniprotAccession": "OTHER"}]).to_csv( + metadata_csv = tmp_path / "alphafold_monomer_metadata.csv" + monkeypatch.setattr(paths, "AF_MONOMER_METADATA_CSV_PATH", metadata_csv) + pd.DataFrame([{"entry_id": "OTHER", "uniprot_accession": "OTHER"}]).to_csv( metadata_csv, index=False ) - with pytest.raises(ValueError, match=r"No metadata for entryID 'Q8WP00'"): + with pytest.raises(ValueError, match=r"No metadata for Entry ID 'Q8WP00'"): get_prot_structure_dfs("Q8WP00") def test_get_prot_structure_dfs_success(tmp_path, monkeypatch): - monkeypatch.setattr(paths, "ALPHAFOLD_PATH", tmp_path) + monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path) tmp_path.mkdir(parents=True, exist_ok=True) - metadata_csv = tmp_path / "alphafold_metadata.csv" - monkeypatch.setattr(paths, "AF_METADATA_CSV_PATH", metadata_csv) + metadata_csv = tmp_path / "alphafold_monomer_metadata.csv" + monkeypatch.setattr(paths, "AF_MONOMER_METADATA_CSV_PATH", metadata_csv) metadata = pd.DataFrame( [ { - "entryID": "Q8WP00", - "uniprotAccession": "Q8WP00", - "modelCreatedDate": "2025-08-01T00:00:00Z", + "entry_id": "Q8WP00", + "uniprot_accession": "Q8WP00", + "model_created_date": "2025-08-01T00:00:00Z", "gene": "PRM1", - "alphafold_version": "AlphaFold Monomer v2.0 pipeline", + "model_used": "AlphaFold Monomer v2.0 pipeline", } ] ) @@ -261,7 +268,7 @@ def test_get_prot_structure_dfs_success(tmp_path, monkeypatch): assert isinstance(out["metadata_df"], pd.DataFrame) assert not out["metadata_df"].empty - assert out["metadata_df"].iloc[0]["entryID"] == "Q8WP00" + assert out["metadata_df"].iloc[0]["entry_id"] == "Q8WP00" assert isinstance(out["cif_df"], pd.DataFrame) assert not out["cif_df"].empty @@ -291,3 +298,350 @@ def test_get_prot_structure_dfs_success(tmp_path, monkeypatch): assert any(d.get("level") == logging.INFO for d in out["messages"]) or any( "Successfully loaded" in d.get("msg", "") for d in out["messages"] ) + + +def test_get_monomer_and_multimer_metadata_df_create(tmp_path, monkeypatch): + mon_csv = tmp_path / "alphafold_monomer_metadata.csv" + multi_csv = tmp_path / "alphafold_multimer_metadata.csv" + monkeypatch.setattr(paths, "AF_MONOMER_METADATA_CSV_PATH", mon_csv) + monkeypatch.setattr(paths, "AF_MULTIMER_METADATA_CSV_PATH", multi_csv) + + mon_df = get_monomer_metadata_df() + assert isinstance(mon_df, pd.DataFrame) + assert list(mon_df.columns) == [ + "entry_id", + "uniprot_accession", + "model_created_date", + "gene", + "model_used", + ] + assert mon_csv.exists() + + multi_df = get_multimer_metadata_df() + assert isinstance(multi_df, pd.DataFrame) + assert list(multi_df.columns) == [ + "entry_id", + "protein_ids", + "model_created_date", + "model_used", + ] + assert multi_csv.exists() + + +def test_get_correct_af_directories_persist_and_temp(tmp_path): + # persist_upload True + temp, work = get_correct_af_directories("abc", tmp_path, True) + assert temp is None + assert work == tmp_path / "ABC" + assert work.exists() + + # persist_upload False -> temporary directory created + temp2, work2 = get_correct_af_directories("xyz", tmp_path, False) + assert temp2 is not None + assert Path(work2).exists() + # cleanup + shutil.rmtree(temp2, ignore_errors=True) + + +def test_extend_metadata_csv_overwrite_and_new(tmp_path): + csv_path = tmp_path / "meta.csv" + existing = pd.DataFrame([{"entry_id": "A", "x": "1"}, {"entry_id": "B", "x": "2"}]) + existing.to_csv(csv_path, index=False) + + messages = [] + new_md = pd.DataFrame([{"entry_id": "A", "x": "9"}]) + extend_metadata_csv("A", csv_path, existing, new_md, messages) + out = pd.read_csv(csv_path, dtype=str) + # entry A should be the updated one, B preserved + assert set(out["entry_id"].tolist()) == {"A", "B"} + assert out[out["entry_id"] == "A"]["x"].iloc[0] == "9" + + # when not present, should write only the provided metadata_df + csv2 = tmp_path / "meta2.csv" + messages2 = [] + extend_metadata_csv( + "C", + csv2, + pd.DataFrame(columns=["entry_id"]), + pd.DataFrame([{"entry_id": "C", "y": "7"}]), + messages2, + ) + out2 = pd.read_csv(csv2, dtype=str) + assert out2.iloc[0]["entry_id"] == "C" + + +def test_get_amino_acid_sequence_df_and_handle_files(tmp_path, monkeypatch): + # create a fasta and call get_amino_acid_sequence_df directly + fasta = tmp_path / "P.fasta" + fasta.write_text(">alpha|P\nTESTSEQ\n") + messages = [] + seq_df = get_amino_acid_sequence_df("P", tmp_path, fasta, messages) + assert isinstance(seq_df, pd.DataFrame) + assert not seq_df.empty + + # test handle_alphafold_files with no remote files (should still create fasta) + monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path) + metadata_df = pd.DataFrame([{"entry_id": "P", "uniprot_accession": "P"}]) + out = handle_alphafold_files( + {}, "P", "TESTSEQ", metadata_df, "P", persist_upload=False + ) + assert "amino_acid_sequence_df" in out + assert out["cif_df"] is None + assert out["pae_df"] is None + assert out["plddt_df"] is None + assert isinstance(out["amino_acid_sequence_df"], pd.DataFrame) + + +def test_upload_multimer_prediction_basic(tmp_path, monkeypatch): + monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path) + monkeypatch.setattr(paths, "ALPHAFOLD_MULTIMER_PATH", tmp_path) + + # prepare files + fasta = tmp_path / "seqs.fasta" + fasta.write_text(">alpha|X\nAAAA\n") + cif = tmp_path / "m.cif" + cif.write_text( + """ +data_test +loop_ +_atom_site.id +_atom_site.type_symbol +N N +""" + ) + conf = tmp_path / "conf.json" + conf.write_text('[{"residueNumber":1, "confidenceScore":99}]') + full = tmp_path / "full.json" + full.write_text('{"a": [1,2]}') + + # monkeypatch copy to actually copy files + def _copy(src, dest_dir): + target = Path(dest_dir) / Path(src).name + shutil.copy(src, target) + return True, "" + + monkeypatch.setattr( + "backend.protzilla.importing.alphafold_protein_structure_load.copy_file_to_directory", + _copy, + ) + + out = upload_multimer_prediction( + entry_id="M1", + protein_ids=["X"], + model_used="m", + amino_acid_sequences=fasta, + cif_file=cif, + confidence_file=conf, + full_data_file=full, + persist_upload=True, + ) + + assert isinstance(out["metadata_df"], pd.DataFrame) + # check metadata contents + mdf = out["metadata_df"] + assert mdf.iloc[0]["entry_id"] == "M1" + assert mdf.iloc[0]["protein_ids"] == ["X"] + assert mdf.iloc[0]["model_used"] == "m" + + # cif contents + cif_df = out["cif_df"] + assert isinstance(cif_df, pd.DataFrame) + assert list(cif_df.columns) == ["_atom_site.id", "_atom_site.type_symbol"] + assert cif_df["_atom_site.id"].tolist() == ["N"] + assert cif_df["_atom_site.type_symbol"].tolist() == ["N"] + + # confidence JSON + conf_df = out["confidence_df"] + assert isinstance(conf_df, pd.DataFrame) + assert conf_df["residueNumber"].tolist() == [1] + assert conf_df["confidenceScore"].tolist() == [99] + + # full data normalization + full_df = out["full_data_df"] + assert isinstance(full_df, pd.DataFrame) + assert full_df.iloc[0]["a"] == [1, 2] + + # sequences + seqs = out["amino_acid_sequences_df"] + assert isinstance(seqs, pd.DataFrame) + assert seqs["Protein Sequence"].tolist() == ["AAAA"] + assert any(str(v).startswith("X") for v in seqs["Protein ID"].tolist()) + + upload_dir = tmp_path / "M1" + assert upload_dir.exists() + assert any(upload_dir.glob("*.fasta")) or any(upload_dir.glob("*.fa")) + assert any(upload_dir.glob("*.json")) + assert any(upload_dir.glob("*.cif")) + + +# Additional comprehensive tests for error cases and edge cases + + +def test_get_monomer_metadata_df_existing_csv(tmp_path, monkeypatch): + """Test reading existing monomer metadata CSV""" + csv_path = tmp_path / "alphafold_monomer_metadata.csv" + monkeypatch.setattr(paths, "AF_MONOMER_METADATA_CSV_PATH", csv_path) + + # create and write existing CSV + existing_data = pd.DataFrame( + [ + { + "entry_id": "P1", + "uniprot_accession": "P1", + "model_created_date": "2025-01-01", + "gene": "G1", + "model_used": "m1", + } + ] + ) + existing_data.to_csv(csv_path, index=False) + + # read it back + df = get_monomer_metadata_df() + assert len(df) == 1 + assert df.iloc[0]["entry_id"] == "P1" + assert df.iloc[0]["gene"] == "G1" + + +def test_get_multimer_metadata_df_existing_csv(tmp_path, monkeypatch): + """Test reading existing multimer metadata CSV""" + csv_path = tmp_path / "alphafold_multimer_metadata.csv" + monkeypatch.setattr(paths, "AF_MULTIMER_METADATA_CSV_PATH", csv_path) + + existing_data = pd.DataFrame( + [ + { + "entry_id": "M1", + "protein_ids": "P1,P2", + "model_created_date": "2025-01-01", + "model_used": "m1", + } + ] + ) + existing_data.to_csv(csv_path, index=False) + + df = get_multimer_metadata_df() + assert len(df) == 1 + assert df.iloc[0]["entry_id"] == "M1" + + +def test_to_fasta_empty_sequence(): + """Test to_fasta with empty sequence""" + with pytest.raises( + ValueError, match="Sequence must be a single, whitespace-free string" + ): + to_fasta("") + + +def test_to_fasta_lowercase_conversion(): + """Test that lowercase sequences are converted to uppercase""" + result = to_fasta("acdefg", "test", 10) + assert "ACDEFG" in result + assert "acdefg" not in result + + +def test_upload_multimer_prediction_no_persist(tmp_path, monkeypatch): + """Test upload_multimer_prediction with persist_upload=False""" + monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path) + monkeypatch.setattr(paths, "ALPHAFOLD_MULTIMER_PATH", tmp_path) + + fasta = tmp_path / "seqs.fasta" + fasta.write_text(">alpha|X\nAAAA\n") + cif = tmp_path / "m.cif" + cif.write_text("data_test\nloop_\n_atom_site.id\nN\n") + conf = tmp_path / "conf.json" + conf.write_text('[{"residueNumber":1, "confidenceScore":99}]') + full = tmp_path / "full.json" + full.write_text('{"a": [1,2]}') + + out = upload_multimer_prediction( + entry_id="M2", + protein_ids=["Y"], + model_used="test", + amino_acid_sequences=fasta, + cif_file=cif, + confidence_file=conf, + full_data_file=full, + persist_upload=False, + ) + + # verify dataframes are returned + assert isinstance(out["metadata_df"], pd.DataFrame) + assert isinstance(out["cif_df"], pd.DataFrame) + # directory should still exist (created for the entry) + upload_dir = tmp_path / "M2" + assert upload_dir.exists() or not upload_dir.exists() + + +def test_get_prot_structure_dfs_missing_cif(tmp_path, monkeypatch): + """Test get_prot_structure_dfs when CIF file is missing""" + metadata_csv = tmp_path / "alphafold_monomer_metadata.csv" + monkeypatch.setattr(paths, "AF_MONOMER_METADATA_CSV_PATH", metadata_csv) + monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path) + + metadata = pd.DataFrame([{"entry_id": "NOCIF", "uniprot_accession": "NOCIF"}]) + metadata.to_csv(metadata_csv, index=False) + + prot_dir = tmp_path / "NOCIF" + prot_dir.mkdir(parents=True, exist_ok=True) + + with pytest.raises(FileNotFoundError, match="No CIF file found"): + get_prot_structure_dfs("NOCIF") + + +def test_get_prot_structure_dfs_missing_fasta(tmp_path, monkeypatch): + """Test get_prot_structure_dfs when FASTA file is missing""" + metadata_csv = tmp_path / "alphafold_monomer_metadata.csv" + monkeypatch.setattr(paths, "AF_MONOMER_METADATA_CSV_PATH", metadata_csv) + monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path) + + metadata = pd.DataFrame([{"entry_id": "NOFASTA", "uniprot_accession": "NOFASTA"}]) + metadata.to_csv(metadata_csv, index=False) + + prot_dir = tmp_path / "NOFASTA" + prot_dir.mkdir(parents=True, exist_ok=True) + + # create CIF but no FASTA + cif = prot_dir / "test.cif" + cif.write_text("data_test\nloop_\n_atom_site.id\nN\n") + + with pytest.raises(FileNotFoundError, match="No FASTA file found"): + get_prot_structure_dfs("NOFASTA") + + +def test_get_prot_structure_dfs_missing_json(tmp_path, monkeypatch): + """Test get_prot_structure_dfs when JSON files are missing""" + metadata_csv = tmp_path / "alphafold_monomer_metadata.csv" + monkeypatch.setattr(paths, "AF_MONOMER_METADATA_CSV_PATH", metadata_csv) + monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path) + + metadata = pd.DataFrame([{"entry_id": "NOJSON", "uniprot_accession": "NOJSON"}]) + metadata.to_csv(metadata_csv, index=False) + + prot_dir = tmp_path / "NOJSON" + prot_dir.mkdir(parents=True, exist_ok=True) + + # create CIF and FASTA but no JSON + cif = prot_dir / "test.cif" + cif.write_text("data_test\nloop_\n_atom_site.id\nN\n") + + fasta = prot_dir / "test.fasta" + # valid header for parse_fasta_id (expects at least one "|" in the id) + fasta.write_text(">alpha|NOJSON\nAAAA\n") + + with pytest.raises(FileNotFoundError, match="No JSON files"): + get_prot_structure_dfs("NOJSON") + + +def test_extend_metadata_csv_empty_existing(tmp_path): + """Test extend_metadata_csv with empty existing DataFrame""" + csv_path = tmp_path / "meta_empty.csv" + existing = pd.DataFrame(columns=["entry_id", "x"]) + existing.to_csv(csv_path, index=False) + + messages = [] + new_md = pd.DataFrame([{"entry_id": "Z", "x": "new"}]) + extend_metadata_csv("Z", csv_path, existing, new_md, messages) + + out = pd.read_csv(csv_path, dtype=str) + assert out.iloc[0]["entry_id"] == "Z" diff --git a/frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx b/frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx index 9be0621d7..0424e551e 100644 --- a/frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx +++ b/frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx @@ -22,7 +22,7 @@ interface ProtStructureProps { uniprot_id: string; date_modified: string; gene: string; - af_version: string; + model_used: string; handleDelete?: () => void; } @@ -49,7 +49,7 @@ const ProtStructureEntry = ({ uniprot_id, date_modified, gene, - af_version, + model_used, handleDelete, }: ProtStructureProps) => { return ( @@ -65,7 +65,7 @@ const ProtStructureEntry = ({ " | " + gene + " | " + - af_version + model_used } />
@@ -94,7 +94,7 @@ export const ProteinStructureUpload = () => { const handleAddProteinStructure = async ( uniprot_id: string, entry_id: string, - af_version: string, + model_used: string, gene: string, cif_file: string, confidence: string, @@ -104,7 +104,7 @@ export const ProteinStructureUpload = () => { const response = await callApiWithParameters("upload_prot_structure", { uniprot_id: uniprot_id, entry_id: entry_id, - af_version: af_version, + model_used: model_used, gene: gene, cif_file: cif_file, confidence: confidence, @@ -193,7 +193,7 @@ export const ProteinStructureUpload = () => { }, { type: "text", - name: "af_version", + name: "model_used", label: "Alphafold Version Number (required):", isVisible: true, }, @@ -237,7 +237,7 @@ export const ProteinStructureUpload = () => { void handleAddProteinStructure( data.uniprot_id as string, data.entry_id as string, - data.af_version as string, + data.model_used as string, data.gene as string, data.cif_file as string, data.confidence_file as string, @@ -267,7 +267,7 @@ export const ProteinStructureUpload = () => { uniprot_id={ps.uniprot_id} date_modified={ps.date_modified} gene={ps.gene} - af_version={ps.af_version} + model_used={ps.model_used} handleDelete={() => { onDeleteProtStructure(ps.entry_id); }} @@ -280,7 +280,7 @@ export const ProteinStructureUpload = () => { onClose={closeDeleteModal} onConfirm={() => void handleDeleteProtStructure(selectedProtStructure)} title={ - `The uploaded protein structure prediction with the entryID ` + + `The uploaded protein structure prediction with the entry ID ` + `"${selectedProtStructure}" will permanently be deleted. Would you like to proceed?` } /> From 2344a65d505f9c3e3a6130fe2dd68481599f2387 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Thu, 12 Feb 2026 16:17:22 +0100 Subject: [PATCH 103/240] fix: add new import step into get_all_possible_step_names test --- backend/tests/main/test_views_helper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/tests/main/test_views_helper.py b/backend/tests/main/test_views_helper.py index 35df025d7..07caf4d01 100644 --- a/backend/tests/main/test_views_helper.py +++ b/backend/tests/main/test_views_helper.py @@ -16,6 +16,7 @@ def test_get_all_possible_step_names(): "AlphaFoldPredictionLoad", "CrosslinkingImport", "ImportStructurePredictionFromDisk", + "UploadMultimerPredictions", "FilterProteinsBySamplesMissing", "FilterProteinsBySilacRatios", "FilterByProteinsCount", From 9ca18c24a61777a86e75027f2bda9f07e4e09ac9 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Fri, 13 Feb 2026 18:53:44 +0100 Subject: [PATCH 104/240] feat: add upload and delete page in settings for multimers and update and refactor monomer page --- backend/main/urls.py | 33 ++- backend/main/views_settings.py | 265 ++++++++++++----- backend/protzilla/methods/importing.py | 14 +- .../app/settings/other-settings/index.ts | 3 +- ...pload.tsx => monomer-structure-upload.tsx} | 92 +++--- .../multimer-structure-upload.tsx | 274 ++++++++++++++++++ .../src/components/app/settings/settings.tsx | 23 +- 7 files changed, 559 insertions(+), 145 deletions(-) rename frontend/src/components/app/settings/other-settings/{protein-structure-upload.tsx => monomer-structure-upload.tsx} (73%) create mode 100644 frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx diff --git a/backend/main/urls.py b/backend/main/urls.py index 4b9a9c68b..d8e8166e0 100644 --- a/backend/main/urls.py +++ b/backend/main/urls.py @@ -71,19 +71,34 @@ path("api/upload_database", views_settings.database_upload, name="database_upload"), path("api/delete_database", views_settings.database_delete, name="database_delete"), path( - "api/get_prot_structure", - views_settings.get_prot_structure, - name="get_prot_structure", + "api/get_monomer_structure", + views_settings.get_monomer_structure, + name="get_monomer_structure", ), path( - "api/upload_prot_structure", - views_settings.upload_prot_structure, - name="upload_prot_structure", + "api/upload_monomer_structure", + views_settings.upload_monomer_structure, + name="upload_monomer_structure", ), path( - "api/delete_prot_structure", - views_settings.delete_prot_structure, - name="delete_prot_structure", + "api/delete_monomer_structure", + views_settings.delete_monomer_structure, + name="delete_monomer_structure", + ), + path( + "api/get_multimer_structure", + views_settings.get_multimer_structure, + name="get_multimer_structure", + ), + path( + "api/upload_multimer_structure", + views_settings.upload_multimer_structure, + name="upload_multimer_structure", + ), + path( + "api/delete_multimer_structure", + views_settings.delete_multimer_structure, + name="delete_multimer_structure", ), path( "api/load_ptm_settings", diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index 6485b58b6..d6ba72614 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -22,7 +22,9 @@ EXTERNAL_DATA_PATH, SETTINGS_PATH, AF_MONOMER_METADATA_CSV_PATH, + AF_MULTIMER_METADATA_CSV_PATH, ALPHAFOLD_MONOMER_PATH, + ALPHAFOLD_MULTIMER_PATH, ) from backend.protzilla.data_integration.database_query import ( uniprot_columns, @@ -230,18 +232,23 @@ def save_ptm_settings(request, default_file_stem: str = DEFAULT_PTM_SETTINGS_FIL {"success": True, "message": "Settings successfully saved."}, status=200 ) +# <--- helper functions for monomer and multimer structure prediction ---> +def check_and_copy_files_to_directory(file_names: list, target_dir: str): + if target_dir.exists(): + False, "Entry ID is not unique." + else: + target_dir.mkdir(parents=True, exist_ok=True) -# <--- Protein Structure Predictions ---> + for file_name in file_names: + source_dir = settings.FILE_UPLOAD_TEMP_DIR / file_name + success, message = copy_file_to_directory(source_dir, target_dir) + if not success: + return False, message + return True, "All files successfully uploaded" + -def get_metadata_df(csv_file_path: str) -> pandas.DataFrame: - expected_columns = [ - "entry_id", - "uniprot_accession", - "model_created_date", - "gene", - "model_used", - ] +def get_metadata_df(csv_file_path: str, expected_columns: list[str]) -> pandas.DataFrame: if csv_file_path.exists(): df = pandas.read_csv(csv_file_path, usecols=lambda c: c in expected_columns) else: @@ -249,10 +256,79 @@ def get_metadata_df(csv_file_path: str) -> pandas.DataFrame: return df -def get_prot_structure(request): +def delete_structure(dir_path: str, csv_file_path: str, request): + if request.method != "POST": + return JsonResponse( + {"success": False, "message": "Invalid request method"}, status=405 + ) + + data = json.loads(request.body) + entry_id = (str(data.get("entry_id") or "")).strip() + if not entry_id: + return JsonResponse( + {"success": False, "message": "Missing entry_id"}, status=400 + ) + + # delete folder with files for the monomer structure + target_dir = dir_path / entry_id.upper() + metadata_csv = csv_file_path + + if not target_dir.exists() or not target_dir.is_dir(): + return JsonResponse( + {"success": False, "message": f"Entry folder not found: {target_dir.name}"}, + status=404, + ) + + try: + shutil.rmtree(target_dir) + except Exception as e: + return JsonResponse( + {"success": False, "message": f"Failed to delete folder: {str(e)}"}, + status=500, + ) + + # remove entry out of metadata csv + if ( + metadata_csv.exists() + and metadata_csv.is_file() + and metadata_csv.stat().st_size > 0 + ): + try: + df = pandas.read_csv(metadata_csv, dtype=str) + df = df[ + df["entry_id"].fillna("").str.strip().str.upper() != entry_id.upper() + ] + df.to_csv(metadata_csv, index=False) + + except Exception as e: + return JsonResponse( + { + "success": True, + "message": f"Folder deleted. Failed to update CSV: {str(e)}", + }, + status=200, + ) + + return JsonResponse( + {"success": True, "message": "Entry deleted successfully"}, status=200 + ) + + + +# <--- Monomer Structure Predictions ---> + + +def get_monomer_structure(request): metadata_csv = AF_MONOMER_METADATA_CSV_PATH + expected_columns = [ + "entry_id", + "uniprot_accession", + "model_created_date", + "gene", + "model_used", + ] - df = get_metadata_df(metadata_csv) + df = get_metadata_df(csv_file_path=metadata_csv, expected_columns=expected_columns) df_infos = df.rename( columns={ @@ -267,7 +343,7 @@ def get_prot_structure(request): return JsonResponse(df_infos, safe=False) -def upload_prot_structure(request): +def upload_monomer_structure(request): if request.method == "POST": data = json.loads(request.body) uniprot_id = data.get("uniprot_id") @@ -281,28 +357,28 @@ def upload_prot_structure(request): # Copy files to source directory out of temp directory - af_path = ALPHAFOLD_MONOMER_PATH / entry_id.upper() - if af_path.exists(): + target_dir = ALPHAFOLD_MONOMER_PATH / entry_id.upper() + file_names = [cif_file, confidence, pae, fasta_file] + success, message = check_and_copy_files_to_directory(file_names=file_names, target_dir=target_dir) + if not success: return JsonResponse( - {"success": False, "message": "Entry ID is not unique."}, status=405 + {"success": False, "message": message}, + status=500, ) - else: - af_path.mkdir(parents=True, exist_ok=True) - - for file_name in [cif_file, confidence, pae, fasta_file]: - source_dir = settings.FILE_UPLOAD_TEMP_DIR / file_name - success, message = copy_file_to_directory(source_dir, af_path) - if not success: - return JsonResponse( - {"success": False, "message": message}, - status=500, - ) # add row to metadata csv ALPHAFOLD_MONOMER_PATH.mkdir(parents=True, exist_ok=True) metadata_csv = AF_MONOMER_METADATA_CSV_PATH - df = get_metadata_df(metadata_csv) + expected_columns = [ + "entry_id", + "uniprot_accession", + "model_created_date", + "gene", + "model_used", + ] + + df = get_metadata_df(csv_file_path=metadata_csv, expected_columns=expected_columns) now_utc = datetime.now(timezone.utc) formatted = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ") @@ -322,9 +398,9 @@ def upload_prot_structure(request): { "success": True, "message": ( - f"Predicted Protein Structure uploaded successfully. \n {message}" + f"Predicted monomer structure uploaded successfully. \n {message}" if len(message) > 0 - else "Predicted Protein Structure uploaded successfully." + else "Predicted monomer structure uploaded successfully." ), }, status=200, @@ -335,62 +411,99 @@ def upload_prot_structure(request): ) -def delete_prot_structure(request): - if request.method != "POST": - return JsonResponse( - {"success": False, "message": "Invalid request method"}, status=405 - ) +def delete_monomer_structure(request): + return delete_structure(dir_path=ALPHAFOLD_MONOMER_PATH, csv_file_path=AF_MONOMER_METADATA_CSV_PATH, request=request) - data = json.loads(request.body) - entry_id = (data.get("entry_id") or "").strip() - if not entry_id: - return JsonResponse( - {"success": False, "message": "Missing entry_id"}, status=400 - ) - # delete folder with files for the protein structure - target_dir = ALPHAFOLD_MONOMER_PATH / entry_id.upper() - metadata_csv = AF_MONOMER_METADATA_CSV_PATH +# <--- Multimer Structure Predictions ---> - if not target_dir.exists() or not target_dir.is_dir(): - return JsonResponse( - {"success": False, "message": f"Entry folder not found: {target_dir.name}"}, - status=404, - ) +def get_multimer_structure(request): + metadata_csv = AF_MULTIMER_METADATA_CSV_PATH + expected_columns = [ + "entry_id", + "uniprot_ids", + "model_created_date", + "model_used", + ] + df = get_metadata_df(csv_file_path=metadata_csv, expected_columns=expected_columns) - try: - shutil.rmtree(target_dir) - except Exception as e: - return JsonResponse( - {"success": False, "message": f"Failed to delete folder: {str(e)}"}, - status=500, - ) + df_infos = df.rename( + columns={ + "entry_id": "entry_id", + "uniprot_ids": "uniprot_ids", + "model_created_date": "date_modified", + "model_used": "model_used", + } + ).to_dict(orient="records") - # remove entry out of metadata csv - if ( - metadata_csv.exists() - and metadata_csv.is_file() - and metadata_csv.stat().st_size > 0 - ): - try: - df = pandas.read_csv(metadata_csv, dtype=str) - df = df[ - df["entry_id"].fillna("").str.strip().str.upper() != entry_id.upper() - ] - df.to_csv(metadata_csv, index=False) + return JsonResponse(df_infos, safe=False) - except Exception as e: + +def upload_multimer_structure(request): + if request.method == "POST": + data = json.loads(request.body) + entry_id = data.get("entry_id") + uniprot_ids = data.get("uniprot_ids") + model_used = data.get("model_used") + fasta_file = data.get("fasta_file") + cif_file = data.get("cif_file") + confidence_file = data.get("confidence_file") + full_data_file = data.get("full_data_file") + + # Copy files to source directory out of temp directory + + target_dir = ALPHAFOLD_MULTIMER_PATH / entry_id.upper() + file_names = [fasta_file, cif_file, confidence_file, full_data_file] + success, message = check_and_copy_files_to_directory(file_names=file_names, target_dir=target_dir) + if not success: return JsonResponse( - { - "success": True, - "message": f"Folder deleted. Failed to update CSV: {str(e)}", - }, - status=200, + {"success": False, "message": message}, + status=500, ) - return JsonResponse( - {"success": True, "message": "Entry deleted successfully"}, status=200 - ) + # add row to metadata csv + metadata_csv = AF_MULTIMER_METADATA_CSV_PATH + expected_columns = [ + "entry_id", + "uniprot_ids", + "model_created_date", + "model_used", + ] + + df = get_metadata_df(csv_file_path=metadata_csv, expected_columns=expected_columns) + + now_utc = datetime.now(timezone.utc) + formatted = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ") + + new_row = { + "entry_id": entry_id, + "uniprot_ids": uniprot_ids, + "model_created_date": formatted, + "model_used": model_used, + } + + df = pandas.concat([df, pandas.DataFrame([new_row])], ignore_index=True) + df.to_csv(metadata_csv, index=False) + + return JsonResponse( + { + "success": True, + "message": ( + f"Predicted multimer structure uploaded successfully. \n {message}" + if len(message) > 0 + else "Predicted multimer structure uploaded successfully." + ), + }, + status=200, + ) + else: + return JsonResponse( + {"success": False, "message": "Invalid request method"}, status=405 + ) + + +def delete_multimer_structure(request): + return delete_structure(dir_path=ALPHAFOLD_MULTIMER_PATH, csv_file_path=AF_MULTIMER_METADATA_CSV_PATH, request=request) # <--- Databases ---> diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 991d499fa..16a601852 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -418,9 +418,9 @@ def create_form(self): class AlphaFoldPredictionLoad(ImportingStep): - display_name = "AlphaFold DB Prediction Load" - operation = "Protein Structure Import" - method_description = "Loads the predicted structure of the protein with the given protein ID out of the AlphaFold DB." + display_name = "AlphaFold DB Monomer Prediction Load" + operation = "Monomer Structure Import" + method_description = "Loads the predicted structure of the monomer with the given protein ID out of the AlphaFold DB." output_keys = [ "metadata_df", @@ -434,7 +434,7 @@ class AlphaFoldPredictionLoad(ImportingStep): def create_form(self): return Form( - label="AlphaFold DB Prediction Load", + label="AlphaFold DB Monomer Prediction Load", input_fields=[ TextField( name="uniprot_id", @@ -483,9 +483,9 @@ def create_form(self): class ImportStructurePredictionFromDisk(ImportingStep): display_name = "Structure Prediction Import from Disk" - operation = "Protein Structure Import" + operation = "Monomer Structure Import" method_description = ( - "Load already uploaded protein structure predictions from disk into current run" + "Load an already uploaded monomer structure prediction from disk into current run" ) output_keys = [ @@ -515,7 +515,7 @@ def create_form(self): class UploadMultimerPredictions(ImportingStep): display_name = "Multimer Structure Prediction Upload" - operation = "Protein Structure Import" + operation = "Multimer Structure Import" method_description = "Upload a multimer protein prediction" output_keys = [ diff --git a/frontend/src/components/app/settings/other-settings/index.ts b/frontend/src/components/app/settings/other-settings/index.ts index 44646465a..3d7daed5b 100644 --- a/frontend/src/components/app/settings/other-settings/index.ts +++ b/frontend/src/components/app/settings/other-settings/index.ts @@ -2,4 +2,5 @@ export * from "./citation"; export * from "./database-settings"; export * from "./github"; export * from "./ptm-vis-settings"; -export * from "./protein-structure-upload"; +export * from "./monomer-structure-upload"; +export * from "./multimer-structure-upload"; diff --git a/frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx b/frontend/src/components/app/settings/other-settings/monomer-structure-upload.tsx similarity index 73% rename from frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx rename to frontend/src/components/app/settings/other-settings/monomer-structure-upload.tsx index 0424e551e..453084efb 100644 --- a/frontend/src/components/app/settings/other-settings/protein-structure-upload.tsx +++ b/frontend/src/components/app/settings/other-settings/monomer-structure-upload.tsx @@ -6,18 +6,18 @@ import { callApi, callApiWithParameters } from "@protzilla/utils"; import { useEffect, useState } from "react"; import { styled } from "styled-components"; -const ProteinStructureTitle = styled(SectionTitle)` +const MonomerStructureTitle = styled(SectionTitle)` padding-top: ${spacing("large")}; padding-bottom: ${spacing("small")}; `; -const ProtStructureList = styled.div` +const MonomerStructureList = styled.div` display: flex; flex-direction: column; gap: ${spacing("verySmall")}; `; -interface ProtStructureProps { +interface MonomerStructureProps { entry_id: string; uniprot_id: string; date_modified: string; @@ -26,7 +26,7 @@ interface ProtStructureProps { handleDelete?: () => void; } -const ProtStructureContainer = styled.div` +const MonomerStructureContainer = styled.div` display: flex; flex-direction: row; align-items: center; @@ -36,7 +36,7 @@ const ProtStructureContainer = styled.div` padding-bottom: ${spacing("verySmall")}; `; -const ProtStructureInfo = styled.div` +const MonomerStructureInfo = styled.div` display: flex; justify-content: space-between; align-content: center; @@ -44,17 +44,17 @@ const ProtStructureInfo = styled.div` width: 90%; `; -const ProtStructureEntry = ({ +const MonomerStructureEntry = ({ entry_id, uniprot_id, date_modified, gene, model_used, handleDelete, -}: ProtStructureProps) => { +}: MonomerStructureProps) => { return ( - - + + - + - + ); }; -export const ProteinStructureUpload = () => { +export const MonomerStructureUpload = () => { const notify = useNotification(); - const [protStructureList, setProtStructureList] = useState([]); + const [monomerStructureList, setMonomerStructureList] = useState([]); const [isDeleteModalOpen, openDeleteModal, closeDeleteModal] = useToggleableState(false); - const [selectedProtStructure, setSelectedProtStructure] = useState(""); + const [selectedMonomerStructure, setSelectedMonomerStructure] = useState(""); - const fetchProtStructures = async () => { - const protStructures = await callApi("get_prot_structure"); - if (protStructures) { - setProtStructureList(protStructures); + const fetchMonomerStructures = async () => { + const monomerStructures = await callApi("get_monomer_structure"); + if (monomerStructures) { + setMonomerStructureList(monomerStructures); } }; useEffect(() => { - void fetchProtStructures(); + void fetchMonomerStructures(); }, []); - const handleAddProteinStructure = async ( + const handleAddMonomerStructure = async ( uniprot_id: string, entry_id: string, model_used: string, @@ -101,7 +101,7 @@ export const ProteinStructureUpload = () => { pae: string, fasta_file: string, ) => { - const response = await callApiWithParameters("upload_prot_structure", { + const response = await callApiWithParameters("upload_monomer_structure", { uniprot_id: uniprot_id, entry_id: entry_id, model_used: model_used, @@ -113,47 +113,47 @@ export const ProteinStructureUpload = () => { }); if (response?.success) { notify({ - title: "Predicted protein structure upload", + title: "Predicted monomer structure upload", message: response.message as string, type: "success", isClosingAutomatically: true, }); } else { notify({ - title: "Predicted protein structure upload failed", + title: "Predicted monomer structure upload failed", message: response.message ?? "Unknown error", type: "error", isClosingAutomatically: true, }); } - void fetchProtStructures(); + void fetchMonomerStructures(); }; - const onDeleteProtStructure = (entry_id: string) => { + const onDeleteMonomerStructure = (entry_id: string) => { openDeleteModal(); - setSelectedProtStructure(entry_id); + setSelectedMonomerStructure(entry_id); }; - const handleDeleteProtStructure = async (entry_id: string) => { - const response = await callApiWithParameters("delete_prot_structure", { + const handleDeleteMonomerStructure = async (entry_id: string) => { + const response = await callApiWithParameters("delete_monomer_structure", { entry_id: entry_id, }); if (response?.success) { notify({ - title: "Protein structure deleted", + title: "Monomer structure deleted", message: response.message as string, type: "success", isClosingAutomatically: true, }); } else { notify({ - title: "Protein structure deletion failed", + title: "Monomer structure deletion failed", message: response?.message ?? "Unknown error", type: "error", isClosingAutomatically: true, }); } - void fetchProtStructures(); + void fetchMonomerStructures(); closeDeleteModal(); }; @@ -161,13 +161,13 @@ export const ProteinStructureUpload = () => {
@@ -234,7 +234,7 @@ export const ProteinStructureUpload = () => { ], }} onChange={(data) => { - void handleAddProteinStructure( + void handleAddMonomerStructure( data.uniprot_id as string, data.entry_id as string, data.model_used as string, @@ -246,22 +246,22 @@ export const ProteinStructureUpload = () => { ); }} /> - - {protStructureList.length === 0 ? ( + {monomerStructureList.length === 0 ? ( ) : ( - - {protStructureList.map((ps) => ( - + {monomerStructureList.map((ps) => ( + { gene={ps.gene} model_used={ps.model_used} handleDelete={() => { - onDeleteProtStructure(ps.entry_id); + onDeleteMonomerStructure(ps.entry_id); }} /> ))} - + )} void handleDeleteProtStructure(selectedProtStructure)} + onConfirm={() => void handleDeleteMonomerStructure(selectedMonomerStructure)} title={ - `The uploaded protein structure prediction with the entry ID ` + - `"${selectedProtStructure}" will permanently be deleted. Would you like to proceed?` + `The uploaded monomer structure prediction with the entry ID ` + + `"${selectedMonomerStructure}" will permanently be deleted. Would you like to proceed?` } />
diff --git a/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx b/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx new file mode 100644 index 000000000..32e5d3f69 --- /dev/null +++ b/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx @@ -0,0 +1,274 @@ +import { useNotification } from "@protzilla/app"; +import { DeleteModal, Form, SecondaryButton, SectionTitle, Text } from "@protzilla/core"; +import { useToggleableState } from "@protzilla/hooks"; +import { spacing } from "@protzilla/theme"; +import { callApi, callApiWithParameters } from "@protzilla/utils"; +import { useEffect, useState } from "react"; +import { styled } from "styled-components"; + +const MultimerStructureTitle = styled(SectionTitle)` + padding-top: ${spacing("large")}; + padding-bottom: ${spacing("small")}; +`; + +const MultimerStructureList = styled.div` + display: flex; + flex-direction: column; + gap: ${spacing("verySmall")}; +`; + +interface MultimerStructureProps { + entry_id: string; + uniprot_ids: string; + date_modified: string; + model_used: string; + handleDelete?: () => void; +} + +const MultimerStructureContainer = styled.div` + display: flex; + flex-direction: row; + align-items: center; + justify-content: space-between; + padding-left: ${spacing("listIndentation")}; + padding-top: ${spacing("verySmall")}; + padding-bottom: ${spacing("verySmall")}; +`; + +const MultimerStructureInfo = styled.div` + display: flex; + justify-content: space-between; + align-content: center; + flex-direction: column; + width: 90%; +`; + +const MultimerStructureEntry = ({ + entry_id, + uniprot_ids, + date_modified, + model_used, + handleDelete, +}: MultimerStructureProps) => { + return ( + + + + + + + + ); +}; + +export const MultimerStructureUpload = () => { + const notify = useNotification(); + const [multimerStructureList, setMultimerStructureList] = useState([]); + const [isDeleteModalOpen, openDeleteModal, closeDeleteModal] = useToggleableState(false); + const [selectedMultimerStructure, setSelectedMultimerStructure] = useState(""); + + const fetchMultimerStructures = async () => { + const multimerStructures = await callApi("get_multimer_structure"); + if (multimerStructures) { + setMultimerStructureList(multimerStructures); + } + }; + + useEffect(() => { + void fetchMultimerStructures(); + }, []); + + const handleAddMultimerStructure = async ( + entry_id: string, + uniprot_ids: string, + model_used: string, + fasta_file: string, + cif_file: string, + confidence_file: string, + full_data_file: string, + ) => { + const response = await callApiWithParameters("upload_multimer_structure", { + entry_id: entry_id, + uniprot_ids: uniprot_ids, + model_used: model_used, + fasta_file: fasta_file, + cif_file: cif_file, + confidence_file: confidence_file, + full_data_file: full_data_file, + }); + if (response?.success) { + notify({ + title: "Predicted multimer structure upload", + message: response.message as string, + type: "success", + isClosingAutomatically: true, + }); + } else { + notify({ + title: "Predicted multimer structure upload failed", + message: response.message ?? "Unknown error", + type: "error", + isClosingAutomatically: true, + }); + } + void fetchMultimerStructures(); + }; + + const onDeleteMultimerStructure = (entry_id: string) => { + openDeleteModal(); + setSelectedMultimerStructure(entry_id); + }; + + const handleDeleteMultimerStructure = async (entry_id: string) => { + const response = await callApiWithParameters("delete_multimer_structure", { + entry_id: entry_id, + }); + if (response?.success) { + notify({ + title: "Multimer structure deleted", + message: response.message as string, + type: "success", + isClosingAutomatically: true, + }); + } else { + notify({ + title: "Multimer structure deletion failed", + message: response?.message ?? "Unknown error", + type: "error", + isClosingAutomatically: true, + }); + } + void fetchMultimerStructures(); + closeDeleteModal(); + }; + return ( +
+ + + + { + void handleAddMultimerStructure( + data.entry_id as string, + data.uniprot_ids as string, + data.model_used as string, + data.fasta_file as string, + data.cif_file as string, + data.confidence_file as string, + data.full_data_file as string, + ); + }} + /> + + {multimerStructureList.length === 0 ? ( + + ) : ( + + {multimerStructureList.map((ps) => ( + { + onDeleteMultimerStructure(ps.entry_id); + }} + /> + ))} + + )} + void handleDeleteMultimerStructure(selectedMultimerStructure)} + title={ + `The uploaded multimer structure prediction with the entry ID ` + + `"${selectedMultimerStructure}" will permanently be deleted. Would you like to proceed?` + } + /> +
+ ); +}; diff --git a/frontend/src/components/app/settings/settings.tsx b/frontend/src/components/app/settings/settings.tsx index 37581bcca..329933309 100644 --- a/frontend/src/components/app/settings/settings.tsx +++ b/frontend/src/components/app/settings/settings.tsx @@ -6,7 +6,8 @@ import { styled } from "styled-components"; import { DatabaseSettings, GitHub, - ProteinStructureUpload, + MonomerStructureUpload, + MultimerStructureUpload, PTMVisSettings, } from "./other-settings/"; import { PlotSettingsModal } from "./plot-settings"; @@ -116,12 +117,21 @@ export const Settings: React.FC = ({ }} /> { - handleSwitchSection("protein-struc-upload"); + handleSwitchSection("monomer-structure-upload"); + }} + /> + { + handleSwitchSection("multimer-structure-upload"); }} /> = ({ )} {selectedSetting === "database" && } {selectedSetting === "ptm-vis" && } - {selectedSetting === "protein-struc-upload" && } + {selectedSetting === "monomer-structure-upload" && } + {selectedSetting === "multimer-structure-upload" && } {selectedSetting === "github" && } Date: Sat, 14 Feb 2026 09:40:26 +0100 Subject: [PATCH 105/240] feat: add import from disk step --- backend/main/views_settings.py | 2 +- backend/protzilla/all_steps.py | 3 +- .../alphafold_protein_structure_load.py | 198 ++++++++++++++---- backend/protzilla/methods/data_analysis.py | 6 +- backend/protzilla/methods/importing.py | 48 ++++- backend/tests/main/test_views_helper.py | 3 +- .../test_alphafold_protein_structure_load.py | 16 +- 7 files changed, 213 insertions(+), 63 deletions(-) diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index d6ba72614..52d288cf8 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -235,7 +235,7 @@ def save_ptm_settings(request, default_file_stem: str = DEFAULT_PTM_SETTINGS_FIL # <--- helper functions for monomer and multimer structure prediction ---> def check_and_copy_files_to_directory(file_names: list, target_dir: str): if target_dir.exists(): - False, "Entry ID is not unique." + return False, "Entry ID is not unique." else: target_dir.mkdir(parents=True, exist_ok=True) diff --git a/backend/protzilla/all_steps.py b/backend/protzilla/all_steps.py index 843d2844d..0095ac1f9 100644 --- a/backend/protzilla/all_steps.py +++ b/backend/protzilla/all_steps.py @@ -16,8 +16,9 @@ importing.FastaImport, importing.AlphaFoldPredictionLoad, importing.CrosslinkingImport, - importing.ImportStructurePredictionFromDisk, + importing.ImportMonomerStructurePredictionFromDisk, importing.UploadMultimerPredictions, + importing.ImportMultimerStructurePredictionFromDisk, data_preprocessing.FilterProteinsBySamplesMissing, data_preprocessing.FilterProteinsBySilacRatios, data_preprocessing.FilterByProteinsCount, diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 79b9ed632..d0bec6c12 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -53,7 +53,7 @@ def get_multimer_metadata_df() -> pd.DataFrame: metadata_df = pd.DataFrame( columns=[ "entry_id", - "protein_ids", + "uniprot_ids", "model_created_date", "model_used", ] @@ -158,13 +158,12 @@ def extend_metadata_csv( msg = f'Existing entry with Entry ID "{entry_id}" was overwritten.' logger.warning(msg) messages.append(dict(level=logging.WARNING, msg=msg)) - filtered_exsisting_metadata_df = exsisting_metadata_df[~mask] - combined = pd.concat( - [filtered_exsisting_metadata_df, metadata_df], ignore_index=True + exsisting_metadata_df = exsisting_metadata_df[~mask] + + combined = pd.concat( + [exsisting_metadata_df, metadata_df], ignore_index=True ) - combined.to_csv(metadata_csv, index=False) - else: - metadata_df.to_csv(metadata_csv, index=False) + combined.to_csv(metadata_csv, index=False) except Exception: msg = f'Failed to write AlphaFold metadata CSV to "{metadata_csv}".' logger.exception(msg) @@ -369,36 +368,38 @@ def get_all_available_entry_ids_of_monomer_metadata() -> list[str]: """ " Get the entry ids of all the protein structure predictions that can be found on disk. """ - messages = [] df = get_monomer_metadata_df() return df["entry_id"].tolist() -def get_prot_structure_dfs(entry_id: str) -> dict[str, Any]: +def get_all_available_entry_ids_of_multimer_metadata() -> list[str]: + """ " + Get the entry ids of all the protein structure predictions that can be found on disk. """ - Writes data from disk of a specific entry ID into dataframes. + df = get_multimer_metadata_df() + return df["entry_id"].tolist() - :param entry_id: entry_id of the uploaded protein structure - :return: A dictionary containing DataFrames for metadata, CIF, PAE, pLDDT, and sequence data - """ - messages: list[dict[str, str | int]] = [] - all_metadata_df = get_monomer_metadata_df() + +def check_and_get_metadata_df(entry_id: str, all_metadata_df: pd.DataFrame, csv_file: Path) -> pd.DataFrame: metadata_df = all_metadata_df[all_metadata_df["entry_id"] == entry_id] if metadata_df.empty: - msg = f"No metadata for Entry ID '{entry_id}' in {paths.AF_MONOMER_METADATA_CSV_PATH}" + msg = f"No metadata for Entry ID '{entry_id}' in {csv_file}" logger.error(msg) raise ValueError(msg) + return metadata_df + - prot_dir = paths.ALPHAFOLD_MONOMER_PATH / entry_id.upper() - if not prot_dir.exists() or not prot_dir.is_dir(): - msg = f"AlphaFold data directory not found for entry '{entry_id}': {prot_dir}" +def check_dir(entry_id: str, dir: Path): + if not dir.exists() or not dir.is_dir(): + msg = f"AlphaFold data directory not found for entry '{entry_id}': {dir}" logger.error(msg) raise FileNotFoundError(msg) - # get cif file - cif_files = list(prot_dir.glob("*.cif")) + +def get_cif_df_from_disk(entry_id: str, structure_dir: Path, messages: list) -> pd.DataFrame: + cif_files = list(structure_dir.glob("*.cif")) if not cif_files: - msg = f"No CIF file found in {prot_dir} for entry '{entry_id}'" + msg = f"No CIF file found in {structure_dir} for entry '{entry_id}'" logger.error(msg) raise FileNotFoundError(msg) @@ -410,23 +411,25 @@ def get_prot_structure_dfs(entry_id: str) -> dict[str, Any]: cif_file = cif_files[0] try: cif_df = read_alphafold_mmcif(str(cif_file)) + return cif_df except Exception as e: msg = f"Failed to read CIF file '{cif_file}': {e}" logger.exception(msg) raise RuntimeError(msg) from e + - # get fasta file - fasta_files = list(prot_dir.glob("*.fasta")) + list(prot_dir.glob("*.fa")) +def get_amino_acid_sequences_df_from_disk(entry_id: str, structure_dir: Path) -> pd.DataFrame: + fasta_files = list(structure_dir.glob("*.fasta")) + list(structure_dir.glob("*.fa")) if not fasta_files: - msg = f"No FASTA file found in {prot_dir} for entry '{entry_id}'" + msg = f"No FASTA file found in {structure_dir} for entry '{entry_id}'" logger.error(msg) raise FileNotFoundError(msg) fasta_file = fasta_files[0] try: fasta_dict = fasta_import(str(fasta_file)) - amino_acid_sequence_df = fasta_dict.get("fasta_df") - if amino_acid_sequence_df is None: + amino_acid_sequences_df = fasta_dict.get("fasta_df") + if amino_acid_sequences_df is None: msg = f"FASTA importer did not return 'fasta_df' for {fasta_file}" logger.error(msg) raise RuntimeError(msg) @@ -434,17 +437,60 @@ def get_prot_structure_dfs(entry_id: str) -> dict[str, Any]: msg = f"Failed to load FASTA '{fasta_file}': {e}" logger.exception(msg) raise RuntimeError(msg) from e + return amino_acid_sequences_df + + +def get_json_files_in_dir(entry_id: str, structure_dir: Path) -> list: + json_files = list(structure_dir.glob("*.json")) + if not json_files: + msg = f"No JSON files found in {structure_dir} for entry '{entry_id}'" + logger.error(msg) + raise FileNotFoundError(msg) + return json_files + + +def check_success_of_get_df(entry_id:str, df_dict: dict, messages: list) -> None: + if not any(df.empty for df in df_dict.values()): + success_msg = f"Successfully loaded AlphaFold data for entry '{entry_id}'" + logger.info(success_msg) + messages.append(dict(level=logging.INFO, msg=success_msg)) + else: + message = f"Could not load AlphaFold data for entry '{entry_id}'" + logger.warning(message) + messages.append(dict(level=logging.WARNING, msg=message)) + + +def get_monomer_structure_dfs(entry_id: str) -> dict[str, Any]: + """ + Writes monomer structure data from disk of a specific entry ID into dataframes. + + :param entry_id: entry_id of the uploaded monomer structure + :return: A dictionary containing DataFrames for metadata, CIF, PAE, pLDDT, and sequence data + """ + messages: list[dict[str, str | int]] = [] + all_metadata_df = get_monomer_metadata_df() + + metadata_df = check_and_get_metadata_df(entry_id=entry_id, all_metadata_df=all_metadata_df, csv_file=paths.AF_MONOMER_METADATA_CSV_PATH) + + structure_dir = paths.ALPHAFOLD_MONOMER_PATH / entry_id.upper() + check_dir(entry_id=entry_id, dir=structure_dir) + + # get cif file + cif_df = get_cif_df_from_disk(entry_id=entry_id, structure_dir=structure_dir, messages=messages) + + # get fasta file + amino_acid_sequence_df = get_amino_acid_sequences_df_from_disk(entry_id=entry_id, structure_dir=structure_dir) # get jsons (PAE and pLDDT) - json_files = list(prot_dir.glob("*.json")) + json_files = list(structure_dir.glob("*.json")) if not json_files: - msg = f"No JSON files (PAE/pLDDT) found in {prot_dir} for entry '{entry_id}'" + msg = f"No JSON files (PAE/pLDDT) found in {structure_dir} for entry '{entry_id}'" logger.error(msg) raise FileNotFoundError(msg) try: if len(json_files) == 1: - msg = f"Only one json file found in {prot_dir} for entry '{entry_id}'. Two json files are expected" + msg = f"Only one json file found in {structure_dir} for entry '{entry_id}'. Two json files are expected" logger.error(msg) raise RuntimeError() else: @@ -470,7 +516,7 @@ def get_prot_structure_dfs(entry_id: str) -> dict[str, Any]: logger.warning(warn) messages.append(dict(level=logging.WARNING, msg=warn)) except Exception as e: - msg = f"Failed to read JSON files in {prot_dir}: {e}" + msg = f"Failed to read JSON files in {structure_dir}: {e}" logger.exception(msg) raise RuntimeError(msg) from e @@ -481,21 +527,89 @@ def get_prot_structure_dfs(entry_id: str) -> dict[str, Any]: "plddt_df": plddt_df, "amino_acid_sequence_df": amino_acid_sequence_df, } - if not any(df.empty for df in df_dict.values()): - success_msg = f"Successfully loaded AlphaFold data for entry '{entry_id}'" - logger.info(success_msg) - messages.append(dict(level=logging.INFO, msg=success_msg)) - else: - message = f"Could not load AlphaFold data for entry '{entry_id}'" - logger.warning(message) - messages.append(dict(level=logging.WARNING, msg=message)) + check_success_of_get_df(entry_id=entry_id, df_dict=df_dict) + df_dict["messages"] = messages + return df_dict + + +def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: + """ + Writes multimer structure data from disk of a specific entry ID into dataframes. + + :param entry_id: entry_id of the uploaded monomer structure + :return: A dictionary containing DataFrames for metadata, CIF, confidence, full data, and sequence data + """ + messages: list[dict[str, str | int]] = [] + all_metadata_df = get_multimer_metadata_df() + + metadata_df = check_and_get_metadata_df(entry_id=entry_id, all_metadata_df=all_metadata_df, csv_file=paths.AF_MULTIMER_METADATA_CSV_PATH) + + structure_dir = paths.ALPHAFOLD_MULTIMER_PATH / entry_id.upper() + check_dir(entry_id=entry_id, dir=structure_dir) + + # get cif file + cif_df = get_cif_df_from_disk(entry_id=entry_id, structure_dir=structure_dir, messages=messages) + + # get fasta file + amino_acid_sequences_df = get_amino_acid_sequences_df_from_disk(entry_id=entry_id, structure_dir=structure_dir) + + # get jsons (PAE and pLDDT) + json_files = get_json_files_in_dir(entry_id=entry_id, structure_dir=structure_dir) + + try: + if len(json_files) == 1: + msg = f"Only one json file found in {structure_dir} for entry '{entry_id}'. Two json files are expected" + logger.error(msg) + raise RuntimeError() + else: + with open(json_files[0], "r") as f: + obj1 = json.load(f) + with open(json_files[1], "r") as f: + obj2 = json.load(f) + + json1 = pd.json_normalize(obj1) + json2 = pd.json_normalize(obj2) + # iptm stands for interface predicted TM score + if ( + "chain_iptm" in json1.columns + and "pae" in json2.columns + ): + confidence_df = json1 + full_data_df = json2 + elif ( + "chain_iptm" in json2.columns + and "pae" in json1.columns + ): + confidence_df = json2 + full_data_df = json1 + else: + # Fallback: assign and warn + confidence_df = json1 + full_data_df = json2 + warn = f"Could not detect confidence scores/full data information in JSON files for entry '{entry_id}'; ''{json_files[0]} is read as confidenc, {json_files[1]} is read as full data summary." + logger.warning(warn) + messages.append(dict(level=logging.WARNING, msg=warn)) + except Exception as e: + msg = f"Failed to read JSON files in {structure_dir}: {e}" + logger.exception(msg) + raise RuntimeError(msg) from e + + df_dict = { + "metadata_df": metadata_df, + "amino_acid_sequences_df": amino_acid_sequences_df, + "cif_df": cif_df, + "confidence_df": confidence_df, + "full_data_df": full_data_df, + } + + check_success_of_get_df(entry_id=entry_id, df_dict=df_dict, messages=messages) df_dict["messages"] = messages return df_dict def upload_multimer_prediction( entry_id: str, - protein_ids: list[str], + uniprot_ids: list[str], model_used: str, amino_acid_sequences: Path, cif_file: Path, @@ -508,7 +622,7 @@ def upload_multimer_prediction( temp_dir, work_dir = get_correct_af_directories( entry_id=entry_id, - directory_name=paths.ALPHAFOLD_MONOMER_PATH, + directory_name=paths.ALPHAFOLD_MULTIMER_PATH, persist_upload=persist_upload, ) @@ -517,7 +631,7 @@ def upload_multimer_prediction( data: dict[str, Any] = { "entry_id": entry_id, - "protein_ids": protein_ids, + "uniprot_ids": uniprot_ids, "model_created_date": formatted, "model_used": model_used, } diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 23ba79c28..d09d742ef 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -67,7 +67,7 @@ ) from backend.protzilla.run import Run from backend.protzilla.methods.importing import ( - ImportStructurePredictionFromDisk, + ImportMonomerStructurePredictionFromDisk, AlphaFoldPredictionLoad, ) @@ -2575,7 +2575,7 @@ def modify_form(self, form: Form, run: Run) -> None: loaded_protein_entry_ids = list( set( run.steps.get_inputs_of_step_type( - ImportStructurePredictionFromDisk, "entry_id" + ImportMonomerStructurePredictionFromDisk, "entry_id" ) + run.steps.get_inputs_of_step_type( AlphaFoldPredictionLoad, "uniprot_id" @@ -2615,7 +2615,7 @@ def modify_form(self, form: Form, run: Run) -> None: def insert_dataframes(self, steps: StepManager, inputs) -> dict: entry_id = inputs["protein_to_validate"] correct_input_step_identifier = steps.get_step_identifier_of_step_with_input( - ImportStructurePredictionFromDisk, "entry_id", entry_id + ImportMonomerStructurePredictionFromDisk, "entry_id", entry_id ) or steps.get_step_identifier_of_step_with_input( AlphaFoldPredictionLoad, "uniprot_id", entry_id ) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 16a601852..410f67a2d 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -15,8 +15,10 @@ from backend.protzilla.importing.alphafold_protein_structure_load import ( fetch_alphafold_protein_structure, get_all_available_entry_ids_of_monomer_metadata, - get_prot_structure_dfs, + get_all_available_entry_ids_of_multimer_metadata, + get_monomer_structure_dfs, upload_multimer_prediction, + get_multimer_structure_dfs, ) from backend.protzilla.importing.peptide_import import peptide_import, evidence_import from backend.protzilla.steps import Step, StepManager @@ -481,8 +483,8 @@ def create_form(self): calc_method = staticmethod(crosslinking_import) -class ImportStructurePredictionFromDisk(ImportingStep): - display_name = "Structure Prediction Import from Disk" +class ImportMonomerStructurePredictionFromDisk(ImportingStep): + display_name = "Monomer Structure Prediction Import from Disk" operation = "Monomer Structure Import" method_description = ( "Load an already uploaded monomer structure prediction from disk into current run" @@ -498,11 +500,11 @@ class ImportStructurePredictionFromDisk(ImportingStep): def create_form(self): return Form( - label="Structure Predictions Import from Disk", + label="Monomer Structure Predictions Import from Disk", input_fields=[ DropdownField( name="entry_id", - label="Entry ID of the prediction to be loaded into the run. (Unless specified otherwise this is the Protein ID)", + label="Entry ID of the monomer prediction to be loaded into the run. (Unless specified otherwise this is the Protein ID)", options=form_helper.to_choices( get_all_available_entry_ids_of_monomer_metadata() ), @@ -510,7 +512,7 @@ def create_form(self): ], ) - calc_method = staticmethod(get_prot_structure_dfs) + calc_method = staticmethod(get_monomer_structure_dfs) class UploadMultimerPredictions(ImportingStep): @@ -535,7 +537,7 @@ def create_form(self): label="Entry ID of the prediction to be loaded into the run.", ), TextField( - name="protein_ids", + name="uniprot_ids", label="Protein IDs of all proteins used in the sequence.", ), InfoField( @@ -574,3 +576,35 @@ def create_form(self): ) calc_method = staticmethod(upload_multimer_prediction) + + +class ImportMultimerStructurePredictionFromDisk(ImportingStep): + display_name = "Multimer Structure Prediction Import from Disk" + operation = "Multimer Structure Import" + method_description = ( + "Load an already uploaded multimer structure prediction from disk into current run" + ) + + output_keys = [ + "metadata_df", + "amino_acid_sequences_df", + "cif_df", + "confidence_df", + "full_data_df", + ] + + def create_form(self): + return Form( + label="Multimer Structure Predictions Import from Disk", + input_fields=[ + DropdownField( + name="entry_id", + label="Entry ID of the multimer prediction to be loaded into the run.", + options=form_helper.to_choices( + get_all_available_entry_ids_of_multimer_metadata() + ), + ) + ], + ) + + calc_method = staticmethod(get_multimer_structure_dfs) diff --git a/backend/tests/main/test_views_helper.py b/backend/tests/main/test_views_helper.py index 07caf4d01..175199013 100644 --- a/backend/tests/main/test_views_helper.py +++ b/backend/tests/main/test_views_helper.py @@ -15,8 +15,9 @@ def test_get_all_possible_step_names(): "FastaImport", "AlphaFoldPredictionLoad", "CrosslinkingImport", - "ImportStructurePredictionFromDisk", + "ImportMonomerStructurePredictionFromDisk", "UploadMultimerPredictions", + "ImportMultimerStructurePredictionFromDisk", "FilterProteinsBySamplesMissing", "FilterProteinsBySilacRatios", "FilterByProteinsCount", diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index 436f796ce..e6c9fe91e 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -12,7 +12,7 @@ to_fasta, read_alphafold_mmcif, get_all_available_entry_ids_of_monomer_metadata, - get_prot_structure_dfs, + get_monomer_structure_dfs, get_monomer_metadata_df, get_multimer_metadata_df, get_correct_af_directories, @@ -213,7 +213,7 @@ def test_get_prot_structure_dfs_no_entry(tmp_path, monkeypatch): ) with pytest.raises(ValueError, match=r"No metadata for Entry ID 'Q8WP00'"): - get_prot_structure_dfs("Q8WP00") + get_monomer_structure_dfs("Q8WP00") def test_get_prot_structure_dfs_success(tmp_path, monkeypatch): @@ -264,7 +264,7 @@ def test_get_prot_structure_dfs_success(tmp_path, monkeypatch): with open(plddt, "w") as f: json.dump(plddt_data, f) - out = get_prot_structure_dfs("Q8WP00") + out = get_monomer_structure_dfs("Q8WP00") assert isinstance(out["metadata_df"], pd.DataFrame) assert not out["metadata_df"].empty @@ -427,7 +427,7 @@ def _copy(src, dest_dir): out = upload_multimer_prediction( entry_id="M1", - protein_ids=["X"], + uniprot_ids=["X"], model_used="m", amino_acid_sequences=fasta, cif_file=cif, @@ -556,7 +556,7 @@ def test_upload_multimer_prediction_no_persist(tmp_path, monkeypatch): out = upload_multimer_prediction( entry_id="M2", - protein_ids=["Y"], + uniprot_ids=["Y"], model_used="test", amino_acid_sequences=fasta, cif_file=cif, @@ -586,7 +586,7 @@ def test_get_prot_structure_dfs_missing_cif(tmp_path, monkeypatch): prot_dir.mkdir(parents=True, exist_ok=True) with pytest.raises(FileNotFoundError, match="No CIF file found"): - get_prot_structure_dfs("NOCIF") + get_monomer_structure_dfs("NOCIF") def test_get_prot_structure_dfs_missing_fasta(tmp_path, monkeypatch): @@ -606,7 +606,7 @@ def test_get_prot_structure_dfs_missing_fasta(tmp_path, monkeypatch): cif.write_text("data_test\nloop_\n_atom_site.id\nN\n") with pytest.raises(FileNotFoundError, match="No FASTA file found"): - get_prot_structure_dfs("NOFASTA") + get_monomer_structure_dfs("NOFASTA") def test_get_prot_structure_dfs_missing_json(tmp_path, monkeypatch): @@ -630,7 +630,7 @@ def test_get_prot_structure_dfs_missing_json(tmp_path, monkeypatch): fasta.write_text(">alpha|NOJSON\nAAAA\n") with pytest.raises(FileNotFoundError, match="No JSON files"): - get_prot_structure_dfs("NOJSON") + get_monomer_structure_dfs("NOJSON") def test_extend_metadata_csv_empty_existing(tmp_path): From 355e4b98389da00c234bd22456b4e9d325ffdb63 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Sat, 14 Feb 2026 11:54:31 +0100 Subject: [PATCH 106/240] fix: add tests and fix bugs --- backend/main/views_settings.py | 50 +++- .../alphafold_protein_structure_load.py | 235 ++++++++------- backend/protzilla/methods/importing.py | 10 +- .../test_alphafold_protein_structure_load.py | 280 ++++++++++++++++-- 4 files changed, 426 insertions(+), 149 deletions(-) diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index 52d288cf8..8c3669373 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -232,10 +232,11 @@ def save_ptm_settings(request, default_file_stem: str = DEFAULT_PTM_SETTINGS_FIL {"success": True, "message": "Settings successfully saved."}, status=200 ) + # <--- helper functions for monomer and multimer structure prediction ---> def check_and_copy_files_to_directory(file_names: list, target_dir: str): if target_dir.exists(): - return False, "Entry ID is not unique." + return False, "Entry ID is not unique." else: target_dir.mkdir(parents=True, exist_ok=True) @@ -245,10 +246,11 @@ def check_and_copy_files_to_directory(file_names: list, target_dir: str): if not success: return False, message return True, "All files successfully uploaded" - -def get_metadata_df(csv_file_path: str, expected_columns: list[str]) -> pandas.DataFrame: +def get_metadata_df( + csv_file_path: str, expected_columns: list[str] +) -> pandas.DataFrame: if csv_file_path.exists(): df = pandas.read_csv(csv_file_path, usecols=lambda c: c in expected_columns) else: @@ -314,19 +316,18 @@ def delete_structure(dir_path: str, csv_file_path: str, request): ) - # <--- Monomer Structure Predictions ---> def get_monomer_structure(request): metadata_csv = AF_MONOMER_METADATA_CSV_PATH expected_columns = [ - "entry_id", - "uniprot_accession", - "model_created_date", - "gene", - "model_used", - ] + "entry_id", + "uniprot_accession", + "model_created_date", + "gene", + "model_used", + ] df = get_metadata_df(csv_file_path=metadata_csv, expected_columns=expected_columns) @@ -359,7 +360,9 @@ def upload_monomer_structure(request): target_dir = ALPHAFOLD_MONOMER_PATH / entry_id.upper() file_names = [cif_file, confidence, pae, fasta_file] - success, message = check_and_copy_files_to_directory(file_names=file_names, target_dir=target_dir) + success, message = check_and_copy_files_to_directory( + file_names=file_names, target_dir=target_dir + ) if not success: return JsonResponse( {"success": False, "message": message}, @@ -378,7 +381,9 @@ def upload_monomer_structure(request): "model_used", ] - df = get_metadata_df(csv_file_path=metadata_csv, expected_columns=expected_columns) + df = get_metadata_df( + csv_file_path=metadata_csv, expected_columns=expected_columns + ) now_utc = datetime.now(timezone.utc) formatted = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ") @@ -412,11 +417,16 @@ def upload_monomer_structure(request): def delete_monomer_structure(request): - return delete_structure(dir_path=ALPHAFOLD_MONOMER_PATH, csv_file_path=AF_MONOMER_METADATA_CSV_PATH, request=request) + return delete_structure( + dir_path=ALPHAFOLD_MONOMER_PATH, + csv_file_path=AF_MONOMER_METADATA_CSV_PATH, + request=request, + ) # <--- Multimer Structure Predictions ---> + def get_multimer_structure(request): metadata_csv = AF_MULTIMER_METADATA_CSV_PATH expected_columns = [ @@ -454,7 +464,9 @@ def upload_multimer_structure(request): target_dir = ALPHAFOLD_MULTIMER_PATH / entry_id.upper() file_names = [fasta_file, cif_file, confidence_file, full_data_file] - success, message = check_and_copy_files_to_directory(file_names=file_names, target_dir=target_dir) + success, message = check_and_copy_files_to_directory( + file_names=file_names, target_dir=target_dir + ) if not success: return JsonResponse( {"success": False, "message": message}, @@ -470,7 +482,9 @@ def upload_multimer_structure(request): "model_used", ] - df = get_metadata_df(csv_file_path=metadata_csv, expected_columns=expected_columns) + df = get_metadata_df( + csv_file_path=metadata_csv, expected_columns=expected_columns + ) now_utc = datetime.now(timezone.utc) formatted = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ") @@ -503,7 +517,11 @@ def upload_multimer_structure(request): def delete_multimer_structure(request): - return delete_structure(dir_path=ALPHAFOLD_MULTIMER_PATH, csv_file_path=AF_MULTIMER_METADATA_CSV_PATH, request=request) + return delete_structure( + dir_path=ALPHAFOLD_MULTIMER_PATH, + csv_file_path=AF_MULTIMER_METADATA_CSV_PATH, + request=request, + ) # <--- Databases ---> diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index d0bec6c12..95de95e7d 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -85,7 +85,7 @@ def to_fasta(seq: str, header: str = "protein_sequence", width: int = 60) -> str return f">alpha|{header}\n{joined}\n" -def read_alphafold_mmcif(path: str) -> pd.DataFrame: +def read_alphafold_mmcif(path: Path) -> pd.DataFrame: """ Parse an AlphaFold mmCIF (Macromolecular Crystallographic Information File) file. @@ -95,15 +95,14 @@ def read_alphafold_mmcif(path: str) -> pd.DataFrame: :raises IsADirectoryError: If the path points to a directory instead of a file :raises ValueError: If no CIF blocks are found in the file """ - p = Path(path) - if not p.exists(): - raise FileNotFoundError(f"File not found: {p}") - if p.is_dir(): - raise IsADirectoryError(f"Expected a file path, got a directory: {p}") + if not path.exists(): + raise FileNotFoundError(f"File not found: {path}") + if path.is_dir(): + raise IsADirectoryError(f"Expected a file path, got a directory: {path}") - doc = gemmi.cif.read_file(str(p)) + doc = gemmi.cif.read_file(str(path)) if len(doc) == 0: - raise ValueError(f"No CIF blocks found in file: {p}") + raise ValueError(f"No CIF blocks found in file: {path}") block = doc.sole_block() @@ -131,7 +130,7 @@ def read_alphafold_mmcif(path: str) -> pd.DataFrame: def get_correct_af_directories( entry_id: str, directory_name: Path, persist_upload: bool -) -> list[Path, Path]: +) -> tuple[Path | None, Path]: target_dir = directory_name / entry_id.upper() temp_dir = None @@ -160,9 +159,7 @@ def extend_metadata_csv( messages.append(dict(level=logging.WARNING, msg=msg)) exsisting_metadata_df = exsisting_metadata_df[~mask] - combined = pd.concat( - [exsisting_metadata_df, metadata_df], ignore_index=True - ) + combined = pd.concat([exsisting_metadata_df, metadata_df], ignore_index=True) combined.to_csv(metadata_csv, index=False) except Exception: msg = f'Failed to write AlphaFold metadata CSV to "{metadata_csv}".' @@ -170,17 +167,16 @@ def extend_metadata_csv( messages.append(dict(level=logging.ERROR, msg=msg)) -def get_amino_acid_sequence_df( - entry_id: str, work_dir: Path, fasta_dest: Path, messages: list -) -> pd.DataFrame: +def get_amino_acid_sequence_df(fasta_dest: Path, messages: list) -> pd.DataFrame: try: fasta_dict = fasta_import(str(fasta_dest)) amino_acid_sequence_df = fasta_dict["fasta_df"] + return amino_acid_sequence_df except Exception: msg = "Failed to create sequence dataframe" logger.exception(msg) messages.append(dict(level=logging.ERROR, msg=msg)) - return amino_acid_sequence_df + return pd.DataFrame() def handle_alphafold_files( @@ -207,15 +203,14 @@ def handle_alphafold_files( :return: A dictionary containing DataFrames for metadata, CIF, PAE, pLDDT, sequence data or None values for failed loads and messages such as warnings """ - cif_df = None - pae_df = None - plddt_df = None - amino_acid_sequence_df = None + cif_df = pd.DataFrame() + pae_df = pd.DataFrame() + plddt_df = pd.DataFrame() + amino_acid_sequence_df = pd.DataFrame() messages = [] - downloaded: dict[str, str] = {} temp_dir, work_dir = get_correct_af_directories( - entry_id=uniprot, + entry_id=entry_id, directory_name=paths.ALPHAFOLD_MONOMER_PATH, persist_upload=persist_upload, ) @@ -225,7 +220,7 @@ def handle_alphafold_files( paths.ALPHAFOLD_MONOMER_PATH.mkdir(parents=True, exist_ok=True) existing_metadata_df = get_monomer_metadata_df() extend_metadata_csv( - entry_id=uniprot, + entry_id=entry_id, metadata_csv=paths.AF_MONOMER_METADATA_CSV_PATH, exsisting_metadata_df=existing_metadata_df, metadata_df=metadata_df, @@ -239,7 +234,6 @@ def handle_alphafold_files( dest = work_dir / fname saved = download_file_from_url(urlval, dest) if saved: - downloaded[key] = str(saved) try: if key == "cifUrl": cif_df = read_alphafold_mmcif(saved) @@ -251,6 +245,7 @@ def handle_alphafold_files( msg = f'Failed to load "{key}" into dataframe' logger.exception(msg) messages.append(dict(level=logging.ERROR, msg=msg)) + fasta_dest: Path | None = None try: sequence = to_fasta(seq=seq, header=uniprot) fasta_dest = work_dir / f"{entry_id.upper()}.fasta" @@ -261,12 +256,11 @@ def handle_alphafold_files( msg = f'Failed to write FASTA file "{fasta_dest}"' logger.exception(msg) messages.append(dict(level=logging.ERROR, msg=msg)) - amino_acid_sequence_df = get_amino_acid_sequence_df( - entry_id=uniprot, - work_dir=work_dir, - fasta_dest=fasta_dest, - messages=messages, - ) + if fasta_dest is not None: + amino_acid_sequence_df = get_amino_acid_sequence_df( + fasta_dest=fasta_dest, + messages=messages, + ) finally: if temp_dir is not None: @@ -325,6 +319,10 @@ def fetch_alphafold_protein_structure( } seq_tmp = r.get("sequence") + if not isinstance(seq_tmp, str) or not seq_tmp.strip(): + raise RuntimeError( + f"AlphaFold payload for {uniprot_id} does not contain a valid protein sequence." + ) files_urls: dict[str, Any] = {} @@ -380,7 +378,9 @@ def get_all_available_entry_ids_of_multimer_metadata() -> list[str]: return df["entry_id"].tolist() -def check_and_get_metadata_df(entry_id: str, all_metadata_df: pd.DataFrame, csv_file: Path) -> pd.DataFrame: +def check_and_get_metadata_df( + entry_id: str, all_metadata_df: pd.DataFrame, csv_file: Path +) -> pd.DataFrame: metadata_df = all_metadata_df[all_metadata_df["entry_id"] == entry_id] if metadata_df.empty: msg = f"No metadata for Entry ID '{entry_id}' in {csv_file}" @@ -396,7 +396,9 @@ def check_dir(entry_id: str, dir: Path): raise FileNotFoundError(msg) -def get_cif_df_from_disk(entry_id: str, structure_dir: Path, messages: list) -> pd.DataFrame: +def get_cif_df_from_disk( + entry_id: str, structure_dir: Path, messages: list +) -> pd.DataFrame: cif_files = list(structure_dir.glob("*.cif")) if not cif_files: msg = f"No CIF file found in {structure_dir} for entry '{entry_id}'" @@ -410,15 +412,17 @@ def get_cif_df_from_disk(entry_id: str, structure_dir: Path, messages: list) -> cif_file = cif_files[0] try: - cif_df = read_alphafold_mmcif(str(cif_file)) + cif_df = read_alphafold_mmcif(cif_file) return cif_df except Exception as e: msg = f"Failed to read CIF file '{cif_file}': {e}" logger.exception(msg) raise RuntimeError(msg) from e - -def get_amino_acid_sequences_df_from_disk(entry_id: str, structure_dir: Path) -> pd.DataFrame: + +def get_amino_acid_sequences_df_from_disk( + entry_id: str, structure_dir: Path +) -> pd.DataFrame: fasta_files = list(structure_dir.glob("*.fasta")) + list(structure_dir.glob("*.fa")) if not fasta_files: msg = f"No FASTA file found in {structure_dir} for entry '{entry_id}'" @@ -449,7 +453,7 @@ def get_json_files_in_dir(entry_id: str, structure_dir: Path) -> list: return json_files -def check_success_of_get_df(entry_id:str, df_dict: dict, messages: list) -> None: +def check_success_of_get_df(entry_id: str, df_dict: dict, messages: list) -> None: if not any(df.empty for df in df_dict.values()): success_msg = f"Successfully loaded AlphaFold data for entry '{entry_id}'" logger.info(success_msg) @@ -470,21 +474,31 @@ def get_monomer_structure_dfs(entry_id: str) -> dict[str, Any]: messages: list[dict[str, str | int]] = [] all_metadata_df = get_monomer_metadata_df() - metadata_df = check_and_get_metadata_df(entry_id=entry_id, all_metadata_df=all_metadata_df, csv_file=paths.AF_MONOMER_METADATA_CSV_PATH) + metadata_df = check_and_get_metadata_df( + entry_id=entry_id, + all_metadata_df=all_metadata_df, + csv_file=paths.AF_MONOMER_METADATA_CSV_PATH, + ) structure_dir = paths.ALPHAFOLD_MONOMER_PATH / entry_id.upper() check_dir(entry_id=entry_id, dir=structure_dir) # get cif file - cif_df = get_cif_df_from_disk(entry_id=entry_id, structure_dir=structure_dir, messages=messages) + cif_df = get_cif_df_from_disk( + entry_id=entry_id, structure_dir=structure_dir, messages=messages + ) # get fasta file - amino_acid_sequence_df = get_amino_acid_sequences_df_from_disk(entry_id=entry_id, structure_dir=structure_dir) + amino_acid_sequence_df = get_amino_acid_sequences_df_from_disk( + entry_id=entry_id, structure_dir=structure_dir + ) # get jsons (PAE and pLDDT) json_files = list(structure_dir.glob("*.json")) if not json_files: - msg = f"No JSON files (PAE/pLDDT) found in {structure_dir} for entry '{entry_id}'" + msg = ( + f"No JSON files (PAE/pLDDT) found in {structure_dir} for entry '{entry_id}'" + ) logger.error(msg) raise FileNotFoundError(msg) @@ -527,7 +541,7 @@ def get_monomer_structure_dfs(entry_id: str) -> dict[str, Any]: "plddt_df": plddt_df, "amino_acid_sequence_df": amino_acid_sequence_df, } - check_success_of_get_df(entry_id=entry_id, df_dict=df_dict) + check_success_of_get_df(entry_id=entry_id, df_dict=df_dict, messages=messages) df_dict["messages"] = messages return df_dict @@ -542,16 +556,24 @@ def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: messages: list[dict[str, str | int]] = [] all_metadata_df = get_multimer_metadata_df() - metadata_df = check_and_get_metadata_df(entry_id=entry_id, all_metadata_df=all_metadata_df, csv_file=paths.AF_MULTIMER_METADATA_CSV_PATH) + metadata_df = check_and_get_metadata_df( + entry_id=entry_id, + all_metadata_df=all_metadata_df, + csv_file=paths.AF_MULTIMER_METADATA_CSV_PATH, + ) structure_dir = paths.ALPHAFOLD_MULTIMER_PATH / entry_id.upper() check_dir(entry_id=entry_id, dir=structure_dir) # get cif file - cif_df = get_cif_df_from_disk(entry_id=entry_id, structure_dir=structure_dir, messages=messages) + cif_df = get_cif_df_from_disk( + entry_id=entry_id, structure_dir=structure_dir, messages=messages + ) # get fasta file - amino_acid_sequences_df = get_amino_acid_sequences_df_from_disk(entry_id=entry_id, structure_dir=structure_dir) + amino_acid_sequences_df = get_amino_acid_sequences_df_from_disk( + entry_id=entry_id, structure_dir=structure_dir + ) # get jsons (PAE and pLDDT) json_files = get_json_files_in_dir(entry_id=entry_id, structure_dir=structure_dir) @@ -570,16 +592,10 @@ def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: json1 = pd.json_normalize(obj1) json2 = pd.json_normalize(obj2) # iptm stands for interface predicted TM score - if ( - "chain_iptm" in json1.columns - and "pae" in json2.columns - ): + if "chain_iptm" in json1.columns and "pae" in json2.columns: confidence_df = json1 full_data_df = json2 - elif ( - "chain_iptm" in json2.columns - and "pae" in json1.columns - ): + elif "chain_iptm" in json2.columns and "pae" in json1.columns: confidence_df = json2 full_data_df = json1 else: @@ -599,7 +615,7 @@ def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: "amino_acid_sequences_df": amino_acid_sequences_df, "cif_df": cif_df, "confidence_df": confidence_df, - "full_data_df": full_data_df, + "full_data_df": full_data_df, } check_success_of_get_df(entry_id=entry_id, df_dict=df_dict, messages=messages) @@ -636,60 +652,69 @@ def upload_multimer_prediction( "model_used": model_used, } - metadata_df = pd.DataFrame([data]) - exsisting_metadata_df = get_multimer_metadata_df() - extend_metadata_csv( - entry_id=entry_id, - metadata_csv=paths.AF_MULTIMER_METADATA_CSV_PATH, - exsisting_metadata_df=exsisting_metadata_df, - metadata_df=metadata_df, - messages=messages, - ) + try: + metadata_df = pd.DataFrame([data]) + if persist_upload: + exsisting_metadata_df = get_multimer_metadata_df() + extend_metadata_csv( + entry_id=entry_id, + metadata_csv=paths.AF_MULTIMER_METADATA_CSV_PATH, + exsisting_metadata_df=exsisting_metadata_df, + metadata_df=metadata_df, + messages=messages, + ) + for file_name in [ + amino_acid_sequences, + cif_file, + confidence_file, + full_data_file, + ]: + success, msg = copy_file_to_directory(file_name, work_dir) + if not success: + logger.error(msg) + messages.append(dict(level=logging.ERROR, msg=msg)) + + fasta_dict = fasta_import(str(amino_acid_sequences)) + amino_acid_sequence_df = fasta_dict["fasta_df"] - upload_dir = paths.ALPHAFOLD_MULTIMER_PATH / entry_id.upper() - if not upload_dir.exists(): - upload_dir.mkdir(parents=True, exist_ok=True) + confidence_df = pd.read_json(confidence_file) - if persist_upload: - for file_name in [ - amino_acid_sequences, - cif_file, - confidence_file, - full_data_file, - ]: - success, msg = copy_file_to_directory(file_name, upload_dir) - if not success: - logger.error(msg) - messages.append(dict(level=logging.ERROR, msg=msg)) - - fasta_dict = fasta_import(str(amino_acid_sequences)) - amino_acid_sequence_df = fasta_dict["fasta_df"] - - confidence_df = pd.read_json(confidence_file) - - # full_data json has arrays of unequal lengths so we need to normalize - with open(full_data_file, "r") as f: - full_data = json.load(f) - if isinstance(full_data, dict): - full_data_df = pd.json_normalize(full_data) - - cif_df = read_alphafold_mmcif(cif_file) + # full_data json has arrays of unequal lengths so we need to normalize + full_data_df = pd.DataFrame() + with open(full_data_file, "r") as f: + full_data = json.load(f) + if isinstance(full_data, dict): + full_data_df = pd.json_normalize(full_data) + else: + messages.append( + { + "level": logging.WARNING, + "msg": "Could not load full data Json", + } + ) - df_dict = { - "metadata_df": metadata_df, - "cif_df": cif_df, - "confidence_df": confidence_df, - "full_data_df": full_data_df, - "amino_acid_sequences_df": amino_acid_sequence_df, - } + cif_df = read_alphafold_mmcif(cif_file) + + df_dict = { + "metadata_df": metadata_df, + "cif_df": cif_df, + "confidence_df": confidence_df, + "full_data_df": full_data_df, + "amino_acid_sequences_df": amino_acid_sequence_df, + } + + if not any(df.empty for df in df_dict.values()): + success_msg = f"Successfully loaded AlphaFold data for entry '{entry_id}'" + logger.info(success_msg) + messages.append(dict(level=logging.INFO, msg=success_msg)) + else: + message = f"Could not load AlphaFold data for entry '{entry_id}'" + logger.warning(message) + messages.append(dict(level=logging.WARNING, msg=message)) + df_dict["messages"] = messages + + finally: + if temp_dir is not None: + shutil.rmtree(temp_dir, ignore_errors=True) - if not any(df.empty for df in df_dict.values()): - success_msg = f"Successfully loaded AlphaFold data for entry '{entry_id}'" - logger.info(success_msg) - messages.append(dict(level=logging.INFO, msg=success_msg)) - else: - message = f"Could not load AlphaFold data for entry '{entry_id}'" - logger.warning(message) - messages.append(dict(level=logging.WARNING, msg=message)) - df_dict["messages"] = messages return df_dict diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 410f67a2d..e163140d7 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -486,9 +486,7 @@ def create_form(self): class ImportMonomerStructurePredictionFromDisk(ImportingStep): display_name = "Monomer Structure Prediction Import from Disk" operation = "Monomer Structure Import" - method_description = ( - "Load an already uploaded monomer structure prediction from disk into current run" - ) + method_description = "Load an already uploaded monomer structure prediction from disk into current run" output_keys = [ "metadata_df", @@ -581,13 +579,11 @@ def create_form(self): class ImportMultimerStructurePredictionFromDisk(ImportingStep): display_name = "Multimer Structure Prediction Import from Disk" operation = "Multimer Structure Import" - method_description = ( - "Load an already uploaded multimer structure prediction from disk into current run" - ) + method_description = "Load an already uploaded multimer structure prediction from disk into current run" output_keys = [ "metadata_df", - "amino_acid_sequences_df", + "amino_acid_sequences_df", "cif_df", "confidence_df", "full_data_df", diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index e6c9fe91e..ede5be51f 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -4,7 +4,6 @@ import logging import shutil from pathlib import Path -import tempfile from backend.protzilla.importing.alphafold_protein_structure_load import ( @@ -12,7 +11,9 @@ to_fasta, read_alphafold_mmcif, get_all_available_entry_ids_of_monomer_metadata, + get_all_available_entry_ids_of_multimer_metadata, get_monomer_structure_dfs, + get_multimer_structure_dfs, get_monomer_metadata_df, get_multimer_metadata_df, get_correct_af_directories, @@ -20,6 +21,12 @@ get_amino_acid_sequence_df, handle_alphafold_files, upload_multimer_prediction, + check_and_get_metadata_df, + check_dir, + get_json_files_in_dir, + get_cif_df_from_disk, + get_amino_acid_sequences_df_from_disk, + check_success_of_get_df, ) from backend.protzilla.constants import paths @@ -49,19 +56,19 @@ def test_to_fasta_whitespace(): def test_read_alphafold_mmcif_file_not_found(tmp_path): missing = tmp_path / "unexisting.cif" with pytest.raises(FileNotFoundError): - read_alphafold_mmcif(str(missing)) + read_alphafold_mmcif(missing) def test_read_alphafold_mmcif_is_directory(tmp_path): with pytest.raises(IsADirectoryError): - read_alphafold_mmcif(str(tmp_path)) + read_alphafold_mmcif(tmp_path) def test_read_alphafold_mmcif_empty(tmp_path): cif = tmp_path / "empty.cif" cif.write_text("") with pytest.raises(ValueError, match="No CIF blocks found"): - read_alphafold_mmcif(str(cif)) + read_alphafold_mmcif(cif) def test_read_alphafold_mmcif_atom_site_not_found(tmp_path): @@ -72,7 +79,7 @@ def test_read_alphafold_mmcif_atom_site_not_found(tmp_path): _entry.id test """ ) - df = read_alphafold_mmcif(str(cif)) + df = read_alphafold_mmcif(cif) assert isinstance(df, pd.DataFrame) assert df.empty @@ -91,7 +98,7 @@ def test_read_alphafold_mmcif_valid_atom_site(tmp_path): """ ) - df = read_alphafold_mmcif(str(cif)) + df = read_alphafold_mmcif(cif) assert isinstance(df, pd.DataFrame) assert list(df.columns) == [ @@ -111,7 +118,10 @@ def test_fetch_alphafold_protein_structure_wrong_uniprot_id(): def test_fetch_alphafold_returned_keys(tmp_path, monkeypatch): - monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) + monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path / "alphafold_monomer") + monkeypatch.setattr( + paths, "AF_MONOMER_METADATA_CSV_PATH", tmp_path / "alphafold_monomer_metadata.csv" + ) out = fetch_alphafold_protein_structure("Q8WP00", persist_upload=True) assert out.keys() == { @@ -125,7 +135,10 @@ def test_fetch_alphafold_returned_keys(tmp_path, monkeypatch): def test_fetch_alphafold_monomer_metadata(tmp_path, monkeypatch): - monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) + monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path / "alphafold_monomer") + monkeypatch.setattr( + paths, "AF_MONOMER_METADATA_CSV_PATH", tmp_path / "alphafold_monomer_metadata.csv" + ) out = fetch_alphafold_protein_structure("Q8WP00", persist_upload=True) assert isinstance(out["metadata_df"], pd.DataFrame) @@ -137,10 +150,15 @@ def test_fetch_alphafold_monomer_metadata(tmp_path, monkeypatch): def test_fetch_alphafold_files_exist(tmp_path, monkeypatch): - monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path) + monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path / "alphafold_monomer") + monkeypatch.setattr( + paths, "AF_MONOMER_METADATA_CSV_PATH", tmp_path / "alphafold_monomer_metadata.csv" + ) + fetch_alphafold_protein_structure("Q8WP00", persist_upload=True) - target_dir = tmp_path / "Q8WP00" + target_dir = (tmp_path / "alphafold_monomer") / "Q8WP00" + assert target_dir.exists() assert target_dir.is_dir() @@ -157,7 +175,11 @@ def test_fetch_alphafold_files_exist(tmp_path, monkeypatch): def test_fetch_alphafold_dfs_exist(tmp_path, monkeypatch): - monkeypatch.setattr(paths, "EXTERNAL_DATA_PATH", tmp_path) + monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path / "alphafold_monomer") + monkeypatch.setattr( + paths, "AF_MONOMER_METADATA_CSV_PATH", tmp_path / "alphafold_monomer_metadata.csv" + ) + out = fetch_alphafold_protein_structure("Q8WP00", persist_upload=True) cif_df = out["cif_df"] @@ -321,7 +343,7 @@ def test_get_monomer_and_multimer_metadata_df_create(tmp_path, monkeypatch): assert isinstance(multi_df, pd.DataFrame) assert list(multi_df.columns) == [ "entry_id", - "protein_ids", + "uniprot_ids", "model_created_date", "model_used", ] @@ -375,25 +397,23 @@ def test_get_amino_acid_sequence_df_and_handle_files(tmp_path, monkeypatch): fasta = tmp_path / "P.fasta" fasta.write_text(">alpha|P\nTESTSEQ\n") messages = [] - seq_df = get_amino_acid_sequence_df("P", tmp_path, fasta, messages) + seq_df = get_amino_acid_sequence_df(fasta, messages) assert isinstance(seq_df, pd.DataFrame) assert not seq_df.empty # test handle_alphafold_files with no remote files (should still create fasta) - monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path) metadata_df = pd.DataFrame([{"entry_id": "P", "uniprot_accession": "P"}]) out = handle_alphafold_files( {}, "P", "TESTSEQ", metadata_df, "P", persist_upload=False ) assert "amino_acid_sequence_df" in out - assert out["cif_df"] is None - assert out["pae_df"] is None - assert out["plddt_df"] is None + assert isinstance(out["cif_df"], pd.DataFrame) and out["cif_df"].empty + assert isinstance(out["pae_df"], pd.DataFrame) and out["pae_df"].empty + assert isinstance(out["plddt_df"], pd.DataFrame) and out["plddt_df"].empty assert isinstance(out["amino_acid_sequence_df"], pd.DataFrame) def test_upload_multimer_prediction_basic(tmp_path, monkeypatch): - monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path) monkeypatch.setattr(paths, "ALPHAFOLD_MULTIMER_PATH", tmp_path) # prepare files @@ -440,7 +460,7 @@ def _copy(src, dest_dir): # check metadata contents mdf = out["metadata_df"] assert mdf.iloc[0]["entry_id"] == "M1" - assert mdf.iloc[0]["protein_ids"] == ["X"] + assert mdf.iloc[0]["uniprot_ids"] == ["X"] assert mdf.iloc[0]["model_used"] == "m" # cif contents @@ -512,7 +532,7 @@ def test_get_multimer_metadata_df_existing_csv(tmp_path, monkeypatch): [ { "entry_id": "M1", - "protein_ids": "P1,P2", + "uniprot_ids": "P1,P2", "model_created_date": "2025-01-01", "model_used": "m1", } @@ -570,7 +590,7 @@ def test_upload_multimer_prediction_no_persist(tmp_path, monkeypatch): assert isinstance(out["cif_df"], pd.DataFrame) # directory should still exist (created for the entry) upload_dir = tmp_path / "M2" - assert upload_dir.exists() or not upload_dir.exists() + assert not upload_dir.exists() def test_get_prot_structure_dfs_missing_cif(tmp_path, monkeypatch): @@ -645,3 +665,221 @@ def test_extend_metadata_csv_empty_existing(tmp_path): out = pd.read_csv(csv_path, dtype=str) assert out.iloc[0]["entry_id"] == "Z" + + + +def test_get_all_available_entry_ids_of_multimer_metadata_empty(tmp_path, monkeypatch): + metadata_csv = tmp_path / "alphafold_multimer_metadata.csv" + monkeypatch.setattr(paths, "AF_MULTIMER_METADATA_CSV_PATH", metadata_csv) + + assert get_all_available_entry_ids_of_multimer_metadata() == [] + assert metadata_csv.exists() + + df = pd.read_csv(metadata_csv, dtype=str) + assert list(df.columns) == [ + "entry_id", + "uniprot_ids", + "model_created_date", + "model_used", + ] + assert len(df) == 0 + + +def test_get_all_available_entry_ids_of_multimer_metadata_nonempty(tmp_path, monkeypatch): + metadata_csv = tmp_path / "alphafold_multimer_metadata.csv" + monkeypatch.setattr(paths, "AF_MULTIMER_METADATA_CSV_PATH", metadata_csv) + + df = pd.DataFrame( + [ + { + "entry_id": "M1", + "uniprot_ids": "P1,P2", + "model_created_date": "2025-01-01T00:00:00Z", + "model_used": "test", + } + ] + ) + df.to_csv(metadata_csv, index=False) + + assert get_all_available_entry_ids_of_multimer_metadata() == ["M1"] + + + +def test_check_and_get_metadata_df_success(tmp_path): + all_df = pd.DataFrame( + [ + {"entry_id": "A", "x": "1"}, + {"entry_id": "B", "x": "2"}, + ] + ) + out = check_and_get_metadata_df("B", all_df, tmp_path / "meta.csv") + assert isinstance(out, pd.DataFrame) + assert len(out) == 1 + assert out.iloc[0]["entry_id"] == "B" + + +def test_check_dir_missing_raises(tmp_path): + d = tmp_path / "MISSING" + with pytest.raises(FileNotFoundError, match="AlphaFold data directory not found"): + check_dir("MISSING", d) + + + +def test_get_json_files_in_dir_success(tmp_path): + d = tmp_path / "D" + d.mkdir() + (d / "a.json").write_text('{"x": 1}') + (d / "b.json").write_text('{"y": 2}') + files = get_json_files_in_dir("E1", d) + assert len(files) == 2 + assert all(f.suffix == ".json" for f in files) + + +def test_get_json_files_in_dir_missing_raises(tmp_path): + d = tmp_path / "D" + d.mkdir() + with pytest.raises(FileNotFoundError, match="No JSON files found"): + get_json_files_in_dir("E1", d) + + +def test_get_cif_df_from_disk_multiple_cif_warns(tmp_path): + d = tmp_path / "E1" + d.mkdir() + + cif1 = d / "a.cif" + cif2 = d / "b.cif" + cif1.write_text( + """ +data_test +loop_ +_atom_site.id +_atom_site.type_symbol +N N +""" + ) + cif2.write_text( + """ +data_test +loop_ +_atom_site.id +_atom_site.type_symbol +CA C +""" + ) + + messages = [] + df = get_cif_df_from_disk("E1", d, messages) + assert isinstance(df, pd.DataFrame) + assert not df.empty + assert any(m.get("level") == logging.WARNING for m in messages) + + + +def test_get_multimer_structure_dfs_success(tmp_path, monkeypatch): + monkeypatch.setattr(paths, "ALPHAFOLD_MULTIMER_PATH", tmp_path / "multimer") + monkeypatch.setattr( + paths, + "AF_MULTIMER_METADATA_CSV_PATH", + tmp_path / "alphafold_multimer_metadata.csv", + ) + + paths.ALPHAFOLD_MULTIMER_PATH.mkdir(parents=True, exist_ok=True) + + md = pd.DataFrame( + [ + { + "entry_id": "M1", + "uniprot_ids": "P1,P2", + "model_created_date": "2025-01-01T00:00:00Z", + "model_used": "Multimer", + } + ] + ) + md.to_csv(paths.AF_MULTIMER_METADATA_CSV_PATH, index=False) + + prot_dir = paths.ALPHAFOLD_MULTIMER_PATH / "M1" + prot_dir.mkdir(parents=True, exist_ok=True) + + cif = prot_dir / "m1.cif" + cif.write_text( + """ +data_test +loop_ +_atom_site.id +_atom_site.type_symbol +N N +""" + ) + + fasta = prot_dir / "m1.fasta" + fasta.write_text(">alpha|M1\nAAAA\n") + + confidence = prot_dir / "confidence.json" + full_data = prot_dir / "full.json" + confidence.write_text(json.dumps({"chain_iptm": [0.75]})) + full_data.write_text(json.dumps({"pae": [[0.1, 0.2], [0.3, 0.4]]})) + + out = get_multimer_structure_dfs("M1") + assert isinstance(out["metadata_df"], pd.DataFrame) + assert isinstance(out["cif_df"], pd.DataFrame) + assert isinstance(out["amino_acid_sequences_df"], pd.DataFrame) + assert isinstance(out["confidence_df"], pd.DataFrame) + assert isinstance(out["full_data_df"], pd.DataFrame) + + assert "chain_iptm" in out["confidence_df"].columns + assert "pae" in out["full_data_df"].columns + + assert any(m.get("level") == logging.INFO for m in out["messages"]) or any( + "Successfully loaded" in str(m.get("msg", "")) for m in out["messages"] + ) + + + + +def test_get_multimer_structure_dfs_json_fallback_warns(tmp_path, monkeypatch): + monkeypatch.setattr(paths, "ALPHAFOLD_MULTIMER_PATH", tmp_path / "multimer") + monkeypatch.setattr( + paths, + "AF_MULTIMER_METADATA_CSV_PATH", + tmp_path / "alphafold_multimer_metadata.csv", + ) + + paths.ALPHAFOLD_MULTIMER_PATH.mkdir(parents=True, exist_ok=True) + + md = pd.DataFrame( + [ + { + "entry_id": "M2", + "uniprot_ids": "P1,P2", + "model_created_date": "2025-01-01T00:00:00Z", + "model_used": "Multimer", + } + ] + ) + md.to_csv(paths.AF_MULTIMER_METADATA_CSV_PATH, index=False) + + prot_dir = paths.ALPHAFOLD_MULTIMER_PATH / "M2" + prot_dir.mkdir(parents=True, exist_ok=True) + + cif = prot_dir / "m2.cif" + cif.write_text( + """ +data_test +loop_ +_atom_site.id +_atom_site.type_symbol +N N +""" + ) + + fasta = prot_dir / "m2.fasta" + fasta.write_text(">alpha|M2\nAAAA\n") + + j1 = prot_dir / "j1.json" + j2 = prot_dir / "j2.json" + j1.write_text(json.dumps({"something": 1})) + j2.write_text(json.dumps({"other": 2})) + + out = get_multimer_structure_dfs("M2") + assert any(m.get("level") == logging.WARNING for m in out["messages"]) + assert any("Could not detect confidence scores" in str(m.get("msg", "")) for m in out["messages"]) From 4f3cb9b8aa42dc41ab1ef5e0bfbdb48bc500e7d3 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Sat, 14 Feb 2026 11:59:00 +0100 Subject: [PATCH 107/240] fix: format with black --- .../test_alphafold_protein_structure_load.py | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index ede5be51f..5f11baedf 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -120,7 +120,9 @@ def test_fetch_alphafold_protein_structure_wrong_uniprot_id(): def test_fetch_alphafold_returned_keys(tmp_path, monkeypatch): monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path / "alphafold_monomer") monkeypatch.setattr( - paths, "AF_MONOMER_METADATA_CSV_PATH", tmp_path / "alphafold_monomer_metadata.csv" + paths, + "AF_MONOMER_METADATA_CSV_PATH", + tmp_path / "alphafold_monomer_metadata.csv", ) out = fetch_alphafold_protein_structure("Q8WP00", persist_upload=True) @@ -137,7 +139,9 @@ def test_fetch_alphafold_returned_keys(tmp_path, monkeypatch): def test_fetch_alphafold_monomer_metadata(tmp_path, monkeypatch): monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path / "alphafold_monomer") monkeypatch.setattr( - paths, "AF_MONOMER_METADATA_CSV_PATH", tmp_path / "alphafold_monomer_metadata.csv" + paths, + "AF_MONOMER_METADATA_CSV_PATH", + tmp_path / "alphafold_monomer_metadata.csv", ) out = fetch_alphafold_protein_structure("Q8WP00", persist_upload=True) @@ -152,7 +156,9 @@ def test_fetch_alphafold_monomer_metadata(tmp_path, monkeypatch): def test_fetch_alphafold_files_exist(tmp_path, monkeypatch): monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path / "alphafold_monomer") monkeypatch.setattr( - paths, "AF_MONOMER_METADATA_CSV_PATH", tmp_path / "alphafold_monomer_metadata.csv" + paths, + "AF_MONOMER_METADATA_CSV_PATH", + tmp_path / "alphafold_monomer_metadata.csv", ) fetch_alphafold_protein_structure("Q8WP00", persist_upload=True) @@ -177,7 +183,9 @@ def test_fetch_alphafold_files_exist(tmp_path, monkeypatch): def test_fetch_alphafold_dfs_exist(tmp_path, monkeypatch): monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path / "alphafold_monomer") monkeypatch.setattr( - paths, "AF_MONOMER_METADATA_CSV_PATH", tmp_path / "alphafold_monomer_metadata.csv" + paths, + "AF_MONOMER_METADATA_CSV_PATH", + tmp_path / "alphafold_monomer_metadata.csv", ) out = fetch_alphafold_protein_structure("Q8WP00", persist_upload=True) @@ -667,7 +675,6 @@ def test_extend_metadata_csv_empty_existing(tmp_path): assert out.iloc[0]["entry_id"] == "Z" - def test_get_all_available_entry_ids_of_multimer_metadata_empty(tmp_path, monkeypatch): metadata_csv = tmp_path / "alphafold_multimer_metadata.csv" monkeypatch.setattr(paths, "AF_MULTIMER_METADATA_CSV_PATH", metadata_csv) @@ -685,7 +692,9 @@ def test_get_all_available_entry_ids_of_multimer_metadata_empty(tmp_path, monkey assert len(df) == 0 -def test_get_all_available_entry_ids_of_multimer_metadata_nonempty(tmp_path, monkeypatch): +def test_get_all_available_entry_ids_of_multimer_metadata_nonempty( + tmp_path, monkeypatch +): metadata_csv = tmp_path / "alphafold_multimer_metadata.csv" monkeypatch.setattr(paths, "AF_MULTIMER_METADATA_CSV_PATH", metadata_csv) @@ -704,7 +713,6 @@ def test_get_all_available_entry_ids_of_multimer_metadata_nonempty(tmp_path, mon assert get_all_available_entry_ids_of_multimer_metadata() == ["M1"] - def test_check_and_get_metadata_df_success(tmp_path): all_df = pd.DataFrame( [ @@ -724,7 +732,6 @@ def test_check_dir_missing_raises(tmp_path): check_dir("MISSING", d) - def test_get_json_files_in_dir_success(tmp_path): d = tmp_path / "D" d.mkdir() @@ -774,7 +781,6 @@ def test_get_cif_df_from_disk_multiple_cif_warns(tmp_path): assert any(m.get("level") == logging.WARNING for m in messages) - def test_get_multimer_structure_dfs_success(tmp_path, monkeypatch): monkeypatch.setattr(paths, "ALPHAFOLD_MULTIMER_PATH", tmp_path / "multimer") monkeypatch.setattr( @@ -834,8 +840,6 @@ def test_get_multimer_structure_dfs_success(tmp_path, monkeypatch): ) - - def test_get_multimer_structure_dfs_json_fallback_warns(tmp_path, monkeypatch): monkeypatch.setattr(paths, "ALPHAFOLD_MULTIMER_PATH", tmp_path / "multimer") monkeypatch.setattr( @@ -882,4 +886,7 @@ def test_get_multimer_structure_dfs_json_fallback_warns(tmp_path, monkeypatch): out = get_multimer_structure_dfs("M2") assert any(m.get("level") == logging.WARNING for m in out["messages"]) - assert any("Could not detect confidence scores" in str(m.get("msg", "")) for m in out["messages"]) + assert any( + "Could not detect confidence scores" in str(m.get("msg", "")) + for m in out["messages"] + ) From 4be236f39a5fa766702da573d0fa37850f31d9b3 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Sat, 14 Feb 2026 12:35:42 +0100 Subject: [PATCH 108/240] fix: add doc strings --- .../alphafold_protein_structure_load.py | 188 +++++++++++++++++- 1 file changed, 187 insertions(+), 1 deletion(-) diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 95de95e7d..a13f1e118 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -131,6 +131,24 @@ def read_alphafold_mmcif(path: Path) -> pd.DataFrame: def get_correct_af_directories( entry_id: str, directory_name: Path, persist_upload: bool ) -> tuple[Path | None, Path]: + """ + Determine and prepare the appropriate working directory for an entry. + + If persist_upload is True, a persistent directory named after the + uppercased entry_id is created inside directory_name and used as the + working directory. If persist_upload is False, a temporary directory + is created and used instead. + + :param entry_id: Identifier of the entry. Used as the name of the + subdirectory (uppercased) when persist_upload is True. + :param directory_name: Base directory under which the persistent + entry-specific directory is created. + :param persist_upload: Whether to create and use a persistent + directory or a temporary one. + :return: A tuple containing the temporary directory (or None if not + created) and the working directory to use. + """ + target_dir = directory_name / entry_id.upper() temp_dir = None @@ -151,6 +169,30 @@ def extend_metadata_csv( metadata_df: pd.DataFrame, messages: list, ) -> None: + """ + Extend or update the AlphaFold metadata CSV with a new entry. + + If an entry with the given entry_id already exists in the provided + existing_metadata_df, it is removed and replaced with the data from + metadata_df. The combined DataFrame is then written to metadata_csv. + + Any warnings or errors encountered during processing are logged and + appended to the provided messages list. + + :param entry_id: The Entry ID used to identify and potentially + overwrite an existing row in the metadata. + :param metadata_csv: Path to the metadata CSV file that should be + updated. + :param exsisting_metadata_df: The current metadata DataFrame loaded + from the CSV file. + :param metadata_df: The new metadata DataFrame to append to the + existing data. + :param messages: A list used to collect structured log messages + with level and message content. + :return: None. + :raises Exception: Propagates unexpected errors that occur during + concatenation or writing to disk after logging them. + """ try: mask = exsisting_metadata_df["entry_id"] == entry_id if mask.any(): @@ -168,6 +210,22 @@ def extend_metadata_csv( def get_amino_acid_sequence_df(fasta_dest: Path, messages: list) -> pd.DataFrame: + """ + Load a FASTA file and return its amino acid sequence DataFrame. + + The function uses fasta_import to parse the FASTA file and extracts + the DataFrame stored under the key "fasta_df". If an error occurs + during parsing, the exception is logged, a message is appended to + the provided messages list, and an empty DataFrame is returned. + + :param fasta_dest: Path to the FASTA file to be imported. + :param messages: A list used to collect structured log messages + with level and message content. + :return: A DataFrame containing the amino acid sequence information, + or an empty DataFrame if parsing fails. + :raises Exception: Propagates unexpected errors from fasta_import + after logging them. + """ try: fasta_dict = fasta_import(str(fasta_dest)) amino_acid_sequence_df = fasta_dict["fasta_df"] @@ -381,6 +439,22 @@ def get_all_available_entry_ids_of_multimer_metadata() -> list[str]: def check_and_get_metadata_df( entry_id: str, all_metadata_df: pd.DataFrame, csv_file: Path ) -> pd.DataFrame: + """ + Retrieve the metadata row for a given Entry ID from a DataFrame. + + The function filters all_metadata_df for rows matching the provided + entry_id. If no matching metadata is found, an error is logged and + a ValueError is raised. + + :param entry_id: The Entry ID used to filter the metadata DataFrame. + :param all_metadata_df: The complete metadata DataFrame containing + all entries. + :param csv_file: Path to the CSV file from which the metadata was + loaded. Used for error reporting. + :return: A DataFrame containing the metadata for the specified + Entry ID. + :raises ValueError: If no metadata for the given Entry ID is found. + """ metadata_df = all_metadata_df[all_metadata_df["entry_id"] == entry_id] if metadata_df.empty: msg = f"No metadata for Entry ID '{entry_id}' in {csv_file}" @@ -389,7 +463,16 @@ def check_and_get_metadata_df( return metadata_df -def check_dir(entry_id: str, dir: Path): +def check_dir(entry_id: str, dir: Path) -> None: + """ + Validate that the given directory exists and is a directory. + + :param entry_id: The Entry ID used for error reporting. + :param dir: Path to the expected AlphaFold data directory. + :return: None. + :raises FileNotFoundError: If the directory does not exist or is + not a valid directory. + """ if not dir.exists() or not dir.is_dir(): msg = f"AlphaFold data directory not found for entry '{entry_id}': {dir}" logger.error(msg) @@ -399,6 +482,23 @@ def check_dir(entry_id: str, dir: Path): def get_cif_df_from_disk( entry_id: str, structure_dir: Path, messages: list ) -> pd.DataFrame: + """ + Load the AlphaFold mmCIF file from disk and return it as a DataFrame. + + The function searches the given structure directory for files with + the .cif extension. If multiple CIF files are found, only the first + one is read and a warning message is logged and appended to the + messages list. If no CIF file is found, a FileNotFoundError is raised. + + :param entry_id: The Entry ID used for error reporting. + :param structure_dir: Path to the directory containing the structure + files. + :param messages: A list used to collect structured log messages + with level and message content. + :return: A DataFrame containing the parsed mmCIF data. + :raises FileNotFoundError: If no CIF file is found in the directory. + :raises RuntimeError: If reading the CIF file fails. + """ cif_files = list(structure_dir.glob("*.cif")) if not cif_files: msg = f"No CIF file found in {structure_dir} for entry '{entry_id}'" @@ -423,6 +523,21 @@ def get_cif_df_from_disk( def get_amino_acid_sequences_df_from_disk( entry_id: str, structure_dir: Path ) -> pd.DataFrame: + """ + Load the amino acid sequence DataFrame from a FASTA file on disk. + + The function searches the given structure directory for files with + the .fasta or .fa extension. The first matching file is parsed using + fasta_import and the DataFrame stored under the key "fasta_df" is + returned. + + :param entry_id: The Entry ID used for error reporting. + :param structure_dir: Path to the directory containing the FASTA file. + :return: A DataFrame containing the amino acid sequence data. + :raises FileNotFoundError: If no FASTA file is found in the directory. + :raises RuntimeError: If loading the FASTA file fails or if the + importer does not return a "fasta_df" entry. + """ fasta_files = list(structure_dir.glob("*.fasta")) + list(structure_dir.glob("*.fa")) if not fasta_files: msg = f"No FASTA file found in {structure_dir} for entry '{entry_id}'" @@ -445,6 +560,19 @@ def get_amino_acid_sequences_df_from_disk( def get_json_files_in_dir(entry_id: str, structure_dir: Path) -> list: + """ + Retrieve all JSON files from a given structure directory. + + The function searches the specified directory for files with the + .json extension and returns them as a list. If no JSON files are + found, an error is logged and a FileNotFoundError is raised. + + :param entry_id: The Entry ID used for error reporting. + :param structure_dir: Path to the directory to search for JSON files. + :return: A list of Path objects representing the JSON files found + in the directory. + :raises FileNotFoundError: If no JSON files are found in the directory. + """ json_files = list(structure_dir.glob("*.json")) if not json_files: msg = f"No JSON files found in {structure_dir} for entry '{entry_id}'" @@ -454,6 +582,21 @@ def get_json_files_in_dir(entry_id: str, structure_dir: Path) -> list: def check_success_of_get_df(entry_id: str, df_dict: dict, messages: list) -> None: + """ + Evaluate whether all retrieved DataFrames contain data and log the result. + + The function checks if any DataFrame in df_dict is empty. If none are + empty, a success message is logged and appended to the messages list. + If at least one DataFrame is empty, a warning message is logged and + appended instead. + + :param entry_id: The Entry ID used for logging the result. + :param df_dict: A dictionary containing DataFrames that were loaded + for the given entry. + :param messages: A list used to collect structured log messages + with level and message content. + :return: None. + """ if not any(df.empty for df in df_dict.values()): success_msg = f"Successfully loaded AlphaFold data for entry '{entry_id}'" logger.info(success_msg) @@ -633,6 +776,49 @@ def upload_multimer_prediction( full_data_file: Path, persist_upload: bool, ) -> dict[str, Any]: + """ + Process an AlphaFold multimer prediction and return its parsed data as DataFrames. + + The function assembles metadata for the prediction, optionally persists both + metadata and input files to the configured multimer storage directory, and + parses the provided files into DataFrames: + - FASTA sequences via fasta_import, key "fasta_df". + - mmCIF structure via read_alphafold_mmcif. + - Confidence JSON via pandas.read_json. + - Full data JSON via json.load and pandas.json_normalize if it is a dict. + + The returned dictionary contains the DataFrames and a "messages" list with + structured log entries describing warnings or errors encountered. + + Temporary working directories created when persist_upload is False are + removed in a finally block. + + :param entry_id: Unique identifier for the prediction entry. Used for + directory naming and metadata. + :param uniprot_ids: UniProt identifiers associated with the multimer + prediction. + :param model_used: Name or identifier of the AlphaFold model used to + create the prediction. + :param amino_acid_sequences: Path to the FASTA file containing the amino + acid sequences. + :param cif_file: Path to the mmCIF structure file. + :param confidence_file: Path to the confidence JSON file. + :param full_data_file: Path to the full data JSON file. If the JSON + content is a dict it is normalized into a single-row DataFrame. + Otherwise, an empty DataFrame is returned and a warning is recorded. + :param persist_upload: If True, persist metadata and copy input files into + the configured multimer directory. If False, use a temporary directory + and do not persist metadata. + :return: A dictionary containing: + - "metadata_df": DataFrame with entry metadata. + - "cif_df": DataFrame parsed from the mmCIF file. + - "confidence_df": DataFrame loaded from the confidence JSON. + - "full_data_df": Normalized DataFrame from the full data JSON or empty. + - "amino_acid_sequences_df": DataFrame from the FASTA import. + - "messages": List of structured log messages with level and msg. + :raises Exception: Any exception raised during parsing or file operations + will propagate after cleanup of any temporary directory. + """ messages = [] From b279ce89cfedc0cf878458174e7cf37447d611c6 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sun, 15 Feb 2026 13:26:48 +0100 Subject: [PATCH 109/240] feat: add vertical lines for crosslinker length and accepted deviation bounds with annotation in legend --- .../data_analysis/crosslinking_validation.py | 175 +++++++++++++----- backend/protzilla/data_preprocessing/plots.py | 67 +++---- backend/protzilla/methods/data_analysis.py | 4 +- 3 files changed, 165 insertions(+), 81 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 1531e3da1..94f3e221e 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -1,8 +1,10 @@ +import math + import pandas as pd import numpy as np -from isort.wrap_modes import vertical +import plotly.graph_objects as go + from plotly.graph_objects import Figure -from scipy.ndimage import standard_deviation from backend.protzilla.importing.alphafold_protein_structure_load import ( fetch_alphafold_protein_structure, @@ -244,33 +246,73 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: return dict(crosslinking_result_df=checked_crosslinks_df, messages={}) -def bar_plot_of_valid_crosslinks( +def _add_vertical_line_with_annotation_in_legend( + fig: Figure, dash: str, annotation: str, x_value: float, color: str = "blue" +) -> None: + """ + Adds a vertical line to a Plotly figure and includes a corresponding entry in the legend + without displaying an additional visible trace in the plot. + + :param fig: Plotly Figure object to which the vertical line and legend entry are added. + :param dash: Line style for the vertical line (e.g., "solid", "dash", "dot"). + :param annotation: Text to display in the legend corresponding to the vertical line. + :param x_value: X-coordinate at which to draw the vertical line. + :param color: Color of the vertical line and legend entry (default is "blue"). + :return: None + """ + # add vertical line + fig.add_vline(x=x_value, line_color=color, line_dash=dash, line_width=2) + # add annotation of the line to the legend + fig.add_trace( + go.Scatter( + x=[None], + y=[None], + mode="lines", + name=annotation, + line=dict(color=color, width=2, dash=dash), + ) + ) + + +def diagrams_of_crosslinking_validation_data( crosslinking_df: pd.DataFrame, protein_to_validate: str, crosslinker_information: dict[str, list[float]], ) -> list[Figure]: """ - Creates a bar plot summarizing the number of valid and invalid cross-links - based on their distances in the AlphaFold structure compared to cross-linker - lengths and allowed deviations. + Creates for each crosslinker histogram plots summarizing the distribution of valid and invalid + cross-links based on the (AlphaFold-)predicted distances compared to crosslinker lengths and + allowed deviations. - :param crosslinking_df: DataFrame containing cross-linking data. + For each crosslinker, two histograms are generated: + - One covering the full distance range. + - One restricted to the range of mean ± 2 standard deviations of the predicted distances. + + Both histograms include vertical reference lines indicating the + crosslinker length and, if applicable, the upper and/or lower accepted deviation bounds. + + Additionally, a bar plot is created summarizing the total number of cross-links that match + or do not match the predicted structure across all analyzed crosslinkers. + + :param crosslinking_df: DataFrame containing cross-linking data, including AlphaFold-predicted + distances, crosslinker identifiers, and validation results. :param protein_to_validate: UniProt ID of the protein to validate. :param crosslinker_information: Contains for each Crosslinker: - length_of_: float - lower_accepted_deviation_for_: float - upper_accepted_deviation_for_: float - :return: List containing a single bar plot object representing counts of - valid and invalid cross-links. - :raises KeyError: If a required crosslinker field is missing in crosslinker_information. + :return: List of Plotly Figure objects. For each crosslinker, the list contains two histogram + figures (mean ± 2 standard deviations first, full range second), followed by a final + bar plot summarizing valid and invalid cross-links across all crosslinkers. + :raises KeyError: If a required crosslinker entry is missing in crosslinker_information. """ validated_df = validate_with_angstrom_deviation( crosslinking_df, protein_to_validate, crosslinker_information - )[ - "crosslinking_result_df" - ] - figures = [] + )["crosslinking_result_df"] validated_df = validated_df.dropna(subset=["valid_crosslink"]) + + figures = [] + for crosslinker, crosslinker_df in validated_df.groupby("Crosslinker"): distances_valid = crosslinker_df.loc[ crosslinker_df["valid_crosslink"] == True, "alphafold_distance" @@ -281,70 +323,119 @@ def bar_plot_of_valid_crosslinks( df_valid = pd.DataFrame({"alphafold_distance": distances_valid}) df_invalid = pd.DataFrame({"alphafold_distance": distances_invalid}) + ( + crosslinker_length, + accepted_deviation_upper_bound, + accepted_deviation_lower_bound, + ) = crosslinker_information[crosslinker] + histogram = create_histograms( dataframe_a=df_valid, dataframe_b=df_invalid, name_a="Valid Crosslinks", name_b="Invalid Crosslinks", - heading=f"Predicted distances for protein {protein_to_validate} with crosslinker {crosslinker}", + heading=f"Predicted distances for {protein_to_validate}", x_title="Distance (Å)", y_title="Count", overlay=True, visual_transformation="linear", relevant_column_a="alphafold_distance", relevant_column_b="alphafold_distance", - one_bin_per_int=True + one_bin_per_int=True, + ) + _add_vertical_line_with_annotation_in_legend( + fig=histogram, + dash="solid", + annotation=f"{crosslinker} length", + x_value=crosslinker_length, ) mean_predicted_lengths = crosslinker_df["alphafold_distance"].mean() - standard_deviation_predicted_lengths = crosslinker_df["alphafold_distance"].std() - ( - crosslinker_length, - accepted_deviation_upper_bound, - accepted_deviation_lower_bound, - ) = crosslinker_information[crosslinker] - dashed_lines = [] - if accepted_deviation_upper_bound != 0: - dashed_lines.append((crosslinker_length + accepted_deviation_upper_bound, f"allowed_deviation_upper_bound")) - if accepted_deviation_lower_bound != 0: - dashed_lines.append((crosslinker_length - accepted_deviation_lower_bound, f"allowed_deviation_lower_bound")) + standard_deviation_predicted_lengths = crosslinker_df[ + "alphafold_distance" + ].std() + mean_plus_minus_two_std_range = ( + max(0, mean_predicted_lengths - 2 * standard_deviation_predicted_lengths), + mean_predicted_lengths + 2 * standard_deviation_predicted_lengths, + ) histogram_two_standard_deviations = create_histograms( dataframe_a=df_valid, dataframe_b=df_invalid, name_a="Valid Crosslinks", name_b="Invalid Crosslinks", - heading=f"Predicted distances for protein {protein_to_validate} with crosslinker {crosslinker}, mean +- 2 standard deviations", + heading=f"Predicted distances for {protein_to_validate}, mean +- 2 standard deviations", x_title="Distance (Å)", y_title="Count", overlay=True, visual_transformation="linear", relevant_column_a="alphafold_distance", relevant_column_b="alphafold_distance", - min_value=max(0, mean_predicted_lengths - - 2 * standard_deviation_predicted_lengths), - max_value=mean_predicted_lengths - + 2 * standard_deviation_predicted_lengths, - vertical_lines=[(crosslinker_length, f"{crosslinker}")], - vertical_lines_dashed= dashed_lines if dashed_lines else None, - one_bin_per_int=True + min_value=mean_plus_minus_two_std_range[0], + max_value=mean_plus_minus_two_std_range[1], + one_bin_per_int=True, ) + _add_vertical_line_with_annotation_in_legend( + fig=histogram_two_standard_deviations, + dash="solid", + annotation=f"{crosslinker} length", + x_value=crosslinker_length, + ) + + if accepted_deviation_upper_bound != 0: + _add_vertical_line_with_annotation_in_legend( + fig=histogram, + dash="dash", + annotation=f"allowed deviation upper bound", + x_value=crosslinker_length + accepted_deviation_upper_bound, + ) + if ( + math.floor(mean_plus_minus_two_std_range[0]) + <= crosslinker_length + accepted_deviation_upper_bound + <= math.ceil(mean_plus_minus_two_std_range[1]) + ): + _add_vertical_line_with_annotation_in_legend( + fig=histogram_two_standard_deviations, + dash="dash", + annotation=f"allowed deviation upper bound", + x_value=crosslinker_length + accepted_deviation_upper_bound, + ) + if accepted_deviation_lower_bound != 0: + _add_vertical_line_with_annotation_in_legend( + fig=histogram, + dash="dash", + annotation=f"allowed deviation lower bound", + x_value=crosslinker_length - accepted_deviation_lower_bound, + ) + if ( + math.floor(mean_plus_minus_two_std_range[0]) + <= crosslinker_length - accepted_deviation_lower_bound + <= math.ceil(mean_plus_minus_two_std_range[1]) + ): + _add_vertical_line_with_annotation_in_legend( + fig=histogram_two_standard_deviations, + dash="dash", + annotation=f"allowed deviation lower bound", + x_value=crosslinker_length - accepted_deviation_lower_bound, + ) figures.append(histogram_two_standard_deviations) figures.append(histogram) - evaluated = validated_df["valid_crosslink"].dropna() - valid_crosslinks = (evaluated == True).sum() - invalid_crosslinks = (evaluated == False).sum() + valid_crosslinks = (validated_df["Is_intra_crosslink"] == True).sum() + invalid_crosslinks = (validated_df["Is_intra_crosslink"] == False).sum() - bar_plot = create_bar_plot( + bar_plot_over_all_checked_crosslinks = create_bar_plot( values_of_sectors=[ valid_crosslinks, invalid_crosslinks, ], - names_of_sectors=["Valid Cross-Links", "Invalid Cross-Links"], - heading="Cross-Links used for Validation", + names_of_sectors=[ + "Cross-Links matching predicted data", + "Cross-Links not matching predicted data", + ], + heading=f"All Cross-Links used for validation of {protein_to_validate}", y_title="Number of Cross-Links", ) - figures.append(bar_plot) + figures.append(bar_plot_over_all_checked_crosslinks) return figures diff --git a/backend/protzilla/data_preprocessing/plots.py b/backend/protzilla/data_preprocessing/plots.py index 149b3dec5..500d68b78 100644 --- a/backend/protzilla/data_preprocessing/plots.py +++ b/backend/protzilla/data_preprocessing/plots.py @@ -175,9 +175,7 @@ def create_histograms( relevant_column_b: str = None, min_value: float = None, max_value: float = None, - vertical_lines: list[tuple[float, str]] = None, - vertical_lines_dashed: list[tuple[float, str]] = None, - one_bin_per_int = False + one_bin_per_int: bool = False, ) -> Figure: """ A function to create a histogram for visualisation @@ -197,8 +195,14 @@ def create_histograms( :param x_title: Optional x axis title for graphs. :param overlay: Specifies whether to draw one Histogram with overlay or two separate histograms :param visual_transformation: Visual transformation of the y-axis data. - - :return: returns a pie or bar chart of the data + :param relevant_column_a: Which column of dataframe_a should be used for the histogram. If None, the default_intensity_column will be used. + :param relevant_column_b: Which column of dataframe_b should be used for the histogram. If None, the default_intensity_column will be used. + :param min_value: Where the first bin should start. If None, will be set to the minimum value of the two dataframes. + :param max_value: Where the last bin should end. If None, will be set to the maximum value of the two dataframes. + :param one_bin_per_int: If set to True, min_value will be rounded down to the next int and max_value will be rounded up to the next int and there will\ + be max_value-min_value many bins. + + :return: returns a histogram of the data """ if visual_transformation not in {"linear", "log10"}: raise ValueError( @@ -218,25 +222,32 @@ def create_histograms( values_b = values_b.apply(np.log10) if min_value is None: - min_value = min(values_a.min(skipna=True), values_b.min(skipna=True)) + min_value = np.nanmin([values_a.min(), values_b.min()]) if max_value is None: - max_value = max(values_a.max(skipna=True), values_b.max(skipna=True)) + max_value = np.nanmax([values_a.max(), values_b.max()]) if one_bin_per_int: min_value = math.floor(min_value) max_value = math.ceil(max_value) - - number_of_bins = max_value-min_value if one_bin_per_int else 100 - binsize_a = ( - min(values_a.max(skipna=True), max_value) - - max(values_a.min(skipna=True), min_value) - ) / number_of_bins - binsize_b = ( - min(values_b.max(skipna=True), max_value) - - max(values_b.min(skipna=True), min_value) - ) / number_of_bins - - if overlay: + number_of_bins = max_value - min_value + binsize_a = 1 + binsize_b = 1 + else: + number_of_bins = 100 + if len(values_a) > 0: + binsize_a = ( + values_a.max(skipna=True) - values_a.min(skipna=True) + ) / number_of_bins + else: + binsize_a = 1 # default value of 1 in case that values_a is empty + if len(values_b) > 0: + binsize_b = ( + values_b.max(skipna=True) - values_b.min(skipna=True) + ) / number_of_bins + else: + binsize_b = 1 # default value of 1 in case that values_b is empty + + if overlay and len(values_a) > 0 and len(values_b) > 0: binsize_a = binsize_b = max(binsize_a, binsize_b) trace0 = go.Histogram( @@ -269,24 +280,6 @@ def create_histograms( if visual_transformation == "log10": fig.update_layout(xaxis=generate_tics(0, max_value, True)) - for lines, dash in [ - (vertical_lines, None), - (vertical_lines_dashed, "dash"), - ]: - if lines is None: - continue - - for position, annotation in lines: - fig.add_vline( - x=position, - line=dict(color="red", width=2, dash=dash), - annotation_text=annotation, - annotation_position="top left", - annotation_textangle=-90, - annotation_y=1, - annotation_yanchor="top", - ) - fig.update_layout(title={"text": f"{heading}"}) fig.update_xaxes(title=x_title) fig.update_yaxes(title=y_title, rangemode="tozero") diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 071644cc2..2dbebc8c7 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -62,7 +62,7 @@ ) from protzilla.data_analysis.crosslinking_validation import ( validate_with_angstrom_deviation, - bar_plot_of_valid_crosslinks, + diagrams_of_crosslinking_validation_data, ) from backend.protzilla.run import Run @@ -2514,7 +2514,7 @@ def modify_form(self, form: Form, run: Run) -> None: form.add_field(upper_bound_length_deviation_field) form.add_field(lower_bound_length_deviation_field) - plot_method = staticmethod(bar_plot_of_valid_crosslinks) + plot_method = staticmethod(diagrams_of_crosslinking_validation_data) calc_method = staticmethod(validate_with_angstrom_deviation) def insert_dataframes(self, steps: StepManager, inputs) -> dict: From 87c8b18a7f1dced44bed6c18c7f987e6148125bc Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sun, 15 Feb 2026 14:45:13 +0100 Subject: [PATCH 110/240] test: add tests for testing the plot method of crosslinking validation with Angstrom deviation --- .../data_analysis/crosslinking_validation.py | 14 +- .../test_crosslinking_validation.py | 307 +++++++++++++++++- 2 files changed, 312 insertions(+), 9 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 94f3e221e..b3e1fc95f 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -246,7 +246,7 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: return dict(crosslinking_result_df=checked_crosslinks_df, messages={}) -def _add_vertical_line_with_annotation_in_legend( +def add_vertical_line_with_annotation_in_legend( fig: Figure, dash: str, annotation: str, x_value: float, color: str = "blue" ) -> None: """ @@ -343,7 +343,7 @@ def diagrams_of_crosslinking_validation_data( relevant_column_b="alphafold_distance", one_bin_per_int=True, ) - _add_vertical_line_with_annotation_in_legend( + add_vertical_line_with_annotation_in_legend( fig=histogram, dash="solid", annotation=f"{crosslinker} length", @@ -374,7 +374,7 @@ def diagrams_of_crosslinking_validation_data( max_value=mean_plus_minus_two_std_range[1], one_bin_per_int=True, ) - _add_vertical_line_with_annotation_in_legend( + add_vertical_line_with_annotation_in_legend( fig=histogram_two_standard_deviations, dash="solid", annotation=f"{crosslinker} length", @@ -382,7 +382,7 @@ def diagrams_of_crosslinking_validation_data( ) if accepted_deviation_upper_bound != 0: - _add_vertical_line_with_annotation_in_legend( + add_vertical_line_with_annotation_in_legend( fig=histogram, dash="dash", annotation=f"allowed deviation upper bound", @@ -393,14 +393,14 @@ def diagrams_of_crosslinking_validation_data( <= crosslinker_length + accepted_deviation_upper_bound <= math.ceil(mean_plus_minus_two_std_range[1]) ): - _add_vertical_line_with_annotation_in_legend( + add_vertical_line_with_annotation_in_legend( fig=histogram_two_standard_deviations, dash="dash", annotation=f"allowed deviation upper bound", x_value=crosslinker_length + accepted_deviation_upper_bound, ) if accepted_deviation_lower_bound != 0: - _add_vertical_line_with_annotation_in_legend( + add_vertical_line_with_annotation_in_legend( fig=histogram, dash="dash", annotation=f"allowed deviation lower bound", @@ -411,7 +411,7 @@ def diagrams_of_crosslinking_validation_data( <= crosslinker_length - accepted_deviation_lower_bound <= math.ceil(mean_plus_minus_two_std_range[1]) ): - _add_vertical_line_with_annotation_in_legend( + add_vertical_line_with_annotation_in_legend( fig=histogram_two_standard_deviations, dash="dash", annotation=f"allowed deviation lower bound", diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index d5de0ca93..577d92a72 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -1,13 +1,17 @@ import pandas as pd import pytest -from unittest.mock import patch -from unittest.mock import MagicMock +from unittest.mock import patch, MagicMock +import plotly.graph_objects as go +from plotly.graph_objects import Figure +import pandas.testing as pdt from backend.protzilla.data_analysis.crosslinking_validation import ( get_position_of_amino_acid_crosslinker_bound_to, validate_with_angstrom_deviation, get_distance_between_two_amino_acids_in_angstrom, + diagrams_of_crosslinking_validation_data, + add_vertical_line_with_annotation_in_legend, ) from protzilla.methods.data_analysis import CrossLinkingValidationWithAngstromDeviation @@ -113,3 +117,302 @@ def test_get_distance_between_two_amino_acids_in_angstrom(): dist = get_distance_between_two_amino_acids_in_angstrom(1, 2, "A", "B", cif_df) assert dist == 5.0 + + +def test_add_vertical_line_with_annotation_in_legend_adds_line_and_legend(): + fig = go.Figure() + add_vertical_line_with_annotation_in_legend( + fig=fig, dash="dash", annotation="Test Line", x_value=5.0 + ) + + # add_vline internally adds a shape to layout.shapes + assert len(fig.layout.shapes) == 1 + vline = fig.layout.shapes[0] + assert vline["x0"] == 5.0 + assert vline["line"]["dash"] == "dash" + assert vline["line"]["color"] == "blue" + + # There should be 1 scatter trace for the legend + assert len(fig.data) == 1 + trace = fig.data[0] + assert trace.mode == "lines" + assert trace.name == "Test Line" + assert trace.line.dash == "dash" + assert trace.line.color == "blue" + assert trace.x == (None,) + assert trace.y == (None,) + + +def test_add_vertical_line_with_annotation_in_legend_adds_line_and_legend_multiple_calls(): + fig = go.Figure() + add_vertical_line_with_annotation_in_legend( + fig=fig, dash="dash", annotation="Line 1", x_value=1.0 + ) + add_vertical_line_with_annotation_in_legend( + fig=fig, dash="dot", annotation="Line 2", x_value=2.0, color="green" + ) + + # Check layout.shapes -> add_vline internally adds a shape to layout.shapes + assert len(fig.layout.shapes) == 2 + vlines_x = [shape.x0 for shape in fig.layout.shapes] + assert vlines_x == [1.0, 2.0] + vlines_colors = [shape["line"]["color"] for shape in fig.layout.shapes] + assert vlines_colors == ["blue", "green"] + vlines_dashes = [shape["line"]["dash"] for shape in fig.layout.shapes] + assert vlines_dashes == ["dash", "dot"] + + # Check legend traces + assert len(fig.data) == 2 + names = [trace.name for trace in fig.data] + colors = [trace.line.color for trace in fig.data] + dashes = [trace.line.dash for trace in fig.data] + x_values = [trace.x for trace in fig.data] + y_values = [trace.y for trace in fig.data] + assert names == ["Line 1", "Line 2"] + assert colors == ["blue", "green"] + assert dashes == ["dash", "dot"] + assert x_values == [(None,), (None,)] + assert y_values == [(None,), (None,)] + + +@pytest.fixture +def sample_crosslinking_df(): + return pd.DataFrame( + { + "Crosslinker": ["CL1", "CL1", "CL2", "CL2"], + "alphafold_distance": [10.0, 12.0, 8.0, 9.0], + "valid_crosslink": [True, False, True, False], + "Is_intra_crosslink": [True, False, True, False], + } + ) + + +@pytest.fixture +def sample_crosslinker_info(): + return { + "CL1": [11.0, 2.0, 0.0], # [length, upper_deviation, lower_deviation] + "CL2": [9.0, 0.0, 1.0], + } + + +@patch("backend.protzilla.data_analysis.crosslinking_validation.create_histograms") +@patch("backend.protzilla.data_analysis.crosslinking_validation.create_bar_plot") +@patch( + "backend.protzilla.data_analysis.crosslinking_validation.add_vertical_line_with_annotation_in_legend" +) +@patch( + "backend.protzilla.data_analysis.crosslinking_validation.validate_with_angstrom_deviation" +) +def test_diagrams_of_crosslinking_validation_data_with_drawing_all_vertical_lines( + mock_validate, + mock_add_vline, + mock_create_bar, + mock_create_hist, + sample_crosslinking_df, + sample_crosslinker_info, +): + validated_df = sample_crosslinking_df.copy() + mock_validate.return_value = {"crosslinking_result_df": validated_df} + + hist_mock = Figure() + mock_create_hist.return_value = hist_mock + bar_mock = Figure() + mock_create_bar.return_value = bar_mock + + figures = diagrams_of_crosslinking_validation_data( + crosslinking_df=sample_crosslinking_df, + protein_to_validate="P12345", + crosslinker_information=sample_crosslinker_info, + ) + + # 2 histograms per crosslinker + 1 bar plot + assert len(figures) == 5 + assert all(isinstance(f, Figure) for f in figures) + + mock_validate.assert_called_once_with( + sample_crosslinking_df, "P12345", sample_crosslinker_info + ) + + assert ( + mock_add_vline.call_count == 8 + ) # for both crosslinkers: 1 call for crosslinker length for each histogram and 1 call for bound on deviation for each histogram + + # Check that create_histograms was called 4 times (2 per crosslinker) + assert mock_create_hist.call_count == 4 + + # Check that create_bar_plot was called once + mock_create_bar.assert_called_once() + + +@pytest.fixture +def sample_crosslinking_df_with_no_std(): + return pd.DataFrame( + { + "Crosslinker": ["CL1", "CL1", "CL2", "CL2"], + "alphafold_distance": [10.5, 10.5, 10.5, 10.5], + "valid_crosslink": [True, False, True, False], + "Is_intra_crosslink": [True, False, True, False], + } + ) + + +@pytest.fixture +def sample_crosslinker_info_matching_sample_crosslinking_df_with_no_std(): + return { + "CL1": [10.5, 1.0, 1.0], # [length, upper_deviation, lower_deviation] + "CL2": [10.5, 0.5, 0.3], + } + + +@patch("backend.protzilla.data_analysis.crosslinking_validation.create_histograms") +@patch("backend.protzilla.data_analysis.crosslinking_validation.create_bar_plot") +@patch( + "backend.protzilla.data_analysis.crosslinking_validation.add_vertical_line_with_annotation_in_legend" +) +@patch( + "backend.protzilla.data_analysis.crosslinking_validation.validate_with_angstrom_deviation" +) +def test_diagrams_of_crosslinking_validation_data_without_drawing_all_vertical_lines( + mock_validate, + mock_add_vline, + mock_create_bar, + mock_create_hist, + sample_crosslinking_df_with_no_std, + sample_crosslinker_info_matching_sample_crosslinking_df_with_no_std, +): + validated_df = sample_crosslinking_df_with_no_std.copy() + mock_validate.return_value = {"crosslinking_result_df": validated_df} + + hist_mock = Figure() + mock_create_hist.return_value = hist_mock + bar_mock = Figure() + mock_create_bar.return_value = bar_mock + + figures = diagrams_of_crosslinking_validation_data( + crosslinking_df=sample_crosslinking_df_with_no_std, + protein_to_validate="P12345", + crosslinker_information=sample_crosslinker_info_matching_sample_crosslinking_df_with_no_std, + ) + + # 2 histograms per crosslinker + 1 bar plot + assert len(figures) == 5 + assert all(isinstance(f, Figure) for f in figures) + + # CL1: all 3 lines are drawn for both histograms, CL2: only crosslinker_length ist drawn for both histograms, + # the bounds are only drawn for the histogram that is not limited to the range of +- 2 standard deviations + assert mock_add_vline.call_count == 10 + + # Check that create_histograms was called 4 times (2 per crosslinker) + assert mock_create_hist.call_count == 4 + + # Check that create_bar_plot was called once + mock_create_bar.assert_called_once() + + +@pytest.fixture +def sample_crosslinker_info_with_one_crosslinker(): + return { + "CL1": [11.0, 2.0, 1.0], # [length, upper_deviation, lower_deviation] + } + + +@pytest.fixture +def sample_crosslinking_df_with_one_crosslinker(): + return pd.DataFrame( + { + "Crosslinker": ["CL1", "CL1", "CL1", "CL1"], + "alphafold_distance": [10.0, 12.0, 8.0, 9.0], + "valid_crosslink": [True, False, True, False], + "Is_intra_crosslink": [True, False, True, False], + } + ) + + +def test_diagrams_calls_with_correct_parameters( + sample_crosslinking_df_with_one_crosslinker, + sample_crosslinker_info_with_one_crosslinker, +): + with patch( + "backend.protzilla.data_analysis.crosslinking_validation.validate_with_angstrom_deviation" + ) as mock_validate, patch( + "backend.protzilla.data_analysis.crosslinking_validation.create_histograms" + ) as mock_hist, patch( + "backend.protzilla.data_analysis.crosslinking_validation.add_vertical_line_with_annotation_in_legend" + ) as mock_vline, patch( + "backend.protzilla.data_analysis.crosslinking_validation.create_bar_plot" + ) as mock_bar: + + mock_validate.return_value = { + "crosslinking_result_df": sample_crosslinking_df_with_one_crosslinker + } + + mock_hist.side_effect = lambda **kwargs: f"hist_{kwargs['heading']}" + mock_bar.return_value = "bar_fig" + + figures = diagrams_of_crosslinking_validation_data( + crosslinking_df=sample_crosslinking_df_with_one_crosslinker, + protein_to_validate="P12345", + crosslinker_information=sample_crosslinker_info_with_one_crosslinker, + ) + + mock_validate.assert_called_once_with( + sample_crosslinking_df_with_one_crosslinker, + "P12345", + sample_crosslinker_info_with_one_crosslinker, + ) + + # There should be 2 histogram calls: 2 per crosslinker + assert mock_hist.call_count == 2 + + # Check histogram call parameters for crosslinker full-range + first_hist_call = mock_hist.call_args_list[0].kwargs + assert first_hist_call["name_a"] == "Valid Crosslinks" + assert first_hist_call["name_b"] == "Invalid Crosslinks" + assert first_hist_call["heading"] == "Predicted distances for P12345" + assert first_hist_call["relevant_column_a"] == "alphafold_distance" + assert first_hist_call["relevant_column_b"] == "alphafold_distance" + assert first_hist_call["one_bin_per_int"] == True + + valid_crosslinks = sample_crosslinking_df_with_one_crosslinker.loc[ + sample_crosslinking_df_with_one_crosslinker["valid_crosslink"] == True, + "alphafold_distance", + ] + invalid_crosslinks = sample_crosslinking_df_with_one_crosslinker.loc[ + sample_crosslinking_df_with_one_crosslinker["valid_crosslink"] == False, + "alphafold_distance", + ] + dataframe_a = pd.DataFrame({"alphafold_distance": valid_crosslinks}) + dataframe_b = pd.DataFrame({"alphafold_distance": invalid_crosslinks}) + pdt.assert_frame_equal(first_hist_call["dataframe_a"], dataframe_a) + pdt.assert_frame_equal(first_hist_call["dataframe_b"], dataframe_b) + + # Check histogram call parameters for crosslinker ±2 std + second_hist_call = mock_hist.call_args_list[1].kwargs + assert "mean +- 2 standard deviations" in second_hist_call["heading"] + mean_predicted_lengths = sample_crosslinking_df_with_one_crosslinker[ + "alphafold_distance" + ].mean() + standard_deviation_predicted_lengths = ( + sample_crosslinking_df_with_one_crosslinker["alphafold_distance"].std() + ) + mean_plus_minus_two_std_range = ( + max(0, mean_predicted_lengths - 2 * standard_deviation_predicted_lengths), + mean_predicted_lengths + 2 * standard_deviation_predicted_lengths, + ) + assert second_hist_call["min_value"] == mean_plus_minus_two_std_range[0] + assert second_hist_call["max_value"] == mean_plus_minus_two_std_range[1] + + call_args_list = [call.kwargs for call in mock_vline.call_args_list] + assert any( + call["annotation"] == "CL1 length" and call["x_value"] == 11.0 + for call in call_args_list + ) + + mock_bar.assert_called_once() + + expected_figures = [ + "hist_Predicted distances for P12345, mean +- 2 standard deviations", + "hist_Predicted distances for P12345", + "bar_fig", + ] + assert figures == expected_figures From 029ea1ad4330d6b9349885f000289e2e7b1e908f Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sun, 15 Feb 2026 14:54:33 +0100 Subject: [PATCH 111/240] test: add tests for testing the create_histograms with one_bin_per_int set to true and empty dataframes --- .../test_plots_data_preprocessing.py | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/backend/tests/protzilla/data_preprocessing/test_plots_data_preprocessing.py b/backend/tests/protzilla/data_preprocessing/test_plots_data_preprocessing.py index a12bf6e47..edac829f6 100644 --- a/backend/tests/protzilla/data_preprocessing/test_plots_data_preprocessing.py +++ b/backend/tests/protzilla/data_preprocessing/test_plots_data_preprocessing.py @@ -101,6 +101,66 @@ def test_create_histograms( return +def test_create_histograms_one_bin_per_int_is_true(): + + df_a = pd.DataFrame({"value": [1.2, 2.7, 3.5]}) + df_b = pd.DataFrame({"value": [2.1, 4.6, 5.9]}) + + fig = create_histograms( + dataframe_a=df_a, + dataframe_b=df_b, + relevant_column_a="value", + relevant_column_b="value", + name_a="A", + name_b="B", + one_bin_per_int=True, + ) + + trace_a = fig.data[0] + trace_b = fig.data[1] + + # Check bin size is 1 + assert trace_a.xbins["size"] == 1 + assert trace_b.xbins["size"] == 1 + + # Check min and max are rounded correctly + # min_value should be floor(min(values_a.min(), values_b.min())) = floor(1.2) = 1 + # max_value should be ceil(max(values_a.max(), values_b.max())) = ceil(5.9) = 6 + assert trace_a.xbins["start"] == 1 + assert trace_a.xbins["end"] == 6 + assert trace_b.xbins["start"] == 1 + assert trace_b.xbins["end"] == 6 + + +def test_create_histograms_with_empty_dataframe(): + df_empty = pd.DataFrame({"value": []}) + df_nonempty = pd.DataFrame({"value": [1, 2, 3]}) + + fig = create_histograms( + dataframe_a=df_empty, + dataframe_b=df_nonempty, + relevant_column_a="value", + relevant_column_b="value", + name_a="Empty", + name_b="NonEmpty", + one_bin_per_int=True, + ) + + trace_empty = fig.data[0] + trace_nonempty = fig.data[1] + + # Ensure the function did not crash and returned a Figure + assert isinstance(fig, Figure) + + # Even if dataframe_a is empty, trace_a should exist with default bin size 1 + assert trace_empty.xbins["size"] == 1 + + # trace_b should have correct start/end bin values + assert trace_nonempty.xbins["start"] == 1 # floor(min(values_b)) = 1 + assert trace_nonempty.xbins["end"] == 3 # ceil(max(values_b)) = 3 + assert trace_nonempty.xbins["size"] == 1 + + @pytest.mark.order(2) @pytest.mark.dependency( depends=[ From c3a2727b4ab64850ab977cd628328ce6c386d9fc Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sun, 15 Feb 2026 15:23:34 +0100 Subject: [PATCH 112/240] refactor: move add_vertical_line_with_annotation_in_legend to plots.py --- .../data_analysis/crosslinking_validation.py | 33 +------------------ backend/protzilla/data_analysis/plots.py | 30 +++++++++++++++++ .../test_crosslinking_validation.py | 2 +- 3 files changed, 32 insertions(+), 33 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 77846ca8a..49c51e75c 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -6,17 +6,14 @@ import numpy as np import re import logging -import plotly.graph_objects as go from plotly.graph_objects import Figure -from backend.protzilla.importing.alphafold_protein_structure_load import ( - fetch_alphafold_protein_structure, -) from backend.protzilla.data_preprocessing.plots import ( create_histograms, create_bar_plot, ) +from protzilla.data_analysis.plots import add_vertical_line_with_annotation_in_legend def get_reactive_atom_of_amino_acid_residue(amino_acid_type: str) -> str: @@ -300,34 +297,6 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: return dict(crosslinking_result_df=checked_crosslinks_df, messages=messages) -def add_vertical_line_with_annotation_in_legend( - fig: Figure, dash: str, annotation: str, x_value: float, color: str = "blue" -) -> None: - """ - Adds a vertical line to a Plotly figure and includes a corresponding entry in the legend - without displaying an additional visible trace in the plot. - - :param fig: Plotly Figure object to which the vertical line and legend entry are added. - :param dash: Line style for the vertical line (e.g., "solid", "dash", "dot"). - :param annotation: Text to display in the legend corresponding to the vertical line. - :param x_value: X-coordinate at which to draw the vertical line. - :param color: Color of the vertical line and legend entry (default is "blue"). - :return: None - """ - # add vertical line - fig.add_vline(x=x_value, line_color=color, line_dash=dash, line_width=2) - # add annotation of the line to the legend - fig.add_trace( - go.Scatter( - x=[None], - y=[None], - mode="lines", - name=annotation, - line=dict(color=color, width=2, dash=dash), - ) - ) - - def diagrams_of_crosslinking_validation_data( crosslinking_df: pd.DataFrame, protein_to_validate: str, diff --git a/backend/protzilla/data_analysis/plots.py b/backend/protzilla/data_analysis/plots.py index 4e24ee052..5ef1b03b6 100644 --- a/backend/protzilla/data_analysis/plots.py +++ b/backend/protzilla/data_analysis/plots.py @@ -5,6 +5,8 @@ import pandas as pd import plotly.express as px import plotly.graph_objects as go +from plotly import graph_objects as go +from plotly.graph_objs import Figure from scipy import stats from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances @@ -466,3 +468,31 @@ def prot_quant_plot( ) return dict(plots=[fig]) + + +def add_vertical_line_with_annotation_in_legend( + fig: Figure, dash: str, annotation: str, x_value: float, color: str = "blue" +) -> None: + """ + Adds a vertical line to a Plotly figure and includes a corresponding entry in the legend + without displaying an additional visible trace in the plot. + + :param fig: Plotly Figure object to which the vertical line and legend entry are added. + :param dash: Line style for the vertical line (e.g., "solid", "dash", "dot"). + :param annotation: Text to display in the legend corresponding to the vertical line. + :param x_value: X-coordinate at which to draw the vertical line. + :param color: Color of the vertical line and legend entry (default is "blue"). + :return: None + """ + # add vertical line + fig.add_vline(x=x_value, line_color=color, line_dash=dash, line_width=2) + # add annotation of the line to the legend + fig.add_trace( + go.Scatter( + x=[None], + y=[None], + mode="lines", + name=annotation, + line=dict(color=color, width=2, dash=dash), + ) + ) diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 96ea7d8a8..04d346bbb 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -12,8 +12,8 @@ get_distance_between_two_amino_acids_in_angstrom, add_positions_of_amino_acid_where_crosslinker_bound_to_df, diagrams_of_crosslinking_validation_data, - add_vertical_line_with_annotation_in_legend, ) +from protzilla.data_analysis.plots import add_vertical_line_with_annotation_in_legend from protzilla.methods.data_analysis import CrossLinkingValidationWithAngstromDeviation From bd2ac89e21e2b64041e17e2e5946c8cfb327a5f2 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sun, 15 Feb 2026 15:40:47 +0100 Subject: [PATCH 113/240] test: fix tests --- .../test_crosslinking_validation.py | 48 ++-------- .../data_analysis/test_plots_data_analysis.py | 93 +++++++++++++++++++ .../test_plots_data_preprocessing.py | 60 ------------ 3 files changed, 101 insertions(+), 100 deletions(-) diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 04d346bbb..d95f3fdd7 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -277,38 +277,6 @@ def test_add_vertical_line_with_annotation_in_legend_adds_line_and_legend(): assert trace.y == (None,) -def test_add_vertical_line_with_annotation_in_legend_adds_line_and_legend_multiple_calls(): - fig = go.Figure() - add_vertical_line_with_annotation_in_legend( - fig=fig, dash="dash", annotation="Line 1", x_value=1.0 - ) - add_vertical_line_with_annotation_in_legend( - fig=fig, dash="dot", annotation="Line 2", x_value=2.0, color="green" - ) - - # Check layout.shapes -> add_vline internally adds a shape to layout.shapes - assert len(fig.layout.shapes) == 2 - vlines_x = [shape.x0 for shape in fig.layout.shapes] - assert vlines_x == [1.0, 2.0] - vlines_colors = [shape["line"]["color"] for shape in fig.layout.shapes] - assert vlines_colors == ["blue", "green"] - vlines_dashes = [shape["line"]["dash"] for shape in fig.layout.shapes] - assert vlines_dashes == ["dash", "dot"] - - # Check legend traces - assert len(fig.data) == 2 - names = [trace.name for trace in fig.data] - colors = [trace.line.color for trace in fig.data] - dashes = [trace.line.dash for trace in fig.data] - x_values = [trace.x for trace in fig.data] - y_values = [trace.y for trace in fig.data] - assert names == ["Line 1", "Line 2"] - assert colors == ["blue", "green"] - assert dashes == ["dash", "dot"] - assert x_values == [(None,), (None,)] - assert y_values == [(None,), (None,)] - - @pytest.fixture def sample_crosslinking_df(): return pd.DataFrame( @@ -357,15 +325,15 @@ def test_diagrams_of_crosslinking_validation_data_with_drawing_all_vertical_line crosslinking_df=sample_crosslinking_df, protein_to_validate="P12345", crosslinker_information=sample_crosslinker_info, + cif_df=pd.DataFrame(), + amino_acid_sequence_df=pd.DataFrame(), ) # 2 histograms per crosslinker + 1 bar plot assert len(figures) == 5 assert all(isinstance(f, Figure) for f in figures) - mock_validate.assert_called_once_with( - sample_crosslinking_df, "P12345", sample_crosslinker_info - ) + mock_validate.assert_called_once() assert ( mock_add_vline.call_count == 8 @@ -426,6 +394,8 @@ def test_diagrams_of_crosslinking_validation_data_without_drawing_all_vertical_l crosslinking_df=sample_crosslinking_df_with_no_std, protein_to_validate="P12345", crosslinker_information=sample_crosslinker_info_matching_sample_crosslinking_df_with_no_std, + cif_df=pd.DataFrame(), + amino_acid_sequence_df=pd.DataFrame(), ) # 2 histograms per crosslinker + 1 bar plot @@ -487,13 +457,11 @@ def test_diagrams_calls_with_correct_parameters( crosslinking_df=sample_crosslinking_df_with_one_crosslinker, protein_to_validate="P12345", crosslinker_information=sample_crosslinker_info_with_one_crosslinker, + cif_df=pd.DataFrame(), + amino_acid_sequence_df=pd.DataFrame(), ) - mock_validate.assert_called_once_with( - sample_crosslinking_df_with_one_crosslinker, - "P12345", - sample_crosslinker_info_with_one_crosslinker, - ) + mock_validate.assert_called_once() # There should be 2 histogram calls: 2 per crosslinker assert mock_hist.call_count == 2 diff --git a/backend/tests/protzilla/data_analysis/test_plots_data_analysis.py b/backend/tests/protzilla/data_analysis/test_plots_data_analysis.py index 8fdc37380..4420a3ee2 100644 --- a/backend/tests/protzilla/data_analysis/test_plots_data_analysis.py +++ b/backend/tests/protzilla/data_analysis/test_plots_data_analysis.py @@ -3,6 +3,7 @@ from backend.protzilla.data_analysis.plots import * from backend.tests.protzilla.data_analysis.test_clustering import * +from backend.protzilla.data_preprocessing.plots import create_histograms @pytest.fixture @@ -286,3 +287,95 @@ def test_clustergram_flip_axes(show_figures, wide_4d_df, metadata_df): if show_figures: fig.show() return + + +def test_create_histograms_one_bin_per_int_is_true(): + + df_a = pd.DataFrame({"value": [1.2, 2.7, 3.5]}) + df_b = pd.DataFrame({"value": [2.1, 4.6, 5.9]}) + + fig = create_histograms( + dataframe_a=df_a, + dataframe_b=df_b, + relevant_column_a="value", + relevant_column_b="value", + name_a="A", + name_b="B", + one_bin_per_int=True, + ) + + trace_a = fig.data[0] + trace_b = fig.data[1] + + # Check bin size is 1 + assert trace_a.xbins["size"] == 1 + assert trace_b.xbins["size"] == 1 + + # Check min and max are rounded correctly + # min_value should be floor(min(values_a.min(), values_b.min())) = floor(1.2) = 1 + # max_value should be ceil(max(values_a.max(), values_b.max())) = ceil(5.9) = 6 + assert trace_a.xbins["start"] == 1 + assert trace_a.xbins["end"] == 6 + assert trace_b.xbins["start"] == 1 + assert trace_b.xbins["end"] == 6 + + +def test_create_histograms_with_empty_dataframe(): + df_empty = pd.DataFrame({"value": []}) + df_nonempty = pd.DataFrame({"value": [1, 2, 3]}) + + fig = create_histograms( + dataframe_a=df_empty, + dataframe_b=df_nonempty, + relevant_column_a="value", + relevant_column_b="value", + name_a="Empty", + name_b="NonEmpty", + one_bin_per_int=True, + ) + + trace_empty = fig.data[0] + trace_nonempty = fig.data[1] + + # Ensure the function did not crash and returned a Figure + assert isinstance(fig, Figure) + + # Even if dataframe_a is empty, trace_a should exist with default bin size 1 + assert trace_empty.xbins["size"] == 1 + + # trace_b should have correct start/end bin values + assert trace_nonempty.xbins["start"] == 1 # floor(min(values_b)) = 1 + assert trace_nonempty.xbins["end"] == 3 # ceil(max(values_b)) = 3 + assert trace_nonempty.xbins["size"] == 1 + + +def test_add_vertical_line_with_annotation_in_legend_adds_line_and_legend_multiple_calls(): + fig = go.Figure() + add_vertical_line_with_annotation_in_legend( + fig=fig, dash="dash", annotation="Line 1", x_value=1.0 + ) + add_vertical_line_with_annotation_in_legend( + fig=fig, dash="dot", annotation="Line 2", x_value=2.0, color="green" + ) + + # Check layout.shapes -> add_vline internally adds a shape to layout.shapes + assert len(fig.layout.shapes) == 2 + vlines_x = [shape.x0 for shape in fig.layout.shapes] + assert vlines_x == [1.0, 2.0] + vlines_colors = [shape["line"]["color"] for shape in fig.layout.shapes] + assert vlines_colors == ["blue", "green"] + vlines_dashes = [shape["line"]["dash"] for shape in fig.layout.shapes] + assert vlines_dashes == ["dash", "dot"] + + # Check legend traces + assert len(fig.data) == 2 + names = [trace.name for trace in fig.data] + colors = [trace.line.color for trace in fig.data] + dashes = [trace.line.dash for trace in fig.data] + x_values = [trace.x for trace in fig.data] + y_values = [trace.y for trace in fig.data] + assert names == ["Line 1", "Line 2"] + assert colors == ["blue", "green"] + assert dashes == ["dash", "dot"] + assert x_values == [(None,), (None,)] + assert y_values == [(None,), (None,)] diff --git a/backend/tests/protzilla/data_preprocessing/test_plots_data_preprocessing.py b/backend/tests/protzilla/data_preprocessing/test_plots_data_preprocessing.py index edac829f6..a12bf6e47 100644 --- a/backend/tests/protzilla/data_preprocessing/test_plots_data_preprocessing.py +++ b/backend/tests/protzilla/data_preprocessing/test_plots_data_preprocessing.py @@ -101,66 +101,6 @@ def test_create_histograms( return -def test_create_histograms_one_bin_per_int_is_true(): - - df_a = pd.DataFrame({"value": [1.2, 2.7, 3.5]}) - df_b = pd.DataFrame({"value": [2.1, 4.6, 5.9]}) - - fig = create_histograms( - dataframe_a=df_a, - dataframe_b=df_b, - relevant_column_a="value", - relevant_column_b="value", - name_a="A", - name_b="B", - one_bin_per_int=True, - ) - - trace_a = fig.data[0] - trace_b = fig.data[1] - - # Check bin size is 1 - assert trace_a.xbins["size"] == 1 - assert trace_b.xbins["size"] == 1 - - # Check min and max are rounded correctly - # min_value should be floor(min(values_a.min(), values_b.min())) = floor(1.2) = 1 - # max_value should be ceil(max(values_a.max(), values_b.max())) = ceil(5.9) = 6 - assert trace_a.xbins["start"] == 1 - assert trace_a.xbins["end"] == 6 - assert trace_b.xbins["start"] == 1 - assert trace_b.xbins["end"] == 6 - - -def test_create_histograms_with_empty_dataframe(): - df_empty = pd.DataFrame({"value": []}) - df_nonempty = pd.DataFrame({"value": [1, 2, 3]}) - - fig = create_histograms( - dataframe_a=df_empty, - dataframe_b=df_nonempty, - relevant_column_a="value", - relevant_column_b="value", - name_a="Empty", - name_b="NonEmpty", - one_bin_per_int=True, - ) - - trace_empty = fig.data[0] - trace_nonempty = fig.data[1] - - # Ensure the function did not crash and returned a Figure - assert isinstance(fig, Figure) - - # Even if dataframe_a is empty, trace_a should exist with default bin size 1 - assert trace_empty.xbins["size"] == 1 - - # trace_b should have correct start/end bin values - assert trace_nonempty.xbins["start"] == 1 # floor(min(values_b)) = 1 - assert trace_nonempty.xbins["end"] == 3 # ceil(max(values_b)) = 3 - assert trace_nonempty.xbins["size"] == 1 - - @pytest.mark.order(2) @pytest.mark.dependency( depends=[ From 31beb7c149189c18b23edcddfb89dcb5b5ac3524 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sun, 15 Feb 2026 15:43:35 +0100 Subject: [PATCH 114/240] chore: move tests to right file --- .../data_analysis/test_plots_data_analysis.py | 92 ------------------ .../test_plots_data_preprocessing.py | 96 +++++++++++++++++++ 2 files changed, 96 insertions(+), 92 deletions(-) diff --git a/backend/tests/protzilla/data_analysis/test_plots_data_analysis.py b/backend/tests/protzilla/data_analysis/test_plots_data_analysis.py index 4420a3ee2..92f28ba6e 100644 --- a/backend/tests/protzilla/data_analysis/test_plots_data_analysis.py +++ b/backend/tests/protzilla/data_analysis/test_plots_data_analysis.py @@ -287,95 +287,3 @@ def test_clustergram_flip_axes(show_figures, wide_4d_df, metadata_df): if show_figures: fig.show() return - - -def test_create_histograms_one_bin_per_int_is_true(): - - df_a = pd.DataFrame({"value": [1.2, 2.7, 3.5]}) - df_b = pd.DataFrame({"value": [2.1, 4.6, 5.9]}) - - fig = create_histograms( - dataframe_a=df_a, - dataframe_b=df_b, - relevant_column_a="value", - relevant_column_b="value", - name_a="A", - name_b="B", - one_bin_per_int=True, - ) - - trace_a = fig.data[0] - trace_b = fig.data[1] - - # Check bin size is 1 - assert trace_a.xbins["size"] == 1 - assert trace_b.xbins["size"] == 1 - - # Check min and max are rounded correctly - # min_value should be floor(min(values_a.min(), values_b.min())) = floor(1.2) = 1 - # max_value should be ceil(max(values_a.max(), values_b.max())) = ceil(5.9) = 6 - assert trace_a.xbins["start"] == 1 - assert trace_a.xbins["end"] == 6 - assert trace_b.xbins["start"] == 1 - assert trace_b.xbins["end"] == 6 - - -def test_create_histograms_with_empty_dataframe(): - df_empty = pd.DataFrame({"value": []}) - df_nonempty = pd.DataFrame({"value": [1, 2, 3]}) - - fig = create_histograms( - dataframe_a=df_empty, - dataframe_b=df_nonempty, - relevant_column_a="value", - relevant_column_b="value", - name_a="Empty", - name_b="NonEmpty", - one_bin_per_int=True, - ) - - trace_empty = fig.data[0] - trace_nonempty = fig.data[1] - - # Ensure the function did not crash and returned a Figure - assert isinstance(fig, Figure) - - # Even if dataframe_a is empty, trace_a should exist with default bin size 1 - assert trace_empty.xbins["size"] == 1 - - # trace_b should have correct start/end bin values - assert trace_nonempty.xbins["start"] == 1 # floor(min(values_b)) = 1 - assert trace_nonempty.xbins["end"] == 3 # ceil(max(values_b)) = 3 - assert trace_nonempty.xbins["size"] == 1 - - -def test_add_vertical_line_with_annotation_in_legend_adds_line_and_legend_multiple_calls(): - fig = go.Figure() - add_vertical_line_with_annotation_in_legend( - fig=fig, dash="dash", annotation="Line 1", x_value=1.0 - ) - add_vertical_line_with_annotation_in_legend( - fig=fig, dash="dot", annotation="Line 2", x_value=2.0, color="green" - ) - - # Check layout.shapes -> add_vline internally adds a shape to layout.shapes - assert len(fig.layout.shapes) == 2 - vlines_x = [shape.x0 for shape in fig.layout.shapes] - assert vlines_x == [1.0, 2.0] - vlines_colors = [shape["line"]["color"] for shape in fig.layout.shapes] - assert vlines_colors == ["blue", "green"] - vlines_dashes = [shape["line"]["dash"] for shape in fig.layout.shapes] - assert vlines_dashes == ["dash", "dot"] - - # Check legend traces - assert len(fig.data) == 2 - names = [trace.name for trace in fig.data] - colors = [trace.line.color for trace in fig.data] - dashes = [trace.line.dash for trace in fig.data] - x_values = [trace.x for trace in fig.data] - y_values = [trace.y for trace in fig.data] - assert names == ["Line 1", "Line 2"] - assert colors == ["blue", "green"] - assert dashes == ["dash", "dot"] - assert x_values == [(None,), (None,)] - assert y_values == [(None,), (None,)] diff --git a/backend/tests/protzilla/data_preprocessing/test_plots_data_preprocessing.py b/backend/tests/protzilla/data_preprocessing/test_plots_data_preprocessing.py index a12bf6e47..3c08c7e63 100644 --- a/backend/tests/protzilla/data_preprocessing/test_plots_data_preprocessing.py +++ b/backend/tests/protzilla/data_preprocessing/test_plots_data_preprocessing.py @@ -3,6 +3,10 @@ from backend.protzilla.data_preprocessing import imputation from backend.protzilla.data_preprocessing.plots import * from backend.tests.protzilla.data_preprocessing.test_imputation import * +from backend.protzilla.data_analysis.plots import ( + add_vertical_line_with_annotation_in_legend, +) + # this tests will build some Figures and display them if show_figures==True # it tests only for occurring errors @@ -101,6 +105,98 @@ def test_create_histograms( return +def test_create_histograms_one_bin_per_int_is_true(): + + df_a = pd.DataFrame({"value": [1.2, 2.7, 3.5]}) + df_b = pd.DataFrame({"value": [2.1, 4.6, 5.9]}) + + fig = create_histograms( + dataframe_a=df_a, + dataframe_b=df_b, + relevant_column_a="value", + relevant_column_b="value", + name_a="A", + name_b="B", + one_bin_per_int=True, + ) + + trace_a = fig.data[0] + trace_b = fig.data[1] + + # Check bin size is 1 + assert trace_a.xbins["size"] == 1 + assert trace_b.xbins["size"] == 1 + + # Check min and max are rounded correctly + # min_value should be floor(min(values_a.min(), values_b.min())) = floor(1.2) = 1 + # max_value should be ceil(max(values_a.max(), values_b.max())) = ceil(5.9) = 6 + assert trace_a.xbins["start"] == 1 + assert trace_a.xbins["end"] == 6 + assert trace_b.xbins["start"] == 1 + assert trace_b.xbins["end"] == 6 + + +def test_create_histograms_with_empty_dataframe(): + df_empty = pd.DataFrame({"value": []}) + df_nonempty = pd.DataFrame({"value": [1, 2, 3]}) + + fig = create_histograms( + dataframe_a=df_empty, + dataframe_b=df_nonempty, + relevant_column_a="value", + relevant_column_b="value", + name_a="Empty", + name_b="NonEmpty", + one_bin_per_int=True, + ) + + trace_empty = fig.data[0] + trace_nonempty = fig.data[1] + + # Ensure the function did not crash and returned a Figure + assert isinstance(fig, Figure) + + # Even if dataframe_a is empty, trace_a should exist with default bin size 1 + assert trace_empty.xbins["size"] == 1 + + # trace_b should have correct start/end bin values + assert trace_nonempty.xbins["start"] == 1 # floor(min(values_b)) = 1 + assert trace_nonempty.xbins["end"] == 3 # ceil(max(values_b)) = 3 + assert trace_nonempty.xbins["size"] == 1 + + +def test_add_vertical_line_with_annotation_in_legend_adds_line_and_legend_multiple_calls(): + fig = go.Figure() + add_vertical_line_with_annotation_in_legend( + fig=fig, dash="dash", annotation="Line 1", x_value=1.0 + ) + add_vertical_line_with_annotation_in_legend( + fig=fig, dash="dot", annotation="Line 2", x_value=2.0, color="green" + ) + + # Check layout.shapes -> add_vline internally adds a shape to layout.shapes + assert len(fig.layout.shapes) == 2 + vlines_x = [shape.x0 for shape in fig.layout.shapes] + assert vlines_x == [1.0, 2.0] + vlines_colors = [shape["line"]["color"] for shape in fig.layout.shapes] + assert vlines_colors == ["blue", "green"] + vlines_dashes = [shape["line"]["dash"] for shape in fig.layout.shapes] + assert vlines_dashes == ["dash", "dot"] + + # Check legend traces + assert len(fig.data) == 2 + names = [trace.name for trace in fig.data] + colors = [trace.line.color for trace in fig.data] + dashes = [trace.line.dash for trace in fig.data] + x_values = [trace.x for trace in fig.data] + y_values = [trace.y for trace in fig.data] + assert names == ["Line 1", "Line 2"] + assert colors == ["blue", "green"] + assert dashes == ["dash", "dot"] + assert x_values == [(None,), (None,)] + assert y_values == [(None,), (None,)] + + @pytest.mark.order(2) @pytest.mark.dependency( depends=[ From cb608b65ecfd3c2289074a0ecff15402cbf57f4c Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 16 Feb 2026 11:04:44 +0100 Subject: [PATCH 115/240] feat: add step for generating a json alphafold multimer query --- backend/protzilla/all_steps.py | 1 + .../protzilla/importing/query_generation.py | 46 +++++++++++++++++++ backend/protzilla/methods/importing.py | 31 +++++++++++++ 3 files changed, 78 insertions(+) create mode 100644 backend/protzilla/importing/query_generation.py diff --git a/backend/protzilla/all_steps.py b/backend/protzilla/all_steps.py index 8bb5d4d75..8a4dac352 100644 --- a/backend/protzilla/all_steps.py +++ b/backend/protzilla/all_steps.py @@ -17,6 +17,7 @@ importing.AlphaFoldPredictionLoad, importing.CrosslinkingImport, importing.ImportStructurePredictionFromDisk, + importing.AlphaFoldMultimerQueryJsonGeneration, data_preprocessing.FilterProteinsBySamplesMissing, data_preprocessing.FilterProteinsBySilacRatios, data_preprocessing.FilterByProteinsCount, diff --git a/backend/protzilla/importing/query_generation.py b/backend/protzilla/importing/query_generation.py new file mode 100644 index 000000000..bb0a2ea92 --- /dev/null +++ b/backend/protzilla/importing/query_generation.py @@ -0,0 +1,46 @@ +import pandas as pd +import requests + + +def generate_alphafold_multimer_query_json( + protein_ids: str, number_copies: str +) -> dict: + # extract contents and make sure they have the same length -> otherwise raise error + uniprot_ids = protein_ids.split() + try: + copies_per_id = [int(input) for input in number_copies.split()] + except ValueError as e: + raise ValueError( + "Invalid copies_per_id: please provide space-separated integers" + ) + if len(uniprot_ids) != len(number_copies): + dict(messages={}, tmp_df=pd.DataFrame()) + + data_for_query = { + "name": "_".join(protein_ids.split()) + "_prediction", + "modelSeeds": [], + "sequences": [], + "dialect": "alphafoldserver", + "version": 1, + } + + for uniprot_id, copies in zip(uniprot_ids, copies_per_id): + url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta" + + response = requests.get(url) + response.raise_for_status() # TODO: was macht das? + + fasta = response.text + amino_acid_sequence = "".join( + line.strip() for line in fasta.splitlines() if not line.startswith(">") + ) + data_for_query["sequences"].append( + { + "proteinChain": { + "sequence": amino_acid_sequence, + "count": copies, + } + } + ) + + return dict(messages={}, tmp_df=pd.DataFrame()) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 07e7957b1..12f42e816 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -27,6 +27,7 @@ FeatureOrientationType, ) from backend.protzilla.constants.intensity_types import IntensityType, IntensityNameType +from protzilla.importing.query_generation import generate_alphafold_multimer_query_json class ImportingStep(Step): @@ -508,3 +509,33 @@ def create_form(self): ) calc_method = staticmethod(get_prot_structure_dfs) + + +class AlphaFoldMultimerQueryJsonGeneration(ImportingStep): + display_name = "AlphaFold Multimer Query JSON Generation" + operation = "Query Generation" + method_description = "Generate a JSON to upload to AlphaFold-Server to generate a prediction on a multimer." + + output_keys = ["tmp_df"] + + def create_form(self): + return Form( + label="AlphaFold Multimer Query JSON Generation", + input_fields=[ + TextField( + name="protein_ids", + label="Protein UniProt IDs", + ), + InfoField(label="IDs should be separated by a space."), + TextField( + name="number_copies", + label="Number of copies for each protein ID", + ), + InfoField( + label="For each entered ID a number should be entered.\n" + "Numbers should be separated by a space." + ), + ], + ) + + calc_method = staticmethod(generate_alphafold_multimer_query_json) From 32c657fd597e809ffbde15db5a2bd97e175ce60f Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 16 Feb 2026 11:05:11 +0100 Subject: [PATCH 116/240] feat: add download tab to outputs --- backend/tests/main/test_views_helper.py | 1 + frontend/src/components/app/run-screen/run-screen.tsx | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/backend/tests/main/test_views_helper.py b/backend/tests/main/test_views_helper.py index 35df025d7..a6243f46b 100644 --- a/backend/tests/main/test_views_helper.py +++ b/backend/tests/main/test_views_helper.py @@ -16,6 +16,7 @@ def test_get_all_possible_step_names(): "AlphaFoldPredictionLoad", "CrosslinkingImport", "ImportStructurePredictionFromDisk", + "AlphaFoldMultimerQueryJsonGeneration", "FilterProteinsBySamplesMissing", "FilterProteinsBySilacRatios", "FilterByProteinsCount", diff --git a/frontend/src/components/app/run-screen/run-screen.tsx b/frontend/src/components/app/run-screen/run-screen.tsx index 55cb7b180..364d3010e 100644 --- a/frontend/src/components/app/run-screen/run-screen.tsx +++ b/frontend/src/components/app/run-screen/run-screen.tsx @@ -235,6 +235,10 @@ export const RunScreen: React.FC = () => { ); + const downloadComponent = ( + + ); + const listEditorComponent = ( { components={[ { name: "Plots", value: plotComponent }, { name: "Tables", value: tableComponent }, + { name: "Downloads", value: downloadComponent }, { name: "Other Output", value: otherComponent }, ]} hasCardTitle={false} From aa0edfa65b567f73b8f8b33efce5f5b0c218061c Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 16 Feb 2026 14:28:16 +0100 Subject: [PATCH 117/240] fix: minor changes from review --- backend/main/views_settings.py | 24 ++++++++----------- .../alphafold_protein_structure_load.py | 8 +++---- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index 8c3669373..9cd52dba9 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -241,8 +241,8 @@ def check_and_copy_files_to_directory(file_names: list, target_dir: str): target_dir.mkdir(parents=True, exist_ok=True) for file_name in file_names: - source_dir = settings.FILE_UPLOAD_TEMP_DIR / file_name - success, message = copy_file_to_directory(source_dir, target_dir) + source_file = settings.FILE_UPLOAD_TEMP_DIR / file_name + success, message = copy_file_to_directory(source_file, target_dir) if not success: return False, message return True, "All files successfully uploaded" @@ -272,7 +272,7 @@ def delete_structure(dir_path: str, csv_file_path: str, request): ) # delete folder with files for the monomer structure - target_dir = dir_path / entry_id.upper() + target_dir = dir_path / entry_id metadata_csv = csv_file_path if not target_dir.exists() or not target_dir.is_dir(): @@ -297,9 +297,7 @@ def delete_structure(dir_path: str, csv_file_path: str, request): ): try: df = pandas.read_csv(metadata_csv, dtype=str) - df = df[ - df["entry_id"].fillna("").str.strip().str.upper() != entry_id.upper() - ] + df = df[df["entry_id"].fillna("").str.strip().str != entry_id] df.to_csv(metadata_csv, index=False) except Exception as e: @@ -358,7 +356,7 @@ def upload_monomer_structure(request): # Copy files to source directory out of temp directory - target_dir = ALPHAFOLD_MONOMER_PATH / entry_id.upper() + target_dir = ALPHAFOLD_MONOMER_PATH / entry_id file_names = [cif_file, confidence, pae, fasta_file] success, message = check_and_copy_files_to_directory( file_names=file_names, target_dir=target_dir @@ -385,13 +383,12 @@ def upload_monomer_structure(request): csv_file_path=metadata_csv, expected_columns=expected_columns ) - now_utc = datetime.now(timezone.utc) - formatted = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ") + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") new_row = { "entry_id": entry_id, "uniprot_accession": uniprot_id, - "model_created_date": formatted, + "model_created_date": timestamp, "gene": gene, "model_used": model_used, } @@ -462,7 +459,7 @@ def upload_multimer_structure(request): # Copy files to source directory out of temp directory - target_dir = ALPHAFOLD_MULTIMER_PATH / entry_id.upper() + target_dir = ALPHAFOLD_MULTIMER_PATH / entry_id file_names = [fasta_file, cif_file, confidence_file, full_data_file] success, message = check_and_copy_files_to_directory( file_names=file_names, target_dir=target_dir @@ -486,13 +483,12 @@ def upload_multimer_structure(request): csv_file_path=metadata_csv, expected_columns=expected_columns ) - now_utc = datetime.now(timezone.utc) - formatted = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ") + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") new_row = { "entry_id": entry_id, "uniprot_ids": uniprot_ids, - "model_created_date": formatted, + "model_created_date": timestamp, "model_used": model_used, } diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index a13f1e118..46346c58b 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -149,7 +149,7 @@ def get_correct_af_directories( created) and the working directory to use. """ - target_dir = directory_name / entry_id.upper() + target_dir = directory_name / entry_id temp_dir = None if persist_upload: @@ -306,7 +306,7 @@ def handle_alphafold_files( fasta_dest: Path | None = None try: sequence = to_fasta(seq=seq, header=uniprot) - fasta_dest = work_dir / f"{entry_id.upper()}.fasta" + fasta_dest = work_dir / f"{entry_id}.fasta" fasta_dest.parent.mkdir(parents=True, exist_ok=True) with open(fasta_dest, "w") as f: f.write(sequence) @@ -623,7 +623,7 @@ def get_monomer_structure_dfs(entry_id: str) -> dict[str, Any]: csv_file=paths.AF_MONOMER_METADATA_CSV_PATH, ) - structure_dir = paths.ALPHAFOLD_MONOMER_PATH / entry_id.upper() + structure_dir = paths.ALPHAFOLD_MONOMER_PATH / entry_id check_dir(entry_id=entry_id, dir=structure_dir) # get cif file @@ -705,7 +705,7 @@ def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: csv_file=paths.AF_MULTIMER_METADATA_CSV_PATH, ) - structure_dir = paths.ALPHAFOLD_MULTIMER_PATH / entry_id.upper() + structure_dir = paths.ALPHAFOLD_MULTIMER_PATH / entry_id check_dir(entry_id=entry_id, dir=structure_dir) # get cif file From 81d1741ee1dc3c4d4acf007748d5b57b8133d627 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 16 Feb 2026 14:42:09 +0100 Subject: [PATCH 118/240] style: address code review feedback --- .../data_analysis/crosslinking_validation.py | 31 ++++++++++--------- backend/protzilla/data_analysis/plots.py | 6 +++- .../test_crosslinking_validation.py | 30 +++++++++++------- 3 files changed, 41 insertions(+), 26 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 49c51e75c..096a65955 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -365,7 +365,7 @@ def diagrams_of_crosslinking_validation_data( dataframe_b=df_invalid, name_a="Valid Crosslinks", name_b="Invalid Crosslinks", - heading=f"Predicted distances for {protein_to_validate}", + heading=f"Predicted distances for {protein_to_validate} with crosslinker {crosslinker}", x_title="Distance (Å)", y_title="Count", overlay=True, @@ -381,28 +381,31 @@ def diagrams_of_crosslinking_validation_data( x_value=crosslinker_length, ) - mean_predicted_lengths = crosslinker_df["alphafold_distance"].mean() + mean_of_predicted_lengths = crosslinker_df["alphafold_distance"].mean() standard_deviation_predicted_lengths = crosslinker_df[ "alphafold_distance" ].std() - mean_plus_minus_two_std_range = ( - max(0, mean_predicted_lengths - 2 * standard_deviation_predicted_lengths), - mean_predicted_lengths + 2 * standard_deviation_predicted_lengths, + mean_plus_two_std = ( + mean_of_predicted_lengths + 2 * standard_deviation_predicted_lengths ) + mean_minus_two_std = max( + 0, mean_of_predicted_lengths - 2 * standard_deviation_predicted_lengths + ) + histogram_two_standard_deviations = create_histograms( dataframe_a=df_valid, dataframe_b=df_invalid, name_a="Valid Crosslinks", name_b="Invalid Crosslinks", - heading=f"Predicted distances for {protein_to_validate}, mean +- 2 standard deviations", + heading=f"Predicted distances for {protein_to_validate} with crosslinker {crosslinker}, mean +/- 2 σ", x_title="Distance (Å)", y_title="Count", overlay=True, visual_transformation="linear", relevant_column_a="alphafold_distance", relevant_column_b="alphafold_distance", - min_value=mean_plus_minus_two_std_range[0], - max_value=mean_plus_minus_two_std_range[1], + min_value=mean_minus_two_std, + max_value=mean_plus_two_std, one_bin_per_int=True, ) add_vertical_line_with_annotation_in_legend( @@ -420,9 +423,9 @@ def diagrams_of_crosslinking_validation_data( x_value=crosslinker_length + accepted_deviation_upper_bound, ) if ( - math.floor(mean_plus_minus_two_std_range[0]) + math.floor(mean_minus_two_std) <= crosslinker_length + accepted_deviation_upper_bound - <= math.ceil(mean_plus_minus_two_std_range[1]) + <= math.ceil(mean_plus_two_std) ): add_vertical_line_with_annotation_in_legend( fig=histogram_two_standard_deviations, @@ -438,9 +441,9 @@ def diagrams_of_crosslinking_validation_data( x_value=crosslinker_length - accepted_deviation_lower_bound, ) if ( - math.floor(mean_plus_minus_two_std_range[0]) + math.floor(mean_minus_two_std) <= crosslinker_length - accepted_deviation_lower_bound - <= math.ceil(mean_plus_minus_two_std_range[1]) + <= math.ceil(mean_plus_two_std) ): add_vertical_line_with_annotation_in_legend( fig=histogram_two_standard_deviations, @@ -452,8 +455,8 @@ def diagrams_of_crosslinking_validation_data( figures.append(histogram_two_standard_deviations) figures.append(histogram) - valid_crosslinks = (validated_df["Is_intra_crosslink"] == True).sum() - invalid_crosslinks = (validated_df["Is_intra_crosslink"] == False).sum() + valid_crosslinks = (validated_df["valid_crosslink"] == True).sum() + invalid_crosslinks = (validated_df["valid_crosslink"] == False).sum() bar_plot_over_all_checked_crosslinks = create_bar_plot( values_of_sectors=[ diff --git a/backend/protzilla/data_analysis/plots.py b/backend/protzilla/data_analysis/plots.py index 5ef1b03b6..da2b45f32 100644 --- a/backend/protzilla/data_analysis/plots.py +++ b/backend/protzilla/data_analysis/plots.py @@ -471,7 +471,11 @@ def prot_quant_plot( def add_vertical_line_with_annotation_in_legend( - fig: Figure, dash: str, annotation: str, x_value: float, color: str = "blue" + fig: Figure, + dash: str, + annotation: str, + x_value: float, + color: str = PLOT_PRIMARY_COLOR, ) -> None: """ Adds a vertical line to a Plotly figure and includes a corresponding entry in the legend diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index d95f3fdd7..33a52727e 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -13,8 +13,13 @@ add_positions_of_amino_acid_where_crosslinker_bound_to_df, diagrams_of_crosslinking_validation_data, ) -from protzilla.data_analysis.plots import add_vertical_line_with_annotation_in_legend -from protzilla.methods.data_analysis import CrossLinkingValidationWithAngstromDeviation +from backend.protzilla.constants.colors import PLOT_PRIMARY_COLOR +from backend.protzilla.data_analysis.plots import ( + add_vertical_line_with_annotation_in_legend, +) +from backend.protzilla.methods.data_analysis import ( + CrossLinkingValidationWithAngstromDeviation, +) @pytest.mark.parametrize( @@ -264,7 +269,7 @@ def test_add_vertical_line_with_annotation_in_legend_adds_line_and_legend(): vline = fig.layout.shapes[0] assert vline["x0"] == 5.0 assert vline["line"]["dash"] == "dash" - assert vline["line"]["color"] == "blue" + assert vline["line"]["color"] == PLOT_PRIMARY_COLOR # There should be 1 scatter trace for the legend assert len(fig.data) == 1 @@ -272,7 +277,7 @@ def test_add_vertical_line_with_annotation_in_legend_adds_line_and_legend(): assert trace.mode == "lines" assert trace.name == "Test Line" assert trace.line.dash == "dash" - assert trace.line.color == "blue" + assert trace.line.color == PLOT_PRIMARY_COLOR assert trace.x == (None,) assert trace.y == (None,) @@ -284,7 +289,6 @@ def sample_crosslinking_df(): "Crosslinker": ["CL1", "CL1", "CL2", "CL2"], "alphafold_distance": [10.0, 12.0, 8.0, 9.0], "valid_crosslink": [True, False, True, False], - "Is_intra_crosslink": [True, False, True, False], } ) @@ -353,7 +357,6 @@ def sample_crosslinking_df_with_no_std(): "Crosslinker": ["CL1", "CL1", "CL2", "CL2"], "alphafold_distance": [10.5, 10.5, 10.5, 10.5], "valid_crosslink": [True, False, True, False], - "Is_intra_crosslink": [True, False, True, False], } ) @@ -427,7 +430,6 @@ def sample_crosslinking_df_with_one_crosslinker(): "Crosslinker": ["CL1", "CL1", "CL1", "CL1"], "alphafold_distance": [10.0, 12.0, 8.0, 9.0], "valid_crosslink": [True, False, True, False], - "Is_intra_crosslink": [True, False, True, False], } ) @@ -470,7 +472,10 @@ def test_diagrams_calls_with_correct_parameters( first_hist_call = mock_hist.call_args_list[0].kwargs assert first_hist_call["name_a"] == "Valid Crosslinks" assert first_hist_call["name_b"] == "Invalid Crosslinks" - assert first_hist_call["heading"] == "Predicted distances for P12345" + assert ( + first_hist_call["heading"] + == "Predicted distances for P12345 with crosslinker CL1" + ) assert first_hist_call["relevant_column_a"] == "alphafold_distance" assert first_hist_call["relevant_column_b"] == "alphafold_distance" assert first_hist_call["one_bin_per_int"] == True @@ -490,7 +495,10 @@ def test_diagrams_calls_with_correct_parameters( # Check histogram call parameters for crosslinker ±2 std second_hist_call = mock_hist.call_args_list[1].kwargs - assert "mean +- 2 standard deviations" in second_hist_call["heading"] + assert ( + second_hist_call["heading"] + == "Predicted distances for P12345 with crosslinker CL1, mean +/- 2 σ" + ) mean_predicted_lengths = sample_crosslinking_df_with_one_crosslinker[ "alphafold_distance" ].mean() @@ -513,8 +521,8 @@ def test_diagrams_calls_with_correct_parameters( mock_bar.assert_called_once() expected_figures = [ - "hist_Predicted distances for P12345, mean +- 2 standard deviations", - "hist_Predicted distances for P12345", + "hist_Predicted distances for P12345 with crosslinker CL1, mean +/- 2 σ", + "hist_Predicted distances for P12345 with crosslinker CL1", "bar_fig", ] assert figures == expected_figures From 62a2cd3e622bec51b1763193e109753edafb80c1 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 16 Feb 2026 15:37:06 +0100 Subject: [PATCH 119/240] fix: update test --- .../importing/test_alphafold_protein_structure_load.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index 5f11baedf..bfdd9ea0d 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -362,7 +362,7 @@ def test_get_correct_af_directories_persist_and_temp(tmp_path): # persist_upload True temp, work = get_correct_af_directories("abc", tmp_path, True) assert temp is None - assert work == tmp_path / "ABC" + assert work == tmp_path / "abc" assert work.exists() # persist_upload False -> temporary directory created From aa3356203a36158960d59bfe115660df9fcab899 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Tue, 17 Feb 2026 13:23:38 +0100 Subject: [PATCH 120/240] fix: add some explanations for entry fields --- backend/protzilla/methods/importing.py | 3 +++ .../other-settings/monomer-structure-upload.tsx | 6 ++++++ .../other-settings/multimer-structure-upload.tsx | 13 +++++++++++++ 3 files changed, 22 insertions(+) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index e163140d7..c8d08528c 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -534,6 +534,9 @@ def create_form(self): name="entry_id", label="Entry ID of the prediction to be loaded into the run.", ), + InfoField( + label="The entry ID should be a unique name given to the uploaded prediction.", + ), TextField( name="uniprot_ids", label="Protein IDs of all proteins used in the sequence.", diff --git a/frontend/src/components/app/settings/other-settings/monomer-structure-upload.tsx b/frontend/src/components/app/settings/other-settings/monomer-structure-upload.tsx index 453084efb..4122e09ee 100644 --- a/frontend/src/components/app/settings/other-settings/monomer-structure-upload.tsx +++ b/frontend/src/components/app/settings/other-settings/monomer-structure-upload.tsx @@ -191,6 +191,12 @@ export const MonomerStructureUpload = () => { label: "Entry ID (required):", isVisible: true, }, + { + type: "info-field", + name: "entry_id_info", + label: "The entry ID should be a unique name given to the uploaded prediction.", + isVisible: true, + }, { type: "text", name: "model_used", diff --git a/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx b/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx index 32e5d3f69..a504c36a7 100644 --- a/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx +++ b/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx @@ -178,12 +178,25 @@ export const MultimerStructureUpload = () => { label: "Entry ID (required):", isVisible: true, }, + { + type: "info-field", + name: "entry_id_info", + label: "The entry ID should be a unique name given to the uploaded prediction.", + isVisible: true, + }, { type: "text", name: "uniprot_ids", label: "Protein IDs of all proteins used in the sequence (required):", isVisible: true, }, + { + type: "info-field", + name: "uniprot_ids_info", + label: + "Please provide the list of Protein IDs separated by a comma e.g.: P68871, P69905, Q5VSL9", + isVisible: true, + }, { type: "text", name: "model_used", From ad1f0dcb6a8ce2bfcf51b42e34cf88c0c21e543f Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Tue, 17 Feb 2026 14:18:27 +0100 Subject: [PATCH 121/240] fix: entry ids are treated case insensitively and according warnings are given to users --- backend/main/views_settings.py | 132 ++++++++++++------ .../alphafold_protein_structure_load.py | 31 ++-- .../test_alphafold_protein_structure_load.py | 2 +- 3 files changed, 107 insertions(+), 58 deletions(-) diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index 9cd52dba9..618df5f35 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -5,7 +5,7 @@ from io import BytesIO -import pandas +import pandas as pd import plotly.graph_objects as go import plotly.io as pio from PIL import Image @@ -236,7 +236,10 @@ def save_ptm_settings(request, default_file_stem: str = DEFAULT_PTM_SETTINGS_FIL # <--- helper functions for monomer and multimer structure prediction ---> def check_and_copy_files_to_directory(file_names: list, target_dir: str): if target_dir.exists(): - return False, "Entry ID is not unique." + return ( + False, + 'Entry ID is not unique. Entry IDs are compared case insensitively, so "ABC" and "abc" are treated as the same ID.', + ) else: target_dir.mkdir(parents=True, exist_ok=True) @@ -248,13 +251,11 @@ def check_and_copy_files_to_directory(file_names: list, target_dir: str): return True, "All files successfully uploaded" -def get_metadata_df( - csv_file_path: str, expected_columns: list[str] -) -> pandas.DataFrame: +def get_metadata_df(csv_file_path: str, expected_columns: list[str]) -> pd.DataFrame: if csv_file_path.exists(): - df = pandas.read_csv(csv_file_path, usecols=lambda c: c in expected_columns) + df = pd.read_csv(csv_file_path, usecols=lambda c: c in expected_columns) else: - df = pandas.DataFrame(columns=expected_columns) + df = pd.DataFrame(columns=expected_columns) return df @@ -272,7 +273,7 @@ def delete_structure(dir_path: str, csv_file_path: str, request): ) # delete folder with files for the monomer structure - target_dir = dir_path / entry_id + target_dir = dir_path / entry_id.upper() metadata_csv = csv_file_path if not target_dir.exists() or not target_dir.is_dir(): @@ -296,8 +297,10 @@ def delete_structure(dir_path: str, csv_file_path: str, request): and metadata_csv.stat().st_size > 0 ): try: - df = pandas.read_csv(metadata_csv, dtype=str) - df = df[df["entry_id"].fillna("").str.strip().str != entry_id] + df = pd.read_csv(metadata_csv, dtype=str) + df = df[ + (df["entry_id"].fillna("").str.strip().str).upper() != entry_id.upper() + ] df.to_csv(metadata_csv, index=False) except Exception as e: @@ -314,6 +317,29 @@ def delete_structure(dir_path: str, csv_file_path: str, request): ) +def extend_metadata_csv( + entry_id: str, + metadata_csv: str, + existing_metadata_df: pd.DataFrame, + metadata_df: pd.DataFrame, +) -> None: + try: + mask = ( + existing_metadata_df["entry_id"].astype(str).str.upper() == entry_id.upper() + ) + if mask.any(): + msg = f'Entry ID "{entry_id}" not unique. Entry IDs are compared case insensitively, so "ABC" and "abc" are treated as the same ID.' + return False, msg + + combined = pd.concat([existing_metadata_df, metadata_df], ignore_index=True) + combined.to_csv(metadata_csv, index=False) + return True, f'"{metadata_csv}" updated successfully.' + + except Exception: + msg = f'Failed to write AlphaFold metadata CSV to "{metadata_csv}".' + return False, msg + + # <--- Monomer Structure Predictions ---> @@ -354,19 +380,6 @@ def upload_monomer_structure(request): pae = data.get("pae") fasta_file = data.get("fasta_file") - # Copy files to source directory out of temp directory - - target_dir = ALPHAFOLD_MONOMER_PATH / entry_id - file_names = [cif_file, confidence, pae, fasta_file] - success, message = check_and_copy_files_to_directory( - file_names=file_names, target_dir=target_dir - ) - if not success: - return JsonResponse( - {"success": False, "message": message}, - status=500, - ) - # add row to metadata csv ALPHAFOLD_MONOMER_PATH.mkdir(parents=True, exist_ok=True) metadata_csv = AF_MONOMER_METADATA_CSV_PATH @@ -379,7 +392,7 @@ def upload_monomer_structure(request): "model_used", ] - df = get_metadata_df( + existing_metadata_df = get_metadata_df( csv_file_path=metadata_csv, expected_columns=expected_columns ) @@ -393,8 +406,31 @@ def upload_monomer_structure(request): "model_used": model_used, } - df = pandas.concat([df, pandas.DataFrame([new_row])], ignore_index=True) - df.to_csv(metadata_csv, index=False) + metadata_df = pd.DataFrame([new_row]) + success, message = extend_metadata_csv( + entry_id=entry_id, + metadata_csv=metadata_csv, + existing_metadata_df=existing_metadata_df, + metadata_df=metadata_df, + ) + if not success: + return JsonResponse( + {"success": False, "message": message}, + status=500, + ) + + # Copy files to source directory out of temp directory + + target_dir = ALPHAFOLD_MONOMER_PATH / entry_id.upper() + file_names = [cif_file, confidence, pae, fasta_file] + success, message = check_and_copy_files_to_directory( + file_names=file_names, target_dir=target_dir + ) + if not success: + return JsonResponse( + {"success": False, "message": message}, + status=500, + ) return JsonResponse( { @@ -457,19 +493,6 @@ def upload_multimer_structure(request): confidence_file = data.get("confidence_file") full_data_file = data.get("full_data_file") - # Copy files to source directory out of temp directory - - target_dir = ALPHAFOLD_MULTIMER_PATH / entry_id - file_names = [fasta_file, cif_file, confidence_file, full_data_file] - success, message = check_and_copy_files_to_directory( - file_names=file_names, target_dir=target_dir - ) - if not success: - return JsonResponse( - {"success": False, "message": message}, - status=500, - ) - # add row to metadata csv metadata_csv = AF_MULTIMER_METADATA_CSV_PATH expected_columns = [ @@ -479,7 +502,7 @@ def upload_multimer_structure(request): "model_used", ] - df = get_metadata_df( + existing_metadata_df = get_metadata_df( csv_file_path=metadata_csv, expected_columns=expected_columns ) @@ -492,8 +515,31 @@ def upload_multimer_structure(request): "model_used": model_used, } - df = pandas.concat([df, pandas.DataFrame([new_row])], ignore_index=True) - df.to_csv(metadata_csv, index=False) + metadata_df = pd.DataFrame([new_row]) + success, message = extend_metadata_csv( + entry_id=entry_id, + metadata_csv=metadata_csv, + existing_metadata_df=existing_metadata_df, + metadata_df=metadata_df, + ) + if not success: + return JsonResponse( + {"success": False, "message": message}, + status=500, + ) + + # Copy files to source directory out of temp directory + + target_dir = ALPHAFOLD_MULTIMER_PATH / entry_id.upper() + file_names = [fasta_file, cif_file, confidence_file, full_data_file] + success, message = check_and_copy_files_to_directory( + file_names=file_names, target_dir=target_dir + ) + if not success: + return JsonResponse( + {"success": False, "message": message}, + status=500, + ) return JsonResponse( { @@ -578,7 +624,7 @@ def database_upload(request): return JsonResponse({"success": False, "message": msg}, status=400) try: - dataframe = pandas.read_csv(path, sep="\t") + dataframe = pd.read_csv(path, sep="\t") except UnicodeDecodeError: msg = "File could not be decoded." messages.add_message(request, messages.ERROR, msg, "alert-danger") diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 46346c58b..cb41dba73 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -149,7 +149,7 @@ def get_correct_af_directories( created) and the working directory to use. """ - target_dir = directory_name / entry_id + target_dir = directory_name / entry_id.upper() temp_dir = None if persist_upload: @@ -165,7 +165,7 @@ def get_correct_af_directories( def extend_metadata_csv( entry_id: str, metadata_csv: Path, - exsisting_metadata_df: pd.DataFrame, + existing_metadata_df: pd.DataFrame, metadata_df: pd.DataFrame, messages: list, ) -> None: @@ -194,14 +194,16 @@ def extend_metadata_csv( concatenation or writing to disk after logging them. """ try: - mask = exsisting_metadata_df["entry_id"] == entry_id + mask = ( + existing_metadata_df["entry_id"].astype(str).str.upper() == entry_id.upper() + ) if mask.any(): - msg = f'Existing entry with Entry ID "{entry_id}" was overwritten.' + msg = f'Existing entry with Entry ID "{entry_id}" was overwritten. Entry IDs are compared case insensitively, so "ABC" and "abc" are treated as the same ID.' logger.warning(msg) messages.append(dict(level=logging.WARNING, msg=msg)) - exsisting_metadata_df = exsisting_metadata_df[~mask] + existing_metadata_df = existing_metadata_df[~mask] - combined = pd.concat([exsisting_metadata_df, metadata_df], ignore_index=True) + combined = pd.concat([existing_metadata_df, metadata_df], ignore_index=True) combined.to_csv(metadata_csv, index=False) except Exception: msg = f'Failed to write AlphaFold metadata CSV to "{metadata_csv}".' @@ -280,7 +282,7 @@ def handle_alphafold_files( extend_metadata_csv( entry_id=entry_id, metadata_csv=paths.AF_MONOMER_METADATA_CSV_PATH, - exsisting_metadata_df=existing_metadata_df, + existing_metadata_df=existing_metadata_df, metadata_df=metadata_df, messages=messages, ) @@ -455,7 +457,9 @@ def check_and_get_metadata_df( Entry ID. :raises ValueError: If no metadata for the given Entry ID is found. """ - metadata_df = all_metadata_df[all_metadata_df["entry_id"] == entry_id] + metadata_df = all_metadata_df[ + all_metadata_df["entry_id"].upper() == entry_id.upper() + ] if metadata_df.empty: msg = f"No metadata for Entry ID '{entry_id}' in {csv_file}" logger.error(msg) @@ -623,7 +627,7 @@ def get_monomer_structure_dfs(entry_id: str) -> dict[str, Any]: csv_file=paths.AF_MONOMER_METADATA_CSV_PATH, ) - structure_dir = paths.ALPHAFOLD_MONOMER_PATH / entry_id + structure_dir = paths.ALPHAFOLD_MONOMER_PATH / entry_id.upper() check_dir(entry_id=entry_id, dir=structure_dir) # get cif file @@ -705,7 +709,7 @@ def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: csv_file=paths.AF_MULTIMER_METADATA_CSV_PATH, ) - structure_dir = paths.ALPHAFOLD_MULTIMER_PATH / entry_id + structure_dir = paths.ALPHAFOLD_MULTIMER_PATH / entry_id.upper() check_dir(entry_id=entry_id, dir=structure_dir) # get cif file @@ -828,13 +832,12 @@ def upload_multimer_prediction( persist_upload=persist_upload, ) - now_utc = datetime.now(timezone.utc) - formatted = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ") + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") data: dict[str, Any] = { "entry_id": entry_id, "uniprot_ids": uniprot_ids, - "model_created_date": formatted, + "model_created_date": timestamp, "model_used": model_used, } @@ -845,7 +848,7 @@ def upload_multimer_prediction( extend_metadata_csv( entry_id=entry_id, metadata_csv=paths.AF_MULTIMER_METADATA_CSV_PATH, - exsisting_metadata_df=exsisting_metadata_df, + existing_metadata_df=exsisting_metadata_df, metadata_df=metadata_df, messages=messages, ) diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index bfdd9ea0d..5f11baedf 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -362,7 +362,7 @@ def test_get_correct_af_directories_persist_and_temp(tmp_path): # persist_upload True temp, work = get_correct_af_directories("abc", tmp_path, True) assert temp is None - assert work == tmp_path / "abc" + assert work == tmp_path / "ABC" assert work.exists() # persist_upload False -> temporary directory created From c532c3eac3a8f6db875712a091da200edb0b0859 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Tue, 17 Feb 2026 14:54:41 +0100 Subject: [PATCH 122/240] fix: minor bug --- backend/protzilla/importing/alphafold_protein_structure_load.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index cb41dba73..7e65adb58 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -458,7 +458,7 @@ def check_and_get_metadata_df( :raises ValueError: If no metadata for the given Entry ID is found. """ metadata_df = all_metadata_df[ - all_metadata_df["entry_id"].upper() == entry_id.upper() + all_metadata_df["entry_id"].astype(str).str.upper() == entry_id.upper() ] if metadata_df.empty: msg = f"No metadata for Entry ID '{entry_id}' in {csv_file}" From e706ea4238fc0cb7104277fabdc184b32149ba24 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Sun, 22 Feb 2026 20:45:18 +0100 Subject: [PATCH 123/240] fix: refactor naming to make it more general for multimer structure use --- .../data_analysis/crosslinking_validation.py | 29 +++++++++---------- .../alphafold_protein_structure_load.py | 22 +++++++------- backend/protzilla/methods/data_analysis.py | 12 ++++---- backend/protzilla/methods/importing.py | 4 +-- .../test_crosslinking_validation.py | 7 ++--- .../test_alphafold_protein_structure_load.py | 24 +++++++-------- 6 files changed, 47 insertions(+), 51 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 7d6a2b0fb..bd4cc216c 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -6,9 +6,6 @@ import logging from plotly.graph_objects import Figure -from protzilla.importing.alphafold_protein_structure_load import ( - fetch_alphafold_protein_structure, -) from protzilla.data_preprocessing.plots import create_bar_plot @@ -191,10 +188,10 @@ def add_positions_of_amino_acid_where_crosslinker_bound_to_df( def validate_with_angstrom_deviation( crosslinking_df: pd.DataFrame, - protein_to_validate: str, + structure_to_validate: str, crosslinker_information: dict[str, list[float]], cif_df: pd.DataFrame, - amino_acid_sequence_df: pd.DataFrame, + amino_acid_sequences_df: pd.DataFrame, ) -> dict: """ Validates cross-links by comparing the cross-linker lengths with the distances between the linked @@ -203,26 +200,26 @@ def validate_with_angstrom_deviation( and more than (cross-linker length - the lower allowed deviation). If one of the bounds is zero only the other bound will be applied. :param crosslinking_df: DataFrame containing cross-linking data. - :param protein_to_validate: UniProt ID of the protein to validate. + :param structure_to_validate: UniProt ID of the protein to validate. :param crosslinker_information: Contains for each Crosslinker: - length_of_: float - lower_accepted_deviation_for_: float - upper_accepted_deviation_for_: float :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) - :param amino_acid_sequence_df: DataFrame containing the protein sequence + :param amino_acid_sequences_df: DataFrame containing the protein sequence :return: dict (crosslinking_df_result, messages), crosslinking_df_result contains the relevant rows (rows of intra-crosslinks within the protein to validate) of crosslinking_df and two more columns containing the distances in AlphaFold and whether the crosslink matches the AlphaFold data or not :raises KeyError: If a required crosslinker field is missing in crosslinker_information. :raises ValueError: If peptide sequences cannot be matched to the protein sequence. """ - protein_sequence = amino_acid_sequence_df.at[0, "Protein Sequence"] + protein_sequence = amino_acid_sequences_df.at[0, "Protein Sequence"] all_crosslinks_df = crosslinking_df.copy() # we are only interested in intra-crosslinks of the protein we want to validate - mask = (all_crosslinks_df.Protein_id1 == protein_to_validate) & ( - all_crosslinks_df.Protein_id2 == protein_to_validate + mask = (all_crosslinks_df.Protein_id1 == structure_to_validate) & ( + all_crosslinks_df.Protein_id2 == structure_to_validate ) relevant_crosslinks_df = all_crosslinks_df[mask].copy() @@ -295,10 +292,10 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: def bar_plot_of_valid_crosslinks( crosslinking_df: pd.DataFrame, - protein_to_validate: str, + structure_to_validate: str, crosslinker_information: dict[str, list[float]], cif_df: pd.DataFrame, - amino_acid_sequence_df: pd.DataFrame, + amino_acid_sequences_df: pd.DataFrame, ) -> list[Figure]: """ Creates a bar plot summarizing the number of valid and invalid cross-links @@ -306,23 +303,23 @@ def bar_plot_of_valid_crosslinks( lengths and allowed deviations. :param crosslinking_df: DataFrame containing cross-linking data. - :param protein_to_validate: UniProt ID of the protein to validate. + :param structure_to_validate: UniProt ID of the protein to validate. :param crosslinker_information: Contains for each Crosslinker: - length_of_: float - lower_accepted_deviation_for_: float - upper_accepted_deviation_for_: float :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) - :param amino_acid_sequence_df: DataFrame containing the protein sequence + :param amino_acid_sequences_df: DataFrame containing the protein sequence :return: List containing a single bar plot object representing counts of valid and invalid cross-links. :raises KeyError: If a required crosslinker field is missing in crosslinker_information. """ validated_df = validate_with_angstrom_deviation( crosslinking_df, - protein_to_validate, + structure_to_validate, crosslinker_information, cif_df, - amino_acid_sequence_df, + amino_acid_sequences_df, )["crosslinking_result_df"] evaluated = validated_df["valid_crosslink"].dropna() diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 7e65adb58..bf42da50e 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -211,7 +211,7 @@ def extend_metadata_csv( messages.append(dict(level=logging.ERROR, msg=msg)) -def get_amino_acid_sequence_df(fasta_dest: Path, messages: list) -> pd.DataFrame: +def get_amino_acid_sequences_df(fasta_dest: Path, messages: list) -> pd.DataFrame: """ Load a FASTA file and return its amino acid sequence DataFrame. @@ -230,8 +230,8 @@ def get_amino_acid_sequence_df(fasta_dest: Path, messages: list) -> pd.DataFrame """ try: fasta_dict = fasta_import(str(fasta_dest)) - amino_acid_sequence_df = fasta_dict["fasta_df"] - return amino_acid_sequence_df + amino_acid_sequences_df = fasta_dict["fasta_df"] + return amino_acid_sequences_df except Exception: msg = "Failed to create sequence dataframe" logger.exception(msg) @@ -266,7 +266,7 @@ def handle_alphafold_files( cif_df = pd.DataFrame() pae_df = pd.DataFrame() plddt_df = pd.DataFrame() - amino_acid_sequence_df = pd.DataFrame() + amino_acid_sequences_df = pd.DataFrame() messages = [] temp_dir, work_dir = get_correct_af_directories( @@ -317,7 +317,7 @@ def handle_alphafold_files( logger.exception(msg) messages.append(dict(level=logging.ERROR, msg=msg)) if fasta_dest is not None: - amino_acid_sequence_df = get_amino_acid_sequence_df( + amino_acid_sequences_df = get_amino_acid_sequences_df( fasta_dest=fasta_dest, messages=messages, ) @@ -330,7 +330,7 @@ def handle_alphafold_files( "cif_df": cif_df, "pae_df": pae_df, "plddt_df": plddt_df, - "amino_acid_sequence_df": amino_acid_sequence_df, + "amino_acid_sequences_df": amino_acid_sequences_df, "messages": messages, } @@ -405,7 +405,7 @@ def fetch_alphafold_protein_structure( "cif_df": alpha_dfs["cif_df"], "pae_df": alpha_dfs["pae_df"], "plddt_df": alpha_dfs["plddt_df"], - "amino_acid_sequence_df": alpha_dfs["amino_acid_sequence_df"], + "amino_acid_sequences_df": alpha_dfs["amino_acid_sequences_df"], } messages = alpha_dfs["messages"] if not any(df.empty for df in df_dict.values()): @@ -636,7 +636,7 @@ def get_monomer_structure_dfs(entry_id: str) -> dict[str, Any]: ) # get fasta file - amino_acid_sequence_df = get_amino_acid_sequences_df_from_disk( + amino_acid_sequences_df = get_amino_acid_sequences_df_from_disk( entry_id=entry_id, structure_dir=structure_dir ) @@ -686,7 +686,7 @@ def get_monomer_structure_dfs(entry_id: str) -> dict[str, Any]: "cif_df": cif_df, "pae_df": pae_df, "plddt_df": plddt_df, - "amino_acid_sequence_df": amino_acid_sequence_df, + "amino_acid_sequences_df": amino_acid_sequences_df, } check_success_of_get_df(entry_id=entry_id, df_dict=df_dict, messages=messages) df_dict["messages"] = messages @@ -864,7 +864,7 @@ def upload_multimer_prediction( messages.append(dict(level=logging.ERROR, msg=msg)) fasta_dict = fasta_import(str(amino_acid_sequences)) - amino_acid_sequence_df = fasta_dict["fasta_df"] + amino_acid_sequences_df = fasta_dict["fasta_df"] confidence_df = pd.read_json(confidence_file) @@ -889,7 +889,7 @@ def upload_multimer_prediction( "cif_df": cif_df, "confidence_df": confidence_df, "full_data_df": full_data_df, - "amino_acid_sequences_df": amino_acid_sequence_df, + "amino_acid_sequences_df": amino_acid_sequences_df, } if not any(df.empty for df in df_dict.values()): diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index d09d742ef..c94346ba3 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -2564,14 +2564,14 @@ def create_form(self): label="Ångström Deviation", input_fields=[ DropdownField( - name="protein_to_validate", + name="structure_to_validate", label="Protein prediction that should be validated", ), ], ) def modify_form(self, form: Form, run: Run) -> None: - # add all loaded protein entry ids to the dropdown of protein_to_validate_field + # add all loaded protein entry ids to the dropdown of structure_to_validate_field loaded_protein_entry_ids = list( set( run.steps.get_inputs_of_step_type( @@ -2582,7 +2582,7 @@ def modify_form(self, form: Form, run: Run) -> None: ) ) ) - form["protein_to_validate"].set_options( + form["structure_to_validate"].set_options( form_helper.to_choices(loaded_protein_entry_ids) ) # create fields for every crosslink @@ -2613,7 +2613,7 @@ def modify_form(self, form: Form, run: Run) -> None: calc_method = staticmethod(validate_with_angstrom_deviation) def insert_dataframes(self, steps: StepManager, inputs) -> dict: - entry_id = inputs["protein_to_validate"] + entry_id = inputs["structure_to_validate"] correct_input_step_identifier = steps.get_step_identifier_of_step_with_input( ImportMonomerStructurePredictionFromDisk, "entry_id", entry_id ) or steps.get_step_identifier_of_step_with_input( @@ -2622,8 +2622,8 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["cif_df"] = steps.get_step_output( Step, "cif_df", correct_input_step_identifier ) - inputs["amino_acid_sequence_df"] = steps.get_step_output( - Step, "amino_acid_sequence_df", correct_input_step_identifier + inputs["amino_acid_sequences_df"] = steps.get_step_output( + Step, "amino_acid_sequences_df", correct_input_step_identifier ) inputs["crosslinking_df"] = steps.get_step_output( Step, diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index c8d08528c..4790c97e7 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -429,7 +429,7 @@ class AlphaFoldPredictionLoad(ImportingStep): "cif_df", "pae_df", "plddt_df", - "amino_acid_sequence_df", + "amino_acid_sequences_df", ] plot_method = None @@ -493,7 +493,7 @@ class ImportMonomerStructurePredictionFromDisk(ImportingStep): "cif_df", "pae_df", "plddt_df", - "amino_acid_sequence_df", + "amino_acid_sequences_df", ] def create_form(self): diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 4fdbf63eb..ca960ef6c 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -1,7 +1,6 @@ import pandas as pd import pytest import logging -from unittest.mock import patch from unittest.mock import MagicMock @@ -34,7 +33,7 @@ def test_validate_with_angstrom_deviation(distance, expected): } ) - amino_acid_sequence_df = pd.DataFrame({"Protein Sequence": ["AB"]}) + amino_acid_sequences_df = pd.DataFrame({"Protein Sequence": ["AB"]}) # Fake Crosslink Data crosslinking_df = pd.DataFrame( @@ -53,9 +52,9 @@ def test_validate_with_angstrom_deviation(distance, expected): result = validate_with_angstrom_deviation( crosslinking_df, - protein_to_validate="P12345", + structure_to_validate="P12345", crosslinker_information=crosslinker_information, - amino_acid_sequence_df=amino_acid_sequence_df, + amino_acid_sequences_df=amino_acid_sequences_df, cif_df=cif_df, ) diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index 5f11baedf..3225851cb 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -18,7 +18,7 @@ get_multimer_metadata_df, get_correct_af_directories, extend_metadata_csv, - get_amino_acid_sequence_df, + get_amino_acid_sequences_df, handle_alphafold_files, upload_multimer_prediction, check_and_get_metadata_df, @@ -131,7 +131,7 @@ def test_fetch_alphafold_returned_keys(tmp_path, monkeypatch): "cif_df", "pae_df", "plddt_df", - "amino_acid_sequence_df", + "amino_acid_sequences_df", "messages", } @@ -203,7 +203,7 @@ def test_fetch_alphafold_dfs_exist(tmp_path, monkeypatch): assert isinstance(plddt_df, pd.DataFrame) assert not plddt_df.empty - seq_df = out["amino_acid_sequence_df"] + seq_df = out["amino_acid_sequences_df"] assert isinstance(seq_df, pd.DataFrame) assert not seq_df.empty @@ -320,10 +320,10 @@ def test_get_prot_structure_dfs_success(tmp_path, monkeypatch): assert out["plddt_df"]["residueNumber"].tolist() == [1] assert out["plddt_df"]["confidenceScore"].tolist() == [90] - assert isinstance(out["amino_acid_sequence_df"], pd.DataFrame) - assert not out["amino_acid_sequence_df"].empty - assert out["amino_acid_sequence_df"]["Protein ID"].tolist() == ["Q8WP00-1"] - assert out["amino_acid_sequence_df"]["Protein Sequence"].tolist() == ["AAAA"] + assert isinstance(out["amino_acid_sequences_df"], pd.DataFrame) + assert not out["amino_acid_sequences_df"].empty + assert out["amino_acid_sequences_df"]["Protein ID"].tolist() == ["Q8WP00-1"] + assert out["amino_acid_sequences_df"]["Protein Sequence"].tolist() == ["AAAA"] assert any(d.get("level") == logging.INFO for d in out["messages"]) or any( "Successfully loaded" in d.get("msg", "") for d in out["messages"] @@ -400,12 +400,12 @@ def test_extend_metadata_csv_overwrite_and_new(tmp_path): assert out2.iloc[0]["entry_id"] == "C" -def test_get_amino_acid_sequence_df_and_handle_files(tmp_path, monkeypatch): - # create a fasta and call get_amino_acid_sequence_df directly +def test_get_amino_acid_sequences_df_and_handle_files(tmp_path, monkeypatch): + # create a fasta and call get_amino_acid_sequences_df directly fasta = tmp_path / "P.fasta" fasta.write_text(">alpha|P\nTESTSEQ\n") messages = [] - seq_df = get_amino_acid_sequence_df(fasta, messages) + seq_df = get_amino_acid_sequences_df(fasta, messages) assert isinstance(seq_df, pd.DataFrame) assert not seq_df.empty @@ -414,11 +414,11 @@ def test_get_amino_acid_sequence_df_and_handle_files(tmp_path, monkeypatch): out = handle_alphafold_files( {}, "P", "TESTSEQ", metadata_df, "P", persist_upload=False ) - assert "amino_acid_sequence_df" in out + assert "amino_acid_sequences_df" in out assert isinstance(out["cif_df"], pd.DataFrame) and out["cif_df"].empty assert isinstance(out["pae_df"], pd.DataFrame) and out["pae_df"].empty assert isinstance(out["plddt_df"], pd.DataFrame) and out["plddt_df"].empty - assert isinstance(out["amino_acid_sequence_df"], pd.DataFrame) + assert isinstance(out["amino_acid_sequences_df"], pd.DataFrame) def test_upload_multimer_prediction_basic(tmp_path, monkeypatch): From baa7434428045f249a6a83f90e691c394c28057c Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Sun, 22 Feb 2026 21:42:34 +0100 Subject: [PATCH 124/240] feat: add validation for multimer step (with a not working calc and plot method) --- backend/protzilla/all_steps.py | 3 +- .../data_analysis/crosslinking_validation.py | 12 +- backend/protzilla/methods/data_analysis.py | 175 +++++++++++++----- backend/tests/main/test_views_helper.py | 3 +- .../test_crosslinking_validation.py | 4 +- 5 files changed, 140 insertions(+), 57 deletions(-) diff --git a/backend/protzilla/all_steps.py b/backend/protzilla/all_steps.py index 0095ac1f9..b684db818 100644 --- a/backend/protzilla/all_steps.py +++ b/backend/protzilla/all_steps.py @@ -70,7 +70,8 @@ data_analysis.PTMOverviewVisualization, data_analysis.PTMBarVisualization, data_analysis.PTMDetailsVisualization, - data_analysis.CrossLinkingValidationWithAngstromDeviation, + data_analysis.CrosslinkingValidationWithAngstromDeviation, + data_analysis.CrosslinkingValidationWithAngstromDeviationForMultimer, data_preprocessing.ImputationByMinPerSample, data_integration.EnrichmentAnalysisGOAnalysisWithString, data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr, diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index bd4cc216c..a9a518824 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -192,6 +192,7 @@ def validate_with_angstrom_deviation( crosslinker_information: dict[str, list[float]], cif_df: pd.DataFrame, amino_acid_sequences_df: pd.DataFrame, + is_multimer: bool, ) -> dict: """ Validates cross-links by comparing the cross-linker lengths with the distances between the linked @@ -217,10 +218,13 @@ def validate_with_angstrom_deviation( all_crosslinks_df = crosslinking_df.copy() - # we are only interested in intra-crosslinks of the protein we want to validate - mask = (all_crosslinks_df.Protein_id1 == structure_to_validate) & ( - all_crosslinks_df.Protein_id2 == structure_to_validate - ) + if not is_multimer: + # we are only interested in intra-crosslinks of the protein we want to validate + mask = (all_crosslinks_df.Protein_id1 == structure_to_validate) & ( + all_crosslinks_df.Protein_id2 == structure_to_validate + ) + else: + mask = all_crosslinks_df.Protein_id1 == structure_to_validate relevant_crosslinks_df = all_crosslinks_df[mask].copy() relevant_crosslinks_df, messages = ( diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index c94346ba3..651696f0e 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -69,6 +69,8 @@ from backend.protzilla.methods.importing import ( ImportMonomerStructurePredictionFromDisk, AlphaFoldPredictionLoad, + ImportMultimerStructurePredictionFromDisk, + UploadMultimerPredictions, ) @@ -2544,13 +2546,7 @@ def create_form(self): ) -class CrossLinkingValidationWithAngstromDeviation(DataAnalysisStep): - display_name = "Ångström Deviation" - operation = "Cross Linking Validation" - method_description = "Validates cross links based on the difference between the length of the cross linker and the distance between the amino acids which were connected by the cross linker. (in Ångström)" - - output_keys = ["crosslinking_result_df"] - +class CrosslinkingValidation(DataAnalysisStep): @staticmethod def _get_crosslinker_names_from_crosslinker_df(steps: StepManager) -> list[str]: df = steps.get_step_output(Step, output_key="crosslinking_df") @@ -2559,9 +2555,74 @@ def _get_crosslinker_names_from_crosslinker_df(steps: StepManager) -> list[str]: crosslinkers = df["Crosslinker"].dropna().unique() return list(crosslinkers) + def create_crosslink_input_fields(self, form: Form, run: Run): + crosslinkers = self._get_crosslinker_names_from_crosslinker_df(run.steps) + for crosslinker in crosslinkers: + field_name = f"{crosslinker}_length" + if field_name not in form: + crosslinker_length_field = FloatField( + name=field_name, + label=f"Length of {crosslinker} in Ångström", + min=0, + ) + upper_bound_length_deviation_field = FloatField( + name=f"{crosslinker}_upper_accepted_deviation", + label=f"Upper bound on the accepted deviation for {crosslinker} Cross-Links in Ångström (0 equals no bound)", + min=0, + ) + lower_bound_length_deviation_field = FloatField( + name=f"{crosslinker}_lower_accepted_deviation", + label=f"Lower bound on the accepted deviation for {crosslinker} Cross-Links in Ångström (0 equals no bound)", + min=0, + ) + form.add_field(crosslinker_length_field) + form.add_field(upper_bound_length_deviation_field) + form.add_field(lower_bound_length_deviation_field) + + def collect_crosslinking_information(self, steps: StepManager, inputs) -> dict: + # although crosslinker_information is not a dataframe we need to insert the user information regarding the crosslinks as a dictionary into the inputs + crosslinker_to_length_and_deviation = {} + for crosslinker in self._get_crosslinker_names_from_crosslinker_df(steps): + crosslinker_to_length_and_deviation[crosslinker] = [ + inputs.get(f"{crosslinker}_length"), + inputs.get(f"{crosslinker}_upper_accepted_deviation"), + inputs.get(f"{crosslinker}_lower_accepted_deviation"), + ] + return crosslinker_to_length_and_deviation + + def insert_dataframes_with_correct_input_step_id( + self, steps, inputs, correct_input_step_identifier: str + ) -> dict: + inputs["cif_df"] = steps.get_step_output( + Step, "cif_df", correct_input_step_identifier + ) + inputs["amino_acid_sequences_df"] = steps.get_step_output( + Step, "amino_acid_sequences_df", correct_input_step_identifier + ) + inputs["crosslinking_df"] = steps.get_step_output( + Step, + "crosslinking_df", + ) + if inputs.get("crosslinking_df") is None: + raise ValueError("No cross linking data found.") + + inputs["crosslinker_information"] = self.collect_crosslinking_information( + steps=steps, inputs=inputs + ) + + return inputs + + +class CrosslinkingValidationWithAngstromDeviation(CrosslinkingValidation): + display_name = "Ångström Deviation For Monomer Structures" + operation = "Cross Linking Validation" + method_description = "Validates cross links within the one protein structure based on the difference between the length of the cross linker and the distance between the amino acids which were connected by the cross linker. (in Ångström)" + + output_keys = ["crosslinking_result_df"] + def create_form(self): return Form( - label="Ångström Deviation", + label="Ångström Deviation - Monomer", input_fields=[ DropdownField( name="structure_to_validate", @@ -2586,28 +2647,7 @@ def modify_form(self, form: Form, run: Run) -> None: form_helper.to_choices(loaded_protein_entry_ids) ) # create fields for every crosslink - crosslinkers = self._get_crosslinker_names_from_crosslinker_df(run.steps) - for crosslinker in crosslinkers: - field_name = f"{crosslinker}_length" - if field_name not in form: - crosslinker_length_field = FloatField( - name=field_name, - label=f"Length of {crosslinker} in Ångström", - min=0, - ) - upper_bound_length_deviation_field = FloatField( - name=f"{crosslinker}_upper_accepted_deviation", - label=f"Upper bound on the accepted deviation for {crosslinker} Cross-Links in Ångström (0 equals no bound)", - min=0, - ) - lower_bound_length_deviation_field = FloatField( - name=f"{crosslinker}_lower_accepted_deviation", - label=f"Lower bound on the accepted deviation for {crosslinker} Cross-Links in Ångström (0 equals no bound)", - min=0, - ) - form.add_field(crosslinker_length_field) - form.add_field(upper_bound_length_deviation_field) - form.add_field(lower_bound_length_deviation_field) + self.create_crosslink_input_fields(form=form, run=run) plot_method = staticmethod(bar_plot_of_valid_crosslinks) calc_method = staticmethod(validate_with_angstrom_deviation) @@ -2619,27 +2659,64 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: ) or steps.get_step_identifier_of_step_with_input( AlphaFoldPredictionLoad, "uniprot_id", entry_id ) - inputs["cif_df"] = steps.get_step_output( - Step, "cif_df", correct_input_step_identifier + + inputs["is_multimer"] = False + return self.insert_dataframes_with_correct_input_step_id( + steps=steps, + inputs=inputs, + correct_input_step_identifier=correct_input_step_identifier, ) - inputs["amino_acid_sequences_df"] = steps.get_step_output( - Step, "amino_acid_sequences_df", correct_input_step_identifier + + +class CrosslinkingValidationWithAngstromDeviationForMultimer(CrosslinkingValidation): + display_name = "Ångström Deviation For Multimer Structures" + operation = "Cross Linking Validation" + method_description = "Validates cross links between proteins based on the difference between the length of the cross linker and the distance between the amino acids which were connected by the cross linker. (in Ångström)" + + output_keys = ["crosslinking_result_df"] + + def create_form(self): + return Form( + label="Ångström Deviation - Multimer", + input_fields=[ + DropdownField( + name="structure_to_validate", + label="Multimer prediction that should be validated", + ), + ], ) - inputs["crosslinking_df"] = steps.get_step_output( - Step, - "crosslinking_df", + + def modify_form(self, form: Form, run: Run) -> None: + # add all loaded protein entry ids to the dropdown of structure_to_validate_field + loaded_proteins_entry_ids = list( + set( + run.steps.get_inputs_of_step_type( + ImportMultimerStructurePredictionFromDisk, "entry_id" + ) + + run.steps.get_inputs_of_step_type( + UploadMultimerPredictions, "entry_id" + ) + ) ) - if inputs.get("crosslinking_df") is None: - raise ValueError("No cross linking data found.") + form["structure_to_validate"].set_options( + form_helper.to_choices(loaded_proteins_entry_ids) + ) + self.create_crosslink_input_fields(form=form, run=run) - # although crosslinker_information is not a dataframe we need to insert the user information regarding the crosslinks as a dictionary into the inputs - crosslinker_to_length_and_deviation = {} - for crosslinker in self._get_crosslinker_names_from_crosslinker_df(steps): - crosslinker_to_length_and_deviation[crosslinker] = [ - inputs.get(f"{crosslinker}_length"), - inputs.get(f"{crosslinker}_upper_accepted_deviation"), - inputs.get(f"{crosslinker}_lower_accepted_deviation"), - ] - inputs["crosslinker_information"] = crosslinker_to_length_and_deviation + plot_method = staticmethod(bar_plot_of_valid_crosslinks) + calc_method = staticmethod(validate_with_angstrom_deviation) - return inputs + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + entry_id = inputs["structure_to_validate"] + correct_input_step_identifier = steps.get_step_identifier_of_step_with_input( + ImportMultimerStructurePredictionFromDisk, "entry_id", entry_id + ) or steps.get_step_identifier_of_step_with_input( + UploadMultimerPredictions, "entry_id", entry_id + ) + + inputs["is_multimer"] = False + return self.insert_dataframes_with_correct_input_step_id( + steps=steps, + inputs=inputs, + correct_input_step_identifier=correct_input_step_identifier, + ) diff --git a/backend/tests/main/test_views_helper.py b/backend/tests/main/test_views_helper.py index 175199013..a70102419 100644 --- a/backend/tests/main/test_views_helper.py +++ b/backend/tests/main/test_views_helper.py @@ -81,7 +81,8 @@ def test_get_all_possible_step_names(): "PlotGOEnrichmentDotPlot", "PlotGSEADotPlot", "PlotGSEAEnrichmentPlot", - "CrossLinkingValidationWithAngstromDeviation", + "CrosslinkingValidationWithAngstromDeviation", + "CrosslinkingValidationWithAngstromDeviationForMultimer", } steps = get_all_possible_steps() diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index ca960ef6c..b14dbfdb8 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -9,7 +9,7 @@ get_distance_between_two_amino_acids_in_angstrom, add_positions_of_amino_acid_where_crosslinker_bound_to_df, ) -from protzilla.methods.data_analysis import CrossLinkingValidationWithAngstromDeviation +from protzilla.methods.data_analysis import CrosslinkingValidationWithAngstromDeviation @pytest.mark.parametrize( @@ -75,7 +75,7 @@ def test_modify_form_creates_crosslinker_fields(): run = MagicMock() run.steps = steps - step = CrossLinkingValidationWithAngstromDeviation() + step = CrosslinkingValidationWithAngstromDeviation() form = step.create_form() step.modify_form(form, run) From d90539c750a4e1b600f4020cecabb67289d000c4 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Fri, 27 Feb 2026 17:31:36 +0100 Subject: [PATCH 125/240] fix: prevent visibility toggling by legend clicks in histogram --- backend/protzilla/data_preprocessing/plots.py | 9 ++++++++- .../data_preprocessing/test_plots_data_preprocessing.py | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/backend/protzilla/data_preprocessing/plots.py b/backend/protzilla/data_preprocessing/plots.py index 500d68b78..fbb3986d1 100644 --- a/backend/protzilla/data_preprocessing/plots.py +++ b/backend/protzilla/data_preprocessing/plots.py @@ -229,7 +229,6 @@ def create_histograms( if one_bin_per_int: min_value = math.floor(min_value) max_value = math.ceil(max_value) - number_of_bins = max_value - min_value binsize_a = 1 binsize_b = 1 else: @@ -283,6 +282,14 @@ def create_histograms( fig.update_layout(title={"text": f"{heading}"}) fig.update_xaxes(title=x_title) fig.update_yaxes(title=y_title, rangemode="tozero") + + # Disable toggling of the visibility of the traces by clicking on the legend + fig.update_layout( + legend=dict( + itemclick=False, + itemdoubleclick=False + ) + ) return fig diff --git a/backend/tests/protzilla/data_preprocessing/test_plots_data_preprocessing.py b/backend/tests/protzilla/data_preprocessing/test_plots_data_preprocessing.py index 3c08c7e63..7e4b0739a 100644 --- a/backend/tests/protzilla/data_preprocessing/test_plots_data_preprocessing.py +++ b/backend/tests/protzilla/data_preprocessing/test_plots_data_preprocessing.py @@ -179,7 +179,7 @@ def test_add_vertical_line_with_annotation_in_legend_adds_line_and_legend_multip vlines_x = [shape.x0 for shape in fig.layout.shapes] assert vlines_x == [1.0, 2.0] vlines_colors = [shape["line"]["color"] for shape in fig.layout.shapes] - assert vlines_colors == ["blue", "green"] + assert vlines_colors == [PLOT_PRIMARY_COLOR, "green"] vlines_dashes = [shape["line"]["dash"] for shape in fig.layout.shapes] assert vlines_dashes == ["dash", "dot"] @@ -191,7 +191,7 @@ def test_add_vertical_line_with_annotation_in_legend_adds_line_and_legend_multip x_values = [trace.x for trace in fig.data] y_values = [trace.y for trace in fig.data] assert names == ["Line 1", "Line 2"] - assert colors == ["blue", "green"] + assert colors == [PLOT_PRIMARY_COLOR, "green"] assert dashes == ["dash", "dot"] assert x_values == [(None,), (None,)] assert y_values == [(None,), (None,)] From 626729e54217ec253abab90891bd383ef974d058 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Fri, 27 Feb 2026 17:46:00 +0100 Subject: [PATCH 126/240] style: run black --- backend/protzilla/data_preprocessing/plots.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/backend/protzilla/data_preprocessing/plots.py b/backend/protzilla/data_preprocessing/plots.py index fbb3986d1..02395f68e 100644 --- a/backend/protzilla/data_preprocessing/plots.py +++ b/backend/protzilla/data_preprocessing/plots.py @@ -284,12 +284,7 @@ def create_histograms( fig.update_yaxes(title=y_title, rangemode="tozero") # Disable toggling of the visibility of the traces by clicking on the legend - fig.update_layout( - legend=dict( - itemclick=False, - itemdoubleclick=False - ) - ) + fig.update_layout(legend=dict(itemclick=False, itemdoubleclick=False)) return fig From 4b64570bbf3eaf5cb77deaacc6011dcde870538c Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Fri, 27 Feb 2026 19:01:04 +0100 Subject: [PATCH 127/240] feat: add validation logic for multimer structures --- .../data_analysis/crosslinking_validation.py | 134 +++++++++++++----- .../alphafold_protein_structure_load.py | 8 +- backend/protzilla/methods/data_analysis.py | 38 +++-- .../test_crosslinking_validation.py | 2 +- 4 files changed, 128 insertions(+), 54 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index a9a518824..ec8fae575 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -102,8 +102,26 @@ def get_distance_between_two_amino_acids_in_angstrom( return float(np.linalg.norm(pos2 - pos1)) +def get_protein_sequence_from_df( + amino_acid_sequences_df: pd.DataFrame, protein_id: str +) -> str: + # because protein ids like O43242 are saved as O43242-1 in amino_acid_sequences_df + if "-" not in protein_id: + protein_id = f"{protein_id}-1" + + matches = amino_acid_sequences_df.loc[ + amino_acid_sequences_df["Protein ID"] == protein_id, "Protein Sequence" + ] + + if matches.empty: + return "" + + return matches.iloc[0] + + def add_positions_of_amino_acid_where_crosslinker_bound_to_df( - input_crosslinking_df: pd.DataFrame, protein_sequence: str + input_crosslinking_df: pd.DataFrame, + amino_acid_sequences_df: pd.DataFrame, ) -> tuple[pd.DataFrame, list[dict]]: """ Adds for each crosslink the 1-based positions of amino acids where the crosslink bound to a crosslinking DataFrame. @@ -116,7 +134,7 @@ def add_positions_of_amino_acid_where_crosslinker_bound_to_df( - 'Peptide2': second peptide sequence - 'CL_position_within_peptide1': 0-based crosslinker position within Peptide1 - 'CL_position_within_peptide2': 0-based crosslinker position within Peptide2 - :param protein_sequence: Full protein sequence in which the peptides are located. + :param protein_sequences: Full protein sequences in which the peptides are located. :return: tuple (updated_crosslinking_df, messages) - updated_crosslinking_df: input DataFrame with two new columns: - 'crosslinker_position1': 1-based crosslinker position in Peptide1 @@ -130,34 +148,47 @@ def add_positions_of_amino_acid_where_crosslinker_bound_to_df( rows_to_duplicate = {} rows_to_delete = [] messages = [] - for idx, crosslinker_row in crosslinking_df.iterrows(): - peptide_sequence1 = crosslinker_row.Peptide1 - peptide_sequence2 = crosslinker_row.Peptide2 - peptide1_positions = [ - m.start() + crosslinker_row.CL_position_within_peptide1 + 1 - for m in re.finditer(f"(?={peptide_sequence1})", protein_sequence) - ] - peptide2_positions = [ - m.start() + crosslinker_row.CL_position_within_peptide2 + 1 - for m in re.finditer(f"(?={peptide_sequence2})", protein_sequence) + + def get_positions_for( + peptide: str, protein_id: str, cl_position_within_peptide: int + ) -> list: + protein_sequence = get_protein_sequence_from_df( + amino_acid_sequences_df=amino_acid_sequences_df, protein_id=protein_id + ) + positions = [ + m.start() + cl_position_within_peptide + 1 + for m in re.finditer(f"(?={peptide})", protein_sequence) ] + return positions + + for idx, crosslinker_row in crosslinking_df.iterrows(): + peptide_sequence1 = re.escape(crosslinker_row.Peptide1) + peptide_sequence2 = re.escape(crosslinker_row.Peptide2) + protein_id1 = crosslinker_row.Protein_id1 + protein_id2 = crosslinker_row.Protein_id2 + + peptide1_positions = get_positions_for( + peptide_sequence1, protein_id1, crosslinker_row.CL_position_within_peptide1 + ) + peptide2_positions = get_positions_for( + peptide_sequence2, protein_id2, crosslinker_row.CL_position_within_peptide2 + ) + all_position_combinations = list( itertools.product(peptide1_positions, peptide2_positions) ) if not all_position_combinations: if not peptide1_positions and not peptide2_positions: - msg = f"Peptide sequences {peptide_sequence1} and {peptide_sequence2} of crosslink entry {idx} were not found in the protein sequence. The entry was deleted." + msg = f"Peptide sequences {peptide_sequence1} and {peptide_sequence2} of crosslink entry {idx} were not found in the protein sequences. The entry was deleted." else: - msg = f"Peptide sequence {peptide_sequence1 if not peptide1_positions else peptide_sequence2} of crosslink entry {idx} was not found in the protein sequence. The entry was deleted." + msg = f"Peptide sequence {peptide_sequence1 if not peptide1_positions else peptide_sequence2} of crosslink entry {idx} was not found in the protein sequences. The entry was deleted." messages.append(dict(level=logging.WARNING, msg=msg)) rows_to_delete.append(idx) continue - crosslinking_df.at[idx, "crosslinker_position1"] = all_position_combinations[0][ - 0 - ] - crosslinking_df.at[idx, "crosslinker_position2"] = all_position_combinations[0][ - 1 - ] + crosslinker_position1, crosslinker_position2 = all_position_combinations[0] + + crosslinking_df.at[idx, "crosslinker_position1"] = crosslinker_position1 + crosslinking_df.at[idx, "crosslinker_position2"] = crosslinker_position2 if len(all_position_combinations) > 1: rows_to_duplicate[idx] = all_position_combinations[1:] @@ -188,7 +219,7 @@ def add_positions_of_amino_acid_where_crosslinker_bound_to_df( def validate_with_angstrom_deviation( crosslinking_df: pd.DataFrame, - structure_to_validate: str, + structures_to_validate: list[str], crosslinker_information: dict[str, list[float]], cif_df: pd.DataFrame, amino_acid_sequences_df: pd.DataFrame, @@ -201,44 +232,68 @@ def validate_with_angstrom_deviation( and more than (cross-linker length - the lower allowed deviation). If one of the bounds is zero only the other bound will be applied. :param crosslinking_df: DataFrame containing cross-linking data. - :param structure_to_validate: UniProt ID of the protein to validate. + :param structures_to_validate: UniProt IDs of the proteins to validate. :param crosslinker_information: Contains for each Crosslinker: - length_of_: float - lower_accepted_deviation_for_: float - upper_accepted_deviation_for_: float :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) :param amino_acid_sequences_df: DataFrame containing the protein sequence + :param is_multimer: Whether the structures we want to check are monomer or multimer :return: dict (crosslinking_df_result, messages), crosslinking_df_result contains the relevant rows (rows of intra-crosslinks within the protein to validate) of crosslinking_df and two more columns containing the distances in AlphaFold and whether the crosslink matches the AlphaFold data or not :raises KeyError: If a required crosslinker field is missing in crosslinker_information. :raises ValueError: If peptide sequences cannot be matched to the protein sequence. """ - protein_sequence = amino_acid_sequences_df.at[0, "Protein Sequence"] all_crosslinks_df = crosslinking_df.copy() - if not is_multimer: # we are only interested in intra-crosslinks of the protein we want to validate - mask = (all_crosslinks_df.Protein_id1 == structure_to_validate) & ( - all_crosslinks_df.Protein_id2 == structure_to_validate + mask = (all_crosslinks_df.Protein_id1 == structures_to_validate[0]) & ( + all_crosslinks_df.Protein_id2 == structures_to_validate[0] ) else: - mask = all_crosslinks_df.Protein_id1 == structure_to_validate + mask = (all_crosslinks_df["Protein_id1"].isin(structures_to_validate)) & ( + all_crosslinks_df["Protein_id2"].isin(structures_to_validate) + ) + relevant_crosslinks_df = all_crosslinks_df[mask].copy() + # Check if dataframe is empty + if relevant_crosslinks_df.empty: + msg = "There are no cross links between the structures to validate." + messages = [dict(level=logging.WARNING, msg=msg)] + return dict(crosslinking_result_df=pd.DataFrame(), messages=messages) + relevant_crosslinks_df, messages = ( add_positions_of_amino_acid_where_crosslinker_bound_to_df( - relevant_crosslinks_df, protein_sequence + relevant_crosslinks_df, amino_acid_sequences_df ) ) def check_crosslink(crosslink: pd.Series) -> pd.Series: + protein_id1 = crosslink.Protein_id1 + protein_id2 = crosslink.Protein_id1 + protein_sequence1 = get_protein_sequence_from_df( + amino_acid_sequences_df=amino_acid_sequences_df, protein_id=protein_id1 + ) + protein_sequence2 = get_protein_sequence_from_df( + amino_acid_sequences_df=amino_acid_sequences_df, protein_id=protein_id2 + ) + + relevant_crosslinks_df["crosslinker_position1"] = relevant_crosslinks_df[ + "crosslinker_position1" + ].astype("Int64") + + relevant_crosslinks_df["crosslinker_position2"] = relevant_crosslinks_df[ + "crosslinker_position2" + ].astype("Int64") predicted_distance = get_distance_between_two_amino_acids_in_angstrom( amino_acid_position1=crosslink.crosslinker_position1, amino_acid_position2=crosslink.crosslinker_position2, - amino_acid_type1=protein_sequence[crosslink.crosslinker_position1 - 1], - amino_acid_type2=protein_sequence[crosslink.crosslinker_position2 - 1], + amino_acid_type1=protein_sequence1[crosslink.crosslinker_position1 - 1], + amino_acid_type2=protein_sequence2[crosslink.crosslinker_position2 - 1], cif_df=cif_df, ) try: @@ -296,10 +351,11 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: def bar_plot_of_valid_crosslinks( crosslinking_df: pd.DataFrame, - structure_to_validate: str, + structures_to_validate: list[str], crosslinker_information: dict[str, list[float]], cif_df: pd.DataFrame, amino_acid_sequences_df: pd.DataFrame, + is_multimer: bool, ) -> list[Figure]: """ Creates a bar plot summarizing the number of valid and invalid cross-links @@ -307,29 +363,35 @@ def bar_plot_of_valid_crosslinks( lengths and allowed deviations. :param crosslinking_df: DataFrame containing cross-linking data. - :param structure_to_validate: UniProt ID of the protein to validate. + :param structures_to_validate: UniProt ID of the protein to validate. :param crosslinker_information: Contains for each Crosslinker: - length_of_: float - lower_accepted_deviation_for_: float - upper_accepted_deviation_for_: float :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) :param amino_acid_sequences_df: DataFrame containing the protein sequence + :param is_multimer: Whether the structures we want to check are monomer or multimer :return: List containing a single bar plot object representing counts of valid and invalid cross-links. :raises KeyError: If a required crosslinker field is missing in crosslinker_information. """ validated_df = validate_with_angstrom_deviation( crosslinking_df, - structure_to_validate, + structures_to_validate, crosslinker_information, cif_df, amino_acid_sequences_df, + is_multimer, )["crosslinking_result_df"] - evaluated = validated_df["valid_crosslink"].dropna() + if validated_df.empty: + valid_crosslinks = 0 + invalid_crosslinks = 0 + else: + evaluated = validated_df["valid_crosslink"].dropna() - valid_crosslinks = (evaluated == True).sum() - invalid_crosslinks = (evaluated == False).sum() + valid_crosslinks = (evaluated == True).sum() + invalid_crosslinks = (evaluated == False).sum() return [ create_bar_plot( diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index bf42da50e..eef2ce78a 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -756,7 +756,6 @@ def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: msg = f"Failed to read JSON files in {structure_dir}: {e}" logger.exception(msg) raise RuntimeError(msg) from e - df_dict = { "metadata_df": metadata_df, "amino_acid_sequences_df": amino_acid_sequences_df, @@ -764,7 +763,6 @@ def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: "confidence_df": confidence_df, "full_data_df": full_data_df, } - check_success_of_get_df(entry_id=entry_id, df_dict=df_dict, messages=messages) df_dict["messages"] = messages return df_dict @@ -772,7 +770,7 @@ def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: def upload_multimer_prediction( entry_id: str, - uniprot_ids: list[str], + uniprot_ids: str, model_used: str, amino_acid_sequences: Path, cif_file: Path, @@ -834,9 +832,11 @@ def upload_multimer_prediction( timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + uniprot_ids_as_list = uniprot_ids.split(", ") + data: dict[str, Any] = { "entry_id": entry_id, - "uniprot_ids": uniprot_ids, + "uniprot_ids": uniprot_ids_as_list, "model_created_date": timestamp, "model_used": model_used, } diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 651696f0e..5442fc322 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -1,4 +1,5 @@ import logging +import ast from backend.protzilla import form_helper from backend.protzilla.constants.option_types import MultipleTestingCorrectionMethod @@ -2610,6 +2611,20 @@ def insert_dataframes_with_correct_input_step_id( steps=steps, inputs=inputs ) + metadata_df = steps.get_step_output( + Step, "metadata_df", correct_input_step_identifier + ) + if "uniprot_accession" in metadata_df.columns: + inputs["structures_to_validate"] = metadata_df["uniprot_accession"].tolist() + elif "uniprot_ids" in metadata_df.columns: + inputs["structures_to_validate"] = ast.literal_eval( + metadata_df["uniprot_ids"].iloc[0] + ) + l = inputs["structures_to_validate"] + else: + raise ValueError( + "No correct metadata found. Metadata must contain 'uniprot_ids' or 'uniprot_accession'." + ) return inputs @@ -2625,14 +2640,14 @@ def create_form(self): label="Ångström Deviation - Monomer", input_fields=[ DropdownField( - name="structure_to_validate", + name="entry_id", label="Protein prediction that should be validated", ), ], ) def modify_form(self, form: Form, run: Run) -> None: - # add all loaded protein entry ids to the dropdown of structure_to_validate_field + # add all loaded protein entry ids to the dropdown of structures_to_validate_field loaded_protein_entry_ids = list( set( run.steps.get_inputs_of_step_type( @@ -2643,9 +2658,7 @@ def modify_form(self, form: Form, run: Run) -> None: ) ) ) - form["structure_to_validate"].set_options( - form_helper.to_choices(loaded_protein_entry_ids) - ) + form["entry_id"].set_options(form_helper.to_choices(loaded_protein_entry_ids)) # create fields for every crosslink self.create_crosslink_input_fields(form=form, run=run) @@ -2653,7 +2666,7 @@ def modify_form(self, form: Form, run: Run) -> None: calc_method = staticmethod(validate_with_angstrom_deviation) def insert_dataframes(self, steps: StepManager, inputs) -> dict: - entry_id = inputs["structure_to_validate"] + entry_id = inputs["entry_id"] correct_input_step_identifier = steps.get_step_identifier_of_step_with_input( ImportMonomerStructurePredictionFromDisk, "entry_id", entry_id ) or steps.get_step_identifier_of_step_with_input( @@ -2680,14 +2693,14 @@ def create_form(self): label="Ångström Deviation - Multimer", input_fields=[ DropdownField( - name="structure_to_validate", + name="entry_id", label="Multimer prediction that should be validated", ), ], ) def modify_form(self, form: Form, run: Run) -> None: - # add all loaded protein entry ids to the dropdown of structure_to_validate_field + # add all loaded protein entry ids to the dropdown of structures_to_validate_field loaded_proteins_entry_ids = list( set( run.steps.get_inputs_of_step_type( @@ -2698,23 +2711,22 @@ def modify_form(self, form: Form, run: Run) -> None: ) ) ) - form["structure_to_validate"].set_options( - form_helper.to_choices(loaded_proteins_entry_ids) - ) + form["entry_id"].set_options(form_helper.to_choices(loaded_proteins_entry_ids)) self.create_crosslink_input_fields(form=form, run=run) plot_method = staticmethod(bar_plot_of_valid_crosslinks) calc_method = staticmethod(validate_with_angstrom_deviation) def insert_dataframes(self, steps: StepManager, inputs) -> dict: - entry_id = inputs["structure_to_validate"] + entry_id = inputs["entry_id"] correct_input_step_identifier = steps.get_step_identifier_of_step_with_input( ImportMultimerStructurePredictionFromDisk, "entry_id", entry_id ) or steps.get_step_identifier_of_step_with_input( UploadMultimerPredictions, "entry_id", entry_id ) - inputs["is_multimer"] = False + inputs["is_multimer"] = True + return self.insert_dataframes_with_correct_input_step_id( steps=steps, inputs=inputs, diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index b14dbfdb8..63b08de6a 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -52,7 +52,7 @@ def test_validate_with_angstrom_deviation(distance, expected): result = validate_with_angstrom_deviation( crosslinking_df, - structure_to_validate="P12345", + structures_to_validate=["P12345"], crosslinker_information=crosslinker_information, amino_acid_sequences_df=amino_acid_sequences_df, cif_df=cif_df, From fe19dc603f24321b8e63837206683fee7c09ecf8 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sun, 1 Mar 2026 11:12:48 +0100 Subject: [PATCH 128/240] fix: fix broken all_steps after merge --- backend/protzilla/all_steps.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/protzilla/all_steps.py b/backend/protzilla/all_steps.py index 8ca0b5810..643977da8 100644 --- a/backend/protzilla/all_steps.py +++ b/backend/protzilla/all_steps.py @@ -16,7 +16,6 @@ importing.FastaImport, importing.AlphaFoldPredictionLoad, importing.CrosslinkingImport, - importing.ImportStructurePredictionFromDisk, importing.AlphaFoldMultimerQueryJsonGeneration, importing.ImportMonomerStructurePredictionFromDisk, importing.UploadMultimerPredictions, From 158f134727a5a18ea5ccc4f2df3c1229a90b3b04 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Sun, 1 Mar 2026 18:46:47 +0100 Subject: [PATCH 129/240] feat: add and adjust tests --- .../test_crosslinking_validation.py | 248 +++++++++++++++++- .../test_alphafold_protein_structure_load.py | 4 +- 2 files changed, 239 insertions(+), 13 deletions(-) diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 63b08de6a..8828dff74 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -33,7 +33,9 @@ def test_validate_with_angstrom_deviation(distance, expected): } ) - amino_acid_sequences_df = pd.DataFrame({"Protein Sequence": ["AB"]}) + amino_acid_sequences_df = pd.DataFrame( + {"Protein ID": ["P12345-1"], "Protein Sequence": ["AB"]} + ) # Fake Crosslink Data crosslinking_df = pd.DataFrame( @@ -56,6 +58,7 @@ def test_validate_with_angstrom_deviation(distance, expected): crosslinker_information=crosslinker_information, amino_acid_sequences_df=amino_acid_sequences_df, cif_df=cif_df, + is_multimer=False, ) df = result["crosslinking_result_df"] @@ -108,6 +111,8 @@ def test_get_distance_between_two_amino_acids_in_angstrom(): def test_add_crosslinker_positions_with_exactly_one_possible_position(): df = pd.DataFrame( { + "Protein_id1": ["P1"], + "Protein_id2": ["P1"], "Peptide1": ["ABC"], "Peptide2": ["DEF"], "CL_position_within_peptide1": [1], @@ -115,10 +120,12 @@ def test_add_crosslinker_positions_with_exactly_one_possible_position(): } ) - protein_sequence = "XXABCYYYDEFZZ" + amino_acid_sequences_df = pd.DataFrame( + {"Protein ID": ["P1-1"], "Protein Sequence": ["XXABCYYYDEFZZ"]} + ) df, messages = add_positions_of_amino_acid_where_crosslinker_bound_to_df( - df, protein_sequence + df, amino_acid_sequences_df ) assert messages == [] @@ -133,6 +140,8 @@ def test_add_crosslinker_positions_with_exactly_one_possible_position(): def test_add_crosslinker_positions_with_more_than_one_possible_position(): df = pd.DataFrame( { + "Protein_id1": ["P1"], + "Protein_id2": ["P1"], "Peptide1": ["AA"], "Peptide2": ["BB"], "CL_position_within_peptide1": [0], @@ -140,10 +149,12 @@ def test_add_crosslinker_positions_with_more_than_one_possible_position(): } ) - protein_sequence = "AAXXAAZZBBYYBB" + amino_acid_sequences_df = pd.DataFrame( + {"Protein ID": ["P1-1"], "Protein Sequence": ["AAXXAAZZBBYYBB"]} + ) df, messages = add_positions_of_amino_acid_where_crosslinker_bound_to_df( - df, protein_sequence + df, amino_acid_sequences_df ) # 2 AA matches × 2 BB matches = 4 combinations @@ -162,6 +173,8 @@ def test_add_crosslinker_positions_with_more_than_one_possible_position(): def test_add_crosslinker_positions_but_one_peptide_not_found_deletes_row(): df = pd.DataFrame( { + "Protein_id1": ["P1"], + "Protein_id2": ["P1"], "Peptide1": ["ABC"], "Peptide2": ["DEF"], "CL_position_within_peptide1": [0], @@ -169,10 +182,12 @@ def test_add_crosslinker_positions_but_one_peptide_not_found_deletes_row(): } ) - protein_sequence = "XXXXXXXX" + amino_acid_sequences_df = pd.DataFrame( + {"Protein ID": ["P1-1"], "Protein Sequence": ["XXXXXXXX"]} + ) df, messages = add_positions_of_amino_acid_where_crosslinker_bound_to_df( - df, protein_sequence + df, amino_acid_sequences_df ) assert len(messages) == 1 @@ -186,6 +201,8 @@ def test_add_crosslinker_positions_but_one_peptide_not_found_deletes_row(): def test_add_crosslinker_positions_with_valid_and_invalid_rows_mixed(): df = pd.DataFrame( { + "Protein_id1": ["P1", "P1", "P1"], + "Protein_id2": ["P1", "P1", "P1"], "Peptide1": ["ABC", "XXX", "ABC"], "Peptide2": ["DEF", "DEF", "YYY"], "CL_position_within_peptide1": [0, 0, 0], @@ -193,10 +210,12 @@ def test_add_crosslinker_positions_with_valid_and_invalid_rows_mixed(): } ) - protein_sequence = "ABCDEF" + amino_acid_sequences_df = pd.DataFrame( + {"Protein ID": ["P1-1"], "Protein Sequence": ["ABCDEF"]} + ) df, messages = add_positions_of_amino_acid_where_crosslinker_bound_to_df( - df, protein_sequence + df, amino_acid_sequences_df ) assert len(messages) == 2 @@ -213,6 +232,8 @@ def test_add_crosslinker_positions_with_valid_and_invalid_rows_mixed(): def test_add_crosslinker_positions_with_overlapping_peptide_matches(): df = pd.DataFrame( { + "Protein_id1": ["P1"], + "Protein_id2": ["P1"], "Peptide1": ["AAA"], "Peptide2": ["B"], "CL_position_within_peptide1": [0], @@ -220,10 +241,12 @@ def test_add_crosslinker_positions_with_overlapping_peptide_matches(): } ) - protein_sequence = "AAAAB" + amino_acid_sequences_df = pd.DataFrame( + {"Protein ID": ["P1-1"], "Protein Sequence": ["AAAAB"]} + ) df, messages = add_positions_of_amino_acid_where_crosslinker_bound_to_df( - df, protein_sequence + df, amino_acid_sequences_df ) # AAA -> positions 0, 1 @@ -246,3 +269,206 @@ def test_add_crosslinker_positions_with_overlapping_peptide_matches(): expected_positions = {(1, 5), (2, 5)} assert observed_positions == expected_positions + + +def test_validate_multimer_filters_only_pairs_within_structures_to_validate(): + rows = [ + ("P1-1", "ABCDE"), + ("P2-1", "VWXYZ"), + ("P3-1", "KLMNO"), + ] + sequences_df = pd.DataFrame(rows, columns=["Protein ID", "Protein Sequence"]) + + crosslinking_df = pd.DataFrame( + [ + # within set: P1-P2 (should be kept when validating ["P1","P2"]) + ("P1", "P2", "BC", "WX", 0, 0, "XL"), + # within set: P2-P2 + ("P2", "P2", "WX", "WX", 0, 0, "XL"), + # outside set: P1-P3 (should be filtered out) + ("P1", "P3", "BC", "LM", 0, 0, "XL"), + ], + columns=[ + "Protein_id1", + "Protein_id2", + "Peptide1", + "Peptide2", + "CL_position_within_peptide1", + "CL_position_within_peptide2", + "Crosslinker", + ], + ) + + cif_df = pd.DataFrame( + { + "_atom_site.label_atom_id": ["CA"] * 5, + "_atom_site.label_seq_id": list(range(1, 6)), + "_atom_site.Cartn_x": [float(i) for i in range(1, 6)], + "_atom_site.Cartn_y": [0.0] * 5, + "_atom_site.Cartn_z": [0.0] * 5, + } + ) + + # Very permissive bounds: always valid as long as distance is defined. + # Format is [length, upper_deviation, lower_deviation]. + crosslinker_information = {"XL": [0.0, 0.0, 0.0]} + + out = validate_with_angstrom_deviation( + crosslinking_df=crosslinking_df, + structures_to_validate=["P1", "P2"], + crosslinker_information=crosslinker_information, + cif_df=cif_df, + amino_acid_sequences_df=sequences_df, + is_multimer=True, + ) + + result_df = out["crosslinking_result_df"] + assert isinstance(result_df, pd.DataFrame) + assert not result_df.empty + + # Only the first two rows should remain after filtering. + assert len(result_df) == 2 + + assert set(result_df["Protein_id1"].unique()).issubset({"P1", "P2"}) + assert set(result_df["Protein_id2"].unique()).issubset({"P1", "P2"}) + + assert "alphafold_distance" in result_df.columns + assert "valid_crosslink" in result_df.columns + assert "crosslinker_position1" in result_df.columns + assert "crosslinker_position2" in result_df.columns + + +def test_validate_multimer_no_links_between_structures_returns_empty_and_warning(): + sequences_df = pd.DataFrame( + [ + ("P1-1", "ABCDE"), + ("P2-1", "VWXYZ"), + ("P3-1", "KLMNO"), + ], + columns=["Protein ID", "Protein Sequence"], + ) + + crosslinking_df = pd.DataFrame( + [ + ("P1", "P3", "BC", "LM", 0, 0, "XL"), + ("P3", "P2", "LM", "WX", 0, 0, "XL"), + ], + columns=[ + "Protein_id1", + "Protein_id2", + "Peptide1", + "Peptide2", + "CL_position_within_peptide1", + "CL_position_within_peptide2", + "Crosslinker", + ], + ) + + cif_df = pd.DataFrame( + { + "_atom_site.label_atom_id": ["CA"] * 5, + "_atom_site.label_seq_id": list(range(1, 6)), + "_atom_site.Cartn_x": [float(i) for i in range(1, 6)], + "_atom_site.Cartn_y": [0.0] * 5, + "_atom_site.Cartn_z": [0.0] * 5, + } + ) + crosslinker_information = {"XL": [0.0, 0.0, 0.0]} + + out = validate_with_angstrom_deviation( + crosslinking_df=crosslinking_df, + structures_to_validate=["P1", "P2"], + crosslinker_information=crosslinker_information, + cif_df=cif_df, + amino_acid_sequences_df=sequences_df, + is_multimer=True, + ) + + result_df = out["crosslinking_result_df"] + messages = out["messages"] + + assert isinstance(result_df, pd.DataFrame) + assert result_df.empty + + assert isinstance(messages, list) + assert len(messages) == 1 + assert messages[0].get("level") is not None + assert "There are no cross links between the structures to validate." in messages[ + 0 + ].get("msg", "") + + +def test_validate_multimer_duplicates_rows_for_multiple_peptide_matches_and_validates_all(): + # AB occurs twice in ABAB: at positions 1 and 3 (1-based). + sequences_df = pd.DataFrame( + [ + ("P1-1", "ABAB"), + ("P2-1", "ABAB"), + ], + columns=["Protein ID", "Protein Sequence"], + ) + + crosslinking_df = pd.DataFrame( + [ + ("P1", "P2", "AB", "AB", 0, 0, "XL"), + ], + columns=[ + "Protein_id1", + "Protein_id2", + "Peptide1", + "Peptide2", + "CL_position_within_peptide1", + "CL_position_within_peptide2", + "Crosslinker", + ], + ) + + cif_df = cif_df = pd.DataFrame( + { + "_atom_site.label_atom_id": ["CA"] * 4, + "_atom_site.label_seq_id": list(range(1, 5)), + "_atom_site.Cartn_x": [float(i) for i in range(1, 5)], + "_atom_site.Cartn_y": [0.0] * 4, + "_atom_site.Cartn_z": [0.0] * 4, + } + ) + + # Always-valid bounds so we focus on duplication and distance computation. + crosslinker_information = {"XL": [0.0, 0.0, 0.0]} + + out = validate_with_angstrom_deviation( + crosslinking_df=crosslinking_df, + structures_to_validate=["P1", "P2"], + crosslinker_information=crosslinker_information, + cif_df=cif_df, + amino_acid_sequences_df=sequences_df, + is_multimer=True, + ) + + result_df = out["crosslinking_result_df"] + messages = out["messages"] + + assert isinstance(result_df, pd.DataFrame) + assert len(result_df) == 4 + + # Crosslinker positions should cover the product of {1,3} x {1,3}. + combos = set( + zip( + result_df["crosslinker_position1"].astype(int).tolist(), + result_df["crosslinker_position2"].astype(int).tolist(), + ) + ) + assert combos == {(1, 1), (1, 3), (3, 1), (3, 3)} + + # Distances in our 1D coordinate system are abs(pos2 - pos1). + distances = sorted(result_df["alphafold_distance"].astype(float).tolist()) + assert distances == [0.0, 0.0, 2.0, 2.0] + + # With permissive bounds, all should be valid. + assert result_df["valid_crosslink"].dropna().all() + + # Expect a duplication warning message. + assert any( + ("duplicated" in str(m.get("msg", "")).lower()) and (m.get("level") is not None) + for m in messages + ) diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index 3225851cb..0c5ac3ecf 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -455,7 +455,7 @@ def _copy(src, dest_dir): out = upload_multimer_prediction( entry_id="M1", - uniprot_ids=["X"], + uniprot_ids="X", model_used="m", amino_acid_sequences=fasta, cif_file=cif, @@ -584,7 +584,7 @@ def test_upload_multimer_prediction_no_persist(tmp_path, monkeypatch): out = upload_multimer_prediction( entry_id="M2", - uniprot_ids=["Y"], + uniprot_ids="Y", model_used="test", amino_acid_sequences=fasta, cif_file=cif, From ce49260b46d20c55d29dcb2a99da47955965dca1 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sun, 1 Mar 2026 21:09:56 +0100 Subject: [PATCH 130/240] feat: add download_methods --- backend/main/urls.py | 1 + backend/main/views.py | 20 ++++++++++++ backend/protzilla/run.py | 4 +++ backend/protzilla/steps.py | 64 +++++++++++++++++++++++++++++++++++++- 4 files changed, 88 insertions(+), 1 deletion(-) diff --git a/backend/main/urls.py b/backend/main/urls.py index d8e8166e0..e17513bee 100644 --- a/backend/main/urls.py +++ b/backend/main/urls.py @@ -48,6 +48,7 @@ ), # might function? path("api/get_step_form/", views.get_step_form, name="get_step_form"), path("api/get_step_plots/", views.get_step_plots, name="get_step_plots"), + path("api/get_step_downloads/", views.get_step_downloads, name="get_step_downloads"), path( "api/get_current_step_output_labels/", views.get_current_step_output_labels, diff --git a/backend/main/views.py b/backend/main/views.py index 0575d7e46..5efe6d935 100644 --- a/backend/main/views.py +++ b/backend/main/views.py @@ -623,6 +623,26 @@ def get_step_plots(request): {"success": False, "message": "Invalid request method"}, status=405 ) +def get_step_downloads(request): + if request.method == "POST": + data = json.loads(request.body) + run_name = data.get("run_name") + + run = Run(run_name) + if run.current_step is not None: + downloads = run.current_downloads.downloads + else: + downloads = {} + + return JsonResponse( + {"success": True, "message": "Got the available download(s) for the step", "data": downloads}, + safe=False, + ) + else: + return JsonResponse( + {"success": False, "message": "Invalid request method"}, status=405 + ) + # TODO: Move somewhere else def _step_output_as_serialised_table( diff --git a/backend/protzilla/run.py b/backend/protzilla/run.py index 30383b014..7dc17daa6 100644 --- a/backend/protzilla/run.py +++ b/backend/protzilla/run.py @@ -354,6 +354,10 @@ def current_messages(self) -> Messages: def current_plots(self) -> Plots | None: return self.steps.current_step.plots + @property + def current_downloads(self): # ToDo: Type Hint + return self.steps.current_step.downloads + @property def current_outputs(self) -> Output: return self.steps.current_step.output diff --git a/backend/protzilla/steps.py b/backend/protzilla/steps.py index 951ac5630..7cf133ca7 100644 --- a/backend/protzilla/steps.py +++ b/backend/protzilla/steps.py @@ -48,6 +48,7 @@ def __init__(self, instance_identifier: str | None = None): self.output: Output = Output() self.filtered_datatable: dict = {} self.plots: Plots = Plots() + self.downloads: Downloads = Downloads() self.messages: Messages = Messages([]) self.instance_identifier = instance_identifier self.disk_write_mutex = Lock() @@ -65,6 +66,10 @@ def __init__(self, instance_identifier: str | None = None): "generated": 0, "dumped": 0, }, + "downloads": { + "generated": 0, + "dumped": 0, + }, } if self.instance_identifier is None: @@ -136,6 +141,11 @@ def calculate(self, steps: StepManager) -> bool: self.handle_plot_outputs(plot_output) self.artifact_versions["plots"]["generated"] += 1 + if self.download_method: + download_output = self.download_method(**self.download_input) + self.handle_download_outputs(download_output) + self.artifact_versions["downloads"]["generated"] += 1 + self.calculation_status = "complete" # delete tempfiles @@ -225,6 +235,19 @@ def handle_plot_outputs(self, outputs: dict | list) -> None: self.plots = Plots(plots) + def handle_download_outputs(self, outputs: dict | list) -> None: + #ToDo: Docstring + + if not isinstance(outputs, dict): + raise TypeError("Output of download method is not a dictionary.") + + downloads = outputs.pop("downloads", {}) + self.output.output.update(outputs) + self.handle_messages(outputs) + + + self.downloads = Downloads(downloads) + def handle_messages(self, outputs: dict) -> None: """ Handles the messages from the calculation method and creates a Messages object from it. @@ -237,6 +260,7 @@ def handle_messages(self, outputs: dict) -> None: calc_method = None plot_method = None # if the plot method uses the output of the calculation method, it should be prefixed with "output_" + download_method = None @property def calculation_input(self) -> dict: @@ -249,7 +273,7 @@ def calculation_input(self) -> dict: for key in required_keys: if key not in self.inputs: raise ValueError( - f"Missing required input '{key}' for the calulation method" + f"Missing required input '{key}' for the calculation method" ) return { @@ -281,6 +305,26 @@ def plot_input(self) -> dict: key: plot_input[key] for key in input_parameters.keys() if key in plot_input } + @property + def download_input(self) -> dict: + input_parameters = inspect.signature(self.download_method).parameters + required_keys = [ + key + for key, param in input_parameters.items() + if param.default == inspect.Parameter.empty + ] + for key in required_keys: + if key not in self.inputs: + raise ValueError( + f"Missing required input '{key}' for the plot method" + ) + + return { + key: self.inputs[key] + for key in input_parameters.keys() + if key in self.inputs + } + def validate_outputs(self, soft_check: bool = False) -> bool: """ Validates the outputs of the step. Uses the output_keys attribute to check if all required keys are present in @@ -435,6 +479,24 @@ def empty(self) -> bool: return len(self.plots) == 0 +class Downloads: + # maps file name to file content (a string) + def __init__(self, downloads: dict[str, str] | None = None): + if downloads is None: + downloads: dict[str,str] = {} + self.downloads = downloads + + def __iter__(self): + return iter(self.downloads) + + def __repr__(self): + return f"Downloads: {len(self.downloads)}" + + @property + def empty(self) -> bool: + return len(self.downloads) == 0 + + class StepManager: def __repr__(self): return f"IMP: {self.importing} PRE: {self.data_preprocessing} ANA: {self.data_analysis} INT: {self.data_integration}" From 237f90350810af6198ac14ae4b6fdb577510d695 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sun, 1 Mar 2026 21:47:32 +0100 Subject: [PATCH 131/240] feat: add download button for generated alphafold json queries --- .../protzilla/importing/query_generation.py | 6 ++- backend/protzilla/methods/importing.py | 8 +-- .../components/app/run-screen/run-screen.tsx | 51 +++++++++++++++++-- 3 files changed, 57 insertions(+), 8 deletions(-) diff --git a/backend/protzilla/importing/query_generation.py b/backend/protzilla/importing/query_generation.py index bb0a2ea92..a3c3f53a0 100644 --- a/backend/protzilla/importing/query_generation.py +++ b/backend/protzilla/importing/query_generation.py @@ -1,3 +1,5 @@ +import json + import pandas as pd import requests @@ -13,7 +15,7 @@ def generate_alphafold_multimer_query_json( raise ValueError( "Invalid copies_per_id: please provide space-separated integers" ) - if len(uniprot_ids) != len(number_copies): + if len(uniprot_ids) != len(copies_per_id): dict(messages={}, tmp_df=pd.DataFrame()) data_for_query = { @@ -43,4 +45,4 @@ def generate_alphafold_multimer_query_json( } ) - return dict(messages={}, tmp_df=pd.DataFrame()) + return dict(messages={}, downloads={f"prediction_query_{'_'.join(uniprot_ids)}" : f"[{json.dumps(data_for_query)}]"}) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index ccf11658b..e117c0734 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pandas as pd + from backend.protzilla.form import * from backend.protzilla import form_helper from backend.protzilla.importing.metadata_import import ( @@ -615,7 +617,7 @@ class AlphaFoldMultimerQueryJsonGeneration(ImportingStep): operation = "Query Generation" method_description = "Generate a JSON to upload to AlphaFold-Server to generate a prediction on a multimer." - output_keys = ["tmp_df"] + output_keys = ["downloads"] def create_form(self): return Form( @@ -636,5 +638,5 @@ def create_form(self): ), ], ) - - calc_method = staticmethod(generate_alphafold_multimer_query_json) + calc_method = staticmethod(lambda: dict(downloads=pd.DataFrame())) + download_method = staticmethod(generate_alphafold_multimer_query_json) diff --git a/frontend/src/components/app/run-screen/run-screen.tsx b/frontend/src/components/app/run-screen/run-screen.tsx index 364d3010e..f5a54535d 100644 --- a/frontend/src/components/app/run-screen/run-screen.tsx +++ b/frontend/src/components/app/run-screen/run-screen.tsx @@ -89,6 +89,7 @@ export const RunScreen: React.FC = () => { const [plots, setPlots] = useState(); const [selectedPlot, setSelectedPlot] = useState
({ data: [], layout: {} }); const [availableTables, setAvailableTables] = useState(); + const [downloads, setDownloads] = useState | undefined>(); const [isDownloadModalOpen, openDownloadModal, closeDownloadModal] = useToggleableState(false); @@ -106,6 +107,7 @@ export const RunScreen: React.FC = () => { }).then(() => { void getRunData(); void getStepPlots(); + void getStepDownloads(); void getCurrentStepOutputLabels(); }); } else { @@ -140,6 +142,16 @@ export const RunScreen: React.FC = () => { } }, [runName]); + const getStepDownloads = useCallback(async () => { + const response = await callApiWithParameters("get_step_downloads/", { + run_name: runName, + }); + if (response) { + const downloads = response.data; + setDownloads(downloads); + } + }, [runName]); + const getCurrentStepOutputLabels = useCallback(async () => { const response = await callApiWithParameters("get_current_step_output_labels/", { run_name: runName, @@ -152,15 +164,21 @@ export const RunScreen: React.FC = () => { useEffect(() => { const fetchData = async () => { - await Promise.all([getRunData(), getStepPlots(), getCurrentStepOutputLabels()]); + await Promise.all([ + getRunData(), + getStepPlots(), + getStepDownloads(), + getCurrentStepOutputLabels(), + ]); }; void fetchData(); - }, [getRunData, getStepPlots, getCurrentStepOutputLabels]); + }, [getRunData, getStepPlots, getStepDownloads, getCurrentStepOutputLabels]); const onFormSubmit = () => { void getRunData(); void getStepPlots(); + void getStepDownloads(); void getCurrentStepOutputLabels(); }; @@ -235,8 +253,35 @@ export const RunScreen: React.FC = () => { ); + const downloadJson = (filename: string, content: string) => { + const blob = new Blob([content], { type: "application/json" }); + const url = URL.createObjectURL(blob); + + const a = document.createElement("a"); + a.href = url; + a.download = filename; + a.click(); + + URL.revokeObjectURL(url); + }; + const downloadComponent = ( - + + {downloads && Object.keys(downloads).length > 0 ? ( + Object.entries(downloads).map(([filename, content]) => ( + { + downloadJson(filename, content); + }} + /> + )) + ) : ( + + )} + ); const listEditorComponent = ( From 126898836e0dacdf23d494c8ee9a01ea7a4705bc Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 2 Mar 2026 11:26:15 +0100 Subject: [PATCH 132/240] refactor: tidy up code --- backend/main/urls.py | 4 +- backend/main/views.py | 7 +- .../protzilla/importing/query_generation.py | 54 +++++++++++--- backend/protzilla/methods/importing.py | 5 +- backend/protzilla/run.py | 11 ++- backend/protzilla/steps.py | 72 ++++++++----------- 6 files changed, 94 insertions(+), 59 deletions(-) diff --git a/backend/main/urls.py b/backend/main/urls.py index e17513bee..590923266 100644 --- a/backend/main/urls.py +++ b/backend/main/urls.py @@ -48,7 +48,9 @@ ), # might function? path("api/get_step_form/", views.get_step_form, name="get_step_form"), path("api/get_step_plots/", views.get_step_plots, name="get_step_plots"), - path("api/get_step_downloads/", views.get_step_downloads, name="get_step_downloads"), + path( + "api/get_step_downloads/", views.get_step_downloads, name="get_step_downloads" + ), path( "api/get_current_step_output_labels/", views.get_current_step_output_labels, diff --git a/backend/main/views.py b/backend/main/views.py index 5efe6d935..cc2ae35bf 100644 --- a/backend/main/views.py +++ b/backend/main/views.py @@ -623,6 +623,7 @@ def get_step_plots(request): {"success": False, "message": "Invalid request method"}, status=405 ) + def get_step_downloads(request): if request.method == "POST": data = json.loads(request.body) @@ -635,7 +636,11 @@ def get_step_downloads(request): downloads = {} return JsonResponse( - {"success": True, "message": "Got the available download(s) for the step", "data": downloads}, + { + "success": True, + "message": "Got the available download(s) for the step", + "data": downloads, + }, safe=False, ) else: diff --git a/backend/protzilla/importing/query_generation.py b/backend/protzilla/importing/query_generation.py index a3c3f53a0..b9b74ef28 100644 --- a/backend/protzilla/importing/query_generation.py +++ b/backend/protzilla/importing/query_generation.py @@ -1,4 +1,5 @@ import json +import logging import pandas as pd import requests @@ -7,18 +8,50 @@ def generate_alphafold_multimer_query_json( protein_ids: str, number_copies: str ) -> dict: - # extract contents and make sure they have the same length -> otherwise raise error + """ + Generates an AlphaFold Multimer JSON query for a set of UniProt protein IDs. + For each provided UniProt ID, the corresponding amino acid sequence is fetched + from the UniProt REST API and added to the query with the specified copy number. + + Protein IDs and copy numbers must be provided as space-separated strings and + must have the same length. If an invalid copy number is provided or if the + lengths do not match, an error message is generated and an exception may be raised. + + :param protein_ids: Space-separated list of UniProt protein IDs (e.g. "P69905 P68871"). + :param number_copies: Space-separated list of integers specifying the number of copies + for each protein ID (e.g. "2 2"). + :return: dict (messages, downloads), downloads contains a dictionary mapping a generated filename + to the AlphaFold Multimer query JSON string (wrapped in square brackets as required by AlphaFold server) + :raises ValueError: If the number of copies cannot be parsed as integers. + :raises requests.exceptions.HTTPError: If fetching a UniProt FASTA sequence fails. + """ + messages = [] + + # extract protein_ids and number of copies per id and make sure they have the same length uniprot_ids = protein_ids.split() try: copies_per_id = [int(input) for input in number_copies.split()] except ValueError as e: + messages.append( + dict( + level=logging.ERROR, + msg=f"Invalid list of number of copies per id: please provide space-separated integers", + ) + ) raise ValueError( - "Invalid copies_per_id: please provide space-separated integers" + "Invalid list of number of copies per id: please provide space-separated integers" ) if len(uniprot_ids) != len(copies_per_id): - dict(messages={}, tmp_df=pd.DataFrame()) + messages.append( + dict( + level=logging.ERROR, + msg=f"For at least one protein id, the number of copies is missing in the input.", + ) + ) + return dict(messages=messages, downloads={}) - data_for_query = { + # create the json query for alphafold + query = { "name": "_".join(protein_ids.split()) + "_prediction", "modelSeeds": [], "sequences": [], @@ -29,14 +62,14 @@ def generate_alphafold_multimer_query_json( for uniprot_id, copies in zip(uniprot_ids, copies_per_id): url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta" - response = requests.get(url) - response.raise_for_status() # TODO: was macht das? + response = requests.get(url, timeout=20) + response.raise_for_status() fasta = response.text amino_acid_sequence = "".join( line.strip() for line in fasta.splitlines() if not line.startswith(">") ) - data_for_query["sequences"].append( + query["sequences"].append( { "proteinChain": { "sequence": amino_acid_sequence, @@ -44,5 +77,8 @@ def generate_alphafold_multimer_query_json( } } ) - - return dict(messages={}, downloads={f"prediction_query_{'_'.join(uniprot_ids)}" : f"[{json.dumps(data_for_query)}]"}) + query_as_string = f"[{json.dumps(query)}]" + return dict( + messages={}, + downloads={f"prediction_query_{'_'.join(uniprot_ids)}": query_as_string}, + ) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index e117c0734..bd275a45d 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -612,7 +612,8 @@ def create_form(self): calc_method = staticmethod(get_multimer_structure_dfs) -class AlphaFoldMultimerQueryJsonGeneration(ImportingStep): +class AlphaFoldMultimerQueryJsonGeneration(Step): + section = "importing" display_name = "AlphaFold Multimer Query JSON Generation" operation = "Query Generation" method_description = "Generate a JSON to upload to AlphaFold-Server to generate a prediction on a multimer." @@ -638,5 +639,5 @@ def create_form(self): ), ], ) - calc_method = staticmethod(lambda: dict(downloads=pd.DataFrame())) + download_method = staticmethod(generate_alphafold_multimer_query_json) diff --git a/backend/protzilla/run.py b/backend/protzilla/run.py index 7dc17daa6..2fc72f320 100644 --- a/backend/protzilla/run.py +++ b/backend/protzilla/run.py @@ -13,7 +13,14 @@ import backend.protzilla.constants.paths as paths from backend.protzilla.constants.date_format import metadata_date_format from backend.protzilla.form import Form -from backend.protzilla.steps import Messages, Output, Plots, Step, StepManager +from backend.protzilla.steps import ( + Messages, + Output, + Plots, + Downloads, + Step, + StepManager, +) from backend.protzilla.utilities import format_trace @@ -355,7 +362,7 @@ def current_plots(self) -> Plots | None: return self.steps.current_step.plots @property - def current_downloads(self): # ToDo: Type Hint + def current_downloads(self) -> Downloads | None: return self.steps.current_step.downloads @property diff --git a/backend/protzilla/steps.py b/backend/protzilla/steps.py index 7cf133ca7..8e47cd244 100644 --- a/backend/protzilla/steps.py +++ b/backend/protzilla/steps.py @@ -7,7 +7,7 @@ from enum import Enum from pathlib import Path from types import MethodType -from typing import Any, Literal +from typing import Any, Literal, Callable import pandas as pd @@ -235,8 +235,14 @@ def handle_plot_outputs(self, outputs: dict | list) -> None: self.plots = Plots(plots) - def handle_download_outputs(self, outputs: dict | list) -> None: - #ToDo: Docstring + def handle_download_outputs(self, outputs: dict) -> None: + """ + Handles the dictionary from the download method and creates a Download object from it. + Responsible for validating that the output is a dictionary, handling any messages contained in the output + and setting the downloads attribute of the class. + :param outputs: A dictionary received after the download method + :return: None + """ if not isinstance(outputs, dict): raise TypeError("Output of download method is not a dictionary.") @@ -245,7 +251,6 @@ def handle_download_outputs(self, outputs: dict | list) -> None: self.output.output.update(outputs) self.handle_messages(outputs) - self.downloads = Downloads(downloads) def handle_messages(self, outputs: dict) -> None: @@ -262,26 +267,33 @@ def handle_messages(self, outputs: dict) -> None: plot_method = None # if the plot method uses the output of the calculation method, it should be prefixed with "output_" download_method = None - @property - def calculation_input(self) -> dict: - input_parameters = inspect.signature(self.calc_method).parameters + def _get_input_parameters( + self, function: Callable[..., Any], relevant_inputs: dict | None = None + ) -> dict: + if relevant_inputs is None: + relevant_inputs = self.inputs + input_parameters = inspect.signature(function).parameters required_keys = [ key for key, param in input_parameters.items() if param.default == inspect.Parameter.empty ] for key in required_keys: - if key not in self.inputs: + if key not in relevant_inputs: raise ValueError( - f"Missing required input '{key}' for the calculation method" + f"Missing required input '{key}' for the '{function.__name__}' method" ) return { - key: self.inputs[key] + key: relevant_inputs[key] for key in input_parameters.keys() - if key in self.inputs + if key in relevant_inputs } + @property + def calculation_input(self) -> dict: + return self._get_input_parameters(self.calc_method) + @property def plot_input(self) -> dict: # if the plot method uses the output of the calculation method, it should be prefixed with "output_" @@ -289,41 +301,13 @@ def plot_input(self) -> dict: "output_" + key: value for key, value in self.output.output.items() } plot_input = self.inputs | prefixed_output - - input_parameters = inspect.signature(self.plot_method).parameters - - required_keys = [ - key - for key, param in input_parameters.items() - if param.default == inspect.Parameter.empty - ] - for key in required_keys: - if key not in plot_input: - raise ValueError(f"Missing required input '{key}' for the plot method") - - return { - key: plot_input[key] for key in input_parameters.keys() if key in plot_input - } + return self._get_input_parameters( + function=self.plot_method, relevant_inputs=plot_input + ) @property def download_input(self) -> dict: - input_parameters = inspect.signature(self.download_method).parameters - required_keys = [ - key - for key, param in input_parameters.items() - if param.default == inspect.Parameter.empty - ] - for key in required_keys: - if key not in self.inputs: - raise ValueError( - f"Missing required input '{key}' for the plot method" - ) - - return { - key: self.inputs[key] - for key in input_parameters.keys() - if key in self.inputs - } + return self._get_input_parameters(self.download_method) def validate_outputs(self, soft_check: bool = False) -> bool: """ @@ -483,7 +467,7 @@ class Downloads: # maps file name to file content (a string) def __init__(self, downloads: dict[str, str] | None = None): if downloads is None: - downloads: dict[str,str] = {} + downloads: dict[str, str] = {} self.downloads = downloads def __iter__(self): From 081a9120db91197c8603bcfa3f850806840ba147 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 2 Mar 2026 11:58:52 +0100 Subject: [PATCH 133/240] test: add tests for alphafold multimer query json generation --- .../protzilla/importing/query_generation.py | 1 + backend/tests/main/test_views_helper.py | 1 + .../importing/test_query_generation.py | 90 +++++++++++++++++++ 3 files changed, 92 insertions(+) create mode 100644 backend/tests/protzilla/importing/test_query_generation.py diff --git a/backend/protzilla/importing/query_generation.py b/backend/protzilla/importing/query_generation.py index b9b74ef28..a1deaff35 100644 --- a/backend/protzilla/importing/query_generation.py +++ b/backend/protzilla/importing/query_generation.py @@ -12,6 +12,7 @@ def generate_alphafold_multimer_query_json( Generates an AlphaFold Multimer JSON query for a set of UniProt protein IDs. For each provided UniProt ID, the corresponding amino acid sequence is fetched from the UniProt REST API and added to the query with the specified copy number. + Format of the json is as defined here: https://github.com/google-deepmind/alphafold/blob/main/server/README.md Protein IDs and copy numbers must be provided as space-separated strings and must have the same length. If an invalid copy number is provided or if the diff --git a/backend/tests/main/test_views_helper.py b/backend/tests/main/test_views_helper.py index 080124103..355418127 100644 --- a/backend/tests/main/test_views_helper.py +++ b/backend/tests/main/test_views_helper.py @@ -15,6 +15,7 @@ def test_get_all_possible_step_names(): "FastaImport", "AlphaFoldPredictionLoad", "CrosslinkingImport", + "AlphaFoldMultimerQueryJsonGeneration", "ImportStructurePredictionFromDisk", "AlphaFoldMultimerQueryJsonGeneration", "ImportMonomerStructurePredictionFromDisk", diff --git a/backend/tests/protzilla/importing/test_query_generation.py b/backend/tests/protzilla/importing/test_query_generation.py new file mode 100644 index 000000000..fbdddfa46 --- /dev/null +++ b/backend/tests/protzilla/importing/test_query_generation.py @@ -0,0 +1,90 @@ +import json + +import pytest +from unittest.mock import patch, Mock +import requests +from backend.protzilla.importing.query_generation import ( + generate_alphafold_multimer_query_json, +) + +FAKE_FASTA_1 = ">P69905 Hemoglobin subunit alpha\nMVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG" +FAKE_FASTA_2 = ">P68871 Hemoglobin subunit beta\nVLSPADKTNVKAAWGKVGGHAAEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG" + + +@patch("backend.protzilla.importing.query_generation.requests.get") +def test_generate_alphafold_multimer_json_query_for_multiple_proteins(mock_get): + mock_resp1 = Mock() + mock_resp1.status_code = 200 + mock_resp1.text = FAKE_FASTA_1 + mock_resp1.raise_for_status = Mock() + + mock_resp2 = Mock() + mock_resp2.status_code = 200 + mock_resp2.text = FAKE_FASTA_2 + mock_resp2.raise_for_status = Mock() + + mock_get.side_effect = [mock_resp1, mock_resp2] + + result = generate_alphafold_multimer_query_json("P69905 P68871", "2 3") + downloads = result["downloads"] + + assert len(downloads) == 1 + key = list(downloads.keys())[0] + assert key == "prediction_query_P69905_P68871" + + # Parse JSON string (after removing outer brackets) + json_str = downloads[key] + parsed_json = json.loads(json_str[1:-1]) + + # Check top-level keys + expected_keys = {"name", "modelSeeds", "sequences", "dialect", "version"} + assert set(parsed_json.keys()) == expected_keys + + # Check name, version, dialect, modelSeeds + assert parsed_json["name"] == "P69905_P68871_prediction" + assert parsed_json["version"] == 1 + assert parsed_json["dialect"] == "alphafoldserver" + assert parsed_json["modelSeeds"] == [] + + # Check sequences + sequences = parsed_json["sequences"] + assert len(sequences) == 2 + + # First protein + protein_chain_1 = sequences[0]["proteinChain"] + assert ( + protein_chain_1["sequence"] + == "MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG" + ) + assert protein_chain_1["count"] == 2 + + # Second protein + protein_chain_2 = sequences[1]["proteinChain"] + assert ( + protein_chain_2["sequence"] + == "VLSPADKTNVKAAWGKVGGHAAEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG" + ) + assert protein_chain_2["count"] == 3 + + +def test_generate_alphafold_multimer_json_query_with_mismatched_number_of_ids_and_number_of_copies(): + result = generate_alphafold_multimer_query_json("P69905 P68871", "2") + messages = result["messages"] + assert len(messages) >= 1 + msg = messages[0]["msg"] + assert "number of copies is missing" in msg + + +def test_generate_alphafold_multimer_json_query_with_invalid_copy_number(): + with pytest.raises(ValueError, match="Invalid list of number of copies per id"): + generate_alphafold_multimer_query_json("P69905", "abc") + + +@patch("backend.protzilla.importing.query_generation.requests.get") +def test_generate_alphafold_multimer_json_query_with_http_error(mock_get): + mock_resp = Mock() + mock_resp.raise_for_status.side_effect = requests.exceptions.HTTPError() + mock_get.return_value = mock_resp + + with pytest.raises(requests.exceptions.HTTPError): + generate_alphafold_multimer_query_json("P69905", "2") From 0a817b53abf7d6ec08c9b1598f5f8fda8d45ac34 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 2 Mar 2026 13:32:10 +0100 Subject: [PATCH 134/240] feat: make sure that at least 2 protein ids or 2 copies of one protein were entered --- backend/protzilla/importing/query_generation.py | 9 +++++++++ backend/tests/main/test_views_helper.py | 1 - 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/backend/protzilla/importing/query_generation.py b/backend/protzilla/importing/query_generation.py index a1deaff35..42fd0416d 100644 --- a/backend/protzilla/importing/query_generation.py +++ b/backend/protzilla/importing/query_generation.py @@ -51,6 +51,15 @@ def generate_alphafold_multimer_query_json( ) return dict(messages=messages, downloads={}) + if sum(copies_per_id) < 2: + messages.append( + dict( + level=logging.ERROR, + msg=f"Please use the monomer steps, for only validating one protein.", + ) + ) + return dict(messages=messages, downloads={}) + # create the json query for alphafold query = { "name": "_".join(protein_ids.split()) + "_prediction", diff --git a/backend/tests/main/test_views_helper.py b/backend/tests/main/test_views_helper.py index 355418127..f5da3335e 100644 --- a/backend/tests/main/test_views_helper.py +++ b/backend/tests/main/test_views_helper.py @@ -16,7 +16,6 @@ def test_get_all_possible_step_names(): "AlphaFoldPredictionLoad", "CrosslinkingImport", "AlphaFoldMultimerQueryJsonGeneration", - "ImportStructurePredictionFromDisk", "AlphaFoldMultimerQueryJsonGeneration", "ImportMonomerStructurePredictionFromDisk", "UploadMultimerPredictions", From d791ed8253a7c24ddfbf01dfa63ee22d48a3c9b8 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 2 Mar 2026 15:19:48 +0100 Subject: [PATCH 135/240] feat: add input to use a specific seed and allow not only space- but also comma-separated input --- .../protzilla/importing/query_generation.py | 26 +++++++++++++------ backend/protzilla/methods/importing.py | 19 +++++++++++--- .../importing/test_query_generation.py | 24 ++++++++++++++--- 3 files changed, 53 insertions(+), 16 deletions(-) diff --git a/backend/protzilla/importing/query_generation.py b/backend/protzilla/importing/query_generation.py index 42fd0416d..1516504d8 100644 --- a/backend/protzilla/importing/query_generation.py +++ b/backend/protzilla/importing/query_generation.py @@ -6,7 +6,7 @@ def generate_alphafold_multimer_query_json( - protein_ids: str, number_copies: str + protein_ids: str, number_copies: str, model_seed: int ) -> dict: """ Generates an AlphaFold Multimer JSON query for a set of UniProt protein IDs. @@ -14,24 +14,31 @@ def generate_alphafold_multimer_query_json( from the UniProt REST API and added to the query with the specified copy number. Format of the json is as defined here: https://github.com/google-deepmind/alphafold/blob/main/server/README.md - Protein IDs and copy numbers must be provided as space-separated strings and + Protein IDs and copy numbers must be provided as space- or comma-separated strings and must have the same length. If an invalid copy number is provided or if the lengths do not match, an error message is generated and an exception may be raised. - :param protein_ids: Space-separated list of UniProt protein IDs (e.g. "P69905 P68871"). - :param number_copies: Space-separated list of integers specifying the number of copies + :param protein_ids: Space- or comma-separated list of UniProt protein IDs (e.g. "P69905 P68871"). + :param number_copies: Space- or comma-separated list of integers specifying the number of copies for each protein ID (e.g. "2 2"). + :param model_seed: Model seed for the AlphaFold query. If -1 we want AlphaFold to use a random seed. :return: dict (messages, downloads), downloads contains a dictionary mapping a generated filename to the AlphaFold Multimer query JSON string (wrapped in square brackets as required by AlphaFold server) - :raises ValueError: If the number of copies cannot be parsed as integers. + :raises ValueError: If the number of copies or the model seeds cannot be parsed as integers. :raises requests.exceptions.HTTPError: If fetching a UniProt FASTA sequence fails. """ messages = [] # extract protein_ids and number of copies per id and make sure they have the same length - uniprot_ids = protein_ids.split() + if "," in protein_ids: + uniprot_ids = protein_ids.split(",") + else: + uniprot_ids = protein_ids.split() try: - copies_per_id = [int(input) for input in number_copies.split()] + if "," in number_copies: + copies_per_id = [int(input) for input in number_copies.split(",")] + else: + copies_per_id = [int(input) for input in number_copies.split()] except ValueError as e: messages.append( dict( @@ -62,13 +69,16 @@ def generate_alphafold_multimer_query_json( # create the json query for alphafold query = { - "name": "_".join(protein_ids.split()) + "_prediction", + "name": "_".join(uniprot_ids) + "_prediction", "modelSeeds": [], "sequences": [], "dialect": "alphafoldserver", "version": 1, } + if model_seed != -1: + query["modelSeeds"] = [model_seed] + for uniprot_id, copies in zip(uniprot_ids, copies_per_id): url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta" diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index bd275a45d..6367316f9 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -626,16 +626,27 @@ def create_form(self): input_fields=[ TextField( name="protein_ids", - label="Protein UniProt IDs", + label="UniProt Protein IDs", ), - InfoField(label="IDs should be separated by a space."), + InfoField(label="IDs should be space- or comma-separated."), TextField( name="number_copies", - label="Number of copies for each protein ID", + label="Number of copies of each protein monomer", ), InfoField( label="For each entered ID a number should be entered.\n" - "Numbers should be separated by a space." + "Numbers should be should be space- or comma-separated." + ), + NumberField( + name="model_seed", + label="Model seed for AlphaFold", + min=-1, + max=4294967295, + value=-1, + ), + InfoField( + label="Leave -1 if you want to use a random seed.\n" + "Otherwise enter a seed (=integer between 0 and 4294967295)" ), ], ) diff --git a/backend/tests/protzilla/importing/test_query_generation.py b/backend/tests/protzilla/importing/test_query_generation.py index fbdddfa46..e1effe52d 100644 --- a/backend/tests/protzilla/importing/test_query_generation.py +++ b/backend/tests/protzilla/importing/test_query_generation.py @@ -25,7 +25,7 @@ def test_generate_alphafold_multimer_json_query_for_multiple_proteins(mock_get): mock_get.side_effect = [mock_resp1, mock_resp2] - result = generate_alphafold_multimer_query_json("P69905 P68871", "2 3") + result = generate_alphafold_multimer_query_json("P69905 P68871", "2,3", -1) downloads = result["downloads"] assert len(downloads) == 1 @@ -67,8 +67,24 @@ def test_generate_alphafold_multimer_json_query_for_multiple_proteins(mock_get): assert protein_chain_2["count"] == 3 +@patch("backend.protzilla.importing.query_generation.requests.get") +def test_generate_alphafold_multimer_json_query_with_model_seed(mock_get): + mock_resp = Mock() + mock_resp.status_code = 200 + mock_resp.text = FAKE_FASTA_1 + mock_resp.raise_for_status = Mock() + mock_get.return_value = mock_resp + + seed = 12345 + result = generate_alphafold_multimer_query_json("P69905", "2", model_seed=seed) + downloads = result["downloads"] + key = list(downloads.keys())[0] + parsed_json = json.loads(downloads[key][1:-1]) + assert parsed_json["modelSeeds"] == seed + + def test_generate_alphafold_multimer_json_query_with_mismatched_number_of_ids_and_number_of_copies(): - result = generate_alphafold_multimer_query_json("P69905 P68871", "2") + result = generate_alphafold_multimer_query_json("P69905 P68871", "2", -1) messages = result["messages"] assert len(messages) >= 1 msg = messages[0]["msg"] @@ -77,7 +93,7 @@ def test_generate_alphafold_multimer_json_query_with_mismatched_number_of_ids_an def test_generate_alphafold_multimer_json_query_with_invalid_copy_number(): with pytest.raises(ValueError, match="Invalid list of number of copies per id"): - generate_alphafold_multimer_query_json("P69905", "abc") + generate_alphafold_multimer_query_json("P69905", "abc", -1) @patch("backend.protzilla.importing.query_generation.requests.get") @@ -87,4 +103,4 @@ def test_generate_alphafold_multimer_json_query_with_http_error(mock_get): mock_get.return_value = mock_resp with pytest.raises(requests.exceptions.HTTPError): - generate_alphafold_multimer_query_json("P69905", "2") + generate_alphafold_multimer_query_json("P69905", "2", -1) From 068329e4528835aa4691c723cc3919eace85545e Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 2 Mar 2026 15:43:07 +0100 Subject: [PATCH 136/240] fix: fix broken test --- backend/tests/protzilla/importing/test_query_generation.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/backend/tests/protzilla/importing/test_query_generation.py b/backend/tests/protzilla/importing/test_query_generation.py index e1effe52d..836ce987c 100644 --- a/backend/tests/protzilla/importing/test_query_generation.py +++ b/backend/tests/protzilla/importing/test_query_generation.py @@ -75,12 +75,11 @@ def test_generate_alphafold_multimer_json_query_with_model_seed(mock_get): mock_resp.raise_for_status = Mock() mock_get.return_value = mock_resp - seed = 12345 - result = generate_alphafold_multimer_query_json("P69905", "2", model_seed=seed) + result = generate_alphafold_multimer_query_json("P69905", "2", model_seed=12345) downloads = result["downloads"] key = list(downloads.keys())[0] parsed_json = json.loads(downloads[key][1:-1]) - assert parsed_json["modelSeeds"] == seed + assert parsed_json["modelSeeds"] == [12345] def test_generate_alphafold_multimer_json_query_with_mismatched_number_of_ids_and_number_of_copies(): From ed7bb2a524fbbce65dd0f0b4232ea2bcbe3f4c62 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Tue, 3 Mar 2026 13:17:43 +0100 Subject: [PATCH 137/240] fix: also accept input separated with comma and space, add success message after successful query generation --- .../protzilla/importing/query_generation.py | 31 +++++++++++-------- backend/protzilla/methods/importing.py | 2 +- backend/tests/main/test_views_helper.py | 1 - 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/backend/protzilla/importing/query_generation.py b/backend/protzilla/importing/query_generation.py index 1516504d8..af635aee5 100644 --- a/backend/protzilla/importing/query_generation.py +++ b/backend/protzilla/importing/query_generation.py @@ -30,24 +30,19 @@ def generate_alphafold_multimer_query_json( messages = [] # extract protein_ids and number of copies per id and make sure they have the same length - if "," in protein_ids: - uniprot_ids = protein_ids.split(",") - else: - uniprot_ids = protein_ids.split() + uniprot_ids = protein_ids.replace(",", " ").split() try: - if "," in number_copies: - copies_per_id = [int(input) for input in number_copies.split(",")] - else: - copies_per_id = [int(input) for input in number_copies.split()] + copies_per_id = [int(input) for input in number_copies.replace(",", " ").split()] except ValueError as e: + msg=f"Invalid list of number of copies per id: please provide space-separated integers" messages.append( dict( level=logging.ERROR, - msg=f"Invalid list of number of copies per id: please provide space-separated integers", + msg=msg, ) ) raise ValueError( - "Invalid list of number of copies per id: please provide space-separated integers" + msg ) if len(uniprot_ids) != len(copies_per_id): messages.append( @@ -57,12 +52,19 @@ def generate_alphafold_multimer_query_json( ) ) return dict(messages=messages, downloads={}) - + if min(copies_per_id) < 1: + messages.append( + dict( + level=logging.ERROR, + msg=f"There can't be a non-positive number of copies.", + ) + ) + return dict(messages=messages, downloads={}) if sum(copies_per_id) < 2: messages.append( dict( level=logging.ERROR, - msg=f"Please use the monomer steps, for only validating one protein.", + msg=f"Please use the monomer steps to validate only one protein.", ) ) return dict(messages=messages, downloads={}) @@ -98,7 +100,10 @@ def generate_alphafold_multimer_query_json( } ) query_as_string = f"[{json.dumps(query)}]" + messages.append( + dict(level=logging.INFO, msg=f"Successfully generated a json file for AlphaFold.") + ) return dict( - messages={}, + messages=messages, downloads={f"prediction_query_{'_'.join(uniprot_ids)}": query_as_string}, ) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 6367316f9..d2d8b87d9 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -646,7 +646,7 @@ def create_form(self): ), InfoField( label="Leave -1 if you want to use a random seed.\n" - "Otherwise enter a seed (=integer between 0 and 4294967295)" + "Otherwise enter a seed (integer between 0 and 4294967295)" ), ], ) diff --git a/backend/tests/main/test_views_helper.py b/backend/tests/main/test_views_helper.py index f5da3335e..78757ddf4 100644 --- a/backend/tests/main/test_views_helper.py +++ b/backend/tests/main/test_views_helper.py @@ -16,7 +16,6 @@ def test_get_all_possible_step_names(): "AlphaFoldPredictionLoad", "CrosslinkingImport", "AlphaFoldMultimerQueryJsonGeneration", - "AlphaFoldMultimerQueryJsonGeneration", "ImportMonomerStructurePredictionFromDisk", "UploadMultimerPredictions", "ImportMultimerStructurePredictionFromDisk", From 288e01a38c641912d7a1a0e8093fb0bf9641fc9f Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Tue, 3 Mar 2026 13:26:35 +0100 Subject: [PATCH 138/240] feat: add input field for file name of prediction query file --- .../protzilla/importing/query_generation.py | 21 +++++++++++-------- backend/protzilla/methods/importing.py | 4 ++++ .../importing/test_query_generation.py | 16 +++++++------- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/backend/protzilla/importing/query_generation.py b/backend/protzilla/importing/query_generation.py index af635aee5..8f5742061 100644 --- a/backend/protzilla/importing/query_generation.py +++ b/backend/protzilla/importing/query_generation.py @@ -6,7 +6,7 @@ def generate_alphafold_multimer_query_json( - protein_ids: str, number_copies: str, model_seed: int + protein_ids: str, number_copies: str, model_seed: int, name: str ) -> dict: """ Generates an AlphaFold Multimer JSON query for a set of UniProt protein IDs. @@ -22,6 +22,7 @@ def generate_alphafold_multimer_query_json( :param number_copies: Space- or comma-separated list of integers specifying the number of copies for each protein ID (e.g. "2 2"). :param model_seed: Model seed for the AlphaFold query. If -1 we want AlphaFold to use a random seed. + :param name: How the AlphaFold job and the generated file should be named. :return: dict (messages, downloads), downloads contains a dictionary mapping a generated filename to the AlphaFold Multimer query JSON string (wrapped in square brackets as required by AlphaFold server) :raises ValueError: If the number of copies or the model seeds cannot be parsed as integers. @@ -32,18 +33,18 @@ def generate_alphafold_multimer_query_json( # extract protein_ids and number of copies per id and make sure they have the same length uniprot_ids = protein_ids.replace(",", " ").split() try: - copies_per_id = [int(input) for input in number_copies.replace(",", " ").split()] + copies_per_id = [ + int(input) for input in number_copies.replace(",", " ").split() + ] except ValueError as e: - msg=f"Invalid list of number of copies per id: please provide space-separated integers" + msg = f"Invalid list of number of copies per id: please provide space-separated integers" messages.append( dict( level=logging.ERROR, msg=msg, ) ) - raise ValueError( - msg - ) + raise ValueError(msg) if len(uniprot_ids) != len(copies_per_id): messages.append( dict( @@ -71,7 +72,7 @@ def generate_alphafold_multimer_query_json( # create the json query for alphafold query = { - "name": "_".join(uniprot_ids) + "_prediction", + "name": name, "modelSeeds": [], "sequences": [], "dialect": "alphafoldserver", @@ -101,9 +102,11 @@ def generate_alphafold_multimer_query_json( ) query_as_string = f"[{json.dumps(query)}]" messages.append( - dict(level=logging.INFO, msg=f"Successfully generated a json file for AlphaFold.") + dict( + level=logging.INFO, msg=f"Successfully generated a json file for AlphaFold." + ) ) return dict( messages=messages, - downloads={f"prediction_query_{'_'.join(uniprot_ids)}": query_as_string}, + downloads={name: query_as_string}, ) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index d2d8b87d9..e45c1983d 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -624,6 +624,10 @@ def create_form(self): return Form( label="AlphaFold Multimer Query JSON Generation", input_fields=[ + TextField( + name="name", + label="File name and AlphaFold job name for generated query", + ), TextField( name="protein_ids", label="UniProt Protein IDs", diff --git a/backend/tests/protzilla/importing/test_query_generation.py b/backend/tests/protzilla/importing/test_query_generation.py index 836ce987c..a27cb0144 100644 --- a/backend/tests/protzilla/importing/test_query_generation.py +++ b/backend/tests/protzilla/importing/test_query_generation.py @@ -25,12 +25,12 @@ def test_generate_alphafold_multimer_json_query_for_multiple_proteins(mock_get): mock_get.side_effect = [mock_resp1, mock_resp2] - result = generate_alphafold_multimer_query_json("P69905 P68871", "2,3", -1) + result = generate_alphafold_multimer_query_json("P69905 P68871", "2,3", -1, "name") downloads = result["downloads"] assert len(downloads) == 1 key = list(downloads.keys())[0] - assert key == "prediction_query_P69905_P68871" + assert key == "name" # Parse JSON string (after removing outer brackets) json_str = downloads[key] @@ -41,7 +41,7 @@ def test_generate_alphafold_multimer_json_query_for_multiple_proteins(mock_get): assert set(parsed_json.keys()) == expected_keys # Check name, version, dialect, modelSeeds - assert parsed_json["name"] == "P69905_P68871_prediction" + assert parsed_json["name"] == "name" assert parsed_json["version"] == 1 assert parsed_json["dialect"] == "alphafoldserver" assert parsed_json["modelSeeds"] == [] @@ -75,7 +75,9 @@ def test_generate_alphafold_multimer_json_query_with_model_seed(mock_get): mock_resp.raise_for_status = Mock() mock_get.return_value = mock_resp - result = generate_alphafold_multimer_query_json("P69905", "2", model_seed=12345) + result = generate_alphafold_multimer_query_json( + "P69905", "2", model_seed=12345, file_name="name" + ) downloads = result["downloads"] key = list(downloads.keys())[0] parsed_json = json.loads(downloads[key][1:-1]) @@ -83,7 +85,7 @@ def test_generate_alphafold_multimer_json_query_with_model_seed(mock_get): def test_generate_alphafold_multimer_json_query_with_mismatched_number_of_ids_and_number_of_copies(): - result = generate_alphafold_multimer_query_json("P69905 P68871", "2", -1) + result = generate_alphafold_multimer_query_json("P69905 P68871", "2", -1, "name") messages = result["messages"] assert len(messages) >= 1 msg = messages[0]["msg"] @@ -92,7 +94,7 @@ def test_generate_alphafold_multimer_json_query_with_mismatched_number_of_ids_an def test_generate_alphafold_multimer_json_query_with_invalid_copy_number(): with pytest.raises(ValueError, match="Invalid list of number of copies per id"): - generate_alphafold_multimer_query_json("P69905", "abc", -1) + generate_alphafold_multimer_query_json("P69905", "abc", -1, "name") @patch("backend.protzilla.importing.query_generation.requests.get") @@ -102,4 +104,4 @@ def test_generate_alphafold_multimer_json_query_with_http_error(mock_get): mock_get.return_value = mock_resp with pytest.raises(requests.exceptions.HTTPError): - generate_alphafold_multimer_query_json("P69905", "2", -1) + generate_alphafold_multimer_query_json("P69905", "2", -1, "name") From 2cdf9dd99d819a1344de3430a3d9975e95c9e2b3 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Tue, 3 Mar 2026 13:47:09 +0100 Subject: [PATCH 139/240] fix: step only turns green if file was generated --- backend/protzilla/importing/query_generation.py | 15 +++++++++------ backend/protzilla/steps.py | 1 - .../protzilla/importing/test_query_generation.py | 9 +++------ 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/backend/protzilla/importing/query_generation.py b/backend/protzilla/importing/query_generation.py index 8f5742061..e7b6e5906 100644 --- a/backend/protzilla/importing/query_generation.py +++ b/backend/protzilla/importing/query_generation.py @@ -46,29 +46,32 @@ def generate_alphafold_multimer_query_json( ) raise ValueError(msg) if len(uniprot_ids) != len(copies_per_id): + msg = f"For at least one protein id, the number of copies is missing in the input." messages.append( dict( level=logging.ERROR, - msg=f"For at least one protein id, the number of copies is missing in the input.", + msg=msg, ) ) - return dict(messages=messages, downloads={}) + raise ValueError(msg) if min(copies_per_id) < 1: + msg = f"There can't be a non-positive number of copies." messages.append( dict( level=logging.ERROR, - msg=f"There can't be a non-positive number of copies.", + msg=msg, ) ) - return dict(messages=messages, downloads={}) + raise ValueError(msg) if sum(copies_per_id) < 2: + msg = f"Please use the monomer steps to validate only one protein." messages.append( dict( level=logging.ERROR, - msg=f"Please use the monomer steps to validate only one protein.", + msg=msg, ) ) - return dict(messages=messages, downloads={}) + raise ValueError(msg) # create the json query for alphafold query = { diff --git a/backend/protzilla/steps.py b/backend/protzilla/steps.py index 8e47cd244..06af14831 100644 --- a/backend/protzilla/steps.py +++ b/backend/protzilla/steps.py @@ -132,7 +132,6 @@ def calculate(self, steps: StepManager) -> bool: self.validate_outputs() self.artifact_versions["output"]["generated"] += 1 - self.calculation_status = "complete" if steps.failed_step_index == stepIndex: steps.failed_step_index = -1 diff --git a/backend/tests/protzilla/importing/test_query_generation.py b/backend/tests/protzilla/importing/test_query_generation.py index a27cb0144..80f494fcb 100644 --- a/backend/tests/protzilla/importing/test_query_generation.py +++ b/backend/tests/protzilla/importing/test_query_generation.py @@ -76,7 +76,7 @@ def test_generate_alphafold_multimer_json_query_with_model_seed(mock_get): mock_get.return_value = mock_resp result = generate_alphafold_multimer_query_json( - "P69905", "2", model_seed=12345, file_name="name" + "P69905", "2", model_seed=12345, name="name" ) downloads = result["downloads"] key = list(downloads.keys())[0] @@ -85,11 +85,8 @@ def test_generate_alphafold_multimer_json_query_with_model_seed(mock_get): def test_generate_alphafold_multimer_json_query_with_mismatched_number_of_ids_and_number_of_copies(): - result = generate_alphafold_multimer_query_json("P69905 P68871", "2", -1, "name") - messages = result["messages"] - assert len(messages) >= 1 - msg = messages[0]["msg"] - assert "number of copies is missing" in msg + with pytest.raises(ValueError, match="number of copies is missing"): + generate_alphafold_multimer_query_json("P69905 P68871", "2", -1, "name") def test_generate_alphafold_multimer_json_query_with_invalid_copy_number(): From c043e76ffd286c29987d8750aeeb544abb4fd5d3 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Tue, 3 Mar 2026 17:58:32 +0100 Subject: [PATCH 140/240] fix: address code review feedback --- backend/main/views.py | 2 +- backend/protzilla/importing/query_generation.py | 2 +- backend/protzilla/methods/importing.py | 3 +++ backend/protzilla/run.py | 4 ++-- .../tests/protzilla/importing/test_query_generation.py | 9 +++++++++ 5 files changed, 16 insertions(+), 4 deletions(-) diff --git a/backend/main/views.py b/backend/main/views.py index cc2ae35bf..c347bda15 100644 --- a/backend/main/views.py +++ b/backend/main/views.py @@ -631,7 +631,7 @@ def get_step_downloads(request): run = Run(run_name) if run.current_step is not None: - downloads = run.current_downloads.downloads + downloads = run.current_downloads else: downloads = {} diff --git a/backend/protzilla/importing/query_generation.py b/backend/protzilla/importing/query_generation.py index e7b6e5906..c95ab5466 100644 --- a/backend/protzilla/importing/query_generation.py +++ b/backend/protzilla/importing/query_generation.py @@ -46,7 +46,7 @@ def generate_alphafold_multimer_query_json( ) raise ValueError(msg) if len(uniprot_ids) != len(copies_per_id): - msg = f"For at least one protein id, the number of copies is missing in the input." + msg = f"There are {len(uniprot_ids)} ids. However, there are {len(copies_per_id)} entries for number of copies. Please make sure that these numbers match." messages.append( dict( level=logging.ERROR, diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index e45c1983d..a6a1b4246 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -628,6 +628,9 @@ def create_form(self): name="name", label="File name and AlphaFold job name for generated query", ), + InfoField( + label="Only enter file stem, '.json' will be added automatically." + ), TextField( name="protein_ids", label="UniProt Protein IDs", diff --git a/backend/protzilla/run.py b/backend/protzilla/run.py index 2fc72f320..4d5268924 100644 --- a/backend/protzilla/run.py +++ b/backend/protzilla/run.py @@ -362,8 +362,8 @@ def current_plots(self) -> Plots | None: return self.steps.current_step.plots @property - def current_downloads(self) -> Downloads | None: - return self.steps.current_step.downloads + def current_downloads(self) -> dict[str, str] | None: + return self.steps.current_step.downloads.downloads @property def current_outputs(self) -> Output: diff --git a/backend/tests/protzilla/importing/test_query_generation.py b/backend/tests/protzilla/importing/test_query_generation.py index 80f494fcb..cf3125a49 100644 --- a/backend/tests/protzilla/importing/test_query_generation.py +++ b/backend/tests/protzilla/importing/test_query_generation.py @@ -89,6 +89,15 @@ def test_generate_alphafold_multimer_json_query_with_mismatched_number_of_ids_an generate_alphafold_multimer_query_json("P69905 P68871", "2", -1, "name") +def test_generate_alphafold_multimer_json_query_with_mismatched_number_of_ids_and_number_of_copies(): + with pytest.raises(ValueError) as error: + generate_alphafold_multimer_query_json("P69905 P68871", "2", -1, "name") + + msg = str(error.value) + assert "2 ids" in msg + assert "1 entries for number of copies" in msg + + def test_generate_alphafold_multimer_json_query_with_invalid_copy_number(): with pytest.raises(ValueError, match="Invalid list of number of copies per id"): generate_alphafold_multimer_query_json("P69905", "abc", -1, "name") From dedfba18935ba5c6fd11946d5735ccf1b12f4b20 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Wed, 4 Mar 2026 15:30:22 +0100 Subject: [PATCH 141/240] refactor: rename alphafold-multimer-query-json-generation to alphafold-query-json-generation --- backend/protzilla/all_steps.py | 2 +- .../protzilla/importing/query_generation.py | 15 +++------------ backend/protzilla/methods/importing.py | 14 ++++++++------ backend/tests/main/test_views_helper.py | 2 +- .../importing/test_query_generation.py | 19 ++++++------------- 5 files changed, 19 insertions(+), 33 deletions(-) diff --git a/backend/protzilla/all_steps.py b/backend/protzilla/all_steps.py index 643977da8..a21e66595 100644 --- a/backend/protzilla/all_steps.py +++ b/backend/protzilla/all_steps.py @@ -16,7 +16,7 @@ importing.FastaImport, importing.AlphaFoldPredictionLoad, importing.CrosslinkingImport, - importing.AlphaFoldMultimerQueryJsonGeneration, + importing.AlphaFoldQueryJsonGeneration, importing.ImportMonomerStructurePredictionFromDisk, importing.UploadMultimerPredictions, importing.ImportMultimerStructurePredictionFromDisk, diff --git a/backend/protzilla/importing/query_generation.py b/backend/protzilla/importing/query_generation.py index c95ab5466..6941612c4 100644 --- a/backend/protzilla/importing/query_generation.py +++ b/backend/protzilla/importing/query_generation.py @@ -5,11 +5,11 @@ import requests -def generate_alphafold_multimer_query_json( +def generate_alphafold_query_json( protein_ids: str, number_copies: str, model_seed: int, name: str ) -> dict: """ - Generates an AlphaFold Multimer JSON query for a set of UniProt protein IDs. + Generates an AlphaFold JSON query for a set of UniProt protein IDs. For each provided UniProt ID, the corresponding amino acid sequence is fetched from the UniProt REST API and added to the query with the specified copy number. Format of the json is as defined here: https://github.com/google-deepmind/alphafold/blob/main/server/README.md @@ -24,7 +24,7 @@ def generate_alphafold_multimer_query_json( :param model_seed: Model seed for the AlphaFold query. If -1 we want AlphaFold to use a random seed. :param name: How the AlphaFold job and the generated file should be named. :return: dict (messages, downloads), downloads contains a dictionary mapping a generated filename - to the AlphaFold Multimer query JSON string (wrapped in square brackets as required by AlphaFold server) + to the AlphaFold query JSON string (wrapped in square brackets as required by AlphaFold server) :raises ValueError: If the number of copies or the model seeds cannot be parsed as integers. :raises requests.exceptions.HTTPError: If fetching a UniProt FASTA sequence fails. """ @@ -63,15 +63,6 @@ def generate_alphafold_multimer_query_json( ) ) raise ValueError(msg) - if sum(copies_per_id) < 2: - msg = f"Please use the monomer steps to validate only one protein." - messages.append( - dict( - level=logging.ERROR, - msg=msg, - ) - ) - raise ValueError(msg) # create the json query for alphafold query = { diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index a6a1b4246..921a780aa 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -32,7 +32,7 @@ FeatureOrientationType, ) from backend.protzilla.constants.intensity_types import IntensityType, IntensityNameType -from protzilla.importing.query_generation import generate_alphafold_multimer_query_json +from protzilla.importing.query_generation import generate_alphafold_query_json class ImportingStep(Step): @@ -612,17 +612,19 @@ def create_form(self): calc_method = staticmethod(get_multimer_structure_dfs) -class AlphaFoldMultimerQueryJsonGeneration(Step): +class AlphaFoldQueryJsonGeneration(Step): section = "importing" - display_name = "AlphaFold Multimer Query JSON Generation" + display_name = "AlphaFold Query JSON Generation" operation = "Query Generation" - method_description = "Generate a JSON to upload to AlphaFold-Server to generate a prediction on a multimer." + method_description = ( + "Generate a JSON to upload to AlphaFold-Server to generate a prediction." + ) output_keys = ["downloads"] def create_form(self): return Form( - label="AlphaFold Multimer Query JSON Generation", + label="AlphaFold Query JSON Generation", input_fields=[ TextField( name="name", @@ -658,4 +660,4 @@ def create_form(self): ], ) - download_method = staticmethod(generate_alphafold_multimer_query_json) + download_method = staticmethod(generate_alphafold_query_json) diff --git a/backend/tests/main/test_views_helper.py b/backend/tests/main/test_views_helper.py index 78757ddf4..0062cac52 100644 --- a/backend/tests/main/test_views_helper.py +++ b/backend/tests/main/test_views_helper.py @@ -15,7 +15,7 @@ def test_get_all_possible_step_names(): "FastaImport", "AlphaFoldPredictionLoad", "CrosslinkingImport", - "AlphaFoldMultimerQueryJsonGeneration", + "AlphaFoldQueryJsonGeneration", "ImportMonomerStructurePredictionFromDisk", "UploadMultimerPredictions", "ImportMultimerStructurePredictionFromDisk", diff --git a/backend/tests/protzilla/importing/test_query_generation.py b/backend/tests/protzilla/importing/test_query_generation.py index cf3125a49..7a83b2d94 100644 --- a/backend/tests/protzilla/importing/test_query_generation.py +++ b/backend/tests/protzilla/importing/test_query_generation.py @@ -4,7 +4,7 @@ from unittest.mock import patch, Mock import requests from backend.protzilla.importing.query_generation import ( - generate_alphafold_multimer_query_json, + generate_alphafold_query_json, ) FAKE_FASTA_1 = ">P69905 Hemoglobin subunit alpha\nMVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG" @@ -25,7 +25,7 @@ def test_generate_alphafold_multimer_json_query_for_multiple_proteins(mock_get): mock_get.side_effect = [mock_resp1, mock_resp2] - result = generate_alphafold_multimer_query_json("P69905 P68871", "2,3", -1, "name") + result = generate_alphafold_query_json("P69905 P68871", "2,3", -1, "name") downloads = result["downloads"] assert len(downloads) == 1 @@ -75,23 +75,16 @@ def test_generate_alphafold_multimer_json_query_with_model_seed(mock_get): mock_resp.raise_for_status = Mock() mock_get.return_value = mock_resp - result = generate_alphafold_multimer_query_json( - "P69905", "2", model_seed=12345, name="name" - ) + result = generate_alphafold_query_json("P69905", "2", model_seed=12345, name="name") downloads = result["downloads"] key = list(downloads.keys())[0] parsed_json = json.loads(downloads[key][1:-1]) assert parsed_json["modelSeeds"] == [12345] -def test_generate_alphafold_multimer_json_query_with_mismatched_number_of_ids_and_number_of_copies(): - with pytest.raises(ValueError, match="number of copies is missing"): - generate_alphafold_multimer_query_json("P69905 P68871", "2", -1, "name") - - def test_generate_alphafold_multimer_json_query_with_mismatched_number_of_ids_and_number_of_copies(): with pytest.raises(ValueError) as error: - generate_alphafold_multimer_query_json("P69905 P68871", "2", -1, "name") + generate_alphafold_query_json("P69905 P68871", "2", -1, "name") msg = str(error.value) assert "2 ids" in msg @@ -100,7 +93,7 @@ def test_generate_alphafold_multimer_json_query_with_mismatched_number_of_ids_an def test_generate_alphafold_multimer_json_query_with_invalid_copy_number(): with pytest.raises(ValueError, match="Invalid list of number of copies per id"): - generate_alphafold_multimer_query_json("P69905", "abc", -1, "name") + generate_alphafold_query_json("P69905", "abc", -1, "name") @patch("backend.protzilla.importing.query_generation.requests.get") @@ -110,4 +103,4 @@ def test_generate_alphafold_multimer_json_query_with_http_error(mock_get): mock_get.return_value = mock_resp with pytest.raises(requests.exceptions.HTTPError): - generate_alphafold_multimer_query_json("P69905", "2", -1, "name") + generate_alphafold_query_json("P69905", "2", -1, "name") From 6ecc4308b400d86fcb0f0a68fc759b6810fe5635 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Wed, 4 Mar 2026 16:35:55 +0100 Subject: [PATCH 142/240] fix: format with black --- .../protzilla/data_analysis/test_crosslinking_validation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 1afe7c2cc..a52fad3f6 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -483,6 +483,8 @@ def test_validate_multimer_duplicates_rows_for_multiple_peptide_matches_and_vali ("duplicated" in str(m.get("msg", "")).lower()) and (m.get("level") is not None) for m in messages ) + + def test_add_vertical_line_with_annotation_in_legend_adds_line_and_legend(): fig = go.Figure() add_vertical_line_with_annotation_in_legend( From 6cb97794992907e70bff6287eecf4fa36752e352 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Wed, 4 Mar 2026 17:15:54 +0100 Subject: [PATCH 143/240] fix: changes proposed by reviews amd fixed a few tests --- .../data_analysis/crosslinking_validation.py | 45 ++++++++--- backend/protzilla/methods/data_analysis.py | 7 +- .../test_crosslinking_validation.py | 75 +++++++++++++++++-- 3 files changed, 102 insertions(+), 25 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 935153f18..3a8c041fe 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -112,6 +112,18 @@ def get_distance_between_two_amino_acids_in_angstrom( def get_protein_sequence_from_df( amino_acid_sequences_df: pd.DataFrame, protein_id: str ) -> str: + """ + Returns the amino acid sequence for a given protein ID from a DataFrame. + + If the provided protein ID does not contain an isoform suffix, the default suffix "-1" + is appended to match the format used in the DataFrame (e.g. "O43242" becomes "O43242-1"). + + :param amino_acid_sequences_df: DataFrame containing at least the columns + "Protein ID" and "Protein Sequence" + :param protein_id: UniProt protein identifier, with or without isoform suffix + :return: the corresponding amino acid sequence as a string, or an empty string + if the protein ID is not found + """ # because protein ids like O43242 are saved as O43242-1 in amino_acid_sequences_df if "-" not in protein_id: protein_id = f"{protein_id}-1" @@ -156,9 +168,18 @@ def add_positions_of_amino_acid_where_crosslinker_bound_to_df( rows_to_delete = [] messages = [] - def get_positions_for( + def get_crosslink_positions_in_protein( peptide: str, protein_id: str, cl_position_within_peptide: int ) -> list: + """ + Returns the 1-based positions of the crosslinked residue within the full + protein sequence for all occurrences of a given peptide. + + :param peptide: peptide sequence to search for in the protein + :param protein_id: UniProt protein identifier + :param cl_position_within_peptide: 1-based position of the crosslinked residue within the peptide + :return: list of 1-based residue positions in the protein sequence + """ protein_sequence = get_protein_sequence_from_df( amino_acid_sequences_df=amino_acid_sequences_df, protein_id=protein_id ) @@ -174,10 +195,10 @@ def get_positions_for( protein_id1 = crosslinker_row.Protein_id1 protein_id2 = crosslinker_row.Protein_id2 - peptide1_positions = get_positions_for( + peptide1_positions = get_crosslink_positions_in_protein( peptide_sequence1, protein_id1, crosslinker_row.CL_position_within_peptide1 ) - peptide2_positions = get_positions_for( + peptide2_positions = get_crosslink_positions_in_protein( peptide_sequence2, protein_id2, crosslinker_row.CL_position_within_peptide2 ) @@ -230,7 +251,6 @@ def validate_with_angstrom_deviation( crosslinker_information: dict[str, list[float]], cif_df: pd.DataFrame, amino_acid_sequences_df: pd.DataFrame, - is_multimer: bool, ) -> dict: """ Validates cross-links by comparing the cross-linker lengths with the distances between the linked @@ -246,7 +266,6 @@ def validate_with_angstrom_deviation( - upper_accepted_deviation_for_: float :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) :param amino_acid_sequences_df: DataFrame containing the protein sequence - :param is_multimer: Whether the structures we want to check are monomer or multimer :return: dict (crosslinking_df_result, messages), crosslinking_df_result contains the relevant rows (rows of intra-crosslinks within the protein to validate) of crosslinking_df and two more columns containing the distances in AlphaFold and whether the crosslink matches the AlphaFold data or not @@ -255,6 +274,10 @@ def validate_with_angstrom_deviation( """ all_crosslinks_df = crosslinking_df.copy() + if len(structures_to_validate) == 1: + is_multimer = False + else: + is_multimer = True if not is_multimer: # we are only interested in intra-crosslinks of the protein we want to validate mask = (all_crosslinks_df.Protein_id1 == structures_to_validate[0]) & ( @@ -281,7 +304,7 @@ def validate_with_angstrom_deviation( def check_crosslink(crosslink: pd.Series) -> pd.Series: protein_id1 = crosslink.Protein_id1 - protein_id2 = crosslink.Protein_id1 + protein_id2 = crosslink.Protein_id2 protein_sequence1 = get_protein_sequence_from_df( amino_acid_sequences_df=amino_acid_sequences_df, protein_id=protein_id1 ) @@ -362,7 +385,6 @@ def diagrams_of_crosslinking_validation_data( crosslinker_information: dict[str, list[float]], cif_df: pd.DataFrame, amino_acid_sequences_df: pd.DataFrame, - is_multimer: bool, ) -> list[Figure]: """ Creates for each crosslinker histogram plots summarizing the distribution of valid and invalid @@ -399,7 +421,6 @@ def diagrams_of_crosslinking_validation_data( crosslinker_information, cif_df, amino_acid_sequences_df, - is_multimer, )["crosslinking_result_df"] validated_df = validated_df.dropna(subset=["valid_crosslink"]) @@ -420,13 +441,13 @@ def diagrams_of_crosslinking_validation_data( accepted_deviation_upper_bound, accepted_deviation_lower_bound, ) = crosslinker_information[crosslinker] - + structures_to_validate_str = ", ".join(structures_to_validate) histogram = create_histograms( dataframe_a=df_valid, dataframe_b=df_invalid, name_a="Valid Crosslinks", name_b="Invalid Crosslinks", - heading=f"Predicted distances for {structures_to_validate} with crosslinker {crosslinker}", + heading=f"Predicted distances for {structures_to_validate_str} with crosslinker {crosslinker}", x_title="Distance (Å)", y_title="Count", overlay=True, @@ -458,7 +479,7 @@ def diagrams_of_crosslinking_validation_data( dataframe_b=df_invalid, name_a="Valid Crosslinks", name_b="Invalid Crosslinks", - heading=f"Predicted distances for {structures_to_validate} with crosslinker {crosslinker}, mean +/- 2 σ", + heading=f"Predicted distances for {structures_to_validate_str} with crosslinker {crosslinker}, mean +/- 2 σ", x_title="Distance (Å)", y_title="Count", overlay=True, @@ -528,7 +549,7 @@ def diagrams_of_crosslinking_validation_data( "Cross-Links matching predicted data", "Cross-Links not matching predicted data", ], - heading=f"All Cross-Links used for validation of {structures_to_validate}", + heading=f"All Cross-Links used for validation of {structures_to_validate_str}", y_title="Number of Cross-Links", ) figures.append(bar_plot_over_all_checked_crosslinks) diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 9351d54ce..f3855e7ef 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -2662,7 +2662,7 @@ def modify_form(self, form: Form, run: Run) -> None: # create fields for every crosslink self.create_crosslink_input_fields(form=form, run=run) - plot_method = staticmethod(bar_plot_of_valid_crosslinks) + plot_method = staticmethod(diagrams_of_crosslinking_validation_data) calc_method = staticmethod(validate_with_angstrom_deviation) def insert_dataframes(self, steps: StepManager, inputs) -> dict: @@ -2673,7 +2673,6 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: AlphaFoldPredictionLoad, "uniprot_id", entry_id ) - inputs["is_multimer"] = False return self.insert_dataframes_with_correct_input_step_id( steps=steps, inputs=inputs, @@ -2714,7 +2713,7 @@ def modify_form(self, form: Form, run: Run) -> None: form["entry_id"].set_options(form_helper.to_choices(loaded_proteins_entry_ids)) self.create_crosslink_input_fields(form=form, run=run) - plot_method = staticmethod(bar_plot_of_valid_crosslinks) + plot_method = staticmethod(diagrams_of_crosslinking_validation_data) calc_method = staticmethod(validate_with_angstrom_deviation) def insert_dataframes(self, steps: StepManager, inputs) -> dict: @@ -2725,8 +2724,6 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: UploadMultimerPredictions, "entry_id", entry_id ) - inputs["is_multimer"] = True - return self.insert_dataframes_with_correct_input_step_id( steps=steps, inputs=inputs, diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index a52fad3f6..83847b718 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -17,9 +17,7 @@ from backend.protzilla.data_analysis.plots import ( add_vertical_line_with_annotation_in_legend, ) -from backend.protzilla.methods.data_analysis import ( - CrossLinkingValidationWithAngstromDeviation, -) + from protzilla.methods.data_analysis import CrosslinkingValidationWithAngstromDeviation @@ -69,7 +67,6 @@ def test_validate_with_angstrom_deviation(distance, expected): crosslinker_information=crosslinker_information, amino_acid_sequences_df=amino_acid_sequences_df, cif_df=cif_df, - is_multimer=False, ) df = result["crosslinking_result_df"] @@ -330,7 +327,6 @@ def test_validate_multimer_filters_only_pairs_within_structures_to_validate(): crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, - is_multimer=True, ) result_df = out["crosslinking_result_df"] @@ -392,7 +388,6 @@ def test_validate_multimer_no_links_between_structures_returns_empty_and_warning crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, - is_multimer=True, ) result_df = out["crosslinking_result_df"] @@ -402,7 +397,7 @@ def test_validate_multimer_no_links_between_structures_returns_empty_and_warning assert result_df.empty assert isinstance(messages, list) - assert len(messages) == 1 + assert len(messages) >= 1 assert messages[0].get("level") is not None assert "There are no cross links between the structures to validate." in messages[ 0 @@ -453,7 +448,6 @@ def test_validate_multimer_duplicates_rows_for_multiple_peptide_matches_and_vali crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, - is_multimer=True, ) result_df = out["crosslinking_result_df"] @@ -753,3 +747,68 @@ def test_diagrams_calls_with_correct_parameters( "bar_fig", ] assert figures == expected_figures + + +def test_validate_multimer_with_invalid_crosslinks(): + sequences_df = pd.DataFrame( + [ + ("P1-1", "ABAB"), + ("P2-1", "ABAB"), + ], + columns=["Protein ID", "Protein Sequence"], + ) + + crosslinking_df = pd.DataFrame( + [ + ("P1", "P2", "AB", "AB", 0, 0, "XL"), + ], + columns=[ + "Protein_id1", + "Protein_id2", + "Peptide1", + "Peptide2", + "CL_position_within_peptide1", + "CL_position_within_peptide2", + "Crosslinker", + ], + ) + + cif_df = pd.DataFrame( + { + "_atom_site.label_atom_id": ["CA"] * 4, + "_atom_site.label_seq_id": [1, 2, 3, 4], + "_atom_site.Cartn_x": [1.0, 2.0, 3.0, 4.0], + "_atom_site.Cartn_y": [0.0, 0.0, 0.0, 0.0], + "_atom_site.Cartn_z": [0.0, 0.0, 0.0, 0.0], + } + ) + + # length = 1.5, upper_dev = 0.6, lower_dev = 0.6. + # Distances will be [0.0, 0.0, 2.0, 2.0] -> two valid (2.0) and two invalid (0.0). + crosslinker_information = {"XL": [1.5, 0.6, 0.6]} + + out = validate_with_angstrom_deviation( + crosslinking_df=crosslinking_df, + structures_to_validate=["P1", "P2"], + crosslinker_information=crosslinker_information, + cif_df=cif_df, + amino_acid_sequences_df=sequences_df, + ) + + result_df = out["crosslinking_result_df"] + assert isinstance(result_df, pd.DataFrame) + assert len(result_df) == 4 + + distances = sorted(result_df["alphafold_distance"].astype(float).tolist()) + assert distances == [0.0, 0.0, 2.0, 2.0] + + valid_counts = result_df["valid_crosslink"].value_counts() + assert valid_counts.get(True, 0) == 2 + assert valid_counts.get(False, 0) == 2 + + valid_distances = sorted( + result_df.loc[result_df["valid_crosslink"] == True, "alphafold_distance"] + .astype(float) + .tolist() + ) + assert valid_distances == [2.0, 2.0] From e7c619a6bf781e05cc300fc723ca7176c9180bef Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Wed, 4 Mar 2026 17:23:01 +0100 Subject: [PATCH 144/240] fix: fixes wrong variable naming --- .../protzilla/data_analysis/test_crosslinking_validation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 83847b718..5ad3d9666 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -548,7 +548,7 @@ def test_diagrams_of_crosslinking_validation_data_with_drawing_all_vertical_line figures = diagrams_of_crosslinking_validation_data( crosslinking_df=sample_crosslinking_df, - protein_to_validate="P12345", + structures_to_validate=["P12345"], crosslinker_information=sample_crosslinker_info, cif_df=pd.DataFrame(), amino_acid_sequence_df=pd.DataFrame(), @@ -616,7 +616,7 @@ def test_diagrams_of_crosslinking_validation_data_without_drawing_all_vertical_l figures = diagrams_of_crosslinking_validation_data( crosslinking_df=sample_crosslinking_df_with_no_std, - protein_to_validate="P12345", + structures_to_validate=["P12345"], crosslinker_information=sample_crosslinker_info_matching_sample_crosslinking_df_with_no_std, cif_df=pd.DataFrame(), amino_acid_sequence_df=pd.DataFrame(), @@ -678,7 +678,7 @@ def test_diagrams_calls_with_correct_parameters( figures = diagrams_of_crosslinking_validation_data( crosslinking_df=sample_crosslinking_df_with_one_crosslinker, - protein_to_validate="P12345", + structures_to_validate=["P12345"], crosslinker_information=sample_crosslinker_info_with_one_crosslinker, cif_df=pd.DataFrame(), amino_acid_sequence_df=pd.DataFrame(), From b8726fc57d3aa59609f483ee85f949378b4afb02 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Wed, 4 Mar 2026 17:29:22 +0100 Subject: [PATCH 145/240] fix: renamed amino_acid_sequence_df --- backend/protzilla/data_analysis/crosslinking_validation.py | 2 +- .../protzilla/data_analysis/test_crosslinking_validation.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 3a8c041fe..b7c7bdb73 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -409,7 +409,7 @@ def diagrams_of_crosslinking_validation_data( - lower_accepted_deviation_for_: float - upper_accepted_deviation_for_: float :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) - :param amino_acid_sequence_df: DataFrame containing the protein sequence + :param amino_acid_sequences_df: DataFrame containing the protein sequence :return: List of Plotly Figure objects. For each crosslinker, the list contains two histogram figures (mean ± 2 standard deviations first, full range second), followed by a final bar plot summarizing valid and invalid cross-links across all crosslinkers. diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 5ad3d9666..ef9142f1e 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -551,7 +551,7 @@ def test_diagrams_of_crosslinking_validation_data_with_drawing_all_vertical_line structures_to_validate=["P12345"], crosslinker_information=sample_crosslinker_info, cif_df=pd.DataFrame(), - amino_acid_sequence_df=pd.DataFrame(), + amino_acid_sequences_df=pd.DataFrame(), ) # 2 histograms per crosslinker + 1 bar plot @@ -619,7 +619,7 @@ def test_diagrams_of_crosslinking_validation_data_without_drawing_all_vertical_l structures_to_validate=["P12345"], crosslinker_information=sample_crosslinker_info_matching_sample_crosslinking_df_with_no_std, cif_df=pd.DataFrame(), - amino_acid_sequence_df=pd.DataFrame(), + amino_acid_sequences_df=pd.DataFrame(), ) # 2 histograms per crosslinker + 1 bar plot @@ -681,7 +681,7 @@ def test_diagrams_calls_with_correct_parameters( structures_to_validate=["P12345"], crosslinker_information=sample_crosslinker_info_with_one_crosslinker, cif_df=pd.DataFrame(), - amino_acid_sequence_df=pd.DataFrame(), + amino_acid_sequences_df=pd.DataFrame(), ) mock_validate.assert_called_once() From 09d50dac971c9d7055ccc6055e552558b2c3ceb6 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Fri, 6 Mar 2026 18:27:59 +0100 Subject: [PATCH 146/240] fix: fix type bug and refactor a bit --- .../data_analysis/crosslinking_validation.py | 21 +++++++------------ .../alphafold_protein_structure_load.py | 3 ++- backend/protzilla/methods/data_analysis.py | 10 +++++---- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index b7c7bdb73..86630b2ce 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -133,17 +133,17 @@ def get_protein_sequence_from_df( ] if matches.empty: - return "" + raise KeyError("Protein ID not found in the given fasta file.") return matches.iloc[0] -def add_positions_of_amino_acid_where_crosslinker_bound_to_df( +def add_protein_crosslink_positions_to_df( input_crosslinking_df: pd.DataFrame, amino_acid_sequences_df: pd.DataFrame, ) -> tuple[pd.DataFrame, list[dict]]: """ - Adds for each crosslink the 1-based positions of amino acids where the crosslink bound to a crosslinking DataFrame. + For each crosslink the 1-based positions of amino acids where the crosslink bound to a crosslinking DataFrame. If a peptide sequence occurs multiple times in the protein, the row is duplicated for each additional combination of positions. If a peptide sequence can't be matched the row will be deleted and a warning emitted. @@ -153,7 +153,7 @@ def add_positions_of_amino_acid_where_crosslinker_bound_to_df( - 'Peptide2': second peptide sequence - 'CL_position_within_peptide1': 0-based crosslinker position within Peptide1 - 'CL_position_within_peptide2': 0-based crosslinker position within Peptide2 - :param protein_sequences: Full protein sequences in which the peptides are located. + :param amino_acid_sequences_df: Dataframe that contains all amino acid sequences :return: tuple (updated_crosslinking_df, messages) - updated_crosslinking_df: input DataFrame with two new columns: - 'crosslinker_position1': 1-based crosslinker position in Peptide1 @@ -265,7 +265,7 @@ def validate_with_angstrom_deviation( - lower_accepted_deviation_for_: float - upper_accepted_deviation_for_: float :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) - :param amino_acid_sequences_df: DataFrame containing the protein sequence + :param amino_acid_sequences_df: Dataframe that contains all amino acid sequences :return: dict (crosslinking_df_result, messages), crosslinking_df_result contains the relevant rows (rows of intra-crosslinks within the protein to validate) of crosslinking_df and two more columns containing the distances in AlphaFold and whether the crosslink matches the AlphaFold data or not @@ -274,10 +274,7 @@ def validate_with_angstrom_deviation( """ all_crosslinks_df = crosslinking_df.copy() - if len(structures_to_validate) == 1: - is_multimer = False - else: - is_multimer = True + is_multimer = len(structures_to_validate) > 1 if not is_multimer: # we are only interested in intra-crosslinks of the protein we want to validate mask = (all_crosslinks_df.Protein_id1 == structures_to_validate[0]) & ( @@ -296,10 +293,8 @@ def validate_with_angstrom_deviation( messages = [dict(level=logging.WARNING, msg=msg)] return dict(crosslinking_result_df=pd.DataFrame(), messages=messages) - relevant_crosslinks_df, messages = ( - add_positions_of_amino_acid_where_crosslinker_bound_to_df( - relevant_crosslinks_df, amino_acid_sequences_df - ) + relevant_crosslinks_df, messages = add_protein_crosslink_positions_to_df( + relevant_crosslinks_df, amino_acid_sequences_df ) def check_crosslink(crosslink: pd.Series) -> pd.Series: diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index eef2ce78a..4d5b86522 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -12,6 +12,7 @@ import gemmi import pandas as pd import requests +import re from backend.protzilla.constants import paths from backend.protzilla.constants.protzilla_logging import logger @@ -832,7 +833,7 @@ def upload_multimer_prediction( timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") - uniprot_ids_as_list = uniprot_ids.split(", ") + uniprot_ids_as_list = re.split(r"\s*,\s*", uniprot_ids.strip()) data: dict[str, Any] = { "entry_id": entry_id, diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index f3855e7ef..fc7051750 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -2617,10 +2617,12 @@ def insert_dataframes_with_correct_input_step_id( if "uniprot_accession" in metadata_df.columns: inputs["structures_to_validate"] = metadata_df["uniprot_accession"].tolist() elif "uniprot_ids" in metadata_df.columns: - inputs["structures_to_validate"] = ast.literal_eval( - metadata_df["uniprot_ids"].iloc[0] - ) - l = inputs["structures_to_validate"] + value = metadata_df["uniprot_ids"].iloc[0] + if isinstance(value, str): + value = ast.literal_eval(value) + + inputs["structures_to_validate"] = value + else: raise ValueError( "No correct metadata found. Metadata must contain 'uniprot_ids' or 'uniprot_accession'." From fc7f7a7f84fc9680653b2cc41b183cbf436fbd74 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Fri, 6 Mar 2026 18:39:52 +0100 Subject: [PATCH 147/240] fix: function naming --- .../test_crosslinking_validation.py | 22 +++++-------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index ef9142f1e..0461bd1d2 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -10,7 +10,7 @@ from backend.protzilla.data_analysis.crosslinking_validation import ( validate_with_angstrom_deviation, get_distance_between_two_amino_acids_in_angstrom, - add_positions_of_amino_acid_where_crosslinker_bound_to_df, + add_protein_crosslink_positions_to_df, diagrams_of_crosslinking_validation_data, ) from backend.protzilla.constants.colors import PLOT_PRIMARY_COLOR @@ -132,9 +132,7 @@ def test_add_crosslinker_positions_with_exactly_one_possible_position(): {"Protein ID": ["P1-1"], "Protein Sequence": ["XXABCYYYDEFZZ"]} ) - df, messages = add_positions_of_amino_acid_where_crosslinker_bound_to_df( - df, amino_acid_sequences_df - ) + df, messages = add_protein_crosslink_positions_to_df(df, amino_acid_sequences_df) assert messages == [] @@ -161,9 +159,7 @@ def test_add_crosslinker_positions_with_more_than_one_possible_position(): {"Protein ID": ["P1-1"], "Protein Sequence": ["AAXXAAZZBBYYBB"]} ) - df, messages = add_positions_of_amino_acid_where_crosslinker_bound_to_df( - df, amino_acid_sequences_df - ) + df, messages = add_protein_crosslink_positions_to_df(df, amino_acid_sequences_df) # 2 AA matches × 2 BB matches = 4 combinations assert len(df) == 4 @@ -194,9 +190,7 @@ def test_add_crosslinker_positions_but_one_peptide_not_found_deletes_row(): {"Protein ID": ["P1-1"], "Protein Sequence": ["XXXXXXXX"]} ) - df, messages = add_positions_of_amino_acid_where_crosslinker_bound_to_df( - df, amino_acid_sequences_df - ) + df, messages = add_protein_crosslink_positions_to_df(df, amino_acid_sequences_df) assert len(messages) == 1 assert messages[0]["level"] == logging.WARNING @@ -222,9 +216,7 @@ def test_add_crosslinker_positions_with_valid_and_invalid_rows_mixed(): {"Protein ID": ["P1-1"], "Protein Sequence": ["ABCDEF"]} ) - df, messages = add_positions_of_amino_acid_where_crosslinker_bound_to_df( - df, amino_acid_sequences_df - ) + df, messages = add_protein_crosslink_positions_to_df(df, amino_acid_sequences_df) assert len(messages) == 2 assert messages[0]["level"] == logging.WARNING @@ -253,9 +245,7 @@ def test_add_crosslinker_positions_with_overlapping_peptide_matches(): {"Protein ID": ["P1-1"], "Protein Sequence": ["AAAAB"]} ) - df, messages = add_positions_of_amino_acid_where_crosslinker_bound_to_df( - df, amino_acid_sequences_df - ) + df, messages = add_protein_crosslink_positions_to_df(df, amino_acid_sequences_df) # AAA -> positions 0, 1 # B -> position 4 From 8d7e24152fae1679bd7272a652e9d03d23dd37ac Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Fri, 6 Mar 2026 18:46:08 +0100 Subject: [PATCH 148/240] fix: improve doc string --- .../data_analysis/crosslinking_validation.py | 45 ++++++++++++------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 86630b2ce..6e973ce85 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -143,23 +143,34 @@ def add_protein_crosslink_positions_to_df( amino_acid_sequences_df: pd.DataFrame, ) -> tuple[pd.DataFrame, list[dict]]: """ - For each crosslink the 1-based positions of amino acids where the crosslink bound to a crosslinking DataFrame. - If a peptide sequence occurs multiple times in the protein, the row is duplicated for each - additional combination of positions. - If a peptide sequence can't be matched the row will be deleted and a warning emitted. - - :param input_crosslinking_df: DataFrame containing cross-linking data with at least the following columns: - - 'Peptide1': first peptide sequence - - 'Peptide2': second peptide sequence - - 'CL_position_within_peptide1': 0-based crosslinker position within Peptide1 - - 'CL_position_within_peptide2': 0-based crosslinker position within Peptide2 - :param amino_acid_sequences_df: Dataframe that contains all amino acid sequences - :return: tuple (updated_crosslinking_df, messages) - - updated_crosslinking_df: input DataFrame with two new columns: - - 'crosslinker_position1': 1-based crosslinker position in Peptide1 - - 'crosslinker_position2': 1-based crosslinker position in Peptide2 - Rows are duplicated for multiple peptide matches. - - messages: list of warning dictionaries with if the peptide was not found or a row was duplicated + Add protein-level crosslink residue positions to a crosslinking DataFrame. + + For each row, this function finds the 1-based residue positions in the full protein + sequence(s) that correspond to the crosslinked residue within each peptide. The + protein-level positions are written to two new columns: + + - 'crosslinker_position1': 1-based residue position in Protein_id1 for Peptide1. + - 'crosslinker_position2': 1-based residue position in Protein_id2 for Peptide2. + + If a peptide occurs multiple times in the corresponding protein sequence, all + combinations of (position1, position2) are generated. The first combination is + kept in the original row and the row is duplicated for each additional combination. + + If either peptide cannot be matched in its corresponding protein sequence, the row + is removed and a warning message is recorded. + + :param input_crosslinking_df: DataFrame containing cross-linking data with at least the following columns: + - 'Peptide1': first peptide sequence + - 'Peptide2': second peptide sequence + - 'CL_position_within_peptide1': 0-based crosslinker position within Peptide1 + - 'CL_position_within_peptide2': 0-based crosslinker position within Peptide2 + :param amino_acid_sequences_df: Dataframe that contains all amino acid sequences + :return: tuple (updated_crosslinking_df, messages) + - updated_crosslinking_df: input DataFrame with two new columns: + - 'crosslinker_position1': 1-based crosslinker position in Peptide1 + - 'crosslinker_position2': 1-based crosslinker position in Peptide2 + Rows are duplicated for multiple peptide matches. + - messages: list of warning dictionaries with if the peptide was not found or a row was duplicated """ crosslinking_df = input_crosslinking_df.copy() crosslinking_df["crosslinker_position1"] = pd.Series(dtype="Int64") From e590a08f7b40fdb818a8f0c8b9acffe2cb6f0fda Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 9 Mar 2026 12:08:18 +0100 Subject: [PATCH 149/240] fix typo --- backend/protzilla/data_analysis/crosslinking_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 6e973ce85..72a57f66c 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -170,7 +170,7 @@ def add_protein_crosslink_positions_to_df( - 'crosslinker_position1': 1-based crosslinker position in Peptide1 - 'crosslinker_position2': 1-based crosslinker position in Peptide2 Rows are duplicated for multiple peptide matches. - - messages: list of warning dictionaries with if the peptide was not found or a row was duplicated + - messages: list of warning dictionaries if the peptide was not found or a row was duplicated """ crosslinking_df = input_crosslinking_df.copy() crosslinking_df["crosslinker_position1"] = pd.Series(dtype="Int64") From 23ce0c3420cdcd86f78a3a96b17e6fb8ab5c64b8 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Wed, 25 Mar 2026 03:48:47 +0100 Subject: [PATCH 150/240] feat: Add link_type to checked_crosslinks_df and show link_type in plot labels --- .../data_analysis/crosslinking_validation.py | 45 +++++++++++++++---- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 72a57f66c..f1ac976eb 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -382,6 +382,11 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: relevant_crosslinks_df["valid_crosslink"].notna() ] + checked_crosslinks_df["link_type"] = checked_crosslinks_df.apply( + lambda row: "intra" if row["Protein_id1"] == row["Protein_id2"] else "inter", + axis=1, + ) + return dict(crosslinking_result_df=checked_crosslinks_df, messages=messages) @@ -442,6 +447,18 @@ def diagrams_of_crosslinking_validation_data( df_valid = pd.DataFrame({"alphafold_distance": distances_valid}) df_invalid = pd.DataFrame({"alphafold_distance": distances_invalid}) + # Count intra/inter for valid and invalid crosslinks + valid_mask = crosslinker_df["valid_crosslink"] + invalid_mask = ~crosslinker_df["valid_crosslink"] + valid_intra = ((valid_mask) & (crosslinker_df["link_type"] == "intra")).sum() + valid_inter = ((valid_mask) & (crosslinker_df["link_type"] == "inter")).sum() + invalid_intra = ( + (invalid_mask) & (crosslinker_df["link_type"] == "intra") + ).sum() + invalid_inter = ( + (invalid_mask) & (crosslinker_df["link_type"] == "inter") + ).sum() + ( crosslinker_length, accepted_deviation_upper_bound, @@ -451,8 +468,8 @@ def diagrams_of_crosslinking_validation_data( histogram = create_histograms( dataframe_a=df_valid, dataframe_b=df_invalid, - name_a="Valid Crosslinks", - name_b="Invalid Crosslinks", + name_a=f"Valid Crosslinks (intra: {valid_intra}, inter: {valid_inter})", + name_b=f"Invalid Crosslinks (intra: {invalid_intra}, inter: {invalid_inter})", heading=f"Predicted distances for {structures_to_validate_str} with crosslinker {crosslinker}", x_title="Distance (Å)", y_title="Count", @@ -483,8 +500,8 @@ def diagrams_of_crosslinking_validation_data( histogram_two_standard_deviations = create_histograms( dataframe_a=df_valid, dataframe_b=df_invalid, - name_a="Valid Crosslinks", - name_b="Invalid Crosslinks", + name_a=f"Valid Crosslinks (intra: {valid_intra}, inter: {valid_inter})", + name_b=f"Invalid Crosslinks (intra: {invalid_intra}, inter: {invalid_inter})", heading=f"Predicted distances for {structures_to_validate_str} with crosslinker {crosslinker}, mean +/- 2 σ", x_title="Distance (Å)", y_title="Count", @@ -543,8 +560,20 @@ def diagrams_of_crosslinking_validation_data( figures.append(histogram_two_standard_deviations) figures.append(histogram) - valid_crosslinks = (validated_df["valid_crosslink"] == True).sum() - invalid_crosslinks = (validated_df["valid_crosslink"] == False).sum() + valid_crosslinks = (validated_df["valid_crosslink"]).sum() + invalid_crosslinks = (~validated_df["valid_crosslink"]).sum() + valid_intra_total = ( + (validated_df["valid_crosslink"]) & (validated_df["link_type"] == "intra") + ).sum() + valid_inter_total = ( + (validated_df["valid_crosslink"]) & (validated_df["link_type"] == "inter") + ).sum() + invalid_intra_total = ( + (~validated_df["valid_crosslink"]) & (validated_df["link_type"] == "intra") + ).sum() + invalid_inter_total = ( + (~validated_df["valid_crosslink"]) & (validated_df["link_type"] == "inter") + ).sum() bar_plot_over_all_checked_crosslinks = create_bar_plot( values_of_sectors=[ @@ -552,8 +581,8 @@ def diagrams_of_crosslinking_validation_data( invalid_crosslinks, ], names_of_sectors=[ - "Cross-Links matching predicted data", - "Cross-Links not matching predicted data", + f"Cross-Links matching predicted data (intra: {valid_intra_total}, inter: {valid_inter_total})", + f"Cross-Links not matching predicted data (intra: {invalid_intra_total}, inter: {invalid_inter_total})", ], heading=f"All Cross-Links used for validation of {structures_to_validate_str}", y_title="Number of Cross-Links", From 585a48fe1e07dd409ac9ceb581a5ebe16e8d5090 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Wed, 25 Mar 2026 04:00:36 +0100 Subject: [PATCH 151/240] feat: add tests for new column and plot label change --- .../data_analysis/test_crosslinking_validation.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 0461bd1d2..a20b713bf 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -73,8 +73,10 @@ def test_validate_with_angstrom_deviation(distance, expected): assert "alphafold_distance" in df.columns assert "valid_crosslink" in df.columns + assert "link_type" in df.columns assert df.loc[0, "alphafold_distance"] == distance assert df.loc[0, "valid_crosslink"] == expected + assert df.loc[0, "link_type"] == "intra" def test_modify_form_creates_crosslinker_fields(): @@ -333,6 +335,7 @@ def test_validate_multimer_filters_only_pairs_within_structures_to_validate(): assert "valid_crosslink" in result_df.columns assert "crosslinker_position1" in result_df.columns assert "crosslinker_position2" in result_df.columns + assert "link_type" in result_df.columns def test_validate_multimer_no_links_between_structures_returns_empty_and_warning(): @@ -462,6 +465,10 @@ def test_validate_multimer_duplicates_rows_for_multiple_peptide_matches_and_vali # With permissive bounds, all should be valid. assert result_df["valid_crosslink"].dropna().all() + # Check link_type column + assert "link_type" in result_df.columns + assert result_df["link_type"].isin(["intra", "inter"]).all() + # Expect a duplication warning message. assert any( ("duplicated" in str(m.get("msg", "")).lower()) and (m.get("level") is not None) @@ -500,6 +507,7 @@ def sample_crosslinking_df(): "Crosslinker": ["CL1", "CL1", "CL2", "CL2"], "alphafold_distance": [10.0, 12.0, 8.0, 9.0], "valid_crosslink": [True, False, True, False], + "link_type": ["intra", "intra", "inter", "inter"], } ) @@ -568,6 +576,7 @@ def sample_crosslinking_df_with_no_std(): "Crosslinker": ["CL1", "CL1", "CL2", "CL2"], "alphafold_distance": [10.5, 10.5, 10.5, 10.5], "valid_crosslink": [True, False, True, False], + "link_type": ["intra", "intra", "inter", "inter"], } ) @@ -641,6 +650,7 @@ def sample_crosslinking_df_with_one_crosslinker(): "Crosslinker": ["CL1", "CL1", "CL1", "CL1"], "alphafold_distance": [10.0, 12.0, 8.0, 9.0], "valid_crosslink": [True, False, True, False], + "link_type": ["intra", "intra", "inter", "inter"], } ) @@ -681,8 +691,8 @@ def test_diagrams_calls_with_correct_parameters( # Check histogram call parameters for crosslinker full-range first_hist_call = mock_hist.call_args_list[0].kwargs - assert first_hist_call["name_a"] == "Valid Crosslinks" - assert first_hist_call["name_b"] == "Invalid Crosslinks" + assert first_hist_call["name_a"] == "Valid Crosslinks (intra: 2, inter: 0)" + assert first_hist_call["name_b"] == "Invalid Crosslinks (intra: 0, inter: 2)" assert ( first_hist_call["heading"] == "Predicted distances for P12345 with crosslinker CL1" @@ -802,3 +812,4 @@ def test_validate_multimer_with_invalid_crosslinks(): .tolist() ) assert valid_distances == [2.0, 2.0] + assert "link_type" in result_df.columns From bacb0c21e234552095e08569a61bf2702cb9cb0b Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Wed, 25 Mar 2026 16:07:18 +0100 Subject: [PATCH 152/240] fix: adapt crosslinking branch to refactoring from 179 --- backend/protzilla/constants/data_types.py | 7 + .../data_analysis/crosslinking_validation.py | 27 +++- backend/protzilla/form.py | 2 +- backend/protzilla/form_helper.py | 1 + .../importing/crosslinking_import.py | 2 +- backend/protzilla/methods/data_analysis.py | 151 +++--------------- backend/protzilla/methods/importing.py | 60 +++---- backend/protzilla/steps.py | 4 +- .../test_crosslinking_validation.py | 36 +++-- 9 files changed, 110 insertions(+), 180 deletions(-) diff --git a/backend/protzilla/constants/data_types.py b/backend/protzilla/constants/data_types.py index b035f6657..358ee20f3 100644 --- a/backend/protzilla/constants/data_types.py +++ b/backend/protzilla/constants/data_types.py @@ -20,6 +20,13 @@ class DataKey(StrEnum): LOG2_FOLD_CHANGE_DF = "log2_fold_change_df" ENRICHMENT_DF = "enrichment_df" GENE_MAPPING_DF = "gene_mapping_df" + CIF_DF = "cif_df" + AMINO_ACID_SEQUENCES_DF = "amino_acid_sequences_df" + PAE_DF = "pae_df" # pae = predicted aligned error + PLDDT_DF = "plddt_df" # plddt = predicted local distance difference test + CROSSLINKING_DF = "crosslinking_df" + CONFIDENCE_DF = "confidence_df" + FULL_DATA_DF = "full_data_df" ProteinDf = NewType("ProteinDf", pd.DataFrame) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 72a57f66c..9965d2528 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -1,5 +1,5 @@ import itertools - +import ast import math import pandas as pd @@ -256,9 +256,21 @@ def get_crosslink_positions_in_protein( return crosslinking_df, messages +def _get_structures_to_validate(metadata_df: pd.DataFrame) -> list[str]: + if "uniprot_accession" in metadata_df.columns: + return metadata_df["uniprot_accession"].tolist() + elif "uniprot_ids" in metadata_df.columns: + value = metadata_df["uniprot_ids"].iloc[0] + if isinstance(value, str): + value = ast.literal_eval(value) + return value + else: + raise ValueError("Metadata must contain 'uniprot_ids' or 'uniprot_accession'.") + + def validate_with_angstrom_deviation( crosslinking_df: pd.DataFrame, - structures_to_validate: list[str], + metadata_df: pd.DataFrame, crosslinker_information: dict[str, list[float]], cif_df: pd.DataFrame, amino_acid_sequences_df: pd.DataFrame, @@ -270,7 +282,7 @@ def validate_with_angstrom_deviation( and more than (cross-linker length - the lower allowed deviation). If one of the bounds is zero only the other bound will be applied. :param crosslinking_df: DataFrame containing cross-linking data. - :param structures_to_validate: UniProt IDs of the proteins to validate. + :param metadata_df: DataFrame containing metadata :param crosslinker_information: Contains for each Crosslinker: - length_of_: float - lower_accepted_deviation_for_: float @@ -283,7 +295,7 @@ def validate_with_angstrom_deviation( :raises KeyError: If a required crosslinker field is missing in crosslinker_information. :raises ValueError: If peptide sequences cannot be matched to the protein sequence. """ - + structures_to_validate = _get_structures_to_validate(metadata_df) all_crosslinks_df = crosslinking_df.copy() is_multimer = len(structures_to_validate) > 1 if not is_multimer: @@ -387,7 +399,7 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: def diagrams_of_crosslinking_validation_data( crosslinking_df: pd.DataFrame, - structures_to_validate: list[str], + metadata_df: pd.DataFrame, crosslinker_information: dict[str, list[float]], cif_df: pd.DataFrame, amino_acid_sequences_df: pd.DataFrame, @@ -409,7 +421,7 @@ def diagrams_of_crosslinking_validation_data( :param crosslinking_df: DataFrame containing cross-linking data, including AlphaFold-predicted distances, crosslinker identifiers, and validation results. - :param structures_to_validate: UniProt IDs of the proteins to validate. + :param metadata_df: Dataframe containing metadata. :param crosslinker_information: Contains for each Crosslinker: - length_of_: float - lower_accepted_deviation_for_: float @@ -421,9 +433,10 @@ def diagrams_of_crosslinking_validation_data( bar plot summarizing valid and invalid cross-links across all crosslinkers. :raises KeyError: If a required crosslinker entry is missing in crosslinker_information. """ + structures_to_validate = _get_structures_to_validate(metadata_df) validated_df = validate_with_angstrom_deviation( crosslinking_df, - structures_to_validate, + metadata_df, crosslinker_information, cif_df, amino_acid_sequences_df, diff --git a/backend/protzilla/form.py b/backend/protzilla/form.py index f79b50f48..2f7b86c9d 100644 --- a/backend/protzilla/form.py +++ b/backend/protzilla/form.py @@ -13,7 +13,7 @@ pass -FormInputType = str | int | float | bool | list[str] +FormInputType = str | int | float | bool | list[str] | dict # Backwards compatibility for older imports that expect `inputs` from this module. inputs = FormInputType diff --git a/backend/protzilla/form_helper.py b/backend/protzilla/form_helper.py index 1367b3e8c..85c87028a 100644 --- a/backend/protzilla/form_helper.py +++ b/backend/protzilla/form_helper.py @@ -4,6 +4,7 @@ def to_choices(choices: list[str], required: bool = True) -> list[Option]: + """should probably only be used in modify_form and not when defining the form""" return sorted( [Option(str(el), str(el)) for el in choices] + [Option(None, "---------")] if not required diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index 4eb2ddad8..e7747a20a 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -14,7 +14,7 @@ from typing import Callable, Optional from enum import Enum -from backend.protzilla.utilities import format_trace +from backend.protzilla.utilities.utilities import format_trace from backend.protzilla.importing.import_utils import ( columns_in_crosslinking_df, rename_columns_csm_format, diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index ab1a02ca3..d9a164513 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -2369,10 +2369,17 @@ def create_form(self): ) -class CrosslinkingValidation(DataAnalysisStep): - @staticmethod - def _get_crosslinker_names_from_crosslinker_df(steps: StepManager) -> list[str]: - df = steps.get_step_output(Step, output_key="crosslinking_df") +class CrosslinkingValidationWithAngstromStep(DataAnalysisStep): + output_keys = ["crosslinking_result_df"] + internal_inputs = {"crosslinker_information"} + + plot_method = staticmethod(diagrams_of_crosslinking_validation_data) + calc_method = staticmethod(validate_with_angstrom_deviation) + + def _get_crosslinker_names_from_crosslinker_df( + self, steps: StepManager + ) -> list[str]: + df = self.get_input(steps, DataKey.CROSSLINKING_DF) if df is None or "Crosslinker" not in df.columns: return [] crosslinkers = df["Crosslinker"].dropna().unique() @@ -2413,143 +2420,37 @@ def collect_crosslinking_information(self, steps: StepManager, inputs) -> dict: ] return crosslinker_to_length_and_deviation - def insert_dataframes_with_correct_input_step_id( - self, steps, inputs, correct_input_step_identifier: str - ) -> dict: - inputs["cif_df"] = steps.get_step_output( - Step, "cif_df", correct_input_step_identifier - ) - inputs["amino_acid_sequences_df"] = steps.get_step_output( - Step, "amino_acid_sequences_df", correct_input_step_identifier - ) - inputs["crosslinking_df"] = steps.get_step_output( - Step, - "crosslinking_df", - ) - if inputs.get("crosslinking_df") is None: - raise ValueError("No cross linking data found.") - - inputs["crosslinker_information"] = self.collect_crosslinking_information( - steps=steps, inputs=inputs - ) + def modify_form(self, run: Run) -> None: + # create input fields for every crosslink + self.create_crosslink_input_fields(form=self.form, run=run) - metadata_df = steps.get_step_output( - Step, "metadata_df", correct_input_step_identifier + def insert_dataframes(self, steps: StepManager) -> None: + super().insert_dataframes(steps) + self.inputs["crosslinker_information"] = self.collect_crosslinking_information( + steps=steps, inputs=self.form_inputs ) - if "uniprot_accession" in metadata_df.columns: - inputs["structures_to_validate"] = metadata_df["uniprot_accession"].tolist() - elif "uniprot_ids" in metadata_df.columns: - value = metadata_df["uniprot_ids"].iloc[0] - if isinstance(value, str): - value = ast.literal_eval(value) - inputs["structures_to_validate"] = value - else: - raise ValueError( - "No correct metadata found. Metadata must contain 'uniprot_ids' or 'uniprot_accession'." - ) - return inputs - - -class CrosslinkingValidationWithAngstromDeviation(CrosslinkingValidation): +class CrosslinkingValidationWithAngstromDeviation( + CrosslinkingValidationWithAngstromStep +): display_name = "Ångström Deviation For Monomer Structures" operation = "Cross Linking Validation" method_description = "Validates cross links within the one protein structure based on the difference between the length of the cross linker and the distance between the amino acids which were connected by the cross linker. (in Ångström)" - output_keys = ["crosslinking_result_df"] - def create_form(self): - return Form( - label="Ångström Deviation - Monomer", - input_fields=[ - DropdownField( - name="entry_id", - label="Protein prediction that should be validated", - ), - ], - ) - - def modify_form(self, form: Form, run: Run) -> None: - # add all loaded protein entry ids to the dropdown of structures_to_validate_field - loaded_protein_entry_ids = list( - set( - run.steps.get_inputs_of_step_type( - ImportMonomerStructurePredictionFromDisk, "entry_id" - ) - + run.steps.get_inputs_of_step_type( - AlphaFoldPredictionLoad, "uniprot_id" - ) - ) - ) - form["entry_id"].set_options(form_helper.to_choices(loaded_protein_entry_ids)) - # create fields for every crosslink - self.create_crosslink_input_fields(form=form, run=run) - - plot_method = staticmethod(diagrams_of_crosslinking_validation_data) - calc_method = staticmethod(validate_with_angstrom_deviation) - - def insert_dataframes(self, steps: StepManager, inputs) -> dict: - entry_id = inputs["entry_id"] - correct_input_step_identifier = steps.get_step_identifier_of_step_with_input( - ImportMonomerStructurePredictionFromDisk, "entry_id", entry_id - ) or steps.get_step_identifier_of_step_with_input( - AlphaFoldPredictionLoad, "uniprot_id", entry_id - ) + return Form(label="Ångström Deviation - Monomer", input_fields=[]) - return self.insert_dataframes_with_correct_input_step_id( - steps=steps, - inputs=inputs, - correct_input_step_identifier=correct_input_step_identifier, - ) - -class CrosslinkingValidationWithAngstromDeviationForMultimer(CrosslinkingValidation): +class CrosslinkingValidationWithAngstromDeviationForMultimer( + CrosslinkingValidationWithAngstromStep +): display_name = "Ångström Deviation For Multimer Structures" operation = "Cross Linking Validation" method_description = "Validates cross links between proteins based on the difference between the length of the cross linker and the distance between the amino acids which were connected by the cross linker. (in Ångström)" - output_keys = ["crosslinking_result_df"] - def create_form(self): return Form( label="Ångström Deviation - Multimer", - input_fields=[ - DropdownField( - name="entry_id", - label="Multimer prediction that should be validated", - ), - ], - ) - - def modify_form(self, form: Form, run: Run) -> None: - # add all loaded protein entry ids to the dropdown of structures_to_validate_field - loaded_proteins_entry_ids = list( - set( - run.steps.get_inputs_of_step_type( - ImportMultimerStructurePredictionFromDisk, "entry_id" - ) - + run.steps.get_inputs_of_step_type( - UploadMultimerPredictions, "entry_id" - ) - ) - ) - form["entry_id"].set_options(form_helper.to_choices(loaded_proteins_entry_ids)) - self.create_crosslink_input_fields(form=form, run=run) - - plot_method = staticmethod(diagrams_of_crosslinking_validation_data) - calc_method = staticmethod(validate_with_angstrom_deviation) - - def insert_dataframes(self, steps: StepManager, inputs) -> dict: - entry_id = inputs["entry_id"] - correct_input_step_identifier = steps.get_step_identifier_of_step_with_input( - ImportMultimerStructurePredictionFromDisk, "entry_id", entry_id - ) or steps.get_step_identifier_of_step_with_input( - UploadMultimerPredictions, "entry_id", entry_id - ) - - return self.insert_dataframes_with_correct_input_step_id( - steps=steps, - inputs=inputs, - correct_input_step_identifier=correct_input_step_identifier, + input_fields=[], ) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 926c7bede..aa565a62d 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -430,11 +430,11 @@ class AlphaFoldPredictionLoad(ImportingStep): method_description = "Loads the predicted structure of the monomer with the given protein ID out of the AlphaFold DB." output_keys = [ - "metadata_df", - "cif_df", - "pae_df", - "plddt_df", - "amino_acid_sequences_df", + DataKey.METADATA_DF, + DataKey.CIF_DF, + DataKey.PAE_DF, + DataKey.PLDDT_DF, + DataKey.AMINO_ACID_SEQUENCES_DF, ] plot_method = None @@ -463,7 +463,7 @@ class CrosslinkingImport(ImportingStep): operation = "Crosslinking Data Import" method_description = "Import a file containing crosslinking data" - output_keys = ["crosslinking_df", "imported_rows_with_errors_df"] + output_keys = [DataKey.CROSSLINKING_DF] def create_form(self): return Form( @@ -494,11 +494,11 @@ class ImportMonomerStructurePredictionFromDisk(ImportingStep): method_description = "Load an already uploaded monomer structure prediction from disk into current run" output_keys = [ - "metadata_df", - "cif_df", - "pae_df", - "plddt_df", - "amino_acid_sequences_df", + DataKey.METADATA_DF, + DataKey.CIF_DF, + DataKey.PAE_DF, + DataKey.PLDDT_DF, + DataKey.AMINO_ACID_SEQUENCES_DF, ] def create_form(self): @@ -508,13 +508,16 @@ def create_form(self): DropdownField( name="entry_id", label="Entry ID of the monomer prediction to be loaded into the run. (Unless specified otherwise this is the Protein ID)", - options=form_helper.to_choices( - get_all_available_entry_ids_of_monomer_metadata() - ), ) ], ) + def modify_form(self, run: Run): + entry_id_field = self.form["entry_id"] + entry_id_field.set_options( + form_helper.to_choices(get_all_available_entry_ids_of_monomer_metadata()) + ) + calc_method = staticmethod(get_monomer_structure_dfs) @@ -524,11 +527,11 @@ class UploadMultimerPredictions(ImportingStep): method_description = "Upload a multimer protein prediction" output_keys = [ - "metadata_df", - "cif_df", - "confidence_df", - "full_data_df", - "amino_acid_sequences_df", + DataKey.METADATA_DF, + DataKey.CIF_DF, + DataKey.CONFIDENCE_DF, + DataKey.FULL_DATA_DF, + DataKey.AMINO_ACID_SEQUENCES_DF, ] def create_form(self): @@ -590,11 +593,11 @@ class ImportMultimerStructurePredictionFromDisk(ImportingStep): method_description = "Load an already uploaded multimer structure prediction from disk into current run" output_keys = [ - "metadata_df", - "amino_acid_sequences_df", - "cif_df", - "confidence_df", - "full_data_df", + DataKey.METADATA_DF, + DataKey.CIF_DF, + DataKey.CONFIDENCE_DF, + DataKey.FULL_DATA_DF, + DataKey.AMINO_ACID_SEQUENCES_DF, ] def create_form(self): @@ -604,11 +607,14 @@ def create_form(self): DropdownField( name="entry_id", label="Entry ID of the multimer prediction to be loaded into the run.", - options=form_helper.to_choices( - get_all_available_entry_ids_of_multimer_metadata() - ), ) ], ) + def modify_form(self, run: Run): + entry_id_field = self.form["entry_id"] + entry_id_field.set_options( + form_helper.to_choices(get_all_available_entry_ids_of_multimer_metadata()) + ) + calc_method = staticmethod(get_multimer_structure_dfs) diff --git a/backend/protzilla/steps.py b/backend/protzilla/steps.py index 22878a0e0..2a6da15a9 100644 --- a/backend/protzilla/steps.py +++ b/backend/protzilla/steps.py @@ -62,7 +62,7 @@ def __init__( self, instance_identifier: StepID | None = None, ): - self.inputs: dict[DataKey, pd.DataFrame | FormInputType] = {} + self.inputs: dict[DataKey | str, pd.DataFrame | FormInputType] = {} self.output: Output = Output() self.visual_data = {"node_position": {"x": 0, "y": 0}} self.plots: Plots = Plots() @@ -101,7 +101,7 @@ def __eq__(self, other): ) def get_form_values(self) -> None: - self.inputs = self.form_inputs.copy() + self.inputs |= self.form_inputs.copy() @classmethod def to_dict(cls): diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 0461bd1d2..5673de3fe 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -17,6 +17,7 @@ from backend.protzilla.data_analysis.plots import ( add_vertical_line_with_annotation_in_legend, ) +from backend.protzilla.form import Form from protzilla.methods.data_analysis import CrosslinkingValidationWithAngstromDeviation @@ -60,10 +61,9 @@ def test_validate_with_angstrom_deviation(distance, expected): ) crosslinker_information = {"DSS": [5.0, 1.0, 1.0]} # Länge 5 Å ± 1 Å - result = validate_with_angstrom_deviation( crosslinking_df, - structures_to_validate=["P12345"], + metadata_df=pd.DataFrame({"uniprot_accession": ["P12345"]}), crosslinker_information=crosslinker_information, amino_acid_sequences_df=amino_acid_sequences_df, cif_df=cif_df, @@ -87,17 +87,19 @@ def test_modify_form_creates_crosslinker_fields(): run.steps = steps step = CrosslinkingValidationWithAngstromDeviation() - form = step.create_form() + step.form = step.create_form() + + step.input_source = MagicMock(return_value=("dummy_step", "dummy_handle")) - step.modify_form(form, run) + step.modify_form(run) - assert "DSS_length" in form - assert "DSS_upper_accepted_deviation" in form - assert "DSS_lower_accepted_deviation" in form + assert "DSS_length" in step.form + assert "DSS_upper_accepted_deviation" in step.form + assert "DSS_lower_accepted_deviation" in step.form - assert "BS3_length" in form - assert "BS3_upper_accepted_deviation" in form - assert "BS3_lower_accepted_deviation" in form + assert "BS3_length" in step.form + assert "BS3_upper_accepted_deviation" in step.form + assert "BS3_lower_accepted_deviation" in step.form def test_get_distance_between_two_amino_acids_in_angstrom(): @@ -313,7 +315,7 @@ def test_validate_multimer_filters_only_pairs_within_structures_to_validate(): out = validate_with_angstrom_deviation( crosslinking_df=crosslinking_df, - structures_to_validate=["P1", "P2"], + metadata_df=pd.DataFrame({"uniprot_ids": [["P1", "P2"]]}), crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, @@ -374,7 +376,7 @@ def test_validate_multimer_no_links_between_structures_returns_empty_and_warning out = validate_with_angstrom_deviation( crosslinking_df=crosslinking_df, - structures_to_validate=["P1", "P2"], + metadata_df=pd.DataFrame({"uniprot_ids": [["P1", "P2"]]}), crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, @@ -434,7 +436,7 @@ def test_validate_multimer_duplicates_rows_for_multiple_peptide_matches_and_vali out = validate_with_angstrom_deviation( crosslinking_df=crosslinking_df, - structures_to_validate=["P1", "P2"], + metadata_df=pd.DataFrame({"uniprot_ids": [["P1", "P2"]]}), crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, @@ -538,7 +540,7 @@ def test_diagrams_of_crosslinking_validation_data_with_drawing_all_vertical_line figures = diagrams_of_crosslinking_validation_data( crosslinking_df=sample_crosslinking_df, - structures_to_validate=["P12345"], + metadata_df=pd.DataFrame({"uniprot_accession": ["P12345"]}), crosslinker_information=sample_crosslinker_info, cif_df=pd.DataFrame(), amino_acid_sequences_df=pd.DataFrame(), @@ -606,7 +608,7 @@ def test_diagrams_of_crosslinking_validation_data_without_drawing_all_vertical_l figures = diagrams_of_crosslinking_validation_data( crosslinking_df=sample_crosslinking_df_with_no_std, - structures_to_validate=["P12345"], + metadata_df=pd.DataFrame({"uniprot_accession": ["P12345"]}), crosslinker_information=sample_crosslinker_info_matching_sample_crosslinking_df_with_no_std, cif_df=pd.DataFrame(), amino_acid_sequences_df=pd.DataFrame(), @@ -668,7 +670,7 @@ def test_diagrams_calls_with_correct_parameters( figures = diagrams_of_crosslinking_validation_data( crosslinking_df=sample_crosslinking_df_with_one_crosslinker, - structures_to_validate=["P12345"], + metadata_df=pd.DataFrame({"uniprot_accession": ["P12345"]}), crosslinker_information=sample_crosslinker_info_with_one_crosslinker, cif_df=pd.DataFrame(), amino_acid_sequences_df=pd.DataFrame(), @@ -779,7 +781,7 @@ def test_validate_multimer_with_invalid_crosslinks(): out = validate_with_angstrom_deviation( crosslinking_df=crosslinking_df, - structures_to_validate=["P1", "P2"], + metadata_df=pd.DataFrame({"uniprot_ids": ["['P1', 'P2']"]}), crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, From 37c9630c6981cdc64cd43e46a4090f2b4a9df149 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Fri, 27 Mar 2026 08:38:16 +0100 Subject: [PATCH 153/240] Change metadata_df of predicted structure to structure_metadata_df --- backend/protzilla/constants/data_types.py | 1 + .../data_analysis/crosslinking_validation.py | 24 +++---- .../alphafold_protein_structure_load.py | 68 +++++++++---------- backend/protzilla/methods/importing.py | 8 +-- 4 files changed, 51 insertions(+), 50 deletions(-) diff --git a/backend/protzilla/constants/data_types.py b/backend/protzilla/constants/data_types.py index 358ee20f3..956b50ad1 100644 --- a/backend/protzilla/constants/data_types.py +++ b/backend/protzilla/constants/data_types.py @@ -11,6 +11,7 @@ class DataKey(StrEnum): PEPTIDE_DF = "peptide_df" PSM_DF = "psm_df" # psm = peptide spectrum match METADATA_DF = "metadata_df" + STRUCTURE_METADATA_DF = "structure_metadata_df" FASTA_DF = "fasta_df" SIGNIFICANT_PROTEINS_DF = "significant_proteins_df" PTM_DF = "ptm_df" diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 9965d2528..45c91d3c3 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -256,11 +256,11 @@ def get_crosslink_positions_in_protein( return crosslinking_df, messages -def _get_structures_to_validate(metadata_df: pd.DataFrame) -> list[str]: - if "uniprot_accession" in metadata_df.columns: - return metadata_df["uniprot_accession"].tolist() - elif "uniprot_ids" in metadata_df.columns: - value = metadata_df["uniprot_ids"].iloc[0] +def _get_structures_to_validate(structure_metadata_df: pd.DataFrame) -> list[str]: + if "uniprot_accession" in structure_metadata_df.columns: + return structure_metadata_df["uniprot_accession"].tolist() + elif "uniprot_ids" in structure_metadata_df.columns: + value = structure_metadata_df["uniprot_ids"].iloc[0] if isinstance(value, str): value = ast.literal_eval(value) return value @@ -270,7 +270,7 @@ def _get_structures_to_validate(metadata_df: pd.DataFrame) -> list[str]: def validate_with_angstrom_deviation( crosslinking_df: pd.DataFrame, - metadata_df: pd.DataFrame, + structure_metadata_df: pd.DataFrame, crosslinker_information: dict[str, list[float]], cif_df: pd.DataFrame, amino_acid_sequences_df: pd.DataFrame, @@ -282,7 +282,7 @@ def validate_with_angstrom_deviation( and more than (cross-linker length - the lower allowed deviation). If one of the bounds is zero only the other bound will be applied. :param crosslinking_df: DataFrame containing cross-linking data. - :param metadata_df: DataFrame containing metadata + :param structure_metadata_df: DataFrame containing metadata :param crosslinker_information: Contains for each Crosslinker: - length_of_: float - lower_accepted_deviation_for_: float @@ -295,7 +295,7 @@ def validate_with_angstrom_deviation( :raises KeyError: If a required crosslinker field is missing in crosslinker_information. :raises ValueError: If peptide sequences cannot be matched to the protein sequence. """ - structures_to_validate = _get_structures_to_validate(metadata_df) + structures_to_validate = _get_structures_to_validate(structure_metadata_df) all_crosslinks_df = crosslinking_df.copy() is_multimer = len(structures_to_validate) > 1 if not is_multimer: @@ -399,7 +399,7 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: def diagrams_of_crosslinking_validation_data( crosslinking_df: pd.DataFrame, - metadata_df: pd.DataFrame, + structure_metadata_df: pd.DataFrame, crosslinker_information: dict[str, list[float]], cif_df: pd.DataFrame, amino_acid_sequences_df: pd.DataFrame, @@ -421,7 +421,7 @@ def diagrams_of_crosslinking_validation_data( :param crosslinking_df: DataFrame containing cross-linking data, including AlphaFold-predicted distances, crosslinker identifiers, and validation results. - :param metadata_df: Dataframe containing metadata. + :param structure_metadata_df: Dataframe containing metadata. :param crosslinker_information: Contains for each Crosslinker: - length_of_: float - lower_accepted_deviation_for_: float @@ -433,10 +433,10 @@ def diagrams_of_crosslinking_validation_data( bar plot summarizing valid and invalid cross-links across all crosslinkers. :raises KeyError: If a required crosslinker entry is missing in crosslinker_information. """ - structures_to_validate = _get_structures_to_validate(metadata_df) + structures_to_validate = _get_structures_to_validate(structure_metadata_df) validated_df = validate_with_angstrom_deviation( crosslinking_df, - metadata_df, + structure_metadata_df, crosslinker_information, cif_df, amino_acid_sequences_df, diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 4d5b86522..d56d1226b 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -26,8 +26,8 @@ def get_monomer_metadata_df() -> pd.DataFrame: Returns all data from alphafold_monomer_metadata.csv in form of a dataframe. If no such csv exist, it returns a dataframe with the corresponding keys but no values and creates a csv with the expected column names. """ - metadata_csv = paths.AF_MONOMER_METADATA_CSV_PATH - if not metadata_csv.exists(): + monomer_metadata_csv = paths.AF_MONOMER_METADATA_CSV_PATH + if not monomer_metadata_csv.exists(): metadata_df = pd.DataFrame( columns=[ "entry_id", @@ -37,10 +37,10 @@ def get_monomer_metadata_df() -> pd.DataFrame: "model_used", ] ) - metadata_csv.parent.mkdir(parents=True, exist_ok=True) - metadata_df.to_csv(metadata_csv, index=False) + monomer_metadata_csv.parent.mkdir(parents=True, exist_ok=True) + metadata_df.to_csv(monomer_metadata_csv, index=False) return metadata_df - return pd.read_csv(metadata_csv, dtype=str) + return pd.read_csv(monomer_metadata_csv, dtype=str) def get_multimer_metadata_df() -> pd.DataFrame: @@ -48,9 +48,9 @@ def get_multimer_metadata_df() -> pd.DataFrame: Returns all data from alphafold_multimer_metadata.csv in form of a dataframe. If no such csv exist, it returns a dataframe with the corresponding keys but no values and creates a csv with the expected column names. """ - metadata_csv = paths.AF_MULTIMER_METADATA_CSV_PATH + multimer_metadata_csv = paths.AF_MULTIMER_METADATA_CSV_PATH - if not metadata_csv.exists(): + if not multimer_metadata_csv.exists(): metadata_df = pd.DataFrame( columns=[ "entry_id", @@ -59,10 +59,10 @@ def get_multimer_metadata_df() -> pd.DataFrame: "model_used", ] ) - metadata_csv.parent.mkdir(parents=True, exist_ok=True) - metadata_df.to_csv(metadata_csv, index=False) + multimer_metadata_csv.parent.mkdir(parents=True, exist_ok=True) + metadata_df.to_csv(multimer_metadata_csv, index=False) return metadata_df - return pd.read_csv(metadata_csv, dtype=str) + return pd.read_csv(multimer_metadata_csv, dtype=str) def to_fasta(seq: str, header: str = "protein_sequence", width: int = 60) -> str: @@ -244,7 +244,7 @@ def handle_alphafold_files( files_urls: dict[str, Any], uniprot: str, seq: str, - metadata_df: pd.DataFrame, + monomer_metadata_df: pd.DataFrame, entry_id: str, persist_upload: bool = False, ) -> dict[str, pd.DataFrame | None]: @@ -258,10 +258,10 @@ def handle_alphafold_files( :param files_urls: Dictionary containing URLs for CIF, PAE, and pLDDT files :param uniprot: The UniProt ID of the protein :param seq: The protein sequence - :param metadata_df: DataFrame containing AlphaFold metadata + :param monomer_metadata_df: DataFrame containing AlphaFold monomer metadata :param entry_id: The entry_id (in the case of fetching from AF DB the same as uniprot id) (used for directory naming) :param persist_upload: If True, files are saved persistently; if False, only loaded into memory - :return: A dictionary containing DataFrames for metadata, CIF, PAE, pLDDT, sequence data or None values for + :return: A dictionary containing DataFrames for monomer metadata, CIF, PAE, pLDDT, sequence data or None values for failed loads and messages such as warnings """ cif_df = pd.DataFrame() @@ -284,7 +284,7 @@ def handle_alphafold_files( entry_id=entry_id, metadata_csv=paths.AF_MONOMER_METADATA_CSV_PATH, existing_metadata_df=existing_metadata_df, - metadata_df=metadata_df, + metadata_df=monomer_metadata_df, messages=messages, ) @@ -342,12 +342,12 @@ def fetch_alphafold_protein_structure( """ Fetch AlphaFold protein structure data from the AlphaFold Database API. - Retrieves metadata and structure files (CIF, PAE, pLDDT) from the AlphaFold Database + Retrieves monomer metadata and structure files (CIF, PAE, pLDDT) from the AlphaFold Database for the given UniProt ID. Optionally persists the downloaded files to disk. :param uniprot_id: The UniProt ID of the protein :param persist_upload: If True, files are saved persistently; if False, only loaded into memory - :return: A dictionary containing DataFrames for metadata, CIF, PAE, pLDDT, and sequence data + :return: A dictionary containing DataFrames for monomer metadata, CIF, PAE, pLDDT, and sequence data :raises RuntimeError: If the API request fails or returns invalid data :raises ValueError: If no predictions are found for the given UniProt ID """ @@ -391,18 +391,18 @@ def fetch_alphafold_protein_structure( if isinstance(r.get(key), str) and r.get(key): files_urls[key] = r[key] - metadata_df = pd.DataFrame([data]) + monomer_metadata_df = pd.DataFrame([data]) alpha_dfs = handle_alphafold_files( files_urls=files_urls, uniprot=uniprot_id, seq=seq_tmp, - metadata_df=metadata_df, + monomer_metadata_df=monomer_metadata_df, entry_id=uniprot_id, persist_upload=persist_upload, ) df_dict = { - "metadata_df": metadata_df, + "structure_metadata_df": monomer_metadata_df, "cif_df": alpha_dfs["cif_df"], "pae_df": alpha_dfs["pae_df"], "plddt_df": alpha_dfs["plddt_df"], @@ -617,12 +617,12 @@ def get_monomer_structure_dfs(entry_id: str) -> dict[str, Any]: Writes monomer structure data from disk of a specific entry ID into dataframes. :param entry_id: entry_id of the uploaded monomer structure - :return: A dictionary containing DataFrames for metadata, CIF, PAE, pLDDT, and sequence data + :return: A dictionary containing DataFrames for monomer metadata, CIF, PAE, pLDDT, and sequence data """ messages: list[dict[str, str | int]] = [] all_metadata_df = get_monomer_metadata_df() - metadata_df = check_and_get_metadata_df( + monomer_metadata_df = check_and_get_metadata_df( entry_id=entry_id, all_metadata_df=all_metadata_df, csv_file=paths.AF_MONOMER_METADATA_CSV_PATH, @@ -683,7 +683,7 @@ def get_monomer_structure_dfs(entry_id: str) -> dict[str, Any]: raise RuntimeError(msg) from e df_dict = { - "metadata_df": metadata_df, + "structure_metadata_df": monomer_metadata_df, "cif_df": cif_df, "pae_df": pae_df, "plddt_df": plddt_df, @@ -699,12 +699,12 @@ def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: Writes multimer structure data from disk of a specific entry ID into dataframes. :param entry_id: entry_id of the uploaded monomer structure - :return: A dictionary containing DataFrames for metadata, CIF, confidence, full data, and sequence data + :return: A dictionary containing DataFrames for multimer metadata, CIF, confidence, full data, and sequence data """ messages: list[dict[str, str | int]] = [] all_metadata_df = get_multimer_metadata_df() - metadata_df = check_and_get_metadata_df( + multimer_metadata_df = check_and_get_metadata_df( entry_id=entry_id, all_metadata_df=all_metadata_df, csv_file=paths.AF_MULTIMER_METADATA_CSV_PATH, @@ -758,7 +758,7 @@ def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: logger.exception(msg) raise RuntimeError(msg) from e df_dict = { - "metadata_df": metadata_df, + "structure_metadata_df": multimer_metadata_df, "amino_acid_sequences_df": amino_acid_sequences_df, "cif_df": cif_df, "confidence_df": confidence_df, @@ -782,8 +782,8 @@ def upload_multimer_prediction( """ Process an AlphaFold multimer prediction and return its parsed data as DataFrames. - The function assembles metadata for the prediction, optionally persists both - metadata and input files to the configured multimer storage directory, and + The function assembles multimer metadata for the prediction, optionally persists both + multimer metadata and input files to the configured multimer storage directory, and parses the provided files into DataFrames: - FASTA sequences via fasta_import, key "fasta_df". - mmCIF structure via read_alphafold_mmcif. @@ -797,7 +797,7 @@ def upload_multimer_prediction( removed in a finally block. :param entry_id: Unique identifier for the prediction entry. Used for - directory naming and metadata. + directory naming and mulitmer metadata. :param uniprot_ids: UniProt identifiers associated with the multimer prediction. :param model_used: Name or identifier of the AlphaFold model used to @@ -809,11 +809,11 @@ def upload_multimer_prediction( :param full_data_file: Path to the full data JSON file. If the JSON content is a dict it is normalized into a single-row DataFrame. Otherwise, an empty DataFrame is returned and a warning is recorded. - :param persist_upload: If True, persist metadata and copy input files into + :param persist_upload: If True, persist multimer metadata and copy input files into the configured multimer directory. If False, use a temporary directory - and do not persist metadata. + and do not persist multimer metadata. :return: A dictionary containing: - - "metadata_df": DataFrame with entry metadata. + - "structure_metadata_df": DataFrame with entry multimer metadata. - "cif_df": DataFrame parsed from the mmCIF file. - "confidence_df": DataFrame loaded from the confidence JSON. - "full_data_df": Normalized DataFrame from the full data JSON or empty. @@ -843,14 +843,14 @@ def upload_multimer_prediction( } try: - metadata_df = pd.DataFrame([data]) + multimer_metadata_df = pd.DataFrame([data]) if persist_upload: exsisting_metadata_df = get_multimer_metadata_df() extend_metadata_csv( entry_id=entry_id, metadata_csv=paths.AF_MULTIMER_METADATA_CSV_PATH, existing_metadata_df=exsisting_metadata_df, - metadata_df=metadata_df, + metadata_df=multimer_metadata_df, messages=messages, ) for file_name in [ @@ -886,7 +886,7 @@ def upload_multimer_prediction( cif_df = read_alphafold_mmcif(cif_file) df_dict = { - "metadata_df": metadata_df, + "structure_metadata_df": multimer_metadata_df, "cif_df": cif_df, "confidence_df": confidence_df, "full_data_df": full_data_df, diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index aa565a62d..7865a751e 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -430,7 +430,7 @@ class AlphaFoldPredictionLoad(ImportingStep): method_description = "Loads the predicted structure of the monomer with the given protein ID out of the AlphaFold DB." output_keys = [ - DataKey.METADATA_DF, + DataKey.STRUCTURE_METADATA_DF, DataKey.CIF_DF, DataKey.PAE_DF, DataKey.PLDDT_DF, @@ -494,7 +494,7 @@ class ImportMonomerStructurePredictionFromDisk(ImportingStep): method_description = "Load an already uploaded monomer structure prediction from disk into current run" output_keys = [ - DataKey.METADATA_DF, + DataKey.STRUCTURE_METADATA_DF, DataKey.CIF_DF, DataKey.PAE_DF, DataKey.PLDDT_DF, @@ -527,7 +527,7 @@ class UploadMultimerPredictions(ImportingStep): method_description = "Upload a multimer protein prediction" output_keys = [ - DataKey.METADATA_DF, + DataKey.STRUCTURE_METADATA_DF, DataKey.CIF_DF, DataKey.CONFIDENCE_DF, DataKey.FULL_DATA_DF, @@ -593,7 +593,7 @@ class ImportMultimerStructurePredictionFromDisk(ImportingStep): method_description = "Load an already uploaded multimer structure prediction from disk into current run" output_keys = [ - DataKey.METADATA_DF, + DataKey.STRUCTURE_METADATA_DF, DataKey.CIF_DF, DataKey.CONFIDENCE_DF, DataKey.FULL_DATA_DF, From 1f3f16267b9083bed28f180c14bf5c479a2fca1c Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Sat, 28 Mar 2026 09:30:48 +0100 Subject: [PATCH 154/240] Change metadata_df naming in tests --- .../test_crosslinking_validation.py | 16 +++++------ .../test_alphafold_protein_structure_load.py | 28 +++++++++---------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 5673de3fe..c8100c6af 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -63,7 +63,7 @@ def test_validate_with_angstrom_deviation(distance, expected): crosslinker_information = {"DSS": [5.0, 1.0, 1.0]} # Länge 5 Å ± 1 Å result = validate_with_angstrom_deviation( crosslinking_df, - metadata_df=pd.DataFrame({"uniprot_accession": ["P12345"]}), + structure_metadata_df=pd.DataFrame({"uniprot_accession": ["P12345"]}), crosslinker_information=crosslinker_information, amino_acid_sequences_df=amino_acid_sequences_df, cif_df=cif_df, @@ -315,7 +315,7 @@ def test_validate_multimer_filters_only_pairs_within_structures_to_validate(): out = validate_with_angstrom_deviation( crosslinking_df=crosslinking_df, - metadata_df=pd.DataFrame({"uniprot_ids": [["P1", "P2"]]}), + structure_metadata_df=pd.DataFrame({"uniprot_ids": [["P1", "P2"]]}), crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, @@ -376,7 +376,7 @@ def test_validate_multimer_no_links_between_structures_returns_empty_and_warning out = validate_with_angstrom_deviation( crosslinking_df=crosslinking_df, - metadata_df=pd.DataFrame({"uniprot_ids": [["P1", "P2"]]}), + structure_metadata_df=pd.DataFrame({"uniprot_ids": [["P1", "P2"]]}), crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, @@ -436,7 +436,7 @@ def test_validate_multimer_duplicates_rows_for_multiple_peptide_matches_and_vali out = validate_with_angstrom_deviation( crosslinking_df=crosslinking_df, - metadata_df=pd.DataFrame({"uniprot_ids": [["P1", "P2"]]}), + structure_metadata_df=pd.DataFrame({"uniprot_ids": [["P1", "P2"]]}), crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, @@ -540,7 +540,7 @@ def test_diagrams_of_crosslinking_validation_data_with_drawing_all_vertical_line figures = diagrams_of_crosslinking_validation_data( crosslinking_df=sample_crosslinking_df, - metadata_df=pd.DataFrame({"uniprot_accession": ["P12345"]}), + structure_metadata_df=pd.DataFrame({"uniprot_accession": ["P12345"]}), crosslinker_information=sample_crosslinker_info, cif_df=pd.DataFrame(), amino_acid_sequences_df=pd.DataFrame(), @@ -608,7 +608,7 @@ def test_diagrams_of_crosslinking_validation_data_without_drawing_all_vertical_l figures = diagrams_of_crosslinking_validation_data( crosslinking_df=sample_crosslinking_df_with_no_std, - metadata_df=pd.DataFrame({"uniprot_accession": ["P12345"]}), + structure_metadata_df=pd.DataFrame({"uniprot_accession": ["P12345"]}), crosslinker_information=sample_crosslinker_info_matching_sample_crosslinking_df_with_no_std, cif_df=pd.DataFrame(), amino_acid_sequences_df=pd.DataFrame(), @@ -670,7 +670,7 @@ def test_diagrams_calls_with_correct_parameters( figures = diagrams_of_crosslinking_validation_data( crosslinking_df=sample_crosslinking_df_with_one_crosslinker, - metadata_df=pd.DataFrame({"uniprot_accession": ["P12345"]}), + structure_metadata_df=pd.DataFrame({"uniprot_accession": ["P12345"]}), crosslinker_information=sample_crosslinker_info_with_one_crosslinker, cif_df=pd.DataFrame(), amino_acid_sequences_df=pd.DataFrame(), @@ -781,7 +781,7 @@ def test_validate_multimer_with_invalid_crosslinks(): out = validate_with_angstrom_deviation( crosslinking_df=crosslinking_df, - metadata_df=pd.DataFrame({"uniprot_ids": ["['P1', 'P2']"]}), + structure_metadata_df=pd.DataFrame({"uniprot_ids": ["['P1', 'P2']"]}), crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index 0c5ac3ecf..9c775c903 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -127,7 +127,7 @@ def test_fetch_alphafold_returned_keys(tmp_path, monkeypatch): out = fetch_alphafold_protein_structure("Q8WP00", persist_upload=True) assert out.keys() == { - "metadata_df", + "structure_metadata_df", "cif_df", "pae_df", "plddt_df", @@ -145,12 +145,12 @@ def test_fetch_alphafold_monomer_metadata(tmp_path, monkeypatch): ) out = fetch_alphafold_protein_structure("Q8WP00", persist_upload=True) - assert isinstance(out["metadata_df"], pd.DataFrame) - assert not out["metadata_df"].empty - assert out["metadata_df"].iloc[0]["uniprot_accession"] == "Q8WP00" - assert out["metadata_df"].iloc[0]["model_created_date"] == "2025-08-01T00:00:00Z" - assert out["metadata_df"].iloc[0]["gene"] == "PRM1" - assert out["metadata_df"].iloc[0]["model_used"] == "AlphaFold Monomer v2.0 pipeline" + assert isinstance(out["structure_metadata_df"], pd.DataFrame) + assert not out["structure_metadata_df"].empty + assert out["structure_metadata_df"].iloc[0]["uniprot_accession"] == "Q8WP00" + assert out["structure_metadata_df"].iloc[0]["model_created_date"] == "2025-08-01T00:00:00Z" + assert out["structure_metadata_df"].iloc[0]["gene"] == "PRM1" + assert out["structure_metadata_df"].iloc[0]["model_used"] == "AlphaFold Monomer v2.0 pipeline" def test_fetch_alphafold_files_exist(tmp_path, monkeypatch): @@ -296,9 +296,9 @@ def test_get_prot_structure_dfs_success(tmp_path, monkeypatch): out = get_monomer_structure_dfs("Q8WP00") - assert isinstance(out["metadata_df"], pd.DataFrame) - assert not out["metadata_df"].empty - assert out["metadata_df"].iloc[0]["entry_id"] == "Q8WP00" + assert isinstance(out["structure_metadata_df"], pd.DataFrame) + assert not out["structure_metadata_df"].empty + assert out["structure_metadata_df"].iloc[0]["entry_id"] == "Q8WP00" assert isinstance(out["cif_df"], pd.DataFrame) assert not out["cif_df"].empty @@ -464,9 +464,9 @@ def _copy(src, dest_dir): persist_upload=True, ) - assert isinstance(out["metadata_df"], pd.DataFrame) + assert isinstance(out["structure_metadata_df"], pd.DataFrame) # check metadata contents - mdf = out["metadata_df"] + mdf = out["structure_metadata_df"] assert mdf.iloc[0]["entry_id"] == "M1" assert mdf.iloc[0]["uniprot_ids"] == ["X"] assert mdf.iloc[0]["model_used"] == "m" @@ -594,7 +594,7 @@ def test_upload_multimer_prediction_no_persist(tmp_path, monkeypatch): ) # verify dataframes are returned - assert isinstance(out["metadata_df"], pd.DataFrame) + assert isinstance(out["structure_metadata_df"], pd.DataFrame) assert isinstance(out["cif_df"], pd.DataFrame) # directory should still exist (created for the entry) upload_dir = tmp_path / "M2" @@ -826,7 +826,7 @@ def test_get_multimer_structure_dfs_success(tmp_path, monkeypatch): full_data.write_text(json.dumps({"pae": [[0.1, 0.2], [0.3, 0.4]]})) out = get_multimer_structure_dfs("M1") - assert isinstance(out["metadata_df"], pd.DataFrame) + assert isinstance(out["structure_metadata_df"], pd.DataFrame) assert isinstance(out["cif_df"], pd.DataFrame) assert isinstance(out["amino_acid_sequences_df"], pd.DataFrame) assert isinstance(out["confidence_df"], pd.DataFrame) From 98ef80920d7576097dc8f341e4a33376c1d10150 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Sat, 28 Mar 2026 09:41:43 +0100 Subject: [PATCH 155/240] fix: format with black --- .../importing/test_alphafold_protein_structure_load.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index 9c775c903..b0bf84f3e 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -148,9 +148,15 @@ def test_fetch_alphafold_monomer_metadata(tmp_path, monkeypatch): assert isinstance(out["structure_metadata_df"], pd.DataFrame) assert not out["structure_metadata_df"].empty assert out["structure_metadata_df"].iloc[0]["uniprot_accession"] == "Q8WP00" - assert out["structure_metadata_df"].iloc[0]["model_created_date"] == "2025-08-01T00:00:00Z" + assert ( + out["structure_metadata_df"].iloc[0]["model_created_date"] + == "2025-08-01T00:00:00Z" + ) assert out["structure_metadata_df"].iloc[0]["gene"] == "PRM1" - assert out["structure_metadata_df"].iloc[0]["model_used"] == "AlphaFold Monomer v2.0 pipeline" + assert ( + out["structure_metadata_df"].iloc[0]["model_used"] + == "AlphaFold Monomer v2.0 pipeline" + ) def test_fetch_alphafold_files_exist(tmp_path, monkeypatch): From d14e2c0cbd218a29c44c583585f9d5492f0f42d2 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Sun, 29 Mar 2026 03:41:36 +0200 Subject: [PATCH 156/240] fix test --- .../protzilla/data_analysis/test_crosslinking_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 241d590cb..9489a01f7 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -693,7 +693,7 @@ def test_diagrams_calls_with_correct_parameters( # Check histogram call parameters for crosslinker full-range first_hist_call = mock_hist.call_args_list[0].kwargs - assert first_hist_call["name_a"] == "Valid Crosslinks (intra: 2, inter: 0)" + assert first_hist_call["name_a"] == "Valid Crosslinks (intra: 1, inter: 1)" assert first_hist_call["name_b"] == "Invalid Crosslinks (intra: 0, inter: 2)" assert ( first_hist_call["heading"] From 4d1c92a0bbf6c4c0fc49996f55d4973d075f21be Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Sun, 29 Mar 2026 03:49:53 +0200 Subject: [PATCH 157/240] fix test 2 --- .../protzilla/data_analysis/test_crosslinking_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 9489a01f7..fc0c04ad7 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -694,7 +694,7 @@ def test_diagrams_calls_with_correct_parameters( # Check histogram call parameters for crosslinker full-range first_hist_call = mock_hist.call_args_list[0].kwargs assert first_hist_call["name_a"] == "Valid Crosslinks (intra: 1, inter: 1)" - assert first_hist_call["name_b"] == "Invalid Crosslinks (intra: 0, inter: 2)" + assert first_hist_call["name_b"] == "Invalid Crosslinks (intra: 1, inter: 1)" assert ( first_hist_call["heading"] == "Predicted distances for P12345 with crosslinker CL1" From e494242a49ee6215f95c1105103f477111b44324 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 30 Mar 2026 11:47:28 +0200 Subject: [PATCH 158/240] refactor: add broken refactoring of download outputs --- backend/main/urls.py | 2 +- backend/main/views.py | 41 ++-- .../protzilla/importing/query_generation.py | 10 +- backend/protzilla/methods/importing.py | 2 +- backend/protzilla/steps.py | 29 +-- frontend/package-lock.json | 198 ++++++++++++++++++ .../components/app/run-screen/run-screen.tsx | 51 +++-- frontend/src/utils/protzilla-types.ts | 5 + 8 files changed, 260 insertions(+), 78 deletions(-) diff --git a/backend/main/urls.py b/backend/main/urls.py index f655e7a93..b368ef6ba 100644 --- a/backend/main/urls.py +++ b/backend/main/urls.py @@ -49,7 +49,7 @@ path("api/get_step_form/", views.get_step_form, name="get_step_form"), path("api/get_step_plots/", views.get_step_plots, name="get_step_plots"), path( - "api/get_step_downloads/", views.get_step_downloads, name="get_step_downloads" + "api/get_downloads_from_step/", views.get_downloads_from_step, name="get_downloads_from_step" ), path( "api/get_current_step_output_labels/", diff --git a/backend/main/views.py b/backend/main/views.py index 665e68779..2f3fd18dd 100644 --- a/backend/main/views.py +++ b/backend/main/views.py @@ -682,30 +682,31 @@ def get_step_plots(request): ) -def get_step_downloads(request): - if request.method == "POST": - data = json.loads(request.body) - run_name = data.get("run_name") - - run = Run(run_name) - if run.current_step is not None: - downloads = run.current_downloads - else: - downloads = {} - - return JsonResponse( - { - "success": True, - "message": "Got the available download(s) for the step", - "data": downloads, - }, - safe=False, - ) - else: +def get_downloads_from_step(request: HttpRequest): + """ + API call. Returns a base64-encoded PNG of a step output to the front-end + """ + if request.method != "POST": return JsonResponse( {"success": False, "message": "Invalid request method"}, status=405 ) + data = json.loads(request.body) + run_name = data.get("run_name") + step_id = data.get("step_id") + output_key = data.get("output_key") + + run = Run(run_name) + step = run.steps.get_step_by_id(step_id) + #output = step.output.get(output_key) + + if run.current_step is not None: + downloads = step.output["downloads"] + else: + downloads = {} + + return JsonResponse({"success": True, "message": "Got the available download(s) for the step", "data": downloads}) + # TODO: Move somewhere else def _step_output_as_serialised_table( diff --git a/backend/protzilla/importing/query_generation.py b/backend/protzilla/importing/query_generation.py index 6941612c4..fb5e01749 100644 --- a/backend/protzilla/importing/query_generation.py +++ b/backend/protzilla/importing/query_generation.py @@ -1,9 +1,9 @@ import json import logging - -import pandas as pd import requests +from backend.protzilla.steps import OutputItem, OutputType + def generate_alphafold_query_json( protein_ids: str, number_copies: str, model_seed: int, name: str @@ -100,7 +100,7 @@ def generate_alphafold_query_json( level=logging.INFO, msg=f"Successfully generated a json file for AlphaFold." ) ) - return dict( - messages=messages, - downloads={name: query_as_string}, + return dict( #TODO: Messages + messages = messages, + downloads=OutputItem(output_type=OutputType.DOWNLOAD, value = {name: query_as_string}) ) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index f0bed1dd3..997412c99 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -671,4 +671,4 @@ def create_form(self): ], ) - download_method = staticmethod(generate_alphafold_query_json) + calc_method = staticmethod(generate_alphafold_query_json) diff --git a/backend/protzilla/steps.py b/backend/protzilla/steps.py index a8c75c851..61d51f995 100644 --- a/backend/protzilla/steps.py +++ b/backend/protzilla/steps.py @@ -160,11 +160,6 @@ def calculate(self, steps: StepManager) -> bool: self.handle_plot_outputs(plot_output) self.artifact_versions["plots"]["generated"] += 1 - if self.download_method: - download_output = self.download_method(**self.download_input) - self.handle_download_outputs(download_output) - self.artifact_versions["downloads"]["generated"] += 1 - self.calculation_status = "complete" # delete tempfiles @@ -339,24 +334,6 @@ def handle_plot_outputs(self, outputs: dict | list) -> None: self.plots = Plots(plots) - def handle_download_outputs(self, outputs: dict) -> None: - """ - Handles the dictionary from the download method and creates a Download object from it. - Responsible for validating that the output is a dictionary, handling any messages contained in the output - and setting the downloads attribute of the class. - :param outputs: A dictionary received after the download method - :return: None - """ - - if not isinstance(outputs, dict): - raise TypeError("Output of download method is not a dictionary.") - - downloads = outputs.pop("downloads", {}) - self.output.output.update(outputs) - self.handle_messages(outputs) - - self.downloads = Downloads(downloads) - def handle_messages(self, outputs: dict) -> None: """ Handles the messages from the calculation method and creates a Messages object from it. @@ -369,7 +346,6 @@ def handle_messages(self, outputs: dict) -> None: calc_method = None plot_method = None # if the plot method uses the output of the calculation method, it should be prefixed with "output_" - download_method = None def _get_input_parameters( self, function: Callable[..., Any], relevant_inputs: dict | None = None @@ -412,10 +388,6 @@ def plot_input(self) -> dict: function=self.plot_method, relevant_inputs=plot_input ) - @property - def download_input(self) -> dict: - return self._get_input_parameters(self.download_method) - def validate_outputs(self, soft_check: bool = False) -> bool: """ Validates the outputs of the step. Uses the output_keys attribute to check if all required keys are present in @@ -507,6 +479,7 @@ class OutputType(StrEnum): FLOAT = "float" INT = "int" PNG_BASE64 = "png_base64" + DOWNLOAD = "download" # for every data type that is not yaml serializable JOBLIB_ARTIFACT = "joblib_artifact" diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 0ea3e32fd..807a70e82 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -8,6 +8,7 @@ "name": "frontend", "version": "0.0.0", "dependencies": { + "@dagrejs/dagre": "^2.0.4", "@emotion/react": "^11.14.0", "@emotion/styled": "^11.14.0", "@mui/material": "^6.4.12", @@ -15,6 +16,7 @@ "@mui/x-data-grid": "^7.29.6", "@storybook/addon-actions": "^8.6.14", "@types/node": "^24.3.1", + "@xyflow/react": "^12.10.0", "axios": "^1.9.0", "bootstrap": "^5.3.6", "corepack": "^0.34.0", @@ -628,6 +630,21 @@ "node": ">=18" } }, + "node_modules/@dagrejs/dagre": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/@dagrejs/dagre/-/dagre-2.0.4.tgz", + "integrity": "sha512-J6vCWTNpicHF4zFlZG1cS5DkGzMr9941gddYkakjrg3ZNev4bbqEgLHFTWiFrcJm7UCRu7olO3K6IRDd9gSGhA==", + "license": "MIT", + "dependencies": { + "@dagrejs/graphlib": "3.0.4" + } + }, + "node_modules/@dagrejs/graphlib": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@dagrejs/graphlib/-/graphlib-3.0.4.tgz", + "integrity": "sha512-HxZ7fCvAwTLCWCO0WjDkzAFQze8LdC6iOpKbetDKHIuDfIgMlIzYzqZ4nxwLlclQX+3ZVeZ1K2OuaOE2WWcyOg==", + "license": "MIT" + }, "node_modules/@emnapi/core": { "version": "1.5.0", "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.5.0.tgz", @@ -3602,6 +3619,55 @@ "@types/deep-eql": "*" } }, + "node_modules/@types/d3-color": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/@types/d3-color/-/d3-color-3.1.3.tgz", + "integrity": "sha512-iO90scth9WAbmgv7ogoq57O9YpKmFBbmoEoCHDB2xMBY0+/KVrqAaCDyCE16dUspeOvIxFFRI+0sEtqDqy2b4A==", + "license": "MIT" + }, + "node_modules/@types/d3-drag": { + "version": "3.0.7", + "resolved": "https://registry.npmjs.org/@types/d3-drag/-/d3-drag-3.0.7.tgz", + "integrity": "sha512-HE3jVKlzU9AaMazNufooRJ5ZpWmLIoc90A37WU2JMmeq28w1FQqCZswHZ3xR+SuxYftzHq6WU6KJHvqxKzTxxQ==", + "license": "MIT", + "dependencies": { + "@types/d3-selection": "*" + } + }, + "node_modules/@types/d3-interpolate": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@types/d3-interpolate/-/d3-interpolate-3.0.4.tgz", + "integrity": "sha512-mgLPETlrpVV1YRJIglr4Ez47g7Yxjl1lj7YKsiMCb27VJH9W8NVM6Bb9d8kkpG/uAQS5AmbA48q2IAolKKo1MA==", + "license": "MIT", + "dependencies": { + "@types/d3-color": "*" + } + }, + "node_modules/@types/d3-selection": { + "version": "3.0.11", + "resolved": "https://registry.npmjs.org/@types/d3-selection/-/d3-selection-3.0.11.tgz", + "integrity": "sha512-bhAXu23DJWsrI45xafYpkQ4NtcKMwWnAC/vKrd2l+nxMFuvOT3XMYTIj2opv8vq8AO5Yh7Qac/nSeP/3zjTK0w==", + "license": "MIT" + }, + "node_modules/@types/d3-transition": { + "version": "3.0.9", + "resolved": "https://registry.npmjs.org/@types/d3-transition/-/d3-transition-3.0.9.tgz", + "integrity": "sha512-uZS5shfxzO3rGlu0cC3bjmMFKsXv+SmZZcgp0KD22ts4uGXp5EVYGzu/0YdwZeKmddhcAccYtREJKkPfXkZuCg==", + "license": "MIT", + "dependencies": { + "@types/d3-selection": "*" + } + }, + "node_modules/@types/d3-zoom": { + "version": "3.0.8", + "resolved": "https://registry.npmjs.org/@types/d3-zoom/-/d3-zoom-3.0.8.tgz", + "integrity": "sha512-iqMC4/YlFCSlO8+2Ii1GGGliCAY4XdeG748w5vQUbevlbDu0zSjH/+jojorQVBK/se0j6DUFNPBGSqD3YWYnDw==", + "license": "MIT", + "dependencies": { + "@types/d3-interpolate": "*", + "@types/d3-selection": "*" + } + }, "node_modules/@types/deep-eql": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/@types/deep-eql/-/deep-eql-4.0.2.tgz", @@ -4513,6 +4579,38 @@ "url": "https://opencollective.com/vitest" } }, + "node_modules/@xyflow/react": { + "version": "12.10.1", + "resolved": "https://registry.npmjs.org/@xyflow/react/-/react-12.10.1.tgz", + "integrity": "sha512-5eSWtIK/+rkldOuFbOOz44CRgQRjtS9v5nufk77DV+XBnfCGL9HAQ8PG00o2ZYKqkEU/Ak6wrKC95Tu+2zuK3Q==", + "license": "MIT", + "dependencies": { + "@xyflow/system": "0.0.75", + "classcat": "^5.0.3", + "zustand": "^4.4.0" + }, + "peerDependencies": { + "react": ">=17", + "react-dom": ">=17" + } + }, + "node_modules/@xyflow/system": { + "version": "0.0.75", + "resolved": "https://registry.npmjs.org/@xyflow/system/-/system-0.0.75.tgz", + "integrity": "sha512-iXs+AGFLi8w/VlAoc/iSxk+CxfT6o64Uw/k0CKASOPqjqz6E0rb5jFZgJtXGZCpfQI6OQpu5EnumP5fGxQheaQ==", + "license": "MIT", + "dependencies": { + "@types/d3-drag": "^3.0.7", + "@types/d3-interpolate": "^3.0.4", + "@types/d3-selection": "^3.0.10", + "@types/d3-transition": "^3.0.8", + "@types/d3-zoom": "^3.0.8", + "d3-drag": "^3.0.0", + "d3-interpolate": "^3.0.1", + "d3-selection": "^3.0.0", + "d3-zoom": "^3.0.0" + } + }, "node_modules/abs-svg-path": { "version": "0.1.1", "resolved": "https://registry.npmjs.org/abs-svg-path/-/abs-svg-path-0.1.1.tgz", @@ -5178,6 +5276,12 @@ "resolved": "https://registry.npmjs.org/clamp/-/clamp-1.0.1.tgz", "integrity": "sha512-kgMuFyE78OC6Dyu3Dy7vcx4uy97EIbVxJB/B0eJ3bUNAkwdNcxYzgKltnyADiYwsR7SEqkkUPsEUT//OVS6XMA==" }, + "node_modules/classcat": { + "version": "5.0.5", + "resolved": "https://registry.npmjs.org/classcat/-/classcat-5.0.5.tgz", + "integrity": "sha512-JhZUT7JFcQy/EzW605k/ktHtncoo9vnyW/2GspNYwFlN1C/WmjuV/xtS04e9SOkL2sTdw0VAZ2UGCcQ9lR6p6w==", + "license": "MIT" + }, "node_modules/cli-cursor": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-5.0.0.tgz", @@ -5580,6 +5684,28 @@ "resolved": "https://registry.npmjs.org/d3-dispatch/-/d3-dispatch-1.0.6.tgz", "integrity": "sha512-fVjoElzjhCEy+Hbn8KygnmMS7Or0a9sI2UzGwoB7cCtvI1XpVN9GpoYlnb3xt2YV66oXYb1fLJ8GMvP4hdU1RA==" }, + "node_modules/d3-drag": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/d3-drag/-/d3-drag-3.0.0.tgz", + "integrity": "sha512-pWbUJLdETVA8lQNJecMxoXfH6x+mO2UQo8rSmZ+QqxcbyA3hfeprFgIT//HW2nlHChWeIIMwS2Fq+gEARkhTkg==", + "license": "ISC", + "dependencies": { + "d3-dispatch": "1 - 3", + "d3-selection": "3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-ease": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-ease/-/d3-ease-3.0.1.tgz", + "integrity": "sha512-wR/XK3D3XcLIZwpbvQwQ5fK+8Ykds1ip7A2Txe0yxncXSdq1L9skcG7blcedkOX+ZcgxGAmLX1FrRGbADwzi0w==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=12" + } + }, "node_modules/d3-force": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/d3-force/-/d3-force-1.2.1.tgz", @@ -5653,6 +5779,15 @@ "resolved": "https://registry.npmjs.org/d3-quadtree/-/d3-quadtree-1.0.7.tgz", "integrity": "sha512-RKPAeXnkC59IDGD0Wu5mANy0Q2V28L+fNe65pOCXVdVuTJS3WPKaJlFHer32Rbh9gIo9qMuJXio8ra4+YmIymA==" }, + "node_modules/d3-selection": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/d3-selection/-/d3-selection-3.0.0.tgz", + "integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, "node_modules/d3-shape": { "version": "1.3.7", "resolved": "https://registry.npmjs.org/d3-shape/-/d3-shape-1.3.7.tgz", @@ -5679,6 +5814,41 @@ "resolved": "https://registry.npmjs.org/d3-timer/-/d3-timer-1.0.10.tgz", "integrity": "sha512-B1JDm0XDaQC+uvo4DT79H0XmBskgS3l6Ve+1SBCfxgmtIb1AVrPIoqd+nPSv+loMX8szQ0sVUhGngL7D5QPiXw==" }, + "node_modules/d3-transition": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-transition/-/d3-transition-3.0.1.tgz", + "integrity": "sha512-ApKvfjsSR6tg06xrL434C0WydLr7JewBB3V+/39RMHsaXTOG0zmt/OAXeng5M5LBm0ojmxJrpomQVZ1aPvBL4w==", + "license": "ISC", + "dependencies": { + "d3-color": "1 - 3", + "d3-dispatch": "1 - 3", + "d3-ease": "1 - 3", + "d3-interpolate": "1 - 3", + "d3-timer": "1 - 3" + }, + "engines": { + "node": ">=12" + }, + "peerDependencies": { + "d3-selection": "2 - 3" + } + }, + "node_modules/d3-zoom": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/d3-zoom/-/d3-zoom-3.0.0.tgz", + "integrity": "sha512-b8AmV3kfQaqWAuacbPuNbL6vahnOJflOhexLzMMNLga62+/nh0JzvJ0aO/5a5MVgUFGS7Hu1P9P03o3fJkDCyw==", + "license": "ISC", + "dependencies": { + "d3-dispatch": "1 - 3", + "d3-drag": "2 - 3", + "d3-interpolate": "1 - 3", + "d3-selection": "2 - 3", + "d3-transition": "2 - 3" + }, + "engines": { + "node": ">=12" + } + }, "node_modules/data-urls": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-5.0.0.tgz", @@ -12887,6 +13057,34 @@ "funding": { "url": "https://github.com/sponsors/sindresorhus" } + }, + "node_modules/zustand": { + "version": "4.5.7", + "resolved": "https://registry.npmjs.org/zustand/-/zustand-4.5.7.tgz", + "integrity": "sha512-CHOUy7mu3lbD6o6LJLfllpjkzhHXSBlX8B9+qPddUsIfeF5S/UZ5q0kmCsnRqT1UHFQZchNFDDzMbQsuesHWlw==", + "license": "MIT", + "dependencies": { + "use-sync-external-store": "^1.2.2" + }, + "engines": { + "node": ">=12.7.0" + }, + "peerDependencies": { + "@types/react": ">=16.8", + "immer": ">=9.0.6", + "react": ">=16.8" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "immer": { + "optional": true + }, + "react": { + "optional": true + } + } } } } diff --git a/frontend/src/components/app/run-screen/run-screen.tsx b/frontend/src/components/app/run-screen/run-screen.tsx index 4babae946..f9e4ce28d 100644 --- a/frontend/src/components/app/run-screen/run-screen.tsx +++ b/frontend/src/components/app/run-screen/run-screen.tsx @@ -13,6 +13,7 @@ import { useToggleableState } from "@protzilla/hooks"; import { spacing } from "@protzilla/theme"; import { callApiWithParameters, + Download, emptyRunData, footerMessages, Image, @@ -91,7 +92,8 @@ export const RunScreen: React.FC = () => { const [plots, setPlots] = useState(); const [selectedPlot, setSelectedPlot] = useState
({ data: [], layout: {} }); const [availableTables, setAvailableTables] = useState(); - const [downloads, setDownloads] = useState | undefined>(); + const [availableDownloads, setAvailableDownloads] = useState([]); + const [downloads, setDownloads] = useState([]); // Static PNGs sent as base64 const [images, setImages] = useState([]); @@ -135,12 +137,12 @@ export const RunScreen: React.FC = () => { step_id: stepID, }).then(() => { setAvailableTables(undefined); + setAvailableDownloads([]); setPlots(undefined); setAvailableImages([]); void getRunData(); void getStepPlots(); - void getStepDownloads(); void getCurrentStepOutputLabels(); }); } else { @@ -176,14 +178,15 @@ export const RunScreen: React.FC = () => { }, [runName]); const getStepDownloads = useCallback(async () => { - const response = await callApiWithParameters("get_step_downloads/", { + const response = await callApiWithParameters("get_downloads_from_step/", { run_name: runName, + step_id: runData.current_step_id, }); if (response) { const downloads = response.data; setDownloads(downloads); } - }, [runName]); + }, [runName, runData]); const getCurrentStepOutputLabels = useCallback(async () => { const response = await callApiWithParameters("get_current_step_output_labels/", { @@ -192,33 +195,32 @@ export const RunScreen: React.FC = () => { if (response) { const tableOutputs = []; const imageOutputs = []; + const downloadOutputs = []; for (const output of response.outputs) { if (output.output_type === "dataframe" || output.output_type === "list") tableOutputs.push(output); else if (output.output_type === "png_base64") imageOutputs.push(output); + else if (output.output_type === "download") downloadOutputs.push(output); } setAvailableTables(tableOutputs); setAvailableImages(imageOutputs); + setAvailableDownloads(downloadOutputs); } }, [runName]); useEffect(() => { const fetchData = async () => { - await Promise.all([ - getRunData(), - getStepPlots(), - getStepDownloads(), - getCurrentStepOutputLabels(), - ]); + await Promise.all([getRunData(), getStepPlots(), getCurrentStepOutputLabels()]); }; void fetchData(); - }, [getRunData, getStepPlots, getStepDownloads, getCurrentStepOutputLabels]); + }, [getRunData, getStepPlots, getCurrentStepOutputLabels]); const onFormSubmit = () => { setAvailableTables(undefined); setAvailableImages([]); setPlots(undefined); + setAvailableDownloads([]); void getRunData(); void getStepPlots(); void getStepDownloads(); @@ -355,22 +357,25 @@ export const RunScreen: React.FC = () => { const downloadComponent = ( - {downloads && Object.keys(downloads).length > 0 ? ( - Object.entries(downloads).map(([filename, content]) => ( - { - downloadJson(filename, content); - }} - /> - )) + {downloads.length > 0 ? ( + downloads.flatMap((download) => + Object.entries(download.data || {}).map(([filename, content]) => ( + { + downloadJson(filename, JSON.stringify(content)); + }} + /> + )), + ) ) : ( )} ); + const nodeEditorComponent = ( { plots && plots.length > 0 && { name: "Plots", value: plotComponent }, availableTables && availableTables.length > 0 && { name: "Tables", value: tableComponent }, availableImages.length > 0 && { name: "Images", value: imageComponent }, - { name: "Downloads", value: downloadComponent }, + availableDownloads.length > 0 && { name: "Downloads", value: downloadComponent }, ].filter(Boolean) as { name: string; value: React.ReactNode }[]; return ( diff --git a/frontend/src/utils/protzilla-types.ts b/frontend/src/utils/protzilla-types.ts index bb4840dd7..0aa592d1d 100644 --- a/frontend/src/utils/protzilla-types.ts +++ b/frontend/src/utils/protzilla-types.ts @@ -23,6 +23,11 @@ export interface Image { data: string; } +export interface Download { + title: string; + data: string; +} + // We assume these are the only data types we receive for tables export type TableRecord = Record; From a0c864426b1ece422c8052d8701918577294aa83 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 30 Mar 2026 14:58:17 +0200 Subject: [PATCH 159/240] refactor: add working refactoring of download outputs --- backend/main/urls.py | 4 +- backend/main/views.py | 24 ++++++++---- .../protzilla/importing/query_generation.py | 7 ++-- backend/protzilla/steps.py | 2 +- .../components/app/run-screen/run-screen.tsx | 38 ++++++++++++------- 5 files changed, 49 insertions(+), 26 deletions(-) diff --git a/backend/main/urls.py b/backend/main/urls.py index b368ef6ba..5f1ae3264 100644 --- a/backend/main/urls.py +++ b/backend/main/urls.py @@ -49,7 +49,9 @@ path("api/get_step_form/", views.get_step_form, name="get_step_form"), path("api/get_step_plots/", views.get_step_plots, name="get_step_plots"), path( - "api/get_downloads_from_step/", views.get_downloads_from_step, name="get_downloads_from_step" + "api/get_downloads_from_step/", + views.get_downloads_from_step, + name="get_downloads_from_step", ), path( "api/get_current_step_output_labels/", diff --git a/backend/main/views.py b/backend/main/views.py index 2f3fd18dd..193253c40 100644 --- a/backend/main/views.py +++ b/backend/main/views.py @@ -698,14 +698,24 @@ def get_downloads_from_step(request: HttpRequest): run = Run(run_name) step = run.steps.get_step_by_id(step_id) - #output = step.output.get(output_key) - - if run.current_step is not None: - downloads = step.output["downloads"] - else: + downloads = step.output.get(output_key) + if downloads is None: downloads = {} - - return JsonResponse({"success": True, "message": "Got the available download(s) for the step", "data": downloads}) + if not isinstance(downloads, dict): + return JsonResponse( + { + "success": False, + "message": f"Requested output must be dict object, is {str(type(downloads))}", + }, + status=405, + ) + return JsonResponse( + { + "success": True, + "message": "Got the available download(s) for the step", + "data": downloads, + } + ) # TODO: Move somewhere else diff --git a/backend/protzilla/importing/query_generation.py b/backend/protzilla/importing/query_generation.py index fb5e01749..937788a16 100644 --- a/backend/protzilla/importing/query_generation.py +++ b/backend/protzilla/importing/query_generation.py @@ -94,13 +94,12 @@ def generate_alphafold_query_json( } } ) - query_as_string = f"[{json.dumps(query)}]" messages.append( dict( level=logging.INFO, msg=f"Successfully generated a json file for AlphaFold." ) ) - return dict( #TODO: Messages - messages = messages, - downloads=OutputItem(output_type=OutputType.DOWNLOAD, value = {name: query_as_string}) + return dict( + messages=messages, + downloads=OutputItem(output_type=OutputType.DOWNLOAD, value={name: [query]}), ) diff --git a/backend/protzilla/steps.py b/backend/protzilla/steps.py index 61d51f995..352397bfc 100644 --- a/backend/protzilla/steps.py +++ b/backend/protzilla/steps.py @@ -642,4 +642,4 @@ def __repr__(self): @property def empty(self) -> bool: - return len(self.downloads) == 0 \ No newline at end of file + return len(self.downloads) == 0 diff --git a/frontend/src/components/app/run-screen/run-screen.tsx b/frontend/src/components/app/run-screen/run-screen.tsx index f9e4ce28d..fa8ebde01 100644 --- a/frontend/src/components/app/run-screen/run-screen.tsx +++ b/frontend/src/components/app/run-screen/run-screen.tsx @@ -177,17 +177,6 @@ export const RunScreen: React.FC = () => { } }, [runName]); - const getStepDownloads = useCallback(async () => { - const response = await callApiWithParameters("get_downloads_from_step/", { - run_name: runName, - step_id: runData.current_step_id, - }); - if (response) { - const downloads = response.data; - setDownloads(downloads); - } - }, [runName, runData]); - const getCurrentStepOutputLabels = useCallback(async () => { const response = await callApiWithParameters("get_current_step_output_labels/", { run_name: runName, @@ -223,7 +212,6 @@ export const RunScreen: React.FC = () => { setAvailableDownloads([]); void getRunData(); void getStepPlots(); - void getStepDownloads(); void getCurrentStepOutputLabels(); }; @@ -269,6 +257,30 @@ export const RunScreen: React.FC = () => { } }, [availableImages, runName, runData.current_step_id]); + useEffect(() => { + if (!runData.current_step_id || availableDownloads.length === 0) return; + + const fetchDownloads = async () => { + const responses = await Promise.all( + availableDownloads.map(async (output) => { + const response = await callApiWithParameters("get_downloads_from_step/", { + run_name: runName, + step_id: runData.current_step_id, + output_key: output.label, + }); + + return { + title: output.label, + data: response?.data, + }; + }), + ); + setDownloads(responses); + }; + + void fetchDownloads(); + }, [runName, runData.current_step_id, availableDownloads]); + const plotComponent = ( {plots && plots.length > 0 ? ( @@ -365,7 +377,7 @@ export const RunScreen: React.FC = () => { text={filename} style={{ width: "fit-content" }} onClick={() => { - downloadJson(filename, JSON.stringify(content)); + downloadJson(filename, JSON.stringify(content, null, 2)); }} /> )), From 84ff4f813ccfd132b2beb4174348e84ffc456d82 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 30 Mar 2026 17:03:38 +0200 Subject: [PATCH 160/240] refactor: add useCertainStepOutputs hook --- .../components/app/run-screen/run-screen.tsx | 140 +++++++++++------- frontend/src/utils/protzilla-types.ts | 3 +- 2 files changed, 84 insertions(+), 59 deletions(-) diff --git a/frontend/src/components/app/run-screen/run-screen.tsx b/frontend/src/components/app/run-screen/run-screen.tsx index fa8ebde01..d312631e1 100644 --- a/frontend/src/components/app/run-screen/run-screen.tsx +++ b/frontend/src/components/app/run-screen/run-screen.tsx @@ -82,6 +82,59 @@ const FooterText = styled.div` width: 100%; `; +interface UseStepOutputsParams { + available_outputs: TOutput[]; + endpoint: string; + runName: string; + stepId?: string; + transform: (output: TOutput, response: TResponse) => TResult; +} + +export function useCertainStepOutputs< + TOutput extends StepOutputInfo, + TResponse = any, + TResult = any, +>({ + available_outputs, + endpoint, + runName, + stepId, + transform, +}: UseStepOutputsParams): TResult[] { + const [data, setData] = useState([]); + + useEffect(() => { + if (!stepId || available_outputs.length === 0) { + setData([]); + return; + } + + const fetchData = async () => { + try { + const responses = await Promise.all( + available_outputs.map(async (output) => { + const response: TResponse = await callApiWithParameters(endpoint, { + run_name: runName, + step_id: stepId, + output_key: output.label, + }); + + return transform(output, response); + }), + ); + + setData(responses); + } catch (error) { + console.error("Failed to fetch outputs:", error); + } + }; + + void fetchData(); + }, [available_outputs, endpoint, runName, stepId, transform]); + + return data; +} + export const RunScreen: React.FC = () => { const navigate = useNavigate(); const location = useLocation(); @@ -93,11 +146,38 @@ export const RunScreen: React.FC = () => { const [selectedPlot, setSelectedPlot] = useState
({ data: [], layout: {} }); const [availableTables, setAvailableTables] = useState(); const [availableDownloads, setAvailableDownloads] = useState([]); - const [downloads, setDownloads] = useState([]); + const downloads = useCertainStepOutputs< + StepOutputInfo, + Download, + { title: string; data: Record } + >({ + available_outputs: availableDownloads, + endpoint: "get_downloads_from_step/", + runName: runName, + stepId: runData.current_step_id, + transform: (output, response) => ({ + title: output.label, + data: response.data, + }), + }); // Static PNGs sent as base64 - const [images, setImages] = useState([]); const [availableImages, setAvailableImages] = useState([]); + const images = useCertainStepOutputs< + StepOutputInfo, + Image, + { title: string; alt: string; data: string } + >({ + available_outputs: availableImages, + endpoint: "get_png_from_step/", + runName: runName, + stepId: runData.current_step_id, + transform: (output, response) => ({ + title: output.label, + alt: output.label, + data: "data:image/png;base64," + response.data, + }), + }); const [isDownloadModalOpen, openDownloadModal, closeDownloadModal] = useToggleableState(false); @@ -227,60 +307,6 @@ export const RunScreen: React.FC = () => { plotPlaceholderMessage = "No plot available for this step."; } - useEffect(() => { - const fetchImages = async () => { - const imagePromises = availableImages.map(async (output_info) => { - const response = await callApiWithParameters("get_png_from_step/", { - run_name: runName, - step_id: runData.current_step_id, - output_key: output_info.label, - }); - return { - title: output_info.label, - alt: output_info.label, - data: "data:image/png;base64,".concat(response.data), - }; - }); - - try { - const resolvedImages = await Promise.all(imagePromises); - setImages(resolvedImages); - } catch (error) { - console.error("Failed to fetch image data:", error); - } - }; - - if (availableImages.length > 0) { - void fetchImages(); - } else { - setImages([]); - } - }, [availableImages, runName, runData.current_step_id]); - - useEffect(() => { - if (!runData.current_step_id || availableDownloads.length === 0) return; - - const fetchDownloads = async () => { - const responses = await Promise.all( - availableDownloads.map(async (output) => { - const response = await callApiWithParameters("get_downloads_from_step/", { - run_name: runName, - step_id: runData.current_step_id, - output_key: output.label, - }); - - return { - title: output.label, - data: response?.data, - }; - }), - ); - setDownloads(responses); - }; - - void fetchDownloads(); - }, [runName, runData.current_step_id, availableDownloads]); - const plotComponent = ( {plots && plots.length > 0 ? ( @@ -371,7 +397,7 @@ export const RunScreen: React.FC = () => { {downloads.length > 0 ? ( downloads.flatMap((download) => - Object.entries(download.data || {}).map(([filename, content]) => ( + Object.entries(download.data).map(([filename, content]) => ( ; } // We assume these are the only data types we receive for tables From 1fdd8976ba850770efeed32d4bad30dc75b329b6 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 30 Mar 2026 17:09:46 +0200 Subject: [PATCH 161/240] refactor: remove 'any' type hint --- frontend/src/components/app/run-screen/run-screen.tsx | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/frontend/src/components/app/run-screen/run-screen.tsx b/frontend/src/components/app/run-screen/run-screen.tsx index d312631e1..2b9479ada 100644 --- a/frontend/src/components/app/run-screen/run-screen.tsx +++ b/frontend/src/components/app/run-screen/run-screen.tsx @@ -90,11 +90,7 @@ interface UseStepOutputsParams { transform: (output: TOutput, response: TResponse) => TResult; } -export function useCertainStepOutputs< - TOutput extends StepOutputInfo, - TResponse = any, - TResult = any, ->({ +export function useCertainStepOutputs({ available_outputs, endpoint, runName, From 464da38d0b337fed4b1d9a1d9bdf334c594747e8 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 30 Mar 2026 17:18:59 +0200 Subject: [PATCH 162/240] fix: fix broken tests --- .../protzilla/importing/test_query_generation.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/backend/tests/protzilla/importing/test_query_generation.py b/backend/tests/protzilla/importing/test_query_generation.py index 7a83b2d94..4a768664c 100644 --- a/backend/tests/protzilla/importing/test_query_generation.py +++ b/backend/tests/protzilla/importing/test_query_generation.py @@ -26,15 +26,13 @@ def test_generate_alphafold_multimer_json_query_for_multiple_proteins(mock_get): mock_get.side_effect = [mock_resp1, mock_resp2] result = generate_alphafold_query_json("P69905 P68871", "2,3", -1, "name") - downloads = result["downloads"] + downloads = result["downloads"].value assert len(downloads) == 1 key = list(downloads.keys())[0] assert key == "name" - # Parse JSON string (after removing outer brackets) - json_str = downloads[key] - parsed_json = json.loads(json_str[1:-1]) + parsed_json = downloads[key][0] # Check top-level keys expected_keys = {"name", "modelSeeds", "sequences", "dialect", "version"} @@ -76,10 +74,10 @@ def test_generate_alphafold_multimer_json_query_with_model_seed(mock_get): mock_get.return_value = mock_resp result = generate_alphafold_query_json("P69905", "2", model_seed=12345, name="name") - downloads = result["downloads"] + downloads = result["downloads"].value key = list(downloads.keys())[0] - parsed_json = json.loads(downloads[key][1:-1]) - assert parsed_json["modelSeeds"] == [12345] + query_json = downloads[key][0] + assert query_json["modelSeeds"] == [12345] def test_generate_alphafold_multimer_json_query_with_mismatched_number_of_ids_and_number_of_copies(): From a6e791f854764571335d64145cc3105d38ef1a7b Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 30 Mar 2026 17:49:30 +0200 Subject: [PATCH 163/240] style: remove unused parts of the first downloads implementation --- backend/protzilla/methods/importing.py | 2 - backend/protzilla/steps.py | 23 --- frontend/package-lock.json | 198 ------------------------- 3 files changed, 223 deletions(-) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 997412c99..cd34197e2 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -631,8 +631,6 @@ class AlphaFoldQueryJsonGeneration(Step): "Generate a JSON to upload to AlphaFold-Server to generate a prediction." ) - output_keys = ["downloads"] - def create_form(self): return Form( label="AlphaFold Query JSON Generation", diff --git a/backend/protzilla/steps.py b/backend/protzilla/steps.py index 352397bfc..14eecbb7a 100644 --- a/backend/protzilla/steps.py +++ b/backend/protzilla/steps.py @@ -67,7 +67,6 @@ def __init__( self.output: Output = Output() self.visual_data = {"node_position": {"x": 0, "y": 0}} self.plots: Plots = Plots() - self.downloads: Downloads = Downloads() self.messages: Messages = Messages([]) self.disk_write_mutex = Lock() @@ -83,10 +82,6 @@ def __init__( "generated": 0, "dumped": 0, }, - "downloads": { - "generated": 0, - "dumped": 0, - }, } if instance_identifier is None: @@ -625,21 +620,3 @@ def __repr__(self): @property def empty(self) -> bool: return len(self.plots) == 0 - - -class Downloads: - # maps file name to file content (a string) - def __init__(self, downloads: dict[str, str] | None = None): - if downloads is None: - downloads: dict[str, str] = {} - self.downloads = downloads - - def __iter__(self): - return iter(self.downloads) - - def __repr__(self): - return f"Downloads: {len(self.downloads)}" - - @property - def empty(self) -> bool: - return len(self.downloads) == 0 diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 807a70e82..0ea3e32fd 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -8,7 +8,6 @@ "name": "frontend", "version": "0.0.0", "dependencies": { - "@dagrejs/dagre": "^2.0.4", "@emotion/react": "^11.14.0", "@emotion/styled": "^11.14.0", "@mui/material": "^6.4.12", @@ -16,7 +15,6 @@ "@mui/x-data-grid": "^7.29.6", "@storybook/addon-actions": "^8.6.14", "@types/node": "^24.3.1", - "@xyflow/react": "^12.10.0", "axios": "^1.9.0", "bootstrap": "^5.3.6", "corepack": "^0.34.0", @@ -630,21 +628,6 @@ "node": ">=18" } }, - "node_modules/@dagrejs/dagre": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/@dagrejs/dagre/-/dagre-2.0.4.tgz", - "integrity": "sha512-J6vCWTNpicHF4zFlZG1cS5DkGzMr9941gddYkakjrg3ZNev4bbqEgLHFTWiFrcJm7UCRu7olO3K6IRDd9gSGhA==", - "license": "MIT", - "dependencies": { - "@dagrejs/graphlib": "3.0.4" - } - }, - "node_modules/@dagrejs/graphlib": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/@dagrejs/graphlib/-/graphlib-3.0.4.tgz", - "integrity": "sha512-HxZ7fCvAwTLCWCO0WjDkzAFQze8LdC6iOpKbetDKHIuDfIgMlIzYzqZ4nxwLlclQX+3ZVeZ1K2OuaOE2WWcyOg==", - "license": "MIT" - }, "node_modules/@emnapi/core": { "version": "1.5.0", "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.5.0.tgz", @@ -3619,55 +3602,6 @@ "@types/deep-eql": "*" } }, - "node_modules/@types/d3-color": { - "version": "3.1.3", - "resolved": "https://registry.npmjs.org/@types/d3-color/-/d3-color-3.1.3.tgz", - "integrity": "sha512-iO90scth9WAbmgv7ogoq57O9YpKmFBbmoEoCHDB2xMBY0+/KVrqAaCDyCE16dUspeOvIxFFRI+0sEtqDqy2b4A==", - "license": "MIT" - }, - "node_modules/@types/d3-drag": { - "version": "3.0.7", - "resolved": "https://registry.npmjs.org/@types/d3-drag/-/d3-drag-3.0.7.tgz", - "integrity": "sha512-HE3jVKlzU9AaMazNufooRJ5ZpWmLIoc90A37WU2JMmeq28w1FQqCZswHZ3xR+SuxYftzHq6WU6KJHvqxKzTxxQ==", - "license": "MIT", - "dependencies": { - "@types/d3-selection": "*" - } - }, - "node_modules/@types/d3-interpolate": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/@types/d3-interpolate/-/d3-interpolate-3.0.4.tgz", - "integrity": "sha512-mgLPETlrpVV1YRJIglr4Ez47g7Yxjl1lj7YKsiMCb27VJH9W8NVM6Bb9d8kkpG/uAQS5AmbA48q2IAolKKo1MA==", - "license": "MIT", - "dependencies": { - "@types/d3-color": "*" - } - }, - "node_modules/@types/d3-selection": { - "version": "3.0.11", - "resolved": "https://registry.npmjs.org/@types/d3-selection/-/d3-selection-3.0.11.tgz", - "integrity": "sha512-bhAXu23DJWsrI45xafYpkQ4NtcKMwWnAC/vKrd2l+nxMFuvOT3XMYTIj2opv8vq8AO5Yh7Qac/nSeP/3zjTK0w==", - "license": "MIT" - }, - "node_modules/@types/d3-transition": { - "version": "3.0.9", - "resolved": "https://registry.npmjs.org/@types/d3-transition/-/d3-transition-3.0.9.tgz", - "integrity": "sha512-uZS5shfxzO3rGlu0cC3bjmMFKsXv+SmZZcgp0KD22ts4uGXp5EVYGzu/0YdwZeKmddhcAccYtREJKkPfXkZuCg==", - "license": "MIT", - "dependencies": { - "@types/d3-selection": "*" - } - }, - "node_modules/@types/d3-zoom": { - "version": "3.0.8", - "resolved": "https://registry.npmjs.org/@types/d3-zoom/-/d3-zoom-3.0.8.tgz", - "integrity": "sha512-iqMC4/YlFCSlO8+2Ii1GGGliCAY4XdeG748w5vQUbevlbDu0zSjH/+jojorQVBK/se0j6DUFNPBGSqD3YWYnDw==", - "license": "MIT", - "dependencies": { - "@types/d3-interpolate": "*", - "@types/d3-selection": "*" - } - }, "node_modules/@types/deep-eql": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/@types/deep-eql/-/deep-eql-4.0.2.tgz", @@ -4579,38 +4513,6 @@ "url": "https://opencollective.com/vitest" } }, - "node_modules/@xyflow/react": { - "version": "12.10.1", - "resolved": "https://registry.npmjs.org/@xyflow/react/-/react-12.10.1.tgz", - "integrity": "sha512-5eSWtIK/+rkldOuFbOOz44CRgQRjtS9v5nufk77DV+XBnfCGL9HAQ8PG00o2ZYKqkEU/Ak6wrKC95Tu+2zuK3Q==", - "license": "MIT", - "dependencies": { - "@xyflow/system": "0.0.75", - "classcat": "^5.0.3", - "zustand": "^4.4.0" - }, - "peerDependencies": { - "react": ">=17", - "react-dom": ">=17" - } - }, - "node_modules/@xyflow/system": { - "version": "0.0.75", - "resolved": "https://registry.npmjs.org/@xyflow/system/-/system-0.0.75.tgz", - "integrity": "sha512-iXs+AGFLi8w/VlAoc/iSxk+CxfT6o64Uw/k0CKASOPqjqz6E0rb5jFZgJtXGZCpfQI6OQpu5EnumP5fGxQheaQ==", - "license": "MIT", - "dependencies": { - "@types/d3-drag": "^3.0.7", - "@types/d3-interpolate": "^3.0.4", - "@types/d3-selection": "^3.0.10", - "@types/d3-transition": "^3.0.8", - "@types/d3-zoom": "^3.0.8", - "d3-drag": "^3.0.0", - "d3-interpolate": "^3.0.1", - "d3-selection": "^3.0.0", - "d3-zoom": "^3.0.0" - } - }, "node_modules/abs-svg-path": { "version": "0.1.1", "resolved": "https://registry.npmjs.org/abs-svg-path/-/abs-svg-path-0.1.1.tgz", @@ -5276,12 +5178,6 @@ "resolved": "https://registry.npmjs.org/clamp/-/clamp-1.0.1.tgz", "integrity": "sha512-kgMuFyE78OC6Dyu3Dy7vcx4uy97EIbVxJB/B0eJ3bUNAkwdNcxYzgKltnyADiYwsR7SEqkkUPsEUT//OVS6XMA==" }, - "node_modules/classcat": { - "version": "5.0.5", - "resolved": "https://registry.npmjs.org/classcat/-/classcat-5.0.5.tgz", - "integrity": "sha512-JhZUT7JFcQy/EzW605k/ktHtncoo9vnyW/2GspNYwFlN1C/WmjuV/xtS04e9SOkL2sTdw0VAZ2UGCcQ9lR6p6w==", - "license": "MIT" - }, "node_modules/cli-cursor": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-5.0.0.tgz", @@ -5684,28 +5580,6 @@ "resolved": "https://registry.npmjs.org/d3-dispatch/-/d3-dispatch-1.0.6.tgz", "integrity": "sha512-fVjoElzjhCEy+Hbn8KygnmMS7Or0a9sI2UzGwoB7cCtvI1XpVN9GpoYlnb3xt2YV66oXYb1fLJ8GMvP4hdU1RA==" }, - "node_modules/d3-drag": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/d3-drag/-/d3-drag-3.0.0.tgz", - "integrity": "sha512-pWbUJLdETVA8lQNJecMxoXfH6x+mO2UQo8rSmZ+QqxcbyA3hfeprFgIT//HW2nlHChWeIIMwS2Fq+gEARkhTkg==", - "license": "ISC", - "dependencies": { - "d3-dispatch": "1 - 3", - "d3-selection": "3" - }, - "engines": { - "node": ">=12" - } - }, - "node_modules/d3-ease": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/d3-ease/-/d3-ease-3.0.1.tgz", - "integrity": "sha512-wR/XK3D3XcLIZwpbvQwQ5fK+8Ykds1ip7A2Txe0yxncXSdq1L9skcG7blcedkOX+ZcgxGAmLX1FrRGbADwzi0w==", - "license": "BSD-3-Clause", - "engines": { - "node": ">=12" - } - }, "node_modules/d3-force": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/d3-force/-/d3-force-1.2.1.tgz", @@ -5779,15 +5653,6 @@ "resolved": "https://registry.npmjs.org/d3-quadtree/-/d3-quadtree-1.0.7.tgz", "integrity": "sha512-RKPAeXnkC59IDGD0Wu5mANy0Q2V28L+fNe65pOCXVdVuTJS3WPKaJlFHer32Rbh9gIo9qMuJXio8ra4+YmIymA==" }, - "node_modules/d3-selection": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/d3-selection/-/d3-selection-3.0.0.tgz", - "integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==", - "license": "ISC", - "engines": { - "node": ">=12" - } - }, "node_modules/d3-shape": { "version": "1.3.7", "resolved": "https://registry.npmjs.org/d3-shape/-/d3-shape-1.3.7.tgz", @@ -5814,41 +5679,6 @@ "resolved": "https://registry.npmjs.org/d3-timer/-/d3-timer-1.0.10.tgz", "integrity": "sha512-B1JDm0XDaQC+uvo4DT79H0XmBskgS3l6Ve+1SBCfxgmtIb1AVrPIoqd+nPSv+loMX8szQ0sVUhGngL7D5QPiXw==" }, - "node_modules/d3-transition": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/d3-transition/-/d3-transition-3.0.1.tgz", - "integrity": "sha512-ApKvfjsSR6tg06xrL434C0WydLr7JewBB3V+/39RMHsaXTOG0zmt/OAXeng5M5LBm0ojmxJrpomQVZ1aPvBL4w==", - "license": "ISC", - "dependencies": { - "d3-color": "1 - 3", - "d3-dispatch": "1 - 3", - "d3-ease": "1 - 3", - "d3-interpolate": "1 - 3", - "d3-timer": "1 - 3" - }, - "engines": { - "node": ">=12" - }, - "peerDependencies": { - "d3-selection": "2 - 3" - } - }, - "node_modules/d3-zoom": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/d3-zoom/-/d3-zoom-3.0.0.tgz", - "integrity": "sha512-b8AmV3kfQaqWAuacbPuNbL6vahnOJflOhexLzMMNLga62+/nh0JzvJ0aO/5a5MVgUFGS7Hu1P9P03o3fJkDCyw==", - "license": "ISC", - "dependencies": { - "d3-dispatch": "1 - 3", - "d3-drag": "2 - 3", - "d3-interpolate": "1 - 3", - "d3-selection": "2 - 3", - "d3-transition": "2 - 3" - }, - "engines": { - "node": ">=12" - } - }, "node_modules/data-urls": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-5.0.0.tgz", @@ -13057,34 +12887,6 @@ "funding": { "url": "https://github.com/sponsors/sindresorhus" } - }, - "node_modules/zustand": { - "version": "4.5.7", - "resolved": "https://registry.npmjs.org/zustand/-/zustand-4.5.7.tgz", - "integrity": "sha512-CHOUy7mu3lbD6o6LJLfllpjkzhHXSBlX8B9+qPddUsIfeF5S/UZ5q0kmCsnRqT1UHFQZchNFDDzMbQsuesHWlw==", - "license": "MIT", - "dependencies": { - "use-sync-external-store": "^1.2.2" - }, - "engines": { - "node": ">=12.7.0" - }, - "peerDependencies": { - "@types/react": ">=16.8", - "immer": ">=9.0.6", - "react": ">=16.8" - }, - "peerDependenciesMeta": { - "@types/react": { - "optional": true - }, - "immer": { - "optional": true - }, - "react": { - "optional": true - } - } } } } From 79a105985457d28959c15c5ec1d026bed05b82e5 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 30 Mar 2026 17:51:24 +0200 Subject: [PATCH 164/240] style: remove unused parts of the first downloads implementation - part 2 --- backend/protzilla/run.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/backend/protzilla/run.py b/backend/protzilla/run.py index 1a133ac11..3231f68ba 100644 --- a/backend/protzilla/run.py +++ b/backend/protzilla/run.py @@ -13,7 +13,7 @@ import backend.protzilla.constants.paths as paths from backend.protzilla.constants.date_format import metadata_date_format from backend.protzilla.form import Form -from backend.protzilla.steps import Messages, Output, Plots, Downloads, Step +from backend.protzilla.steps import Messages, Output, Plots, Step from backend.protzilla.step_manager import StepManager from backend.protzilla.utilities.utilities import format_trace @@ -351,10 +351,6 @@ def current_messages(self) -> Messages: def current_plots(self) -> Plots | None: return self.steps.current_step.plots - @property - def current_downloads(self) -> dict[str, str] | None: - return self.steps.current_step.downloads.downloads - @property def current_outputs(self) -> Output: return self.steps.current_step.output From de2604f82ef91a263a44da22206c8bda2ca99a33 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Tue, 31 Mar 2026 10:32:00 +0200 Subject: [PATCH 165/240] chore: fix docstring --- backend/main/views.py | 3 --- backend/protzilla/steps.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/backend/main/views.py b/backend/main/views.py index 193253c40..7c5d3ce2a 100644 --- a/backend/main/views.py +++ b/backend/main/views.py @@ -683,9 +683,6 @@ def get_step_plots(request): def get_downloads_from_step(request: HttpRequest): - """ - API call. Returns a base64-encoded PNG of a step output to the front-end - """ if request.method != "POST": return JsonResponse( {"success": False, "message": "Invalid request method"}, status=405 diff --git a/backend/protzilla/steps.py b/backend/protzilla/steps.py index 14eecbb7a..a7fb93ee2 100644 --- a/backend/protzilla/steps.py +++ b/backend/protzilla/steps.py @@ -474,7 +474,7 @@ class OutputType(StrEnum): FLOAT = "float" INT = "int" PNG_BASE64 = "png_base64" - DOWNLOAD = "download" + DOWNLOAD = "download" # right now only JSONs are supported, value should be dict(filename, json content) # for every data type that is not yaml serializable JOBLIB_ARTIFACT = "joblib_artifact" From 97a39a0ac80700b183c5e89bc23225d0345a2802 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Wed, 1 Apr 2026 13:26:40 +0200 Subject: [PATCH 166/240] fix: fix broken change between different output tabs --- .../components/app/run-screen/run-screen.tsx | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/frontend/src/components/app/run-screen/run-screen.tsx b/frontend/src/components/app/run-screen/run-screen.tsx index 2b9479ada..8a45908af 100644 --- a/frontend/src/components/app/run-screen/run-screen.tsx +++ b/frontend/src/components/app/run-screen/run-screen.tsx @@ -141,7 +141,15 @@ export const RunScreen: React.FC = () => { const [plots, setPlots] = useState(); const [selectedPlot, setSelectedPlot] = useState
({ data: [], layout: {} }); const [availableTables, setAvailableTables] = useState(); + const [availableDownloads, setAvailableDownloads] = useState([]); + const transformDownload = useCallback( + (output: StepOutputInfo, response: Download) => ({ + title: output.label, + data: response.data, + }), + [], + ); const downloads = useCertainStepOutputs< StepOutputInfo, Download, @@ -151,14 +159,19 @@ export const RunScreen: React.FC = () => { endpoint: "get_downloads_from_step/", runName: runName, stepId: runData.current_step_id, - transform: (output, response) => ({ - title: output.label, - data: response.data, - }), + transform: transformDownload, }); // Static PNGs sent as base64 const [availableImages, setAvailableImages] = useState([]); + const transformImage = useCallback( + (output: StepOutputInfo, response: Image) => ({ + title: output.label, + alt: output.label, + data: "data:image/png;base64," + response.data, + }), + [], + ); const images = useCertainStepOutputs< StepOutputInfo, Image, @@ -168,11 +181,7 @@ export const RunScreen: React.FC = () => { endpoint: "get_png_from_step/", runName: runName, stepId: runData.current_step_id, - transform: (output, response) => ({ - title: output.label, - alt: output.label, - data: "data:image/png;base64," + response.data, - }), + transform: transformImage, }); const [isDownloadModalOpen, openDownloadModal, closeDownloadModal] = useToggleableState(false); From a4adc07fff49119229a5eaa4bb74db7198391a54 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Fri, 10 Apr 2026 09:07:26 +0200 Subject: [PATCH 167/240] fix: first upload in settings when alphafold folder does not exist yet --- backend/main/views_settings.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index 42bfd6840..6c04f7b03 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -493,6 +493,8 @@ def upload_multimer_structure(request): confidence_file = data.get("confidence_file") full_data_file = data.get("full_data_file") + ALPHAFOLD_MULTIMER_PATH.mkdir(parents=True, exist_ok=True) + # add row to metadata csv metadata_csv = AF_MULTIMER_METADATA_CSV_PATH expected_columns = [ From 7e9b5e16616a007629e3cebd05a0bb46fe46b5b1 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Fri, 10 Apr 2026 09:19:57 +0200 Subject: [PATCH 168/240] fix: import via settings for multimers --- backend/main/views_settings.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index 6c04f7b03..99199426d 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -1,6 +1,7 @@ import json import os import shutil +import re from datetime import date, datetime, timezone from io import BytesIO @@ -510,9 +511,11 @@ def upload_multimer_structure(request): timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + uniprot_ids_as_list = re.split(r"\s*,\s*", uniprot_ids.strip()) + new_row = { "entry_id": entry_id, - "uniprot_ids": uniprot_ids, + "uniprot_ids": uniprot_ids_as_list, "model_created_date": timestamp, "model_used": model_used, } From 51669f0c6f14229d38162c26cc09da52f240016a Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Sat, 11 Apr 2026 11:53:34 +0200 Subject: [PATCH 169/240] add workflow to gitignore --- backend/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/.gitignore b/backend/.gitignore index 1a49ec347..a13a10f26 100644 --- a/backend/.gitignore +++ b/backend/.gitignore @@ -10,6 +10,7 @@ !user_data/workflows/only_import_and_filter_proteins.yaml !user_data/workflows/standard.yaml !user_data/workflows/.test-run-empty.yaml +!user_data/workflows/cl_monomer.yaml !user_data/example_dataset # Uploads (e.g. Uniprot databases) From cc307f2cb205622d88620d043aa405df5ddfd3f4 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Sat, 11 Apr 2026 11:54:46 +0200 Subject: [PATCH 170/240] change order of workflow in git ignore --- backend/.gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/.gitignore b/backend/.gitignore index a13a10f26..bddc63870 100644 --- a/backend/.gitignore +++ b/backend/.gitignore @@ -9,8 +9,8 @@ !user_data/workflows/only_import.yaml !user_data/workflows/only_import_and_filter_proteins.yaml !user_data/workflows/standard.yaml -!user_data/workflows/.test-run-empty.yaml !user_data/workflows/cl_monomer.yaml +!user_data/workflows/.test-run-empty.yaml !user_data/example_dataset # Uploads (e.g. Uniprot databases) From c47552aee511937bec2965d053142567795e9437 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Sat, 11 Apr 2026 11:58:09 +0200 Subject: [PATCH 171/240] add cl monomer workflow --- backend/user_data/workflows/cl_monomer.yaml | 62 +++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 backend/user_data/workflows/cl_monomer.yaml diff --git a/backend/user_data/workflows/cl_monomer.yaml b/backend/user_data/workflows/cl_monomer.yaml new file mode 100644 index 000000000..32c4e2fd7 --- /dev/null +++ b/backend/user_data/workflows/cl_monomer.yaml @@ -0,0 +1,62 @@ +current_step_id: s00004_AlphaFoldPredictionLoad +df_mode: Standard +graph_edges: +- !!python/tuple + - s00003_CrosslinkingImport + - s00005_CrosslinkingValidationWithAngstromDeviation + - source_handle: crosslinking_df + target_handle: crosslinking_df +- !!python/tuple + - s00004_AlphaFoldPredictionLoad + - s00005_CrosslinkingValidationWithAngstromDeviation + - source_handle: structure_metadata_df + target_handle: structure_metadata_df +- !!python/tuple + - s00004_AlphaFoldPredictionLoad + - s00005_CrosslinkingValidationWithAngstromDeviation + - source_handle: cif_df + target_handle: cif_df +- !!python/tuple + - s00004_AlphaFoldPredictionLoad + - s00005_CrosslinkingValidationWithAngstromDeviation + - source_handle: amino_acid_sequences_df + target_handle: amino_acid_sequences_df +id_clock: 5 +steps: +- calculation_status: incomplete + form_inputs: + file_path: null + organism_ids: '' + instance_identifier: s00003_CrosslinkingImport + messages: [] + output: {} + plots: {} + type: CrosslinkingImport + visual_data: + node_position: + x: -58.407821229050285 + y: -3.4357541899441344 +- calculation_status: incomplete + form_inputs: + persist_upload: true + uniprot_id: '' + instance_identifier: s00004_AlphaFoldPredictionLoad + messages: [] + output: {} + plots: {} + type: AlphaFoldPredictionLoad + visual_data: + node_position: + x: 284.1888049686339 + y: -3.6223900627321264 +- calculation_status: incomplete + form_inputs: {} + instance_identifier: s00005_CrosslinkingValidationWithAngstromDeviation + messages: [] + output: {} + plots: {} + type: CrosslinkingValidationWithAngstromDeviation + visual_data: + node_position: + x: 88.81734340376391 + y: 184.78418276583125 From f981762749c34bb403924cd0312a43363b75d413 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 20 Apr 2026 10:15:16 +0200 Subject: [PATCH 172/240] feat: account for chain id and protein id in crosslinking validation --- .../data_analysis/crosslinking_validation.py | 138 ++++++++++++++++-- backend/protzilla/methods/importing.py | 4 +- 2 files changed, 124 insertions(+), 18 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index eb4e4f618..29d317616 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -35,6 +35,7 @@ def get_coordinates_of_atom_crosslinker_bound_to( amino_acid_position_where_crosslinker_bound: int, amino_acid_type: str, cif_df: pd.DataFrame, + chain_id: str, ) -> tuple[float, float, float]: """ Returns the Cartesian coordinates of the atom to which the cross-linker is @@ -43,25 +44,25 @@ def get_coordinates_of_atom_crosslinker_bound_to( :param amino_acid_position_where_crosslinker_bound: 1-based position of the amino acid residue :param amino_acid_type: amino acid type at the given position :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) + :param chain_id: ID of the chain of the atom the crosslinker bounds to :return: a tuple (x, y, z) containing the Cartesian coordinates of the atom in Ångström :raises ValueError: if the specified atom cannot be found in the CIF data """ relevant_atom = get_reactive_atom_of_amino_acid_residue(amino_acid_type) + seq_ids = pd.to_numeric(cif_df["_atom_site.label_seq_id"], errors="coerce") # Filter to the exact reactive atom of the amino acid residue # where the crosslinker is bound (e.g. CA at position 45) cif_df = cif_df[ (cif_df["_atom_site.label_atom_id"] == relevant_atom) - & ( - cif_df["_atom_site.label_seq_id"].astype(int) - == amino_acid_position_where_crosslinker_bound - ) + & (seq_ids == amino_acid_position_where_crosslinker_bound) + & (cif_df["_atom_site.auth_asym_id"] == chain_id) ] if cif_df.empty: raise ValueError( - f"No {relevant_atom} atom found for amino acid at position {amino_acid_position_where_crosslinker_bound}." + f"No {relevant_atom} atom found for amino acid at position {amino_acid_position_where_crosslinker_bound} in chain {chain_id}." ) row = cif_df.iloc[0] @@ -79,6 +80,8 @@ def get_distance_between_two_amino_acids_in_angstrom( amino_acid_type1: str, amino_acid_type2: str, cif_df: pd.DataFrame, + chain_id1: str, + chain_id2: str, ) -> float: """ Calculates the Euclidean distance in Ångström between two amino acid residues @@ -89,19 +92,27 @@ def get_distance_between_two_amino_acids_in_angstrom( :param amino_acid_type1: amino acid type at the first position :param amino_acid_type2: amino acid type at the second position :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) + :param chain_id1: ID of the chain of the first amino acid residue in which crosslinker binds + :param chain_id2: ID of the chain of the second amino acid residue in which crosslinker binds :return: the distance between the two residues in Ångström """ pos1 = np.array( get_coordinates_of_atom_crosslinker_bound_to( - amino_acid_position1, amino_acid_type1, cif_df + amino_acid_position1, + amino_acid_type1, + cif_df, + chain_id1, ), dtype=float, ) pos2 = np.array( get_coordinates_of_atom_crosslinker_bound_to( - amino_acid_position2, amino_acid_type2, cif_df + amino_acid_position2, + amino_acid_type2, + cif_df, + chain_id2, ), dtype=float, ) @@ -268,6 +279,86 @@ def _get_structures_to_validate(structure_metadata_df: pd.DataFrame) -> list[str raise ValueError("Metadata must contain 'uniprot_ids' or 'uniprot_accession'.") +def get_chain_starts( + cif_df: pd.DataFrame, + protein_id: str, +) -> dict: + """ + Returns the starting row positions and chain IDs for all chains + belonging to a given protein in an mmCIF-derived DataFrame. + + :param cif_df: mmCIF data as a pandas DataFrame. + :param protein_id: identifier of the protein you want to query. + :return: set of residue numbers where the different chains start indexed by chain_id + """ + relevant_df = cif_df[ + cif_df["_atom_site.pdbx_sifts_xref_db_acc"] == protein_id + ].copy() + relevant_df["_atom_site.label_seq_id"] = pd.to_numeric( + relevant_df["_atom_site.label_seq_id"], errors="coerce" + ) + relevant_df = relevant_df.dropna( + subset=["_atom_site.auth_asym_id", "_atom_site.label_seq_id"] + ) + + # Find, for each chain, the first row position in the original DataFrame + # and the corresponding first residue number which is the smallest number of the chain + results = {} + for chain_id, group in relevant_df.groupby("_atom_site.auth_asym_id", sort=False): + results[chain_id] = int(group["_atom_site.label_seq_id"].min()) + + return results + + +def expand_crosslinks_to_chain_combinations( + relevant_crosslinks_df: pd.DataFrame, + chains_per_protein: dict[str, dict[str, int]], +) -> pd.DataFrame: + """ + Duplicate each crosslink row so that all possible chain combinations + are represented. + + + :param relevant_crosslinks_df: dataframe that contains information on the crosslinks between the proteins + :param chains_per_protein: dictionary that contains for each protein id a dictionary that gives the first residue position for each chain id + return: crosslinks dataframe with the additional columns of Chain_id1 and Chain_id2 + """ + expanded_rows = [] + + for _, crosslink in relevant_crosslinks_df.iterrows(): + protein_id1 = crosslink["Protein_id1"] + protein_id2 = crosslink["Protein_id2"] + + chains_protein1 = chains_per_protein.get(protein_id1, {}) + chains_protein2 = chains_per_protein.get(protein_id2, {}) + + chain_ids1 = list(chains_protein1.keys()) + chain_ids2 = list(chains_protein2.keys()) + + if not chain_ids1 or not chain_ids2: + continue + + # we do not want the same combination twice if the protein_ids are the same + # e.g.: protein 1 chain A - protein 1 chain B and protein 1 chain B - protein 1 chain A + if protein_id1 == protein_id2: + chain_pairs = itertools.combinations_with_replacement(chain_ids1, 2) + else: + chain_pairs = itertools.product(chain_ids1, chain_ids2) + + for chain_id1, chain_id2 in chain_pairs: + new_row = crosslink.copy() + new_row["Chain_id1"] = chain_id1 + new_row["Chain_id2"] = chain_id2 + expanded_rows.append(new_row) + + if not expanded_rows: + return pd.DataFrame( + columns=list(relevant_crosslinks_df.columns) + ["Chain_id1", "Chain_id2"] + ) + + return pd.DataFrame(expanded_rows).reset_index(drop=True) + + def validate_with_angstrom_deviation( crosslinking_df: pd.DataFrame, structure_metadata_df: pd.DataFrame, @@ -316,6 +407,17 @@ def validate_with_angstrom_deviation( messages = [dict(level=logging.WARNING, msg=msg)] return dict(crosslinking_result_df=pd.DataFrame(), messages=messages) + chains_per_protein = {} + for protein_id in set(structures_to_validate): + chains_per_protein[protein_id] = get_chain_starts( + cif_df=cif_df, protein_id=protein_id + ) + + relevant_crosslinks_df = expand_crosslinks_to_chain_combinations( + relevant_crosslinks_df=relevant_crosslinks_df, + chains_per_protein=chains_per_protein, + ) + relevant_crosslinks_df, messages = add_protein_crosslink_positions_to_df( relevant_crosslinks_df, amino_acid_sequences_df ) @@ -330,19 +432,14 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: amino_acid_sequences_df=amino_acid_sequences_df, protein_id=protein_id2 ) - relevant_crosslinks_df["crosslinker_position1"] = relevant_crosslinks_df[ - "crosslinker_position1" - ].astype("Int64") - - relevant_crosslinks_df["crosslinker_position2"] = relevant_crosslinks_df[ - "crosslinker_position2" - ].astype("Int64") predicted_distance = get_distance_between_two_amino_acids_in_angstrom( amino_acid_position1=crosslink.crosslinker_position1, amino_acid_position2=crosslink.crosslinker_position2, amino_acid_type1=protein_sequence1[crosslink.crosslinker_position1 - 1], amino_acid_type2=protein_sequence2[crosslink.crosslinker_position2 - 1], cif_df=cif_df, + chain_id1=crosslink.Chain_id1, + chain_id2=crosslink.Chain_id2, ) try: ( @@ -385,6 +482,14 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: "crosslinker_position1", "crosslinker_position2", ] + + relevant_crosslinks_df["crosslinker_position1"] = relevant_crosslinks_df[ + "crosslinker_position1" + ].astype("Int64") + relevant_crosslinks_df["crosslinker_position2"] = relevant_crosslinks_df[ + "crosslinker_position2" + ].astype("Int64") + relevant_crosslinks_df[new_columns] = relevant_crosslinks_df.apply( check_crosslink, axis=1 ) @@ -395,7 +500,7 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: ] checked_crosslinks_df["link_type"] = checked_crosslinks_df.apply( - lambda row: "intra" if row["Protein_id1"] == row["Protein_id2"] else "inter", + lambda row: "intra" if row["Chain_id1"] == row["Chain_id2"] else "inter", axis=1, ) @@ -450,6 +555,8 @@ def diagrams_of_crosslinking_validation_data( figures = [] + structures_to_validate_str = ", ".join(structures_to_validate) + for crosslinker, crosslinker_df in validated_df.groupby("Crosslinker"): distances_valid = crosslinker_df.loc[ crosslinker_df["valid_crosslink"] == True, "alphafold_distance" @@ -477,7 +584,6 @@ def diagrams_of_crosslinking_validation_data( accepted_deviation_upper_bound, accepted_deviation_lower_bound, ) = crosslinker_information[crosslinker] - structures_to_validate_str = ", ".join(structures_to_validate) histogram = create_histograms( dataframe_a=df_valid, dataframe_b=df_invalid, diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 90f65adc5..f1eee9941 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -550,10 +550,10 @@ def create_form(self): ), TextField( name="uniprot_ids", - label="Protein IDs of all proteins used in the sequence.", + label="Protein IDs of all proteins used in the sequence. ", ), InfoField( - label="Please provide a list of Protein IDs separated by a comma \n e.g.: P68871, P69905, Q5VSL9" + label="Please provide a list of Protein IDs separated by a comma \n e.g.: P68871, P69905, Q5VSL9. List them in the same order as provided to the AlphaFold model. If a protein appears multiple times, include it each time it occurs in the corresponding position." ), TextField( name="model_used", From 1d7e940dd19b43f1c3758be1a1bcaea79ee6d5af Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 20 Apr 2026 11:40:25 +0200 Subject: [PATCH 173/240] feat: add job request json upload to all multimer imports --- backend/main/views_settings.py | 3 +- backend/protzilla/constants/data_types.py | 1 + .../alphafold_protein_structure_load.py | 40 ++++++++++++------- backend/protzilla/methods/importing.py | 9 ++++- .../multimer-structure-upload.tsx | 10 +++++ 5 files changed, 47 insertions(+), 16 deletions(-) diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index 99199426d..c550e5d14 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -493,6 +493,7 @@ def upload_multimer_structure(request): cif_file = data.get("cif_file") confidence_file = data.get("confidence_file") full_data_file = data.get("full_data_file") + job_request_file = data.get("job_request_file") ALPHAFOLD_MULTIMER_PATH.mkdir(parents=True, exist_ok=True) @@ -536,7 +537,7 @@ def upload_multimer_structure(request): # Copy files to source directory out of temp directory target_dir = ALPHAFOLD_MULTIMER_PATH / entry_id.upper() - file_names = [fasta_file, cif_file, confidence_file, full_data_file] + file_names = [fasta_file, cif_file, confidence_file, full_data_file, job_request_file] success, message = check_and_copy_files_to_directory( file_names=file_names, target_dir=target_dir ) diff --git a/backend/protzilla/constants/data_types.py b/backend/protzilla/constants/data_types.py index 956b50ad1..2665cfe3e 100644 --- a/backend/protzilla/constants/data_types.py +++ b/backend/protzilla/constants/data_types.py @@ -28,6 +28,7 @@ class DataKey(StrEnum): CROSSLINKING_DF = "crosslinking_df" CONFIDENCE_DF = "confidence_df" FULL_DATA_DF = "full_data_df" + JOB_REQUEST_DF = "job_request_df" ProteinDf = NewType("ProteinDf", pd.DataFrame) diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index d56d1226b..45cf91aa2 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -723,7 +723,7 @@ def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: entry_id=entry_id, structure_dir=structure_dir ) - # get jsons (PAE and pLDDT) + # get jsons (full data and confidence and job requests) json_files = get_json_files_in_dir(entry_id=entry_id, structure_dir=structure_dir) try: @@ -731,28 +731,35 @@ def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: msg = f"Only one json file found in {structure_dir} for entry '{entry_id}'. Two json files are expected" logger.error(msg) raise RuntimeError() + elif len(json_files) == 2: + msg = f"Only two json file found in {structure_dir} for entry '{entry_id}'. Three json files are expected" + logger.error(msg) + raise RuntimeError() else: with open(json_files[0], "r") as f: obj1 = json.load(f) with open(json_files[1], "r") as f: obj2 = json.load(f) + with open(json_files[2], "r") as f: + obj3 = json.load(f) json1 = pd.json_normalize(obj1) json2 = pd.json_normalize(obj2) + json3 = pd.json_normalize(obj3) # iptm stands for interface predicted TM score - if "chain_iptm" in json1.columns and "pae" in json2.columns: - confidence_df = json1 - full_data_df = json2 - elif "chain_iptm" in json2.columns and "pae" in json1.columns: - confidence_df = json2 - full_data_df = json1 - else: - # Fallback: assign and warn - confidence_df = json1 - full_data_df = json2 - warn = f"Could not detect confidence scores/full data information in JSON files for entry '{entry_id}'; ''{json_files[0]} is read as confidenc, {json_files[1]} is read as full data summary." - logger.warning(warn) - messages.append(dict(level=logging.WARNING, msg=warn)) + + confidence_df, full_data_df, job_request_df = None, None, None + for json_df in [json1, json2, json3]: + if "chain_iptm" in json_df.columns: + confidence_df = json_df + elif "pae" in json_df.columns: + full_data_df = json_df + elif "sequences" in json_df.columns: + job_request_df = json_df + if confidence_df is None or full_data_df is None or job_request_df is None: + msg = f"Could not detect confidence scores/full data/job request in JSON files for entry '{entry_id}'." + logger.exception(msg) + raise RuntimeError(msg) except Exception as e: msg = f"Failed to read JSON files in {structure_dir}: {e}" logger.exception(msg) @@ -763,6 +770,7 @@ def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: "cif_df": cif_df, "confidence_df": confidence_df, "full_data_df": full_data_df, + "job_request_df": job_request_df, } check_success_of_get_df(entry_id=entry_id, df_dict=df_dict, messages=messages) df_dict["messages"] = messages @@ -777,6 +785,7 @@ def upload_multimer_prediction( cif_file: Path, confidence_file: Path, full_data_file: Path, + job_request_file: Path, persist_upload: bool, ) -> dict[str, Any]: """ @@ -858,6 +867,7 @@ def upload_multimer_prediction( cif_file, confidence_file, full_data_file, + job_request_file, ]: success, msg = copy_file_to_directory(file_name, work_dir) if not success: @@ -868,6 +878,7 @@ def upload_multimer_prediction( amino_acid_sequences_df = fasta_dict["fasta_df"] confidence_df = pd.read_json(confidence_file) + job_request_df = pd.read_json(job_request_file) # full_data json has arrays of unequal lengths so we need to normalize full_data_df = pd.DataFrame() @@ -891,6 +902,7 @@ def upload_multimer_prediction( "confidence_df": confidence_df, "full_data_df": full_data_df, "amino_acid_sequences_df": amino_acid_sequences_df, + "job_request_df": job_request_df, } if not any(df.empty for df in df_dict.values()): diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index f1eee9941..2c486e2f8 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -534,6 +534,7 @@ class UploadMultimerPredictions(ImportingStep): DataKey.CIF_DF, DataKey.CONFIDENCE_DF, DataKey.FULL_DATA_DF, + DataKey.JOB_REQUEST_DF, DataKey.AMINO_ACID_SEQUENCES_DF, ] @@ -553,7 +554,7 @@ def create_form(self): label="Protein IDs of all proteins used in the sequence. ", ), InfoField( - label="Please provide a list of Protein IDs separated by a comma \n e.g.: P68871, P69905, Q5VSL9. List them in the same order as provided to the AlphaFold model. If a protein appears multiple times, include it each time it occurs in the corresponding position." + label="Please provide a list of Protein IDs separated by a comma \n e.g.: P68871, P69905, Q5VSL9." ), TextField( name="model_used", @@ -579,6 +580,11 @@ def create_form(self): label="Full data json file (required)", value=None, ), + FileInput( + name="job_request_file", + label="Job request json file (required)", + value=None, + ), CheckboxField( name="persist_upload", label="Upload should be saved persistently across runs", @@ -600,6 +606,7 @@ class ImportMultimerStructurePredictionFromDisk(ImportingStep): DataKey.CIF_DF, DataKey.CONFIDENCE_DF, DataKey.FULL_DATA_DF, + DataKey.JOB_REQUEST_DF, DataKey.AMINO_ACID_SEQUENCES_DF, ] diff --git a/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx b/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx index a504c36a7..11ec79ea6 100644 --- a/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx +++ b/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx @@ -95,6 +95,7 @@ export const MultimerStructureUpload = () => { cif_file: string, confidence_file: string, full_data_file: string, + job_request_file: string, ) => { const response = await callApiWithParameters("upload_multimer_structure", { entry_id: entry_id, @@ -104,6 +105,7 @@ export const MultimerStructureUpload = () => { cif_file: cif_file, confidence_file: confidence_file, full_data_file: full_data_file, + job_request_file: job_request_file, }); if (response?.success) { notify({ @@ -231,6 +233,13 @@ export const MultimerStructureUpload = () => { isVisible: true, accept: ".json", }, + { + type: "file", + name: "job_request_file", + label: "Job request json file (required):", + isVisible: true, + accept: ".json", + }, ], }} onChange={(data) => { @@ -242,6 +251,7 @@ export const MultimerStructureUpload = () => { data.cif_file as string, data.confidence_file as string, data.full_data_file as string, + data.job_request_file as string, ); }} /> From 54e57f6470b24ad48b803bdf7a4da23970a50bf9 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 20 Apr 2026 11:54:09 +0200 Subject: [PATCH 174/240] fix: import of job request json file --- backend/main/views_settings.py | 8 +++++++- .../importing/alphafold_protein_structure_load.py | 8 ++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index c550e5d14..63ede4254 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -537,7 +537,13 @@ def upload_multimer_structure(request): # Copy files to source directory out of temp directory target_dir = ALPHAFOLD_MULTIMER_PATH / entry_id.upper() - file_names = [fasta_file, cif_file, confidence_file, full_data_file, job_request_file] + file_names = [ + fasta_file, + cif_file, + confidence_file, + full_data_file, + job_request_file, + ] success, message = check_and_copy_files_to_directory( file_names=file_names, target_dir=target_dir ) diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 45cf91aa2..fa1317c37 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -732,9 +732,9 @@ def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: logger.error(msg) raise RuntimeError() elif len(json_files) == 2: - msg = f"Only two json file found in {structure_dir} for entry '{entry_id}'. Three json files are expected" - logger.error(msg) - raise RuntimeError() + msg = f"Only two json file found in {structure_dir} for entry '{entry_id}'. Three json files are expected" + logger.error(msg) + raise RuntimeError() else: with open(json_files[0], "r") as f: obj1 = json.load(f) @@ -750,7 +750,7 @@ def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: confidence_df, full_data_df, job_request_df = None, None, None for json_df in [json1, json2, json3]: - if "chain_iptm" in json_df.columns: + if "chain_iptm" in json_df.columns: confidence_df = json_df elif "pae" in json_df.columns: full_data_df = json_df From 4063321eab27cb4bea7435a0138a178b0461af6a Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 20 Apr 2026 11:54:37 +0200 Subject: [PATCH 175/240] update tests for job request json upload --- .../test_alphafold_protein_structure_load.py | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index b0bf84f3e..c48a168a5 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -447,6 +447,28 @@ def test_upload_multimer_prediction_basic(tmp_path, monkeypatch): conf.write_text('[{"residueNumber":1, "confidenceScore":99}]') full = tmp_path / "full.json" full.write_text('{"a": [1,2]}') + job_request = tmp_path / "job_request.json" + job_request.write_text( + json.dumps( + [ + { + "name": "test_job", + "modelSeeds": ["123456789"], + "sequences": [ + { + "proteinChain": { + "sequence": "AAAA", + "count": 1, + "useStructureTemplate": True, + } + } + ], + "dialect": "alphafoldserver", + "version": 3, + } + ] + ) + ) # monkeypatch copy to actually copy files def _copy(src, dest_dir): @@ -467,6 +489,7 @@ def _copy(src, dest_dir): cif_file=cif, confidence_file=conf, full_data_file=full, + job_request_file=job_request, persist_upload=True, ) @@ -495,6 +518,12 @@ def _copy(src, dest_dir): assert isinstance(full_df, pd.DataFrame) assert full_df.iloc[0]["a"] == [1, 2] + # job request JSON + job_df = out["job_request_df"] + assert isinstance(job_df, pd.DataFrame) + assert job_df.iloc[0]["name"] == "test_job" + assert job_df.iloc[0]["dialect"] == "alphafoldserver" + # sequences seqs = out["amino_acid_sequences_df"] assert isinstance(seqs, pd.DataFrame) @@ -587,6 +616,28 @@ def test_upload_multimer_prediction_no_persist(tmp_path, monkeypatch): conf.write_text('[{"residueNumber":1, "confidenceScore":99}]') full = tmp_path / "full.json" full.write_text('{"a": [1,2]}') + job_request = tmp_path / "job_request.json" + job_request.write_text( + json.dumps( + [ + { + "name": "test_job_2", + "modelSeeds": ["987654321"], + "sequences": [ + { + "proteinChain": { + "sequence": "BBBB", + "count": 1, + "useStructureTemplate": True, + } + } + ], + "dialect": "alphafoldserver", + "version": 3, + } + ] + ) + ) out = upload_multimer_prediction( entry_id="M2", @@ -596,12 +647,15 @@ def test_upload_multimer_prediction_no_persist(tmp_path, monkeypatch): cif_file=cif, confidence_file=conf, full_data_file=full, + job_request_file=job_request, persist_upload=False, ) # verify dataframes are returned assert isinstance(out["structure_metadata_df"], pd.DataFrame) assert isinstance(out["cif_df"], pd.DataFrame) + assert isinstance(out["job_request_df"], pd.DataFrame) + assert out["job_request_df"].iloc[0]["name"] == "test_job_2" # directory should still exist (created for the entry) upload_dir = tmp_path / "M2" assert not upload_dir.exists() @@ -828,8 +882,30 @@ def test_get_multimer_structure_dfs_success(tmp_path, monkeypatch): confidence = prot_dir / "confidence.json" full_data = prot_dir / "full.json" + job_request = prot_dir / "job_request.json" confidence.write_text(json.dumps({"chain_iptm": [0.75]})) full_data.write_text(json.dumps({"pae": [[0.1, 0.2], [0.3, 0.4]]})) + job_request.write_text( + json.dumps( + [ + { + "name": "multimer_job", + "modelSeeds": ["111111111"], + "sequences": [ + { + "proteinChain": { + "sequence": "AAAA", + "count": 2, + "useStructureTemplate": True, + } + } + ], + "dialect": "alphafoldserver", + "version": 3, + } + ] + ) + ) out = get_multimer_structure_dfs("M1") assert isinstance(out["structure_metadata_df"], pd.DataFrame) @@ -837,9 +913,12 @@ def test_get_multimer_structure_dfs_success(tmp_path, monkeypatch): assert isinstance(out["amino_acid_sequences_df"], pd.DataFrame) assert isinstance(out["confidence_df"], pd.DataFrame) assert isinstance(out["full_data_df"], pd.DataFrame) + assert isinstance(out["job_request_df"], pd.DataFrame) assert "chain_iptm" in out["confidence_df"].columns assert "pae" in out["full_data_df"].columns + assert out["job_request_df"].iloc[0]["name"] == "multimer_job" + assert out["job_request_df"].iloc[0]["version"] == 3 assert any(m.get("level") == logging.INFO for m in out["messages"]) or any( "Successfully loaded" in str(m.get("msg", "")) for m in out["messages"] From 24d3779be3f87a517dbad6f3a63291d5611c4e57 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Tue, 21 Apr 2026 10:07:18 +0200 Subject: [PATCH 176/240] fix multimer validation step and refactor --- .../data_analysis/crosslinking_validation.py | 351 ++++++++++++++---- backend/protzilla/methods/data_analysis.py | 13 +- 2 files changed, 278 insertions(+), 86 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 29d317616..f5745f660 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -267,47 +267,26 @@ def get_crosslink_positions_in_protein( return crosslinking_df, messages -def _get_structures_to_validate(structure_metadata_df: pd.DataFrame) -> list[str]: - if "uniprot_accession" in structure_metadata_df.columns: - return structure_metadata_df["uniprot_accession"].tolist() - elif "uniprot_ids" in structure_metadata_df.columns: - value = structure_metadata_df["uniprot_ids"].iloc[0] - if isinstance(value, str): - value = ast.literal_eval(value) - return value - else: - raise ValueError("Metadata must contain 'uniprot_ids' or 'uniprot_accession'.") - - -def get_chain_starts( +def get_chains( cif_df: pd.DataFrame, + valid_ids: dict, protein_id: str, -) -> dict: + id_column_name: str, +) -> list: """ - Returns the starting row positions and chain IDs for all chains - belonging to a given protein in an mmCIF-derived DataFrame. + Returns a list of unique chain IDs belonging to a given protein + in an mmCIF-derived DataFrame. :param cif_df: mmCIF data as a pandas DataFrame. + :param valid_ids: dictionary of valid IDs. :param protein_id: identifier of the protein you want to query. - :return: set of residue numbers where the different chains start indexed by chain_id + :param id_column_name: column name to check against valid_ids. + :return: list of unique chain IDs. """ - relevant_df = cif_df[ - cif_df["_atom_site.pdbx_sifts_xref_db_acc"] == protein_id - ].copy() - relevant_df["_atom_site.label_seq_id"] = pd.to_numeric( - relevant_df["_atom_site.label_seq_id"], errors="coerce" - ) - relevant_df = relevant_df.dropna( - subset=["_atom_site.auth_asym_id", "_atom_site.label_seq_id"] - ) - - # Find, for each chain, the first row position in the original DataFrame - # and the corresponding first residue number which is the smallest number of the chain - results = {} - for chain_id, group in relevant_df.groupby("_atom_site.auth_asym_id", sort=False): - results[chain_id] = int(group["_atom_site.label_seq_id"].min()) + relevant_df = cif_df[cif_df[id_column_name].isin(valid_ids[protein_id])] + chain_ids = relevant_df["_atom_site.auth_asym_id"].dropna().unique().tolist() - return results + return chain_ids def expand_crosslinks_to_chain_combinations( @@ -320,7 +299,7 @@ def expand_crosslinks_to_chain_combinations( :param relevant_crosslinks_df: dataframe that contains information on the crosslinks between the proteins - :param chains_per_protein: dictionary that contains for each protein id a dictionary that gives the first residue position for each chain id + :param chains_per_protein: dictionary that contains all chain_ids in a list for each protein id return: crosslinks dataframe with the additional columns of Chain_id1 and Chain_id2 """ expanded_rows = [] @@ -329,8 +308,8 @@ def expand_crosslinks_to_chain_combinations( protein_id1 = crosslink["Protein_id1"] protein_id2 = crosslink["Protein_id2"] - chains_protein1 = chains_per_protein.get(protein_id1, {}) - chains_protein2 = chains_per_protein.get(protein_id2, {}) + chains_protein1 = chains_per_protein[protein_id1] + chains_protein2 = chains_per_protein[protein_id2] chain_ids1 = list(chains_protein1.keys()) chain_ids2 = list(chains_protein2.keys()) @@ -359,12 +338,155 @@ def expand_crosslinks_to_chain_combinations( return pd.DataFrame(expanded_rows).reset_index(drop=True) -def validate_with_angstrom_deviation( +def monomer_validation( + crosslinking_df: pd.DataFrame, + structure_metadata_df: pd.DataFrame, + crosslinker_information: dict[str, list[float]], + cif_df: pd.DataFrame, + amino_acid_sequences_df: pd.DataFrame, +) -> dict: + """ + Validates crosslinking data for a monomeric protein structure by checking + distance deviations (in Angstroms). + + :param crosslinking_df: DataFrame containing the full set of crosslinks. + :param structure_metadata_df: DataFrame containing structural metadata. + :param crosslinker_information: Dictionary mapping crosslinker names to their + allowed distance boundaries (e.g., [min_dist, max_dist]). + :param cif_df: DataFrame containing mmCIF information. + :param amino_acid_sequences_df: DataFrame containing known amino acid sequences. + :return: A dictionary containing the validation results and distance metrics. + """ + all_crosslinks_df = crosslinking_df.copy() + protein_id = structure_metadata_df["uniprot_accession"].iloc[0] + # we are only interested in intra-crosslinks of the protein we want to validate + mask = (all_crosslinks_df["Protein_id1"] == protein_id) & ( + all_crosslinks_df["Protein_id2"] == protein_id + ) + valid_ids = {protein_id: [protein_id]} + relevant_crosslinks_df = all_crosslinks_df[mask] + return validate_with_angstrom_deviation( + relevant_crosslinks_df=relevant_crosslinks_df, + crosslinker_information=crosslinker_information, + cif_df=cif_df, + amino_acid_sequences_df=amino_acid_sequences_df, + valid_ids=valid_ids, + id_column_name="_atom_site.pdbx_sifts_xref_db_acc", + structures_to_validate=[protein_id], + ) + + +def get_protein_id_from_sequence(amino_acid_sequences_df, target_sequence): + """ + Finds the Protein ID(s) for a given exact protein sequence. + + :param amino_acid_sequences_df: Dataframe that contains all amino acid sequences + :param target_sequence: The protein sequence you want to find the protein id of. + :return: the Protein ID that matches the sequence. (Should only be one, therefore we take the first one) + """ + matching_rows = amino_acid_sequences_df[ + amino_acid_sequences_df["Protein Sequence"] == target_sequence + ] + if not matching_rows.empty: + return matching_rows["Protein ID"].iloc[0] + else: + return None + + +def get_valid_ids_per_protein_id_from_job_request( + amino_acid_sequences_df: pd.DataFrame, job_request_df: pd.DataFrame +) -> dict: + """ + Extracts protein sequences from an AlphaFold Server job request and assigns + them their protein ID based on the given amino sequences df. It then checks the count + and collects all ids that will later be used in the cif file to identify the proteins + instead of their protein ids (because there are no protein ids in the cif file given) + + :param amino_acid_sequences_df: Dataframe that contains all amino acid sequences. + :param job_request_df: DataFrame containing the loaded AlphaFold job request JSON, + which must include a 'sequences' column. + :return: A dictionary mapping Protein IDs to a list of their assigned unique integer + chain IDs. Example: {'P12345': [1, 2], 'Q67890': [3]} + """ + valid_ids = {} + unique_id = 1 + + sequences_list = job_request_df["sequences"].iloc[0] + if isinstance(sequences_list, str): + sequences_list = ast.literal_eval(sequences_list) + + for item in sequences_list: + if "proteinChain" in item: + seq_string = item["proteinChain"]["sequence"] + count = item["proteinChain"]["count"] + protein_id = get_protein_id_from_sequence( + amino_acid_sequences_df, seq_string + ) + if protein_id is not None: + # Remove the specific isoform/variant suffix because we do not use it in the crosslinking df + protein_id = protein_id.replace("-1", "") + valid_ids[protein_id] = [] + for _ in range(count): + valid_ids[protein_id].append(unique_id) + unique_id += 1 + return valid_ids + + +def multimer_validation( crosslinking_df: pd.DataFrame, structure_metadata_df: pd.DataFrame, crosslinker_information: dict[str, list[float]], cif_df: pd.DataFrame, amino_acid_sequences_df: pd.DataFrame, + job_request_df: pd.DataFrame, +) -> dict: + """ + Validates crosslinking data for a multimeric protein complex by checking + distance deviations (in Angstroms). + + This function maps sequences from an AlphaFold Server job request to determine + valid chain IDs, filters the crosslinking dataset to include only interactions + between these valid structures, and delegates the structural distance calculation. + + :param crosslinking_df: DataFrame containing the full set of crosslinks. + :param structure_metadata_df: DataFrame containing structural metadata. + (Note: Passed for pipeline consistency, but unused in this step). + :param crosslinker_information: Dictionary mapping crosslinker names to their + allowed distance boundaries (e.g., [min_dist, max_dist]). + :param cif_df: DataFrame containing mmCIF information. + :param amino_acid_sequences_df: DataFrame containing known amino acid sequences. + :param job_request_df: DataFrame containing the loaded AlphaFold job request JSON. + :return: A dictionary containing the validation results and distance metrics. + """ + valid_ids = get_valid_ids_per_protein_id_from_job_request( + amino_acid_sequences_df=amino_acid_sequences_df, job_request_df=job_request_df + ) + structures_to_validate = list(valid_ids.keys()) + all_crosslinks_df = crosslinking_df.copy() + mask = (all_crosslinks_df["Protein_id1"].isin(structures_to_validate)) & ( + all_crosslinks_df["Protein_id2"].isin(structures_to_validate) + ) + relevant_crosslinks_df = all_crosslinks_df[mask] + + return validate_with_angstrom_deviation( + relevant_crosslinks_df=relevant_crosslinks_df, + crosslinker_information=crosslinker_information, + cif_df=cif_df, + amino_acid_sequences_df=amino_acid_sequences_df, + valid_ids=valid_ids, + id_column_name="_atom_site.label_entity_id", + structures_to_validate=structures_to_validate, + ) + + +def validate_with_angstrom_deviation( + relevant_crosslinks_df: pd.DataFrame, + crosslinker_information: dict[str, list[float]], + cif_df: pd.DataFrame, + amino_acid_sequences_df: pd.DataFrame, + valid_ids: dict, + id_column_name: str, + structures_to_validate: list, ) -> dict: """ Validates cross-links by comparing the cross-linker lengths with the distances between the linked @@ -372,34 +494,21 @@ def validate_with_angstrom_deviation( so if the distance between the connected amino acids in AlphaFold is less than (cross-linker length + the upper allowed deviation) and more than (cross-linker length - the lower allowed deviation). If one of the bounds is zero only the other bound will be applied. - :param crosslinking_df: DataFrame containing cross-linking data. - :param structure_metadata_df: DataFrame containing metadata - :param crosslinker_information: Contains for each Crosslinker: - - length_of_: float - - lower_accepted_deviation_for_: float - - upper_accepted_deviation_for_: float - :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) - :param amino_acid_sequences_df: Dataframe that contains all amino acid sequences - :return: dict (crosslinking_df_result, messages), crosslinking_df_result contains the relevant rows (rows of intra-crosslinks within the - protein to validate) of crosslinking_df and two more columns containing the distances in AlphaFold and whether the crosslink matches the - AlphaFold data or not + :param relevant_crosslinks_df: DataFrame containing the subset of cross-linking data to validate. + :param crosslinker_information: Dictionary mapping crosslinker names to a list of three floats: + [crosslinker_length, upper_accepted_deviation, lower_accepted_deviation]. + :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms). + :param amino_acid_sequences_df: Dataframe that contains all known amino acid sequences. + :param valid_ids: Dictionary mapping protein IDs to their valid chain/entity identifiers in the CIF data. + :param id_column_name: The column name in the cif_df to use for matching against valid_ids. + :param structures_to_validate: List of protein IDs to validate. + :return: A dictionary containing: + - 'crosslinking_result_df': DataFrame containing the validated rows augmented with alphafold distances, + validation booleans, crosslinker positions, and link types (intra/inter). + - 'messages': List of dictionaries containing log levels and warning/info messages. :raises KeyError: If a required crosslinker field is missing in crosslinker_information. :raises ValueError: If peptide sequences cannot be matched to the protein sequence. """ - structures_to_validate = _get_structures_to_validate(structure_metadata_df) - all_crosslinks_df = crosslinking_df.copy() - is_multimer = len(structures_to_validate) > 1 - if not is_multimer: - # we are only interested in intra-crosslinks of the protein we want to validate - mask = (all_crosslinks_df.Protein_id1 == structures_to_validate[0]) & ( - all_crosslinks_df.Protein_id2 == structures_to_validate[0] - ) - else: - mask = (all_crosslinks_df["Protein_id1"].isin(structures_to_validate)) & ( - all_crosslinks_df["Protein_id2"].isin(structures_to_validate) - ) - - relevant_crosslinks_df = all_crosslinks_df[mask].copy() # Check if dataframe is empty if relevant_crosslinks_df.empty: @@ -408,9 +517,12 @@ def validate_with_angstrom_deviation( return dict(crosslinking_result_df=pd.DataFrame(), messages=messages) chains_per_protein = {} - for protein_id in set(structures_to_validate): - chains_per_protein[protein_id] = get_chain_starts( - cif_df=cif_df, protein_id=protein_id + for protein_id in structures_to_validate: + chains_per_protein[protein_id] = get_chains( + cif_df=cif_df, + valid_ids=valid_ids, + protein_id=protein_id, + id_column_name=id_column_name, ) relevant_crosslinks_df = expand_crosslinks_to_chain_combinations( @@ -508,11 +620,9 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: def diagrams_of_crosslinking_validation_data( - crosslinking_df: pd.DataFrame, - structure_metadata_df: pd.DataFrame, - crosslinker_information: dict[str, list[float]], - cif_df: pd.DataFrame, - amino_acid_sequences_df: pd.DataFrame, + validated_df: pd.DataFrame, + structures_to_validate: str, + crosslinker_information: pd.DataFrame, ) -> list[Figure]: """ Creates for each crosslinker histogram plots summarizing the distribution of valid and invalid @@ -543,14 +653,6 @@ def diagrams_of_crosslinking_validation_data( bar plot summarizing valid and invalid cross-links across all crosslinkers. :raises KeyError: If a required crosslinker entry is missing in crosslinker_information. """ - structures_to_validate = _get_structures_to_validate(structure_metadata_df) - validated_df = validate_with_angstrom_deviation( - crosslinking_df, - structure_metadata_df, - crosslinker_information, - cif_df, - amino_acid_sequences_df, - )["crosslinking_result_df"] validated_df = validated_df.dropna(subset=["valid_crosslink"]) figures = [] @@ -606,9 +708,9 @@ def diagrams_of_crosslinking_validation_data( ) mean_of_predicted_lengths = crosslinker_df["alphafold_distance"].mean() - standard_deviation_predicted_lengths = crosslinker_df[ - "alphafold_distance" - ].std() + standard_deviation_predicted_lengths = crosslinker_df["alphafold_distance"].std( + ddof=0 + ) mean_plus_two_std = ( mean_of_predicted_lengths + 2 * standard_deviation_predicted_lengths ) @@ -709,3 +811,90 @@ def diagrams_of_crosslinking_validation_data( figures.append(bar_plot_over_all_checked_crosslinks) return figures + + +def monomer_diagrams( + crosslinking_df: pd.DataFrame, + structure_metadata_df: pd.DataFrame, + crosslinker_information: dict[str, list[float]], + cif_df: pd.DataFrame, + amino_acid_sequences_df: pd.DataFrame, +) -> list[Figure]: + """ + Generates visual diagrams to evaluate crosslinking validation results + for a monomeric protein structure. + + This function acts as a wrapper that first runs the crosslink validation + step via `monomer_validation`. It then extracts the resulting dataframe + of validated crosslinks and passes it to the diagram generator to create + the final plots. + + :param crosslinking_df: DataFrame containing the full set of crosslinks. + :param structure_metadata_df: DataFrame containing structural metadata; the + first row's 'uniprot_accession' is used as the target. + :param crosslinker_information: Dictionary mapping crosslinker names to a list of + three floats: [length, upper_bound, lower_bound]. + :param cif_df: DataFrame containing parsed mmCIF structural coordinate data. + :param amino_acid_sequences_df: DataFrame containing known amino acid sequences. + :return: A list of Figure objects visualizing the crosslinking validation data. + """ + structures_to_validate = [structure_metadata_df["uniprot_accession"].iloc[0]] + validated_df = monomer_validation( + crosslinking_df, + structure_metadata_df, + crosslinker_information, + cif_df, + amino_acid_sequences_df, + )["crosslinking_result_df"] + return diagrams_of_crosslinking_validation_data( + validated_df=validated_df, + structures_to_validate=structures_to_validate, + crosslinker_information=crosslinker_information, + ) + + +def multimer_diagrams( + crosslinking_df: pd.DataFrame, + structure_metadata_df: pd.DataFrame, + crosslinker_information: dict[str, list[float]], + cif_df: pd.DataFrame, + amino_acid_sequences_df: pd.DataFrame, + job_request_df: pd.DataFrame, +) -> list[Figure]: + """ + Generates visual diagrams to evaluate crosslinking validation results + for a multimeric protein complex. + + This function parses an AlphaFold job request to determine the valid chain + compositions. It then runs `multimer_validation` to filter and validate + the relevant crosslinks, extracting the result to generate structural + distance and validation plots. + + :param crosslinking_df: DataFrame containing the full set of crosslinks. + :param structure_metadata_df: DataFrame containing structural metadata. + :param crosslinker_information: Dictionary mapping crosslinker names to a list of + three floats: [length, upper_bound, lower_bound]. + :param cif_df: DataFrame containing parsed mmCIF structural coordinate data. + :param amino_acid_sequences_df: DataFrame containing known amino acid sequences. + :param job_request_df: DataFrame containing the loaded AlphaFold job request JSON. + :return: A list of Figure objects visualizing the crosslinking validation data. + """ + valid_ids = get_valid_ids_per_protein_id_from_job_request( + amino_acid_sequences_df=amino_acid_sequences_df, job_request_df=job_request_df + ) + structures_to_validate = list(valid_ids.keys()) + + validated_df = multimer_validation( + crosslinking_df, + structure_metadata_df, + crosslinker_information, + cif_df, + amino_acid_sequences_df, + job_request_df, + )["crosslinking_result_df"] + + return diagrams_of_crosslinking_validation_data( + validated_df=validated_df, + structures_to_validate=structures_to_validate, + crosslinker_information=crosslinker_information, + ) diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 10394af1f..643ce1901 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -90,8 +90,10 @@ get_detected_modifications, ) from protzilla.data_analysis.crosslinking_validation import ( - validate_with_angstrom_deviation, - diagrams_of_crosslinking_validation_data, + monomer_diagrams, + multimer_diagrams, + monomer_validation, + multimer_validation, ) from backend.protzilla.run import Run from backend.protzilla.methods.importing import ( @@ -2368,9 +2370,6 @@ class CrosslinkingValidationWithAngstromStep(DataAnalysisStep): output_keys = ["crosslinking_result_df"] internal_inputs = {"crosslinker_information"} - plot_method = staticmethod(diagrams_of_crosslinking_validation_data) - calc_method = staticmethod(validate_with_angstrom_deviation) - def _get_crosslinker_names_from_crosslinker_df( self, steps: StepManager ) -> list[str]: @@ -2432,6 +2431,8 @@ class CrosslinkingValidationWithAngstromDeviation( display_name = "Ångström Deviation For Monomer Structures" operation = "Cross Linking Validation" method_description = "Validates cross links within the one protein structure based on the difference between the length of the cross linker and the distance between the amino acids which were connected by the cross linker. (in Ångström)" + calc_method = staticmethod(monomer_validation) + plot_method = staticmethod(monomer_diagrams) def create_form(self): return Form(label="Ångström Deviation - Monomer", input_fields=[]) @@ -2443,6 +2444,8 @@ class CrosslinkingValidationWithAngstromDeviationForMultimer( display_name = "Ångström Deviation For Multimer Structures" operation = "Cross Linking Validation" method_description = "Validates cross links between proteins based on the difference between the length of the cross linker and the distance between the amino acids which were connected by the cross linker. (in Ångström)" + calc_method = staticmethod(multimer_validation) + plot_method = staticmethod(multimer_diagrams) def create_form(self): return Form( From d633530e8d2ed729ebc3b4e8194ddd34a039f8f5 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Tue, 21 Apr 2026 10:31:39 +0200 Subject: [PATCH 177/240] update and add tests --- .../test_crosslinking_validation.py | 280 +++++++++++++++--- 1 file changed, 235 insertions(+), 45 deletions(-) diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 38673b813..e8a094aa5 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -12,6 +12,8 @@ get_distance_between_two_amino_acids_in_angstrom, add_protein_crosslink_positions_to_df, diagrams_of_crosslinking_validation_data, + expand_crosslinks_to_chain_combinations, + get_chains, ) from backend.protzilla.constants.colors import PLOT_PRIMARY_COLOR from backend.protzilla.data_analysis.plots import ( @@ -32,7 +34,7 @@ ], ) def test_validate_with_angstrom_deviation(distance, expected): - # Fake AlphaFold Data + # Fake AlphaFold Data with chain IDs cif_df = pd.DataFrame( { "_atom_site.label_atom_id": ["CA", "CA"], @@ -40,6 +42,8 @@ def test_validate_with_angstrom_deviation(distance, expected): "_atom_site.Cartn_x": [0, distance], "_atom_site.Cartn_y": [0, 0], "_atom_site.Cartn_z": [0, 0], + "_atom_site.auth_asym_id": ["A", "A"], + "_atom_site.pdbx_sifts_xref_db_acc": ["P12345", "P12345"], } ) @@ -61,12 +65,17 @@ def test_validate_with_angstrom_deviation(distance, expected): ) crosslinker_information = {"DSS": [5.0, 1.0, 1.0]} # Länge 5 Å ± 1 Å + valid_ids = {"P12345": ["P12345"]} + structures_to_validate = ["P12345"] + result = validate_with_angstrom_deviation( - crosslinking_df, - structure_metadata_df=pd.DataFrame({"uniprot_accession": ["P12345"]}), + relevant_crosslinks_df=crosslinking_df, crosslinker_information=crosslinker_information, - amino_acid_sequences_df=amino_acid_sequences_df, cif_df=cif_df, + amino_acid_sequences_df=amino_acid_sequences_df, + valid_ids=valid_ids, + id_column_name="_atom_site.pdbx_sifts_xref_db_acc", + structures_to_validate=structures_to_validate, ) df = result["crosslinking_result_df"] @@ -74,6 +83,8 @@ def test_validate_with_angstrom_deviation(distance, expected): assert "alphafold_distance" in df.columns assert "valid_crosslink" in df.columns assert "link_type" in df.columns + assert "Chain_id1" in df.columns + assert "Chain_id2" in df.columns assert df.loc[0, "alphafold_distance"] == distance assert df.loc[0, "valid_crosslink"] == expected assert df.loc[0, "link_type"] == "intra" @@ -112,10 +123,13 @@ def test_get_distance_between_two_amino_acids_in_angstrom(): "_atom_site.Cartn_x": [0, 3], "_atom_site.Cartn_y": [0, 4], "_atom_site.Cartn_z": [0, 0], + "_atom_site.auth_asym_id": ["A", "A"], } ) - dist = get_distance_between_two_amino_acids_in_angstrom(1, 2, "A", "B", cif_df) + dist = get_distance_between_two_amino_acids_in_angstrom( + 1, 2, "A", "B", cif_df, chain_id1="A", chain_id2="A" + ) assert dist == 5.0 @@ -308,19 +322,25 @@ def test_validate_multimer_filters_only_pairs_within_structures_to_validate(): "_atom_site.Cartn_x": [float(i) for i in range(1, 6)], "_atom_site.Cartn_y": [0.0] * 5, "_atom_site.Cartn_z": [0.0] * 5, + "_atom_site.auth_asym_id": ["A"] * 5, + "_atom_site.label_entity_id": [1, 1, 2, 2, 3], } ) # Very permissive bounds: always valid as long as distance is defined. # Format is [length, upper_deviation, lower_deviation]. crosslinker_information = {"XL": [0.0, 0.0, 0.0]} + valid_ids = {"P1": [1], "P2": [2]} + structures_to_validate = ["P1", "P2"] out = validate_with_angstrom_deviation( - crosslinking_df=crosslinking_df, - structure_metadata_df=pd.DataFrame({"uniprot_ids": [["P1", "P2"]]}), + relevant_crosslinks_df=crosslinking_df, crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, + valid_ids=valid_ids, + id_column_name="_atom_site.label_entity_id", + structures_to_validate=structures_to_validate, ) result_df = out["crosslinking_result_df"] @@ -338,6 +358,8 @@ def test_validate_multimer_filters_only_pairs_within_structures_to_validate(): assert "crosslinker_position1" in result_df.columns assert "crosslinker_position2" in result_df.columns assert "link_type" in result_df.columns + assert "Chain_id1" in result_df.columns + assert "Chain_id2" in result_df.columns def test_validate_multimer_no_links_between_structures_returns_empty_and_warning(): @@ -373,16 +395,22 @@ def test_validate_multimer_no_links_between_structures_returns_empty_and_warning "_atom_site.Cartn_x": [float(i) for i in range(1, 6)], "_atom_site.Cartn_y": [0.0] * 5, "_atom_site.Cartn_z": [0.0] * 5, + "_atom_site.auth_asym_id": ["A"] * 5, + "_atom_site.label_entity_id": [1, 1, 2, 3, 3], } ) crosslinker_information = {"XL": [0.0, 0.0, 0.0]} + valid_ids = {"P1": [1], "P2": [2]} + structures_to_validate = ["P1", "P2"] out = validate_with_angstrom_deviation( - crosslinking_df=crosslinking_df, - structure_metadata_df=pd.DataFrame({"uniprot_ids": [["P1", "P2"]]}), + relevant_crosslinks_df=crosslinking_df, crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, + valid_ids=valid_ids, + id_column_name="_atom_site.label_entity_id", + structures_to_validate=structures_to_validate, ) result_df = out["crosslinking_result_df"] @@ -431,18 +459,24 @@ def test_validate_multimer_duplicates_rows_for_multiple_peptide_matches_and_vali "_atom_site.Cartn_x": [float(i) for i in range(1, 5)], "_atom_site.Cartn_y": [0.0] * 4, "_atom_site.Cartn_z": [0.0] * 4, + "_atom_site.auth_asym_id": ["A"] * 4, + "_atom_site.label_entity_id": [1, 1, 2, 2], } ) # Always-valid bounds so we focus on duplication and distance computation. crosslinker_information = {"XL": [0.0, 0.0, 0.0]} + valid_ids = {"P1": [1], "P2": [2]} + structures_to_validate = ["P1", "P2"] out = validate_with_angstrom_deviation( - crosslinking_df=crosslinking_df, - structure_metadata_df=pd.DataFrame({"uniprot_ids": [["P1", "P2"]]}), + relevant_crosslinks_df=crosslinking_df, crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, + valid_ids=valid_ids, + id_column_name="_atom_site.label_entity_id", + structures_to_validate=structures_to_validate, ) result_df = out["crosslinking_result_df"] @@ -470,6 +504,8 @@ def test_validate_multimer_duplicates_rows_for_multiple_peptide_matches_and_vali # Check link_type column assert "link_type" in result_df.columns assert result_df["link_type"].isin(["intra", "inter"]).all() + # All links should be inter because they are between different chains + assert all(result_df["link_type"] == "inter") # Expect a duplication warning message. assert any( @@ -527,11 +563,7 @@ def sample_crosslinker_info(): @patch( "backend.protzilla.data_analysis.crosslinking_validation.add_vertical_line_with_annotation_in_legend" ) -@patch( - "backend.protzilla.data_analysis.crosslinking_validation.validate_with_angstrom_deviation" -) def test_diagrams_of_crosslinking_validation_data_with_drawing_all_vertical_lines( - mock_validate, mock_add_vline, mock_create_bar, mock_create_hist, @@ -539,7 +571,6 @@ def test_diagrams_of_crosslinking_validation_data_with_drawing_all_vertical_line sample_crosslinker_info, ): validated_df = sample_crosslinking_df.copy() - mock_validate.return_value = {"crosslinking_result_df": validated_df} hist_mock = Figure() mock_create_hist.return_value = hist_mock @@ -547,19 +578,15 @@ def test_diagrams_of_crosslinking_validation_data_with_drawing_all_vertical_line mock_create_bar.return_value = bar_mock figures = diagrams_of_crosslinking_validation_data( - crosslinking_df=sample_crosslinking_df, - structure_metadata_df=pd.DataFrame({"uniprot_accession": ["P12345"]}), + validated_df=validated_df, + structures_to_validate=["P12345"], crosslinker_information=sample_crosslinker_info, - cif_df=pd.DataFrame(), - amino_acid_sequences_df=pd.DataFrame(), ) # 2 histograms per crosslinker + 1 bar plot assert len(figures) == 5 assert all(isinstance(f, Figure) for f in figures) - mock_validate.assert_called_once() - assert ( mock_add_vline.call_count == 8 ) # for both crosslinkers: 1 call for crosslinker length for each histogram and 1 call for bound on deviation for each histogram @@ -596,11 +623,7 @@ def sample_crosslinker_info_matching_sample_crosslinking_df_with_no_std(): @patch( "backend.protzilla.data_analysis.crosslinking_validation.add_vertical_line_with_annotation_in_legend" ) -@patch( - "backend.protzilla.data_analysis.crosslinking_validation.validate_with_angstrom_deviation" -) def test_diagrams_of_crosslinking_validation_data_without_drawing_all_vertical_lines( - mock_validate, mock_add_vline, mock_create_bar, mock_create_hist, @@ -608,7 +631,6 @@ def test_diagrams_of_crosslinking_validation_data_without_drawing_all_vertical_l sample_crosslinker_info_matching_sample_crosslinking_df_with_no_std, ): validated_df = sample_crosslinking_df_with_no_std.copy() - mock_validate.return_value = {"crosslinking_result_df": validated_df} hist_mock = Figure() mock_create_hist.return_value = hist_mock @@ -616,11 +638,9 @@ def test_diagrams_of_crosslinking_validation_data_without_drawing_all_vertical_l mock_create_bar.return_value = bar_mock figures = diagrams_of_crosslinking_validation_data( - crosslinking_df=sample_crosslinking_df_with_no_std, - structure_metadata_df=pd.DataFrame({"uniprot_accession": ["P12345"]}), + validated_df=validated_df, + structures_to_validate=["P12345"], crosslinker_information=sample_crosslinker_info_matching_sample_crosslinking_df_with_no_std, - cif_df=pd.DataFrame(), - amino_acid_sequences_df=pd.DataFrame(), ) # 2 histograms per crosslinker + 1 bar plot @@ -662,8 +682,6 @@ def test_diagrams_calls_with_correct_parameters( sample_crosslinker_info_with_one_crosslinker, ): with patch( - "backend.protzilla.data_analysis.crosslinking_validation.validate_with_angstrom_deviation" - ) as mock_validate, patch( "backend.protzilla.data_analysis.crosslinking_validation.create_histograms" ) as mock_hist, patch( "backend.protzilla.data_analysis.crosslinking_validation.add_vertical_line_with_annotation_in_legend" @@ -671,23 +689,15 @@ def test_diagrams_calls_with_correct_parameters( "backend.protzilla.data_analysis.crosslinking_validation.create_bar_plot" ) as mock_bar: - mock_validate.return_value = { - "crosslinking_result_df": sample_crosslinking_df_with_one_crosslinker - } - mock_hist.side_effect = lambda **kwargs: f"hist_{kwargs['heading']}" mock_bar.return_value = "bar_fig" figures = diagrams_of_crosslinking_validation_data( - crosslinking_df=sample_crosslinking_df_with_one_crosslinker, - structure_metadata_df=pd.DataFrame({"uniprot_accession": ["P12345"]}), + validated_df=sample_crosslinking_df_with_one_crosslinker, + structures_to_validate=["P12345"], crosslinker_information=sample_crosslinker_info_with_one_crosslinker, - cif_df=pd.DataFrame(), - amino_acid_sequences_df=pd.DataFrame(), ) - mock_validate.assert_called_once() - # There should be 2 histogram calls: 2 per crosslinker assert mock_hist.call_count == 2 @@ -782,19 +792,25 @@ def test_validate_multimer_with_invalid_crosslinks(): "_atom_site.Cartn_x": [1.0, 2.0, 3.0, 4.0], "_atom_site.Cartn_y": [0.0, 0.0, 0.0, 0.0], "_atom_site.Cartn_z": [0.0, 0.0, 0.0, 0.0], + "_atom_site.auth_asym_id": ["A"] * 4, + "_atom_site.label_entity_id": [1, 1, 2, 2], } ) # length = 1.5, upper_dev = 0.6, lower_dev = 0.6. # Distances will be [0.0, 0.0, 2.0, 2.0] -> two valid (2.0) and two invalid (0.0). crosslinker_information = {"XL": [1.5, 0.6, 0.6]} + valid_ids = {"P1": [1], "P2": [2]} + structures_to_validate = ["P1", "P2"] out = validate_with_angstrom_deviation( - crosslinking_df=crosslinking_df, - structure_metadata_df=pd.DataFrame({"uniprot_ids": ["['P1', 'P2']"]}), + relevant_crosslinks_df=crosslinking_df, crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, + valid_ids=valid_ids, + id_column_name="_atom_site.label_entity_id", + structures_to_validate=structures_to_validate, ) result_df = out["crosslinking_result_df"] @@ -815,3 +831,177 @@ def test_validate_multimer_with_invalid_crosslinks(): ) assert valid_distances == [2.0, 2.0] assert "link_type" in result_df.columns + + +def test_get_chains(): + """Test that get_chains extracts chain IDs correctly from CIF data.""" + cif_df = pd.DataFrame( + { + "_atom_site.label_seq_id": [1, 2, 3, 4, 5], + "_atom_site.auth_asym_id": ["A", "A", "B", "B", "B"], + "_atom_site.label_entity_id": [1, 1, 2, 2, 2], + } + ) + + valid_ids = {"P1": [1], "P2": [2]} + + chains_p1 = get_chains( + cif_df=cif_df, + valid_ids=valid_ids, + protein_id="P1", + id_column_name="_atom_site.label_entity_id", + ) + + chains_p2 = get_chains( + cif_df=cif_df, + valid_ids=valid_ids, + protein_id="P2", + id_column_name="_atom_site.label_entity_id", + ) + + assert set(chains_p1) == {"A"} + assert set(chains_p2) == {"B"} + + +def test_expand_crosslinks_to_chain_combinations_homodimer(): + """Test expanding crosslinks for homodimer (same protein twice).""" + crosslinking_df = pd.DataFrame( + [ + ("P1", "P1", "AB", "CD", 0, 0, "XL"), + ], + columns=[ + "Protein_id1", + "Protein_id2", + "Peptide1", + "Peptide2", + "CL_position_within_peptide1", + "CL_position_within_peptide2", + "Crosslinker", + ], + ) + + chains_per_protein = {"P1": {"A": None, "B": None}} + + expanded_df = expand_crosslinks_to_chain_combinations( + crosslinking_df, chains_per_protein + ) + + # For homodimer with 2 chains: combinations with replacement should give us: + # (A,A), (A,B), (B,B) = 3 combinations + assert len(expanded_df) == 3 + assert "Chain_id1" in expanded_df.columns + assert "Chain_id2" in expanded_df.columns + + chain_combos = set( + zip(expanded_df["Chain_id1"].tolist(), expanded_df["Chain_id2"].tolist()) + ) + assert chain_combos == {("A", "A"), ("A", "B"), ("B", "B")} + + +def test_expand_crosslinks_to_chain_combinations_heterodimer(): + """Test expanding crosslinks for heterodimer (different proteins).""" + crosslinking_df = pd.DataFrame( + [ + ("P1", "P2", "AB", "CD", 0, 0, "XL"), + ], + columns=[ + "Protein_id1", + "Protein_id2", + "Peptide1", + "Peptide2", + "CL_position_within_peptide1", + "CL_position_within_peptide2", + "Crosslinker", + ], + ) + + chains_per_protein = {"P1": {"A": None}, "P2": {"C": None, "D": None}} + + expanded_df = expand_crosslinks_to_chain_combinations( + crosslinking_df, chains_per_protein + ) + + # For heterodimer: product of {A} x {C, D} = 2 combinations + assert len(expanded_df) == 2 + + chain_combos = set( + zip(expanded_df["Chain_id1"].tolist(), expanded_df["Chain_id2"].tolist()) + ) + assert chain_combos == {("A", "C"), ("A", "D")} + + +def test_validate_multimer_same_protein_different_chains_intra_vs_inter(): + """Test that intra/inter link_type is determined by chain ID, not protein ID.""" + sequences_df = pd.DataFrame( + [ + ("P1-1", "ABAB"), + ], + columns=["Protein ID", "Protein Sequence"], + ) + + # Single protein P1 with two copies in multimer (P1 appears twice as different chains) + crosslinking_df = pd.DataFrame( + [ + ("P1", "P1", "AB", "AB", 0, 0, "XL"), + ], + columns=[ + "Protein_id1", + "Protein_id2", + "Peptide1", + "Peptide2", + "CL_position_within_peptide1", + "CL_position_within_peptide2", + "Crosslinker", + ], + ) + + cif_df = pd.DataFrame( + { + "_atom_site.label_atom_id": ["CA"] * 4, + "_atom_site.label_seq_id": list(range(1, 5)), + "_atom_site.Cartn_x": [float(i) for i in range(1, 5)], + "_atom_site.Cartn_y": [0.0] * 4, + "_atom_site.Cartn_z": [0.0] * 4, + "_atom_site.auth_asym_id": ["A", "A", "B", "B"], + "_atom_site.label_entity_id": [1, 1, 1, 1], # All same protein ID in entity + } + ) + + crosslinker_information = {"XL": [0.0, 0.0, 0.0]} + valid_ids = {"P1": [1]} # One protein ID, but present in chains A and B + structures_to_validate = ["P1"] + + out = validate_with_angstrom_deviation( + relevant_crosslinks_df=crosslinking_df, + crosslinker_information=crosslinker_information, + cif_df=cif_df, + amino_acid_sequences_df=sequences_df, + valid_ids=valid_ids, + id_column_name="_atom_site.label_entity_id", + structures_to_validate=structures_to_validate, + ) + + result_df = out["crosslinking_result_df"] + + # Should have 3 combinations: (A,A), (A,B), (B,B) + assert len(result_df) == 3 + + # Check link types based on chain IDs + intra_links = result_df[result_df["link_type"] == "intra"] + inter_links = result_df[result_df["link_type"] == "inter"] + + # (A,A) and (B,B) should be intra (same chain) + assert len(intra_links) == 2 + # (A,B) should be inter (different chains) + assert len(inter_links) == 1 + + # Verify the specific chain combinations + intra_combos = set( + zip(intra_links["Chain_id1"].tolist(), intra_links["Chain_id2"].tolist()) + ) + assert intra_combos == {("A", "A"), ("B", "B")} + + inter_combos = set( + zip(inter_links["Chain_id1"].tolist(), inter_links["Chain_id2"].tolist()) + ) + assert inter_combos == {("A", "B")} From 214adba6517edffd8b77f07baf2deedc52eddb95 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Tue, 21 Apr 2026 11:01:58 +0200 Subject: [PATCH 178/240] fix two bugs and the tests --- .../data_analysis/crosslinking_validation.py | 39 +++++++--------- .../test_crosslinking_validation.py | 44 +++++++++---------- 2 files changed, 38 insertions(+), 45 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index f5745f660..624979e80 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -308,11 +308,8 @@ def expand_crosslinks_to_chain_combinations( protein_id1 = crosslink["Protein_id1"] protein_id2 = crosslink["Protein_id2"] - chains_protein1 = chains_per_protein[protein_id1] - chains_protein2 = chains_per_protein[protein_id2] - - chain_ids1 = list(chains_protein1.keys()) - chain_ids2 = list(chains_protein2.keys()) + chain_ids1 = chains_per_protein[protein_id1] + chain_ids2 = chains_per_protein[protein_id2] if not chain_ids1 or not chain_ids2: continue @@ -357,16 +354,10 @@ def monomer_validation( :param amino_acid_sequences_df: DataFrame containing known amino acid sequences. :return: A dictionary containing the validation results and distance metrics. """ - all_crosslinks_df = crosslinking_df.copy() protein_id = structure_metadata_df["uniprot_accession"].iloc[0] - # we are only interested in intra-crosslinks of the protein we want to validate - mask = (all_crosslinks_df["Protein_id1"] == protein_id) & ( - all_crosslinks_df["Protein_id2"] == protein_id - ) valid_ids = {protein_id: [protein_id]} - relevant_crosslinks_df = all_crosslinks_df[mask] return validate_with_angstrom_deviation( - relevant_crosslinks_df=relevant_crosslinks_df, + crosslinking_df=crosslinking_df, crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=amino_acid_sequences_df, @@ -462,14 +453,9 @@ def multimer_validation( amino_acid_sequences_df=amino_acid_sequences_df, job_request_df=job_request_df ) structures_to_validate = list(valid_ids.keys()) - all_crosslinks_df = crosslinking_df.copy() - mask = (all_crosslinks_df["Protein_id1"].isin(structures_to_validate)) & ( - all_crosslinks_df["Protein_id2"].isin(structures_to_validate) - ) - relevant_crosslinks_df = all_crosslinks_df[mask] return validate_with_angstrom_deviation( - relevant_crosslinks_df=relevant_crosslinks_df, + crosslinking_df=crosslinking_df, crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=amino_acid_sequences_df, @@ -480,7 +466,7 @@ def multimer_validation( def validate_with_angstrom_deviation( - relevant_crosslinks_df: pd.DataFrame, + crosslinking_df: pd.DataFrame, crosslinker_information: dict[str, list[float]], cif_df: pd.DataFrame, amino_acid_sequences_df: pd.DataFrame, @@ -494,7 +480,7 @@ def validate_with_angstrom_deviation( so if the distance between the connected amino acids in AlphaFold is less than (cross-linker length + the upper allowed deviation) and more than (cross-linker length - the lower allowed deviation). If one of the bounds is zero only the other bound will be applied. - :param relevant_crosslinks_df: DataFrame containing the subset of cross-linking data to validate. + :param crosslinking_df: DataFrame containing the cross-linking data to validate. :param crosslinker_information: Dictionary mapping crosslinker names to a list of three floats: [crosslinker_length, upper_accepted_deviation, lower_accepted_deviation]. :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms). @@ -510,6 +496,12 @@ def validate_with_angstrom_deviation( :raises ValueError: If peptide sequences cannot be matched to the protein sequence. """ + all_crosslinks_df = crosslinking_df.copy() + mask = (all_crosslinks_df["Protein_id1"].isin(structures_to_validate)) & ( + all_crosslinks_df["Protein_id2"].isin(structures_to_validate) + ) + relevant_crosslinks_df = all_crosslinks_df[mask] + # Check if dataframe is empty if relevant_crosslinks_df.empty: msg = "There are no cross links between the structures to validate." @@ -708,9 +700,10 @@ def diagrams_of_crosslinking_validation_data( ) mean_of_predicted_lengths = crosslinker_df["alphafold_distance"].mean() - standard_deviation_predicted_lengths = crosslinker_df["alphafold_distance"].std( - ddof=0 - ) + if len(crosslinker_df) == 1: + standard_deviation_predicted_lengths = 0.0 + else: + standard_deviation_predicted_lengths = crosslinker_df["alphafold_distance"].std() mean_plus_two_std = ( mean_of_predicted_lengths + 2 * standard_deviation_predicted_lengths ) diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index e8a094aa5..269e93862 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -69,7 +69,7 @@ def test_validate_with_angstrom_deviation(distance, expected): structures_to_validate = ["P12345"] result = validate_with_angstrom_deviation( - relevant_crosslinks_df=crosslinking_df, + crosslinking_df=crosslinking_df, crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=amino_acid_sequences_df, @@ -334,7 +334,7 @@ def test_validate_multimer_filters_only_pairs_within_structures_to_validate(): structures_to_validate = ["P1", "P2"] out = validate_with_angstrom_deviation( - relevant_crosslinks_df=crosslinking_df, + crosslinking_df=crosslinking_df, crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, @@ -404,7 +404,7 @@ def test_validate_multimer_no_links_between_structures_returns_empty_and_warning structures_to_validate = ["P1", "P2"] out = validate_with_angstrom_deviation( - relevant_crosslinks_df=crosslinking_df, + crosslinking_df=crosslinking_df, crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, @@ -452,15 +452,15 @@ def test_validate_multimer_duplicates_rows_for_multiple_peptide_matches_and_vali ], ) - cif_df = cif_df = pd.DataFrame( + cif_df = pd.DataFrame( { - "_atom_site.label_atom_id": ["CA"] * 4, - "_atom_site.label_seq_id": list(range(1, 5)), - "_atom_site.Cartn_x": [float(i) for i in range(1, 5)], - "_atom_site.Cartn_y": [0.0] * 4, - "_atom_site.Cartn_z": [0.0] * 4, - "_atom_site.auth_asym_id": ["A"] * 4, - "_atom_site.label_entity_id": [1, 1, 2, 2], + "_atom_site.label_atom_id": ["CA"] * 8, + "_atom_site.label_seq_id": [1, 2, 3, 4, 1, 2, 3, 4], + "_atom_site.Cartn_x": [1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0], + "_atom_site.Cartn_y": [0.0] * 8, + "_atom_site.Cartn_z": [0.0] * 8, + "_atom_site.auth_asym_id": ["A", "A", "A", "A", "B", "B", "B", "B"], + "_atom_site.label_entity_id": [1, 1, 1, 1, 2, 2, 2, 2], } ) @@ -470,7 +470,7 @@ def test_validate_multimer_duplicates_rows_for_multiple_peptide_matches_and_vali structures_to_validate = ["P1", "P2"] out = validate_with_angstrom_deviation( - relevant_crosslinks_df=crosslinking_df, + crosslinking_df=crosslinking_df, crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, @@ -804,7 +804,7 @@ def test_validate_multimer_with_invalid_crosslinks(): structures_to_validate = ["P1", "P2"] out = validate_with_angstrom_deviation( - relevant_crosslinks_df=crosslinking_df, + crosslinking_df=crosslinking_df, crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, @@ -934,7 +934,7 @@ def test_validate_multimer_same_protein_different_chains_intra_vs_inter(): """Test that intra/inter link_type is determined by chain ID, not protein ID.""" sequences_df = pd.DataFrame( [ - ("P1-1", "ABAB"), + ("P1-1", "ABCD"), ], columns=["Protein ID", "Protein Sequence"], ) @@ -957,13 +957,13 @@ def test_validate_multimer_same_protein_different_chains_intra_vs_inter(): cif_df = pd.DataFrame( { - "_atom_site.label_atom_id": ["CA"] * 4, - "_atom_site.label_seq_id": list(range(1, 5)), - "_atom_site.Cartn_x": [float(i) for i in range(1, 5)], - "_atom_site.Cartn_y": [0.0] * 4, - "_atom_site.Cartn_z": [0.0] * 4, - "_atom_site.auth_asym_id": ["A", "A", "B", "B"], - "_atom_site.label_entity_id": [1, 1, 1, 1], # All same protein ID in entity + "_atom_site.label_atom_id": ["CA"] * 8, + "_atom_site.label_seq_id": [1, 2, 3, 4, 1, 2, 3, 4], + "_atom_site.Cartn_x": [1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0], + "_atom_site.Cartn_y": [0.0] * 8, + "_atom_site.Cartn_z": [0.0] * 8, + "_atom_site.auth_asym_id": ["A", "A", "A", "A", "B", "B", "B", "B"], + "_atom_site.label_entity_id": [1, 1, 1, 1, 1, 1, 1, 1], } ) @@ -972,7 +972,7 @@ def test_validate_multimer_same_protein_different_chains_intra_vs_inter(): structures_to_validate = ["P1"] out = validate_with_angstrom_deviation( - relevant_crosslinks_df=crosslinking_df, + crosslinking_df=crosslinking_df, crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, From 0bfe813073b9062860605fd9976c54fe0ee8b51c Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Tue, 21 Apr 2026 13:46:44 +0200 Subject: [PATCH 179/240] feat: add first draft for visualization (#268) * feat: add first draft for visualiszation * feat: add visualization_method to steps and a visualization tab in frontend * fix: remove testing code * feat: prepare display of visualization without connection to molstar component * feat: add more functionality for visualization * feat: add more functionality to visualization component * refactor: limit possibilities for visualization types * feat: mvp of visualizations is finally working nowgit statusgit statusgit status * refactor: some minor changes for prettier code * feat: add new dependencies for docker * fix: remove erroneous docker volume * feat: fix docker installations * feat: add some styling * feat: store cif_df instead of url to cif * refactor: store dataframe and convert back to cif for visualization * feat: decode cif string for visualization from cif_df * feat: add visualization method for crosslinking validation * refactor: some code cleanup * feat: add first draft for crosslinker visualization * fix: revert back to visualization without crosslinks due to major issues * feat: add crosslinker visualization * refactor: some code cleanup * feat: add base for color-coded distinction of crosslinks * feat: crosslinker to be displayed in different colors depending on isIntra and isValid * refactor: spread code of new component over several files for better structure * refactor: add broken refactoring of visualization outputs * fix: fix broken visualization output refactoring and remove unused parts of the first visualization output attempt * feat: readd visualization to importing steps * refactor: some code cleanup of what was left from merging and refactoring * feat: adapt for multimer use * feat: connect errors of new component to system's error system * refactor: (hopefully) make frontend linter happy * refactor: create a config file for constants in the new component * refactor: replace magic numbers in styling for something more relative * refactor: fix styling * feat: extend visualization of crosslinks so it can be used with multimers in the future * feat: add lazy loading of visualization, to ensure proper usability of runs with visualizations * feat: enable correct multimer inter crosslink display * fix: some code formatting * fix: fix tests * fix: some linter cleanup * feat: better visualization serialisation/deserialisation in the disk operator * fix: change external facing messages to british english * fix: backend formatting * fix: visualization serialisation/deserialisation in the disk operator * refactor: some minor changes for better code * fix: remove code for debugging * refactor: clean use of entry ids * fix: fix tests --------- Co-authored-by: Tarek Massini Co-authored-by: Anna Polensky --- backend/main/urls.py | 5 + backend/main/views.py | 33 + backend/main/views_helper.py | 129 + .../data_analysis/crosslinking_validation.py | 29 +- backend/protzilla/disk_operator.py | 19 + .../alphafold_protein_structure_load.py | 56 +- backend/protzilla/steps.py | 1 + .../test_crosslinking_validation.py | 30 +- .../test_alphafold_protein_structure_load.py | 1 + frontend/package-lock.json | 3687 ++++++++++++++++- frontend/package.json | 4 +- frontend/pnpm-lock.yaml | 3002 ++++++++++++-- .../components/app/run-screen/run-screen.tsx | 100 +- frontend/src/components/core/index.ts | 1 + frontend/src/components/core/shared/index.ts | 1 + .../molstar-viewer/crosslinker-processing.tsx | 244 ++ .../core/shared/molstar-viewer/index.ts | 2 + .../molstar-viewer/molstar-viewer.config.ts | 8 + .../molstar-viewer/molstar-viewer.props.tsx | 6 + .../molstar-viewer/molstar-viewer.service.ts | 64 + .../shared/molstar-viewer/molstar-viewer.tsx | 81 + .../core/shared/molstar-viewer/styles.ts | 250 ++ frontend/src/utils/protzilla-types.ts | 14 + 23 files changed, 7321 insertions(+), 446 deletions(-) create mode 100644 frontend/src/components/core/shared/molstar-viewer/crosslinker-processing.tsx create mode 100644 frontend/src/components/core/shared/molstar-viewer/index.ts create mode 100644 frontend/src/components/core/shared/molstar-viewer/molstar-viewer.config.ts create mode 100644 frontend/src/components/core/shared/molstar-viewer/molstar-viewer.props.tsx create mode 100644 frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts create mode 100644 frontend/src/components/core/shared/molstar-viewer/molstar-viewer.tsx create mode 100644 frontend/src/components/core/shared/molstar-viewer/styles.ts diff --git a/backend/main/urls.py b/backend/main/urls.py index 5f1ae3264..51a184285 100644 --- a/backend/main/urls.py +++ b/backend/main/urls.py @@ -48,6 +48,11 @@ path("api/save_workflow/", views.save_workflow, name="save_workflow"), path("api/get_step_form/", views.get_step_form, name="get_step_form"), path("api/get_step_plots/", views.get_step_plots, name="get_step_plots"), + path( + "api/get_step_visualizations/", + views.get_step_visualizations, + name="get_step_visualizations", + ), path( "api/get_downloads_from_step/", views.get_downloads_from_step, diff --git a/backend/main/views.py b/backend/main/views.py index 7c5d3ce2a..ec5ac6925 100644 --- a/backend/main/views.py +++ b/backend/main/views.py @@ -3,6 +3,7 @@ import traceback from zipfile import ZipFile import re +import traceback import logging from typing import Any @@ -42,6 +43,7 @@ get_displayed_steps, parameters_from_post, sanitize_name, + create_visualization, ) from backend.protzilla.all_steps import get_all_possible_steps @@ -715,6 +717,37 @@ def get_downloads_from_step(request: HttpRequest): ) +def get_step_visualizations(request): + if request.method == "POST": + data = json.loads(request.body) + run_name = data.get("run_name") + step_id = data.get("step_id") + output_key = data.get("output_key") + + run = Run(run_name) + step = run.steps.get_step_by_id(step_id) + visualization_dict = step.output.get(output_key) + if visualization_dict is None: + return JsonResponse( + { + "success": False, + "message": "Got no available visualization for the step", + "data": {}, + } + ) + return JsonResponse( + { + "success": True, + "message": "Got the available visualization for the step", + "data": create_visualization(**visualization_dict), + } + ) + else: + return JsonResponse( + {"success": False, "message": "Invalid request method"}, status=405 + ) + + # TODO: Move somewhere else def _step_output_as_serialised_table( label: str, _data: pd.DataFrame | Any, index_delims: tuple[int, int] = (None, None) diff --git a/backend/main/views_helper.py b/backend/main/views_helper.py index 506e087a1..e7f6f7c0b 100644 --- a/backend/main/views_helper.py +++ b/backend/main/views_helper.py @@ -2,6 +2,8 @@ from pathlib import Path import numpy as np +import pandas as pd +from typing import Optional, List, Dict from backend.protzilla.constants.paths import SETTINGS_PATH from backend.protzilla.disk_operator import YamlOperator @@ -176,3 +178,130 @@ def load_yaml_from_file(path: Path) -> str: raise FileNotFoundError(f"File {path} does not exist.") with path.open("r") as f: return f.read() + + +# ------------------------- helper for get_step_visualization: ------------------------- + + +def create_visualization( + cif_df: pd.DataFrame, + structure_entry_id: str, + crosslinking_df: Optional[pd.DataFrame] = None, +) -> dict: + """ + Create visualization data, by packaging a mmCIF string (converted from a CIF DataFrame) with its structure entry ID. + Optionally include crosslinks. + + :param cif_df: DataFrame containing mmCIF atom_site information. + :param structure_entry_id: Protein identifier to include in the mmCIF header. + :param crosslinking_df: Optional DataFrame containing crosslink positions. + :return: Dictionary containing: + - "structureEntryId" (str) + - "cifString" (str) + - "crosslinks" (optional, list of dicts) + """ + try: + cif_string = convert_cif_df_to_mmcif_for_visualization( + cif_df, structure_entry_id + ) + except (ValueError, TypeError): + cif_string = "" + + result = {"structureEntryId": structure_entry_id, "cifString": cif_string} + + if crosslinking_df is not None: + result["crosslinks"] = extract_relevant_crosslink_information(crosslinking_df) + + return result + + +def convert_cif_df_to_mmcif_for_visualization( + cif_df: pd.DataFrame, structure_entry_id: str +) -> str: + """ + Convert a DataFrame containing mmCIF atom_site information back into a mmCIF string. + + :param cif_df: DataFrame with CIF columns + :param structure_entry_id: Optional entry ID for the CIF block + :return: A string representing the mmCIF file + """ + if cif_df is None or cif_df.empty: + raise ValueError("CIF-DataFrame is empty, cannot create mmCIF content.") + + lines = [ + f"data_{structure_entry_id}", + "#", + f"_entry.id {structure_entry_id}", + "#", + "loop_", + ] + + for column in cif_df.columns: + lines.append(column) + + for _, row in cif_df.iterrows(): + row_items = [] + for column in cif_df.columns: + value = row[column] + if value is None: + value_str = "." + else: + value_str = str(value) + if " " in value_str or any(char in value_str for char in "();,"): + value_str = f"'{value_str}'" + row_items.append(value_str) + lines.append(" ".join(row_items)) + + cif_string = "\n".join(lines) + return cif_string + + +def extract_relevant_crosslink_information( + crosslinking_df: pd.DataFrame, +) -> List[Dict[str, int]]: + """ + For each crosslink extract its relevant information from a DataFrame. + This includes information on where the crosslinker binds on both its ends, + such as the chain and the absolute crosslinker position within the chain. + As well as a boolean for its validity and wether it is an intra or inter crosslink. + + :param crosslinking_df: DataFrame with columns + 'crosslinker_position1', + 'crosslinker_position2', + 'chain_id1', + 'chain_id2', + 'valid_crosslink', + 'Is_intra_crosslink', + :return: List of dicts with keys + 'crosslinkerPosition1', + 'crosslinkerPosition2', + 'chainId1', + 'chainId2', + 'isValid', + 'isIntraCrosslink', + """ + crosslinks = [] + for _, row in crosslinking_df.iterrows(): + position1 = row.get("crosslinker_position1") + position2 = row.get("crosslinker_position2") + # When the validation is extended to treat multimeres with more than one chain correctly, + # it should ideally store chain_id1 and chain_id2 into the crosslinking_df. + # Since we already need those chain ids to calculate correct distances in the validation, + # it would be unnecessary to determine those again in the visualization. + # Therefore we use placeholders for now and need to change the following, when the validation is extended: + chain_id1 = "A" # row.get("chain_id1") + chain_id2 = "A" # row.get("chain_id2") + is_valid = row.get("valid_crosslink") + is_intra_crosslink = row.get("Is_intra_crosslink") + if pd.notnull(position1) and pd.notnull(position2) and pd.notnull(is_valid): + crosslinks.append( + { + "crosslinkerPosition1": int(position1), + "crosslinkerPosition2": int(position2), + "chainId1": str(chain_id1), + "chainId2": str(chain_id2), + "isValid": bool(is_valid), + "isIntraCrosslink": bool(is_intra_crosslink), + } + ) + return crosslinks diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index eb4e4f618..00f28cc01 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -13,7 +13,10 @@ create_histograms, create_bar_plot, ) -from protzilla.data_analysis.plots import add_vertical_line_with_annotation_in_legend +from backend.protzilla.data_analysis.plots import ( + add_vertical_line_with_annotation_in_legend, +) +from backend.protzilla.steps import OutputItem, OutputType def get_reactive_atom_of_amino_acid_residue(amino_acid_type: str) -> str: @@ -27,7 +30,7 @@ def get_reactive_atom_of_amino_acid_residue(amino_acid_type: str) -> str: """ # right now we always return the central C atom # later we might want to return the reactive atom of the amino acid residue of the specific amino acid type - # as soon as we change this, we will need to change the test test_validate_with_angstrom_deviation + # as soon as we change this, we will need to change the test test_validate_with_angstrom_deviation (and the visualization) return "CA" @@ -268,6 +271,13 @@ def _get_structures_to_validate(structure_metadata_df: pd.DataFrame) -> list[str raise ValueError("Metadata must contain 'uniprot_ids' or 'uniprot_accession'.") +def _get_structure_entry_id(structure_metadata_df: pd.DataFrame) -> list[str]: + if "entry_id" in structure_metadata_df.columns: + return structure_metadata_df["entry_id"].iloc[0] + else: + raise ValueError("Metadata must contain 'entry_id'.") + + def validate_with_angstrom_deviation( crosslinking_df: pd.DataFrame, structure_metadata_df: pd.DataFrame, @@ -399,7 +409,20 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: axis=1, ) - return dict(crosslinking_result_df=checked_crosslinks_df, messages=messages) + structure_entry_id = _get_structure_entry_id(structure_metadata_df) + data_for_visualization = { + "structure_entry_id": structure_entry_id, + "cif_df": cif_df, + "crosslinking_df": checked_crosslinks_df, + } + + return dict( + crosslinking_result_df=checked_crosslinks_df, + messages=messages, + visualization=OutputItem( + output_type=OutputType.VISUALIZATION, value=data_for_visualization + ), + ) def diagrams_of_crosslinking_validation_data( diff --git a/backend/protzilla/disk_operator.py b/backend/protzilla/disk_operator.py index 235b99447..675c07367 100644 --- a/backend/protzilla/disk_operator.py +++ b/backend/protzilla/disk_operator.py @@ -426,6 +426,12 @@ def _read_outputs(self, _output: dict[str, OutputItem]) -> Output: output_type=OutputType.PNG_BASE64, value=self.base64_operator.read(self.run_dir / path), ) + case OutputType.VISUALIZATION: + path = Path(str(item.value)) + step_output[key] = OutputItem( + output_type=OutputType.VISUALIZATION, + value=self.artifact_operator.read(self.run_dir / path), + ) case _: step_output[key] = item @@ -478,6 +484,19 @@ def _write_output(self, step: Step) -> dict: output_type=OutputType.PNG_BASE64, value=str(file_path.relative_to(self.run_dir)), ) + case OutputType.VISUALIZATION: + file_path = ( + self.artifact_dir + / f"{step.instance_identifier}_{key}_visualization.joblib.gz" + ) + + if self._dump_is_outdated(step, "output"): + self.artifact_operator.write(file_path, item.value) + + output_data[key] = OutputItem( + output_type=OutputType.VISUALIZATION, + value=str(file_path.relative_to(self.run_dir)), + ) case _: output_data[key] = item diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index d56d1226b..32151e750 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -19,6 +19,7 @@ from backend.protzilla.importing.fasta_import import fasta_import from backend.protzilla.networking import download_file_from_url from backend.protzilla.utilities.utilities import copy_file_to_directory +from backend.protzilla.steps import OutputItem, OutputType def get_monomer_metadata_df() -> pd.DataFrame: @@ -413,14 +414,25 @@ def fetch_alphafold_protein_structure( success_msg = f"Successfully loaded AlphaFold data for protein with Protein ID '{uniprot_id}'" logger.info(success_msg) messages.append(dict(level=logging.INFO, msg=success_msg)) + data_for_visualization = { + "structure_entry_id": uniprot_id, + "cif_df": alpha_dfs["cif_df"], + } else: message = ( f"Could not load AlphaFold data for protein with Protein ID '{uniprot_id}'" ) logger.warning(message) messages.append(dict(level=logging.WARNING, msg=message)) - df_dict["messages"] = messages - return df_dict + data_for_visualization = None + + return dict( + **df_dict, + messages=messages, + visualization=OutputItem( + output_type=OutputType.VISUALIZATION, value=data_for_visualization + ), + ) def get_all_available_entry_ids_of_monomer_metadata() -> list[str]: @@ -690,8 +702,17 @@ def get_monomer_structure_dfs(entry_id: str) -> dict[str, Any]: "amino_acid_sequences_df": amino_acid_sequences_df, } check_success_of_get_df(entry_id=entry_id, df_dict=df_dict, messages=messages) - df_dict["messages"] = messages - return df_dict + data_for_visualization = { + "structure_entry_id": entry_id, + "cif_df": cif_df, + } + return dict( + **df_dict, + messages=messages, + visualization=OutputItem( + output_type=OutputType.VISUALIZATION, value=data_for_visualization + ), + ) def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: @@ -765,8 +786,17 @@ def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: "full_data_df": full_data_df, } check_success_of_get_df(entry_id=entry_id, df_dict=df_dict, messages=messages) - df_dict["messages"] = messages - return df_dict + data_for_visualization = { + "structure_entry_id": entry_id, + "cif_df": cif_df, + } + return dict( + **df_dict, + messages=messages, + visualization=OutputItem( + output_type=OutputType.VISUALIZATION, value=data_for_visualization + ), + ) def upload_multimer_prediction( @@ -897,14 +927,24 @@ def upload_multimer_prediction( success_msg = f"Successfully loaded AlphaFold data for entry '{entry_id}'" logger.info(success_msg) messages.append(dict(level=logging.INFO, msg=success_msg)) + data_for_visualization = { + "structure_entry_id": entry_id, + "cif_df": cif_df, + } else: message = f"Could not load AlphaFold data for entry '{entry_id}'" logger.warning(message) messages.append(dict(level=logging.WARNING, msg=message)) - df_dict["messages"] = messages + data_for_visualization = None finally: if temp_dir is not None: shutil.rmtree(temp_dir, ignore_errors=True) - return df_dict + return dict( + **df_dict, + messages=messages, + visualization=OutputItem( + output_type=OutputType.VISUALIZATION, value=data_for_visualization + ), + ) diff --git a/backend/protzilla/steps.py b/backend/protzilla/steps.py index a7fb93ee2..eb69b4cb9 100644 --- a/backend/protzilla/steps.py +++ b/backend/protzilla/steps.py @@ -475,6 +475,7 @@ class OutputType(StrEnum): INT = "int" PNG_BASE64 = "png_base64" DOWNLOAD = "download" # right now only JSONs are supported, value should be dict(filename, json content) + VISUALIZATION = "visualization" # for every data type that is not yaml serializable JOBLIB_ARTIFACT = "joblib_artifact" diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 38673b813..7599155c2 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -60,10 +60,14 @@ def test_validate_with_angstrom_deviation(distance, expected): } ) + structure_metadata_df = pd.DataFrame( + {"entry_id": ["test"], "uniprot_accession": ["P12345"]} + ) + crosslinker_information = {"DSS": [5.0, 1.0, 1.0]} # Länge 5 Å ± 1 Å result = validate_with_angstrom_deviation( crosslinking_df, - structure_metadata_df=pd.DataFrame({"uniprot_accession": ["P12345"]}), + structure_metadata_df=structure_metadata_df, crosslinker_information=crosslinker_information, amino_acid_sequences_df=amino_acid_sequences_df, cif_df=cif_df, @@ -315,9 +319,13 @@ def test_validate_multimer_filters_only_pairs_within_structures_to_validate(): # Format is [length, upper_deviation, lower_deviation]. crosslinker_information = {"XL": [0.0, 0.0, 0.0]} + structure_metadata_df = pd.DataFrame( + {"entry_id": ["test"], "uniprot_ids": [["P1", "P2"]]} + ) + out = validate_with_angstrom_deviation( crosslinking_df=crosslinking_df, - structure_metadata_df=pd.DataFrame({"uniprot_ids": [["P1", "P2"]]}), + structure_metadata_df=structure_metadata_df, crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, @@ -377,9 +385,13 @@ def test_validate_multimer_no_links_between_structures_returns_empty_and_warning ) crosslinker_information = {"XL": [0.0, 0.0, 0.0]} + structure_metadata_df = pd.DataFrame( + {"entry_id": ["test"], "uniprot_ids": [["P1", "P2"]]} + ) + out = validate_with_angstrom_deviation( crosslinking_df=crosslinking_df, - structure_metadata_df=pd.DataFrame({"uniprot_ids": [["P1", "P2"]]}), + structure_metadata_df=structure_metadata_df, crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, @@ -437,9 +449,13 @@ def test_validate_multimer_duplicates_rows_for_multiple_peptide_matches_and_vali # Always-valid bounds so we focus on duplication and distance computation. crosslinker_information = {"XL": [0.0, 0.0, 0.0]} + structure_metadata_df = pd.DataFrame( + {"entry_id": ["test"], "uniprot_ids": [["P1", "P2"]]} + ) + out = validate_with_angstrom_deviation( crosslinking_df=crosslinking_df, - structure_metadata_df=pd.DataFrame({"uniprot_ids": [["P1", "P2"]]}), + structure_metadata_df=structure_metadata_df, crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, @@ -789,9 +805,13 @@ def test_validate_multimer_with_invalid_crosslinks(): # Distances will be [0.0, 0.0, 2.0, 2.0] -> two valid (2.0) and two invalid (0.0). crosslinker_information = {"XL": [1.5, 0.6, 0.6]} + structure_metadata_df = pd.DataFrame( + {"entry_id": ["test"], "uniprot_ids": [["P1", "P2"]]} + ) + out = validate_with_angstrom_deviation( crosslinking_df=crosslinking_df, - structure_metadata_df=pd.DataFrame({"uniprot_ids": ["['P1', 'P2']"]}), + structure_metadata_df=structure_metadata_df, crosslinker_information=crosslinker_information, cif_df=cif_df, amino_acid_sequences_df=sequences_df, diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index b0bf84f3e..3a52fecf6 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -133,6 +133,7 @@ def test_fetch_alphafold_returned_keys(tmp_path, monkeypatch): "plddt_df", "amino_acid_sequences_df", "messages", + "visualization", } diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 0ea3e32fd..1cbe87bc5 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -8,6 +8,7 @@ "name": "frontend", "version": "0.0.0", "dependencies": { + "@dagrejs/dagre": "^2.0.4", "@emotion/react": "^11.14.0", "@emotion/styled": "^11.14.0", "@mui/material": "^6.4.12", @@ -15,15 +16,17 @@ "@mui/x-data-grid": "^7.29.6", "@storybook/addon-actions": "^8.6.14", "@types/node": "^24.3.1", + "@xyflow/react": "^12.10.0", "axios": "^1.9.0", "bootstrap": "^5.3.6", - "corepack": "^0.34.0", + "corepack": "^0.34.6", "fast-deep-equal": "^3.1.3", "file-saver": "^2.0.5", "framer-motion": "^12.17.0", "lodash.merge": "^4.6.2", "mobx": "^6.13.7", "mobx-react-lite": "^4.1.0", + "molstar": "^5.7.0", "moment": "^2.30.1", "plotly.js": "^3.0.1", "plotly.js-dist-min": "^3.0.1", @@ -67,6 +70,7 @@ "jsdom": "^26.1.0", "lint-staged": "^15.5.2", "prettier": "^3.5.3", + "sass-embedded": "^1.98.0", "storybook": "^8.6.14", "typescript": "~5.6.3", "typescript-eslint": "^8.34.0", @@ -482,6 +486,13 @@ "node": ">=6.9.0" } }, + "node_modules/@bufbuild/protobuf": { + "version": "2.11.0", + "resolved": "https://registry.npmjs.org/@bufbuild/protobuf/-/protobuf-2.11.0.tgz", + "integrity": "sha512-sBXGT13cpmPR5BMgHE6UEEfEaShh5Ror6rfN3yEK5si7QVrtZg8LEPQb0VVhiLRUslD2yLnXtnRzG035J/mZXQ==", + "dev": true, + "license": "(Apache-2.0 AND BSD-3-Clause)" + }, "node_modules/@choojs/findup": { "version": "0.2.1", "resolved": "https://registry.npmjs.org/@choojs/findup/-/findup-0.2.1.tgz", @@ -628,6 +639,21 @@ "node": ">=18" } }, + "node_modules/@dagrejs/dagre": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/@dagrejs/dagre/-/dagre-2.0.4.tgz", + "integrity": "sha512-J6vCWTNpicHF4zFlZG1cS5DkGzMr9941gddYkakjrg3ZNev4bbqEgLHFTWiFrcJm7UCRu7olO3K6IRDd9gSGhA==", + "license": "MIT", + "dependencies": { + "@dagrejs/graphlib": "3.0.4" + } + }, + "node_modules/@dagrejs/graphlib": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@dagrejs/graphlib/-/graphlib-3.0.4.tgz", + "integrity": "sha512-HxZ7fCvAwTLCWCO0WjDkzAFQze8LdC6iOpKbetDKHIuDfIgMlIzYzqZ4nxwLlclQX+3ZVeZ1K2OuaOE2WWcyOg==", + "license": "MIT" + }, "node_modules/@emnapi/core": { "version": "1.5.0", "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.5.0.tgz", @@ -2159,6 +2185,316 @@ "node": ">=12.4.0" } }, + "node_modules/@parcel/watcher": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher/-/watcher-2.5.6.tgz", + "integrity": "sha512-tmmZ3lQxAe/k/+rNnXQRawJ4NjxO2hqiOLTHvWchtGZULp4RyFeh6aU4XdOYBFe2KE1oShQTv4AblOs2iOrNnQ==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "dependencies": { + "detect-libc": "^2.0.3", + "is-glob": "^4.0.3", + "node-addon-api": "^7.0.0", + "picomatch": "^4.0.3" + }, + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + }, + "optionalDependencies": { + "@parcel/watcher-android-arm64": "2.5.6", + "@parcel/watcher-darwin-arm64": "2.5.6", + "@parcel/watcher-darwin-x64": "2.5.6", + "@parcel/watcher-freebsd-x64": "2.5.6", + "@parcel/watcher-linux-arm-glibc": "2.5.6", + "@parcel/watcher-linux-arm-musl": "2.5.6", + "@parcel/watcher-linux-arm64-glibc": "2.5.6", + "@parcel/watcher-linux-arm64-musl": "2.5.6", + "@parcel/watcher-linux-x64-glibc": "2.5.6", + "@parcel/watcher-linux-x64-musl": "2.5.6", + "@parcel/watcher-win32-arm64": "2.5.6", + "@parcel/watcher-win32-ia32": "2.5.6", + "@parcel/watcher-win32-x64": "2.5.6" + } + }, + "node_modules/@parcel/watcher-android-arm64": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-android-arm64/-/watcher-android-arm64-2.5.6.tgz", + "integrity": "sha512-YQxSS34tPF/6ZG7r/Ih9xy+kP/WwediEUsqmtf0cuCV5TPPKw/PQHRhueUo6JdeFJaqV3pyjm0GdYjZotbRt/A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-darwin-arm64": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-darwin-arm64/-/watcher-darwin-arm64-2.5.6.tgz", + "integrity": "sha512-Z2ZdrnwyXvvvdtRHLmM4knydIdU9adO3D4n/0cVipF3rRiwP+3/sfzpAwA/qKFL6i1ModaabkU7IbpeMBgiVEA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-darwin-x64": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-darwin-x64/-/watcher-darwin-x64-2.5.6.tgz", + "integrity": "sha512-HgvOf3W9dhithcwOWX9uDZyn1lW9R+7tPZ4sug+NGrGIo4Rk1hAXLEbcH1TQSqxts0NYXXlOWqVpvS1SFS4fRg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-freebsd-x64": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-freebsd-x64/-/watcher-freebsd-x64-2.5.6.tgz", + "integrity": "sha512-vJVi8yd/qzJxEKHkeemh7w3YAn6RJCtYlE4HPMoVnCpIXEzSrxErBW5SJBgKLbXU3WdIpkjBTeUNtyBVn8TRng==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-linux-arm-glibc": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm-glibc/-/watcher-linux-arm-glibc-2.5.6.tgz", + "integrity": "sha512-9JiYfB6h6BgV50CCfasfLf/uvOcJskMSwcdH1PHH9rvS1IrNy8zad6IUVPVUfmXr+u+Km9IxcfMLzgdOudz9EQ==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-linux-arm-musl": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm-musl/-/watcher-linux-arm-musl-2.5.6.tgz", + "integrity": "sha512-Ve3gUCG57nuUUSyjBq/MAM0CzArtuIOxsBdQ+ftz6ho8n7s1i9E1Nmk/xmP323r2YL0SONs1EuwqBp2u1k5fxg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-linux-arm64-glibc": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm64-glibc/-/watcher-linux-arm64-glibc-2.5.6.tgz", + "integrity": "sha512-f2g/DT3NhGPdBmMWYoxixqYr3v/UXcmLOYy16Bx0TM20Tchduwr4EaCbmxh1321TABqPGDpS8D/ggOTaljijOA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-linux-arm64-musl": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm64-musl/-/watcher-linux-arm64-musl-2.5.6.tgz", + "integrity": "sha512-qb6naMDGlbCwdhLj6hgoVKJl2odL34z2sqkC7Z6kzir8b5W65WYDpLB6R06KabvZdgoHI/zxke4b3zR0wAbDTA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-linux-x64-glibc": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-x64-glibc/-/watcher-linux-x64-glibc-2.5.6.tgz", + "integrity": "sha512-kbT5wvNQlx7NaGjzPFu8nVIW1rWqV780O7ZtkjuWaPUgpv2NMFpjYERVi0UYj1msZNyCzGlaCWEtzc+exjMGbQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-linux-x64-musl": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-x64-musl/-/watcher-linux-x64-musl-2.5.6.tgz", + "integrity": "sha512-1JRFeC+h7RdXwldHzTsmdtYR/Ku8SylLgTU/reMuqdVD7CtLwf0VR1FqeprZ0eHQkO0vqsbvFLXUmYm/uNKJBg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-win32-arm64": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-win32-arm64/-/watcher-win32-arm64-2.5.6.tgz", + "integrity": "sha512-3ukyebjc6eGlw9yRt678DxVF7rjXatWiHvTXqphZLvo7aC5NdEgFufVwjFfY51ijYEWpXbqF5jtrK275z52D4Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-win32-ia32": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-win32-ia32/-/watcher-win32-ia32-2.5.6.tgz", + "integrity": "sha512-k35yLp1ZMwwee3Ez/pxBi5cf4AoBKYXj00CZ80jUz5h8prpiaQsiRPKQMxoLstNuqe2vR4RNPEAEcjEFzhEz/g==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-win32-x64": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-win32-x64/-/watcher-win32-x64-2.5.6.tgz", + "integrity": "sha512-hbQlYcCq5dlAX9Qx+kFb0FHue6vbjlf0FrNzSKdYK2APUf7tGfGxQCk2ihEREmbR6ZMc0MVAD5RIX/41gpUzTw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, "node_modules/@pkgjs/parseargs": { "version": "0.11.0", "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz", @@ -2565,6 +2901,13 @@ "integrity": "sha512-zt6OdqaDoOnJ1ZYsCYGt9YmWzDXl4vQdKTyJev62gFhRGKdx7mcT54V9KIjg+d2wi9EXsPvAPKe7i7WjfVWB8g==", "dev": true }, + "node_modules/@scarf/scarf": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/@scarf/scarf/-/scarf-1.4.0.tgz", + "integrity": "sha512-xxeapPiUXdZAE3che6f3xogoJPeZgig6omHEy1rIY5WVsB3H2BHNnZH+gHG6x91SCWyQCzWGsuL2Hh3ClO5/qQ==", + "hasInstallScript": true, + "license": "Apache-2.0" + }, "node_modules/@storybook/addon-actions": { "version": "8.6.14", "resolved": "https://registry.npmjs.org/@storybook/addon-actions/-/addon-actions-8.6.14.tgz", @@ -3546,6 +3889,12 @@ "tslib": "^2.4.0" } }, + "node_modules/@types/argparse": { + "version": "2.0.17", + "resolved": "https://registry.npmjs.org/@types/argparse/-/argparse-2.0.17.tgz", + "integrity": "sha512-fueJssTf+4dW4HODshEGkIZbkLKHzgu1FvCI4cTc/MKum/534Euo3SrN+ilq8xgyHnOjtmg33/hee8iXLRg1XA==", + "license": "MIT" + }, "node_modules/@types/aria-query": { "version": "5.0.4", "resolved": "https://registry.npmjs.org/@types/aria-query/-/aria-query-5.0.4.tgz", @@ -3593,6 +3942,22 @@ "@babel/types": "^7.28.2" } }, + "node_modules/@types/benchmark": { + "version": "2.1.5", + "resolved": "https://registry.npmjs.org/@types/benchmark/-/benchmark-2.1.5.tgz", + "integrity": "sha512-cKio2eFB3v7qmKcvIHLUMw/dIx/8bhWPuzpzRT4unCPRTD8VdA9Zb0afxpcxOqR4PixRS7yT42FqGS8BYL8g1w==", + "license": "MIT" + }, + "node_modules/@types/body-parser": { + "version": "1.19.6", + "resolved": "https://registry.npmjs.org/@types/body-parser/-/body-parser-1.19.6.tgz", + "integrity": "sha512-HLFeCYgz89uk22N5Qg3dvGvsv46B8GLvKKo1zKG4NybA8U2DiEO3w9lqGg29t/tfLRJpJ6iQxnVw4OnB7MoM9g==", + "license": "MIT", + "dependencies": { + "@types/connect": "*", + "@types/node": "*" + } + }, "node_modules/@types/chai": { "version": "5.2.2", "resolved": "https://registry.npmjs.org/@types/chai/-/chai-5.2.2.tgz", @@ -3602,6 +3967,83 @@ "@types/deep-eql": "*" } }, + "node_modules/@types/compression": { + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/@types/compression/-/compression-1.8.1.tgz", + "integrity": "sha512-kCFuWS0ebDbmxs0AXYn6e2r2nrGAb5KwQhknjSPSPgJcGd8+HVSILlUyFhGqML2gk39HcG7D1ydW9/qpYkN00Q==", + "license": "MIT", + "dependencies": { + "@types/express": "*", + "@types/node": "*" + } + }, + "node_modules/@types/connect": { + "version": "3.4.38", + "resolved": "https://registry.npmjs.org/@types/connect/-/connect-3.4.38.tgz", + "integrity": "sha512-K6uROf1LD88uDQqJCktA4yzL1YYAK6NgfsI0v/mTgyPKWsX1CnJ0XPSDhViejru1GcRkLWb8RlzFYJRqGUbaug==", + "license": "MIT", + "dependencies": { + "@types/node": "*" + } + }, + "node_modules/@types/d3-color": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/@types/d3-color/-/d3-color-3.1.3.tgz", + "integrity": "sha512-iO90scth9WAbmgv7ogoq57O9YpKmFBbmoEoCHDB2xMBY0+/KVrqAaCDyCE16dUspeOvIxFFRI+0sEtqDqy2b4A==", + "license": "MIT" + }, + "node_modules/@types/d3-drag": { + "version": "3.0.7", + "resolved": "https://registry.npmjs.org/@types/d3-drag/-/d3-drag-3.0.7.tgz", + "integrity": "sha512-HE3jVKlzU9AaMazNufooRJ5ZpWmLIoc90A37WU2JMmeq28w1FQqCZswHZ3xR+SuxYftzHq6WU6KJHvqxKzTxxQ==", + "license": "MIT", + "dependencies": { + "@types/d3-selection": "*" + } + }, + "node_modules/@types/d3-interpolate": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@types/d3-interpolate/-/d3-interpolate-3.0.4.tgz", + "integrity": "sha512-mgLPETlrpVV1YRJIglr4Ez47g7Yxjl1lj7YKsiMCb27VJH9W8NVM6Bb9d8kkpG/uAQS5AmbA48q2IAolKKo1MA==", + "license": "MIT", + "dependencies": { + "@types/d3-color": "*" + } + }, + "node_modules/@types/d3-selection": { + "version": "3.0.11", + "resolved": "https://registry.npmjs.org/@types/d3-selection/-/d3-selection-3.0.11.tgz", + "integrity": "sha512-bhAXu23DJWsrI45xafYpkQ4NtcKMwWnAC/vKrd2l+nxMFuvOT3XMYTIj2opv8vq8AO5Yh7Qac/nSeP/3zjTK0w==", + "license": "MIT" + }, + "node_modules/@types/d3-transition": { + "version": "3.0.9", + "resolved": "https://registry.npmjs.org/@types/d3-transition/-/d3-transition-3.0.9.tgz", + "integrity": "sha512-uZS5shfxzO3rGlu0cC3bjmMFKsXv+SmZZcgp0KD22ts4uGXp5EVYGzu/0YdwZeKmddhcAccYtREJKkPfXkZuCg==", + "license": "MIT", + "dependencies": { + "@types/d3-selection": "*" + } + }, + "node_modules/@types/d3-zoom": { + "version": "3.0.8", + "resolved": "https://registry.npmjs.org/@types/d3-zoom/-/d3-zoom-3.0.8.tgz", + "integrity": "sha512-iqMC4/YlFCSlO8+2Ii1GGGliCAY4XdeG748w5vQUbevlbDu0zSjH/+jojorQVBK/se0j6DUFNPBGSqD3YWYnDw==", + "license": "MIT", + "dependencies": { + "@types/d3-interpolate": "*", + "@types/d3-selection": "*" + } + }, + "node_modules/@types/debug": { + "version": "4.1.12", + "resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz", + "integrity": "sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ==", + "license": "MIT", + "dependencies": { + "@types/ms": "*" + } + }, "node_modules/@types/deep-eql": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/@types/deep-eql/-/deep-eql-4.0.2.tgz", @@ -3617,13 +4059,44 @@ "node_modules/@types/estree": { "version": "1.0.8", "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", - "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", - "dev": true + "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==" }, - "node_modules/@types/file-saver": { - "version": "2.0.7", - "resolved": "https://registry.npmjs.org/@types/file-saver/-/file-saver-2.0.7.tgz", - "integrity": "sha512-dNKVfHd/jk0SkR/exKGj2ggkB45MAkzvWCaqLUUgkyjITkGNzH8H+yUwr+BLJUBjZOe9w8X3wgmXhZDRg1ED6A==", + "node_modules/@types/estree-jsx": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/@types/estree-jsx/-/estree-jsx-1.0.5.tgz", + "integrity": "sha512-52CcUVNFyfb1A2ALocQw/Dd1BQFNmSdkuC3BkZ6iqhdMfQz7JWOFRuJFloOzjk+6WijU56m9oKXFAXc7o3Towg==", + "license": "MIT", + "dependencies": { + "@types/estree": "*" + } + }, + "node_modules/@types/express": { + "version": "5.0.6", + "resolved": "https://registry.npmjs.org/@types/express/-/express-5.0.6.tgz", + "integrity": "sha512-sKYVuV7Sv9fbPIt/442koC7+IIwK5olP1KWeD88e/idgoJqDm3JV/YUiPwkoKK92ylff2MGxSz1CSjsXelx0YA==", + "license": "MIT", + "dependencies": { + "@types/body-parser": "*", + "@types/express-serve-static-core": "^5.0.0", + "@types/serve-static": "^2" + } + }, + "node_modules/@types/express-serve-static-core": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/@types/express-serve-static-core/-/express-serve-static-core-5.1.1.tgz", + "integrity": "sha512-v4zIMr/cX7/d2BpAEX3KNKL/JrT1s43s96lLvvdTmza1oEvDudCqK9aF/djc/SWgy8Yh0h30TZx5VpzqFCxk5A==", + "license": "MIT", + "dependencies": { + "@types/node": "*", + "@types/qs": "*", + "@types/range-parser": "*", + "@types/send": "*" + } + }, + "node_modules/@types/file-saver": { + "version": "2.0.7", + "resolved": "https://registry.npmjs.org/@types/file-saver/-/file-saver-2.0.7.tgz", + "integrity": "sha512-dNKVfHd/jk0SkR/exKGj2ggkB45MAkzvWCaqLUUgkyjITkGNzH8H+yUwr+BLJUBjZOe9w8X3wgmXhZDRg1ED6A==", "dev": true }, "node_modules/@types/geojson": { @@ -3639,6 +4112,15 @@ "@types/geojson": "*" } }, + "node_modules/@types/hast": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@types/hast/-/hast-3.0.4.tgz", + "integrity": "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==", + "license": "MIT", + "dependencies": { + "@types/unist": "*" + } + }, "node_modules/@types/hoist-non-react-statics": { "version": "3.3.7", "resolved": "https://registry.npmjs.org/@types/hoist-non-react-statics/-/hoist-non-react-statics-3.3.7.tgz", @@ -3650,6 +4132,12 @@ "@types/react": "*" } }, + "node_modules/@types/http-errors": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.5.tgz", + "integrity": "sha512-r8Tayk8HJnX0FztbZN7oVqGccWgw98T/0neJphO91KkmOzug1KkofZURD4UaD5uH8AqcFLfdPErnBod0u71/qg==", + "license": "MIT" + }, "node_modules/@types/json-schema": { "version": "7.0.15", "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz", @@ -3677,12 +4165,27 @@ "@types/pbf": "*" } }, + "node_modules/@types/mdast": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/@types/mdast/-/mdast-4.0.4.tgz", + "integrity": "sha512-kGaNbPh1k7AFzgpud/gMdvIm5xuECykRR+JnWKQno9TAXVa6WIVCGTPvYGekIDL4uwCZQSYbUxNBSb1aUo79oA==", + "license": "MIT", + "dependencies": { + "@types/unist": "*" + } + }, "node_modules/@types/mdx": { "version": "2.0.13", "resolved": "https://registry.npmjs.org/@types/mdx/-/mdx-2.0.13.tgz", "integrity": "sha512-+OWZQfAYyio6YkJb3HLxDrvnx6SWWDbC0zVPfBRzUk0/nqoDyf6dNxQi3eArPe8rJ473nobTMQ/8Zk+LxJ+Yuw==", "dev": true }, + "node_modules/@types/ms": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/@types/ms/-/ms-2.1.0.tgz", + "integrity": "sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==", + "license": "MIT" + }, "node_modules/@types/node": { "version": "24.3.1", "resolved": "https://registry.npmjs.org/@types/node/-/node-24.3.1.tgz", @@ -3691,6 +4194,16 @@ "undici-types": "~7.10.0" } }, + "node_modules/@types/node-fetch": { + "version": "2.6.13", + "resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.13.tgz", + "integrity": "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw==", + "license": "MIT", + "dependencies": { + "@types/node": "*", + "form-data": "^4.0.4" + } + }, "node_modules/@types/parse-json": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/@types/parse-json/-/parse-json-4.0.2.tgz", @@ -3721,6 +4234,18 @@ "resolved": "https://registry.npmjs.org/@types/prop-types/-/prop-types-15.7.15.tgz", "integrity": "sha512-F6bEyamV9jKGAFBEmlQnesRPGOQqS2+Uwi0Em15xenOxHaf2hv6L8YCVn3rPdPJOiJfPiCnLIRyvwVaqMY3MIw==" }, + "node_modules/@types/qs": { + "version": "6.15.0", + "resolved": "https://registry.npmjs.org/@types/qs/-/qs-6.15.0.tgz", + "integrity": "sha512-JawvT8iBVWpzTrz3EGw9BTQFg3BQNmwERdKE22vlTxawwtbyUSlMppvZYKLZzB5zgACXdXxbD3m1bXaMqP/9ow==", + "license": "MIT" + }, + "node_modules/@types/range-parser": { + "version": "1.2.7", + "resolved": "https://registry.npmjs.org/@types/range-parser/-/range-parser-1.2.7.tgz", + "integrity": "sha512-hKormJbkJqzQGhziax5PItDUTMAM9uE2XXQmM37dyd4hVM+5aVl7oVxMVUiVQn2oCQFN/LKCZdvSM0pFRqbSmQ==", + "license": "MIT" + }, "node_modules/@types/react": { "version": "18.3.24", "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.24.tgz", @@ -3763,6 +4288,25 @@ "integrity": "sha512-A4STmOXPhMUtHH+S6ymgE2GiBSMqf4oTvcQZMcHzokuTLVYzXTB8ttjcgxOVaAp2lGwEdzZ0J+cRbbeevQj1UQ==", "dev": true }, + "node_modules/@types/send": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/@types/send/-/send-1.2.1.tgz", + "integrity": "sha512-arsCikDvlU99zl1g69TcAB3mzZPpxgw0UQnaHeC1Nwb015xp8bknZv5rIfri9xTOcMuaVgvabfIRA7PSZVuZIQ==", + "license": "MIT", + "dependencies": { + "@types/node": "*" + } + }, + "node_modules/@types/serve-static": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@types/serve-static/-/serve-static-2.2.0.tgz", + "integrity": "sha512-8mam4H1NHLtu7nmtalF7eyBH14QyOASmcxHhSfEoRyr0nP/YdoesEtU+uSRvMe96TW/HPTtkoKqQLl53N7UXMQ==", + "license": "MIT", + "dependencies": { + "@types/http-errors": "*", + "@types/node": "*" + } + }, "node_modules/@types/stylis": { "version": "4.2.5", "resolved": "https://registry.npmjs.org/@types/stylis/-/stylis-4.2.5.tgz", @@ -3776,6 +4320,18 @@ "@types/geojson": "*" } }, + "node_modules/@types/swagger-ui-dist": { + "version": "3.30.6", + "resolved": "https://registry.npmjs.org/@types/swagger-ui-dist/-/swagger-ui-dist-3.30.6.tgz", + "integrity": "sha512-FVxN7wjLYRtJsZBscOcOcf8oR++m38vbUFjT33Mr9HBuasX9bRDrJsp7iwixcOtKSHEEa2B7o2+4wEiXqC+Ebw==", + "license": "MIT" + }, + "node_modules/@types/unist": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", + "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==", + "license": "MIT" + }, "node_modules/@types/uuid": { "version": "9.0.8", "resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.8.tgz", @@ -4014,6 +4570,12 @@ "url": "https://opencollective.com/typescript-eslint" } }, + "node_modules/@ungap/structured-clone": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@ungap/structured-clone/-/structured-clone-1.3.0.tgz", + "integrity": "sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g==", + "license": "ISC" + }, "node_modules/@unrs/resolver-binding-android-arm-eabi": { "version": "1.11.1", "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-android-arm-eabi/-/resolver-binding-android-arm-eabi-1.11.1.tgz", @@ -4513,11 +5075,90 @@ "url": "https://opencollective.com/vitest" } }, + "node_modules/@xyflow/react": { + "version": "12.10.2", + "resolved": "https://registry.npmjs.org/@xyflow/react/-/react-12.10.2.tgz", + "integrity": "sha512-CgIi6HwlcHXwlkTpr0fxLv/0sRVNZ8IdwKLzzeCscaYBwpvfcH1QFOCeaTCuEn1FQEs/B8CjnTSjhs8udgmBgQ==", + "license": "MIT", + "dependencies": { + "@xyflow/system": "0.0.76", + "classcat": "^5.0.3", + "zustand": "^4.4.0" + }, + "peerDependencies": { + "react": ">=17", + "react-dom": ">=17" + } + }, + "node_modules/@xyflow/system": { + "version": "0.0.76", + "resolved": "https://registry.npmjs.org/@xyflow/system/-/system-0.0.76.tgz", + "integrity": "sha512-hvwvnRS1B3REwVDlWexsq7YQaPZeG3/mKo1jv38UmnpWmxihp14bW6VtEOuHEwJX2FvzFw8k77LyKSk/wiZVNA==", + "license": "MIT", + "dependencies": { + "@types/d3-drag": "^3.0.7", + "@types/d3-interpolate": "^3.0.4", + "@types/d3-selection": "^3.0.10", + "@types/d3-transition": "^3.0.8", + "@types/d3-zoom": "^3.0.8", + "d3-drag": "^3.0.0", + "d3-interpolate": "^3.0.1", + "d3-selection": "^3.0.0", + "d3-zoom": "^3.0.0" + } + }, "node_modules/abs-svg-path": { "version": "0.1.1", "resolved": "https://registry.npmjs.org/abs-svg-path/-/abs-svg-path-0.1.1.tgz", "integrity": "sha512-d8XPSGjfyzlXC3Xx891DJRyZfqk5JU0BJrDQcsWomFIV1/BIzPW5HDH5iDdWpqWaav0YVIEzT1RHTwWr0FFshA==" }, + "node_modules/accepts": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/accepts/-/accepts-2.0.0.tgz", + "integrity": "sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng==", + "license": "MIT", + "dependencies": { + "mime-types": "^3.0.0", + "negotiator": "^1.0.0" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/accepts/node_modules/mime-db": { + "version": "1.54.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.54.0.tgz", + "integrity": "sha512-aU5EJuIN2WDemCcAp2vFBfp/m4EAhWJnUNSSw0ixs7/kXbd6Pg64EmwJkNdFhB8aWt1sH2CTXrLxo/iAGV3oPQ==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/accepts/node_modules/mime-types": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-3.0.2.tgz", + "integrity": "sha512-Lbgzdk0h4juoQ9fCKXW4by0UJqj+nOOrI9MJ1sSj4nI8aI2eo1qmvQEie4VD1glsS250n15LsWsYtCugiStS5A==", + "license": "MIT", + "dependencies": { + "mime-db": "^1.54.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, + "node_modules/accepts/node_modules/negotiator": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-1.0.0.tgz", + "integrity": "sha512-8Ofs/AUQh8MaEcrlq5xOX0CQ9ypTF5dl78mjlMNfOK08fzpgTHQRQPBxcPlEtIw0yRpws+Zo/3r+5WRby7u3Gg==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, "node_modules/acorn": { "version": "8.15.0", "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz", @@ -4606,8 +5247,7 @@ "node_modules/argparse": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", - "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==", - "dev": true + "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==" }, "node_modules/aria-query": { "version": "5.3.0", @@ -4627,7 +5267,6 @@ "version": "1.0.2", "resolved": "https://registry.npmjs.org/array-buffer-byte-length/-/array-buffer-byte-length-1.0.2.tgz", "integrity": "sha512-LHE+8BuR7RYGDKvnrmcuSq3tDcKv9OFEXQt/HpbZhY7V6h0zlUXutnAD82GiFx9rdieCMjkvtcsPqBwgUl1Iiw==", - "dev": true, "dependencies": { "call-bound": "^1.0.3", "is-array-buffer": "^3.0.5" @@ -4764,6 +5403,28 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/array.prototype.reduce": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/array.prototype.reduce/-/array.prototype.reduce-1.0.8.tgz", + "integrity": "sha512-DwuEqgXFBwbmZSRqt3BpQigWNUoqw9Ml2dTWdF3B2zQlQX4OeUE0zyuzX0fX0IbTvjdkZbcBTU3idgpO78qkTw==", + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.4", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.9", + "es-array-method-boxes-properly": "^1.0.0", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.1.1", + "is-string": "^1.1.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/array.prototype.tosorted": { "version": "1.1.4", "resolved": "https://registry.npmjs.org/array.prototype.tosorted/-/array.prototype.tosorted-1.1.4.tgz", @@ -4784,7 +5445,6 @@ "version": "1.0.4", "resolved": "https://registry.npmjs.org/arraybuffer.prototype.slice/-/arraybuffer.prototype.slice-1.0.4.tgz", "integrity": "sha512-BNoCY6SXXPQ7gF2opIP4GBE+Xw7U+pHMYKuzjgCN3GwiaIR09UUeKfheyIry77QtrCBlC0KK0q5/TER/tYh3PQ==", - "dev": true, "dependencies": { "array-buffer-byte-length": "^1.0.1", "call-bind": "^1.0.8", @@ -4825,7 +5485,6 @@ "version": "1.0.0", "resolved": "https://registry.npmjs.org/async-function/-/async-function-1.0.0.tgz", "integrity": "sha512-hsU18Ae8CDTR6Kgu9DYf0EbCr/a5iGL0rytQDobUcdpYOKokk8LEjVphnXkDkgpi0wYVsqrXuP0bZxJaTqdgoA==", - "dev": true, "engines": { "node": ">= 0.4" } @@ -4873,6 +5532,16 @@ "npm": ">=6" } }, + "node_modules/bail": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/bail/-/bail-2.0.2.tgz", + "integrity": "sha512-0xO6mYd7JB2YesxDKplafRpsiOzPt9V02ddPCLbY1xYGPOX24NTyN50qnUxgCPcSoYMhKpAuBTjQoRZCAkUDRw==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/balanced-match": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", @@ -4922,6 +5591,46 @@ "safe-buffer": "^5.1.1" } }, + "node_modules/body-parser": { + "version": "2.2.2", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-2.2.2.tgz", + "integrity": "sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==", + "license": "MIT", + "dependencies": { + "bytes": "^3.1.2", + "content-type": "^1.0.5", + "debug": "^4.4.3", + "http-errors": "^2.0.0", + "iconv-lite": "^0.7.0", + "on-finished": "^2.4.1", + "qs": "^6.14.1", + "raw-body": "^3.0.1", + "type-is": "^2.0.1" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, + "node_modules/body-parser/node_modules/iconv-lite": { + "version": "0.7.2", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.7.2.tgz", + "integrity": "sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==", + "license": "MIT", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, "node_modules/bootstrap": { "version": "5.3.8", "resolved": "https://registry.npmjs.org/bootstrap/-/bootstrap-5.3.8.tgz", @@ -5003,6 +5712,15 @@ "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz", "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==" }, + "node_modules/bytes": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", + "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==", + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, "node_modules/cac": { "version": "6.7.14", "resolved": "https://registry.npmjs.org/cac/-/cac-6.7.14.tgz", @@ -5112,6 +5830,16 @@ "element-size": "^1.1.1" } }, + "node_modules/ccount": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/ccount/-/ccount-2.0.1.tgz", + "integrity": "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/chai": { "version": "5.3.3", "resolved": "https://registry.npmjs.org/chai/-/chai-5.3.3.tgz", @@ -5141,6 +5869,46 @@ "node": ">=8" } }, + "node_modules/character-entities": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/character-entities/-/character-entities-2.0.2.tgz", + "integrity": "sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/character-entities-html4": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/character-entities-html4/-/character-entities-html4-2.1.0.tgz", + "integrity": "sha512-1v7fgQRj6hnSwFpq1Eu0ynr/CDEw0rXo2B61qXrLNdHZmPKgb7fqS1a2JwF0rISo9q77jDI8VMEHoApn8qDoZA==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/character-entities-legacy": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/character-entities-legacy/-/character-entities-legacy-3.0.0.tgz", + "integrity": "sha512-RpPp0asT/6ufRm//AJVwpViZbGM/MkjQFxJccQRHmISF/22NBtsHqAWmL+/pmkPWoIUJdWyeVleTl1wydHATVQ==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/character-reference-invalid": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/character-reference-invalid/-/character-reference-invalid-2.0.1.tgz", + "integrity": "sha512-iBZ4F4wRbyORVsu0jPV7gXkOsGYjGHPmAyv+HiHG8gi5PtC9KI2j1+v8/tlibRvjoWX027ypmG/n0HtO5t7unw==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/check-error": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/check-error/-/check-error-2.1.1.tgz", @@ -5150,6 +5918,23 @@ "node": ">= 16" } }, + "node_modules/chokidar": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-4.0.3.tgz", + "integrity": "sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA==", + "dev": true, + "license": "MIT", + "optional": true, + "dependencies": { + "readdirp": "^4.0.1" + }, + "engines": { + "node": ">= 14.16.0" + }, + "funding": { + "url": "https://paulmillr.com/funding/" + } + }, "node_modules/chromatic": { "version": "11.29.0", "resolved": "https://registry.npmjs.org/chromatic/-/chromatic-11.29.0.tgz", @@ -5178,6 +5963,12 @@ "resolved": "https://registry.npmjs.org/clamp/-/clamp-1.0.1.tgz", "integrity": "sha512-kgMuFyE78OC6Dyu3Dy7vcx4uy97EIbVxJB/B0eJ3bUNAkwdNcxYzgKltnyADiYwsR7SEqkkUPsEUT//OVS6XMA==" }, + "node_modules/classcat": { + "version": "5.0.5", + "resolved": "https://registry.npmjs.org/classcat/-/classcat-5.0.5.tgz", + "integrity": "sha512-JhZUT7JFcQy/EzW605k/ktHtncoo9vnyW/2GspNYwFlN1C/WmjuV/xtS04e9SOkL2sTdw0VAZ2UGCcQ9lR6p6w==", + "license": "MIT" + }, "node_modules/cli-cursor": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-5.0.0.tgz", @@ -5336,6 +6127,13 @@ "integrity": "sha512-IfEDxwoWIjkeXL1eXcDiow4UbKjhLdq6/EuSVR9GMN7KVH3r9gQ83e73hsz1Nd1T3ijd5xv1wcWRYO+D6kCI2w==", "dev": true }, + "node_modules/colorjs.io": { + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/colorjs.io/-/colorjs.io-0.5.2.tgz", + "integrity": "sha512-twmVoizEW7ylZSN32OgKdXRmo1qg+wT5/6C3xu5b9QsWzSFAhHLn2xd8ro0diCsKfCj1RdaTP/nrcW+vAoQPIw==", + "dev": true, + "license": "MIT" + }, "node_modules/combined-stream": { "version": "1.0.8", "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", @@ -5347,6 +6145,16 @@ "node": ">= 0.8" } }, + "node_modules/comma-separated-tokens": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/comma-separated-tokens/-/comma-separated-tokens-2.0.3.tgz", + "integrity": "sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/commander": { "version": "13.1.0", "resolved": "https://registry.npmjs.org/commander/-/commander-13.1.0.tgz", @@ -5356,6 +6164,51 @@ "node": ">=18" } }, + "node_modules/compressible": { + "version": "2.0.18", + "resolved": "https://registry.npmjs.org/compressible/-/compressible-2.0.18.tgz", + "integrity": "sha512-AF3r7P5dWxL8MxyITRMlORQNaOA2IkAFaTr4k7BUumjPtRpGDTZpl0Pb1XCO6JeDCBdp126Cgs9sMxqSjgYyRg==", + "license": "MIT", + "dependencies": { + "mime-db": ">= 1.43.0 < 2" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/compression": { + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/compression/-/compression-1.8.1.tgz", + "integrity": "sha512-9mAqGPHLakhCLeNyxPkK4xVo746zQ/czLH1Ky+vkitMnWfWZps8r0qXuwhwizagCRttsL4lfG4pIOvaWLpAP0w==", + "license": "MIT", + "dependencies": { + "bytes": "3.1.2", + "compressible": "~2.0.18", + "debug": "2.6.9", + "negotiator": "~0.6.4", + "on-headers": "~1.1.0", + "safe-buffer": "5.2.1", + "vary": "~1.1.2" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/compression/node_modules/debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "license": "MIT", + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/compression/node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", + "license": "MIT" + }, "node_modules/concat-map": { "version": "0.0.1", "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", @@ -5376,6 +6229,28 @@ "typedarray": "^0.0.6" } }, + "node_modules/content-disposition": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-1.0.1.tgz", + "integrity": "sha512-oIXISMynqSqm241k6kcQ5UwttDILMK4BiurCfGEREw6+X9jkkpEe5T9FZaApyLGGOnFuyMWZpdolTXMtvEJ08Q==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, + "node_modules/content-type": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.5.tgz", + "integrity": "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, "node_modules/convert-source-map": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz", @@ -5390,15 +6265,25 @@ "node": ">=18" } }, + "node_modules/cookie-signature": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.2.2.tgz", + "integrity": "sha512-D76uU73ulSXrD1UXF4KE2TMxVVwhsnCgfAyTg9k8P6KGZjlXKrOLe4dJQKI3Bxi5wjesZoFXJWElNWBjPZMbhg==", + "license": "MIT", + "engines": { + "node": ">=6.6.0" + } + }, "node_modules/core-util-is": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz", "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==" }, "node_modules/corepack": { - "version": "0.34.0", - "resolved": "https://registry.npmjs.org/corepack/-/corepack-0.34.0.tgz", - "integrity": "sha512-8D9N/k9hDjoISCDGUzH2wBF0fJD49p3G7ifoEZcc0vhB7Py6r+Mc1SpJ8dvnWY/HMP95K60WkQbN7vgbUgXgpA==", + "version": "0.34.6", + "resolved": "https://registry.npmjs.org/corepack/-/corepack-0.34.6.tgz", + "integrity": "sha512-gvylq9kzJB09mSsiOnKOnhg0YdCWNy2aGaeGbYF4HlyGd/v4moxEonQjJPYI45/K4zP7q1hW9qCVvaYYKK5nkA==", + "license": "MIT", "bin": { "corepack": "dist/corepack.js", "pnpm": "dist/pnpm.js", @@ -5410,6 +6295,23 @@ "node": "^20.10.0 || ^22.11.0 || >=24.0.0" } }, + "node_modules/cors": { + "version": "2.8.6", + "resolved": "https://registry.npmjs.org/cors/-/cors-2.8.6.tgz", + "integrity": "sha512-tJtZBBHA6vjIAaF6EnIaq6laBBP9aq/Y3ouVJjEfoHbRBcHBAHYcMh/w8LDrk2PvIMMq8gmopa5D4V8RmbrxGw==", + "license": "MIT", + "dependencies": { + "object-assign": "^4", + "vary": "^1" + }, + "engines": { + "node": ">= 0.10" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, "node_modules/cosmiconfig": { "version": "7.1.0", "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-7.1.0.tgz", @@ -5580,6 +6482,28 @@ "resolved": "https://registry.npmjs.org/d3-dispatch/-/d3-dispatch-1.0.6.tgz", "integrity": "sha512-fVjoElzjhCEy+Hbn8KygnmMS7Or0a9sI2UzGwoB7cCtvI1XpVN9GpoYlnb3xt2YV66oXYb1fLJ8GMvP4hdU1RA==" }, + "node_modules/d3-drag": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/d3-drag/-/d3-drag-3.0.0.tgz", + "integrity": "sha512-pWbUJLdETVA8lQNJecMxoXfH6x+mO2UQo8rSmZ+QqxcbyA3hfeprFgIT//HW2nlHChWeIIMwS2Fq+gEARkhTkg==", + "license": "ISC", + "dependencies": { + "d3-dispatch": "1 - 3", + "d3-selection": "3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-ease": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-ease/-/d3-ease-3.0.1.tgz", + "integrity": "sha512-wR/XK3D3XcLIZwpbvQwQ5fK+8Ykds1ip7A2Txe0yxncXSdq1L9skcG7blcedkOX+ZcgxGAmLX1FrRGbADwzi0w==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=12" + } + }, "node_modules/d3-force": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/d3-force/-/d3-force-1.2.1.tgz", @@ -5653,6 +6577,15 @@ "resolved": "https://registry.npmjs.org/d3-quadtree/-/d3-quadtree-1.0.7.tgz", "integrity": "sha512-RKPAeXnkC59IDGD0Wu5mANy0Q2V28L+fNe65pOCXVdVuTJS3WPKaJlFHer32Rbh9gIo9qMuJXio8ra4+YmIymA==" }, + "node_modules/d3-selection": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/d3-selection/-/d3-selection-3.0.0.tgz", + "integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, "node_modules/d3-shape": { "version": "1.3.7", "resolved": "https://registry.npmjs.org/d3-shape/-/d3-shape-1.3.7.tgz", @@ -5679,6 +6612,41 @@ "resolved": "https://registry.npmjs.org/d3-timer/-/d3-timer-1.0.10.tgz", "integrity": "sha512-B1JDm0XDaQC+uvo4DT79H0XmBskgS3l6Ve+1SBCfxgmtIb1AVrPIoqd+nPSv+loMX8szQ0sVUhGngL7D5QPiXw==" }, + "node_modules/d3-transition": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-transition/-/d3-transition-3.0.1.tgz", + "integrity": "sha512-ApKvfjsSR6tg06xrL434C0WydLr7JewBB3V+/39RMHsaXTOG0zmt/OAXeng5M5LBm0ojmxJrpomQVZ1aPvBL4w==", + "license": "ISC", + "dependencies": { + "d3-color": "1 - 3", + "d3-dispatch": "1 - 3", + "d3-ease": "1 - 3", + "d3-interpolate": "1 - 3", + "d3-timer": "1 - 3" + }, + "engines": { + "node": ">=12" + }, + "peerDependencies": { + "d3-selection": "2 - 3" + } + }, + "node_modules/d3-zoom": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/d3-zoom/-/d3-zoom-3.0.0.tgz", + "integrity": "sha512-b8AmV3kfQaqWAuacbPuNbL6vahnOJflOhexLzMMNLga62+/nh0JzvJ0aO/5a5MVgUFGS7Hu1P9P03o3fJkDCyw==", + "license": "ISC", + "dependencies": { + "d3-dispatch": "1 - 3", + "d3-drag": "2 - 3", + "d3-interpolate": "1 - 3", + "d3-selection": "2 - 3", + "d3-transition": "2 - 3" + }, + "engines": { + "node": ">=12" + } + }, "node_modules/data-urls": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-5.0.0.tgz", @@ -5696,7 +6664,6 @@ "version": "1.0.2", "resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.2.tgz", "integrity": "sha512-EmKO5V3OLXh1rtK2wgXRansaK1/mtVdTUEiEI0W8RkvgT05kfxaH29PliLnpLP73yYO6142Q72QNa8Wx/A5CqQ==", - "dev": true, "dependencies": { "call-bound": "^1.0.3", "es-errors": "^1.3.0", @@ -5713,7 +6680,6 @@ "version": "1.0.2", "resolved": "https://registry.npmjs.org/data-view-byte-length/-/data-view-byte-length-1.0.2.tgz", "integrity": "sha512-tuhGbE6CfTM9+5ANGf+oQb72Ky/0+s3xKUpHvShfiz2RxMFgFPjsXuRLBVMtvMs15awe45SRb83D6wH4ew6wlQ==", - "dev": true, "dependencies": { "call-bound": "^1.0.3", "es-errors": "^1.3.0", @@ -5730,7 +6696,6 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/data-view-byte-offset/-/data-view-byte-offset-1.0.1.tgz", "integrity": "sha512-BS8PfmtDGnrgYdOonGZQdLZslWIeCGFP9tpan0hi1Co2Zr2NKADsvGYA8XxuG/4UWgJ6Cjtv+YJnB6MM69QGlQ==", - "dev": true, "dependencies": { "call-bound": "^1.0.2", "es-errors": "^1.3.0", @@ -5744,9 +6709,10 @@ } }, "node_modules/debug": { - "version": "4.4.1", - "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.1.tgz", - "integrity": "sha512-KcKCqiftBJcZr++7ykoDIEwSa3XWowTfNPo92BYxjXiyYEVrUQh2aLyhxBCwww+heortUFxEJYcRzosstTEBYQ==", + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", + "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", + "license": "MIT", "dependencies": { "ms": "^2.1.3" }, @@ -5765,6 +6731,19 @@ "integrity": "sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg==", "dev": true }, + "node_modules/decode-named-character-reference": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/decode-named-character-reference/-/decode-named-character-reference-1.3.0.tgz", + "integrity": "sha512-GtpQYB283KrPp6nRw50q3U9/VfOutZOe103qlN7BPP6Ad27xYnOIWv4lPzo8HCAL+mMZofJ9KEy30fq6MfaK6Q==", + "license": "MIT", + "dependencies": { + "character-entities": "^2.0.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/deep-eql": { "version": "5.0.2", "resolved": "https://registry.npmjs.org/deep-eql/-/deep-eql-5.0.2.tgz", @@ -5808,7 +6787,6 @@ "version": "1.2.1", "resolved": "https://registry.npmjs.org/define-properties/-/define-properties-1.2.1.tgz", "integrity": "sha512-8QmQKqEASLd5nx0U1B1okLElbUuuttJ/AnYmRXbbbGDWh6uS208EjD4Xqq/I9wK7u0v6O08XhTWnt5XtEbR6Dg==", - "dev": true, "dependencies": { "define-data-property": "^1.0.1", "has-property-descriptors": "^1.0.0", @@ -5837,6 +6815,15 @@ "node": ">=0.4.0" } }, + "node_modules/depd": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz", + "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==", + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, "node_modules/dequal": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz", @@ -5850,6 +6837,30 @@ "resolved": "https://registry.npmjs.org/detect-kerning/-/detect-kerning-2.1.2.tgz", "integrity": "sha512-I3JIbrnKPAntNLl1I6TpSQQdQ4AutYzv/sKMFKbepawV/hlH0GmYKhUoOEMd4xqaUHT+Bm0f4127lh5qs1m1tw==" }, + "node_modules/detect-libc": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.1.2.tgz", + "integrity": "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ==", + "dev": true, + "license": "Apache-2.0", + "optional": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/devlop": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/devlop/-/devlop-1.1.0.tgz", + "integrity": "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA==", + "license": "MIT", + "dependencies": { + "dequal": "^2.0.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/doctrine": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/doctrine/-/doctrine-2.1.0.tgz", @@ -5944,6 +6955,12 @@ "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==", "dev": true }, + "node_modules/ee-first": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz", + "integrity": "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==", + "license": "MIT" + }, "node_modules/electron-to-chromium": { "version": "1.5.217", "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.217.tgz", @@ -5969,6 +6986,15 @@ "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==", "dev": true }, + "node_modules/encodeurl": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz", + "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==", + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, "node_modules/end-of-stream": { "version": "1.4.5", "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz", @@ -6013,7 +7039,6 @@ "version": "1.24.0", "resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.24.0.tgz", "integrity": "sha512-WSzPgsdLtTcQwm4CROfS5ju2Wa1QQcVeT37jFjYzdFz1r9ahadC8B8/a4qxJxM+09F18iumCdRmlr96ZYkQvEg==", - "dev": true, "dependencies": { "array-buffer-byte-length": "^1.0.2", "arraybuffer.prototype.slice": "^1.0.4", @@ -6077,6 +7102,12 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/es-array-method-boxes-properly": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/es-array-method-boxes-properly/-/es-array-method-boxes-properly-1.0.0.tgz", + "integrity": "sha512-wd6JXUmyHmt8T5a2xreUwKcGPq6f1f+WwIJkijUqiGcJz1qqnZgP6XIK+QyIWU5lT7imeNxUll48bziG+TSYcA==", + "license": "MIT" + }, "node_modules/es-define-property": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", @@ -6167,7 +7198,6 @@ "version": "1.3.0", "resolved": "https://registry.npmjs.org/es-to-primitive/-/es-to-primitive-1.3.0.tgz", "integrity": "sha512-w+5mJ3GuFL+NjVtJlvydShqE1eN3h3PbI7/5LAsYJP/2qtuMXjfL2LpHSRqo4b4eSF5K/DH1JXKUAHSB2UW50g==", - "dev": true, "dependencies": { "is-callable": "^1.2.7", "is-date-object": "^1.0.5", @@ -6288,6 +7318,12 @@ "node": ">=6" } }, + "node_modules/escape-html": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz", + "integrity": "sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow==", + "license": "MIT" + }, "node_modules/escape-string-regexp": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz", @@ -6806,6 +7842,16 @@ "node": ">=4.0" } }, + "node_modules/estree-util-is-identifier-name": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/estree-util-is-identifier-name/-/estree-util-is-identifier-name-3.0.0.tgz", + "integrity": "sha512-hFtqIDZTIUZ9BXLb8y4pYGyk6+wekIivNVTcmvk8NoOh+VeRn5y6cEHzbURrWbfp1fIqdVipilzj+lfaadNZmg==", + "license": "MIT", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/estree-walker": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-2.0.2.tgz", @@ -6820,6 +7866,15 @@ "node": ">=0.10.0" } }, + "node_modules/etag": { + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz", + "integrity": "sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, "node_modules/event-emitter": { "version": "0.3.5", "resolved": "https://registry.npmjs.org/event-emitter/-/event-emitter-0.3.5.tgz", @@ -6875,6 +7930,83 @@ "node": ">=12.0.0" } }, + "node_modules/express": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/express/-/express-5.2.1.tgz", + "integrity": "sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw==", + "license": "MIT", + "dependencies": { + "accepts": "^2.0.0", + "body-parser": "^2.2.1", + "content-disposition": "^1.0.0", + "content-type": "^1.0.5", + "cookie": "^0.7.1", + "cookie-signature": "^1.2.1", + "debug": "^4.4.0", + "depd": "^2.0.0", + "encodeurl": "^2.0.0", + "escape-html": "^1.0.3", + "etag": "^1.8.1", + "finalhandler": "^2.1.0", + "fresh": "^2.0.0", + "http-errors": "^2.0.0", + "merge-descriptors": "^2.0.0", + "mime-types": "^3.0.0", + "on-finished": "^2.4.1", + "once": "^1.4.0", + "parseurl": "^1.3.3", + "proxy-addr": "^2.0.7", + "qs": "^6.14.0", + "range-parser": "^1.2.1", + "router": "^2.2.0", + "send": "^1.1.0", + "serve-static": "^2.2.0", + "statuses": "^2.0.1", + "type-is": "^2.0.1", + "vary": "^1.1.2" + }, + "engines": { + "node": ">= 18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, + "node_modules/express/node_modules/cookie": { + "version": "0.7.2", + "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.7.2.tgz", + "integrity": "sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/express/node_modules/mime-db": { + "version": "1.54.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.54.0.tgz", + "integrity": "sha512-aU5EJuIN2WDemCcAp2vFBfp/m4EAhWJnUNSSw0ixs7/kXbd6Pg64EmwJkNdFhB8aWt1sH2CTXrLxo/iAGV3oPQ==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/express/node_modules/mime-types": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-3.0.2.tgz", + "integrity": "sha512-Lbgzdk0h4juoQ9fCKXW4by0UJqj+nOOrI9MJ1sSj4nI8aI2eo1qmvQEie4VD1glsS250n15LsWsYtCugiStS5A==", + "license": "MIT", + "dependencies": { + "mime-db": "^1.54.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, "node_modules/ext": { "version": "1.7.0", "resolved": "https://registry.npmjs.org/ext/-/ext-1.7.0.tgz", @@ -6883,6 +8015,12 @@ "type": "^2.7.2" } }, + "node_modules/extend": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", + "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==", + "license": "MIT" + }, "node_modules/falafel": { "version": "2.2.5", "resolved": "https://registry.npmjs.org/falafel/-/falafel-2.2.5.tgz", @@ -7023,6 +8161,27 @@ "node": ">=8" } }, + "node_modules/finalhandler": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-2.1.1.tgz", + "integrity": "sha512-S8KoZgRZN+a5rNwqTxlZZePjT/4cnm0ROV70LedRHZ0p8u9fRID0hJUZQpkKLzro8LfmC8sx23bY6tVNxv8pQA==", + "license": "MIT", + "dependencies": { + "debug": "^4.4.0", + "encodeurl": "^2.0.0", + "escape-html": "^1.0.3", + "on-finished": "^2.4.1", + "parseurl": "^1.3.3", + "statuses": "^2.0.1" + }, + "engines": { + "node": ">= 18.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, "node_modules/find-root": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/find-root/-/find-root-1.1.0.tgz", @@ -7151,6 +8310,22 @@ "node": ">= 6" } }, + "node_modules/forwarded": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz", + "integrity": "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/fp-ts": { + "version": "2.16.11", + "resolved": "https://registry.npmjs.org/fp-ts/-/fp-ts-2.16.11.tgz", + "integrity": "sha512-LaI+KaX2NFkfn1ZGHoKCmcfv7yrZsC3b8NtWsTVQeHkq4F27vI5igUuO53sxqDEa2gNQMHFPmpojDw/1zmUK7w==", + "license": "MIT", + "peer": true + }, "node_modules/framer-motion": { "version": "12.23.12", "resolved": "https://registry.npmjs.org/framer-motion/-/framer-motion-12.23.12.tgz", @@ -7177,6 +8352,15 @@ } } }, + "node_modules/fresh": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/fresh/-/fresh-2.0.0.tgz", + "integrity": "sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==", + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, "node_modules/from2": { "version": "2.3.0", "resolved": "https://registry.npmjs.org/from2/-/from2-2.3.0.tgz", @@ -7212,7 +8396,6 @@ "version": "1.1.8", "resolved": "https://registry.npmjs.org/function.prototype.name/-/function.prototype.name-1.1.8.tgz", "integrity": "sha512-e5iwyodOHhbMr/yNrc7fDYG4qlbIvI5gajyzPnb5TCwyhjApznQh1BMFou9b30SevY43gCJKXycoCBjMbsuW0Q==", - "dev": true, "dependencies": { "call-bind": "^1.0.8", "call-bound": "^1.0.3", @@ -7232,7 +8415,6 @@ "version": "1.2.3", "resolved": "https://registry.npmjs.org/functions-have-names/-/functions-have-names-1.2.3.tgz", "integrity": "sha512-xckBUXyTIqT97tq2x2AMb+g163b5JFysYk0x4qxNFwbfQkmNZoiRHb6sPzI9/QV33WeuvVYBUIiD4NzNIyqaRQ==", - "dev": true, "funding": { "url": "https://github.com/sponsors/ljharb" } @@ -7319,7 +8501,6 @@ "version": "1.1.0", "resolved": "https://registry.npmjs.org/get-symbol-description/-/get-symbol-description-1.1.0.tgz", "integrity": "sha512-w9UMqWwJxHNOvoNzSJ2oPF5wvYcvP7jUvYzhp67yEhTi17ZDBBC1z9pTdGuzjD+EFIqLSYRweZjqfiPzQ06Ebg==", - "dev": true, "dependencies": { "call-bound": "^1.0.3", "es-errors": "^1.3.0", @@ -7475,7 +8656,6 @@ "version": "1.0.4", "resolved": "https://registry.npmjs.org/globalthis/-/globalthis-1.0.4.tgz", "integrity": "sha512-DpLKbNU4WylpxJykQujfCcwYWiV/Jhm50Goo0wrVILAv5jOr9d+H+UR3PhSCD2rCCEIg0uc+G+muBTwD54JhDQ==", - "dev": true, "dependencies": { "define-properties": "^1.2.1", "gopd": "^1.0.1" @@ -7701,11 +8881,16 @@ "resolved": "https://registry.npmjs.org/grid-index/-/grid-index-1.1.0.tgz", "integrity": "sha512-HZRwumpOGUrHyxO5bqKZL0B0GlUpwtCAzZ42sgxUPniu33R1LSFH5yrIcBCHjkctCAh3mtWKcKd9J4vDDdeVHA==" }, + "node_modules/h264-mp4-encoder": { + "version": "1.0.12", + "resolved": "https://registry.npmjs.org/h264-mp4-encoder/-/h264-mp4-encoder-1.0.12.tgz", + "integrity": "sha512-xih3J+Go0o1RqGjhOt6TwXLWWGqLONRPyS8yoMu/RoS/S8WyEv4HuHp1KBsDDl8srZQ3gw9f95JYkCSjCuZbHQ==", + "license": "MIT" + }, "node_modules/has-bigints": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/has-bigints/-/has-bigints-1.1.0.tgz", "integrity": "sha512-R3pbpkcIqv2Pm3dUwgjclDRVmWpTJW2DcMzcIhEXEx1oh/CEMObMm3KLmRJOdvhM7o4uQBnwr8pzRK2sJWIqfg==", - "dev": true, "engines": { "node": ">= 0.4" }, @@ -7753,7 +8938,6 @@ "version": "1.2.0", "resolved": "https://registry.npmjs.org/has-proto/-/has-proto-1.2.0.tgz", "integrity": "sha512-KIL7eQPfHQRC8+XluaIw7BHUwwqL19bQn4hzNgdr+1wXoU0KKj6rufu47lhY7KbJR2C6T6+PfyN0Ea7wkSS+qQ==", - "dev": true, "dependencies": { "dunder-proto": "^1.0.0" }, @@ -7800,6 +8984,46 @@ "node": ">= 0.4" } }, + "node_modules/hast-util-to-jsx-runtime": { + "version": "2.3.6", + "resolved": "https://registry.npmjs.org/hast-util-to-jsx-runtime/-/hast-util-to-jsx-runtime-2.3.6.tgz", + "integrity": "sha512-zl6s8LwNyo1P9uw+XJGvZtdFF1GdAkOg8ujOw+4Pyb76874fLps4ueHXDhXWdk6YHQ6OgUtinliG7RsYvCbbBg==", + "license": "MIT", + "dependencies": { + "@types/estree": "^1.0.0", + "@types/hast": "^3.0.0", + "@types/unist": "^3.0.0", + "comma-separated-tokens": "^2.0.0", + "devlop": "^1.0.0", + "estree-util-is-identifier-name": "^3.0.0", + "hast-util-whitespace": "^3.0.0", + "mdast-util-mdx-expression": "^2.0.0", + "mdast-util-mdx-jsx": "^3.0.0", + "mdast-util-mdxjs-esm": "^2.0.0", + "property-information": "^7.0.0", + "space-separated-tokens": "^2.0.0", + "style-to-js": "^1.0.0", + "unist-util-position": "^5.0.0", + "vfile-message": "^4.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/hast-util-whitespace": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/hast-util-whitespace/-/hast-util-whitespace-3.0.0.tgz", + "integrity": "sha512-88JUN06ipLwsnv+dVn+OIYOvAuvBMy/Qoi6O7mQHxdPXpjy+Cd6xRkWwux7DKO+4sYILtLBRIKgsdpS2gQc7qw==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/hoist-non-react-statics": { "version": "3.3.2", "resolved": "https://registry.npmjs.org/hoist-non-react-statics/-/hoist-non-react-statics-3.3.2.tgz", @@ -7825,6 +9049,36 @@ "node": ">=18" } }, + "node_modules/html-url-attributes": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/html-url-attributes/-/html-url-attributes-3.0.1.tgz", + "integrity": "sha512-ol6UPyBWqsrO6EJySPz2O7ZSr856WDrEzM5zMqp+FJJLGMW35cLYmmZnl0vztAZxRUoNZJFTCohfjuIJ8I4QBQ==", + "license": "MIT", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/http-errors": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.1.tgz", + "integrity": "sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ==", + "license": "MIT", + "dependencies": { + "depd": "~2.0.0", + "inherits": "~2.0.4", + "setprototypeof": "~1.2.0", + "statuses": "~2.0.2", + "toidentifier": "~1.0.1" + }, + "engines": { + "node": ">= 0.8" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, "node_modules/http-proxy-agent": { "version": "7.0.2", "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz", @@ -7914,6 +9168,12 @@ "node": ">= 4" } }, + "node_modules/immutable": { + "version": "5.1.5", + "resolved": "https://registry.npmjs.org/immutable/-/immutable-5.1.5.tgz", + "integrity": "sha512-t7xcm2siw+hlUM68I+UEOK+z84RzmN59as9DZ7P1l0994DKUWV7UXBMQZVxaoMSRQ+PBZbHCOoBt7a2wxOMt+A==", + "license": "MIT" + }, "node_modules/import-fresh": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz", @@ -7960,11 +9220,16 @@ "node": "^14.17.0 || ^16.13.0 || >=18.0.0" } }, + "node_modules/inline-style-parser": { + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/inline-style-parser/-/inline-style-parser-0.2.7.tgz", + "integrity": "sha512-Nb2ctOyNR8DqQoR0OwRG95uNWIC0C1lCgf5Naz5H6Ji72KZ8OcFZLz2P5sNgwlyoJ8Yif11oMuYs5pBQa86csA==", + "license": "MIT" + }, "node_modules/internal-slot": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/internal-slot/-/internal-slot-1.1.0.tgz", "integrity": "sha512-4gd7VpWNQNB4UKKCFFVcp1AVv+FMOgs9NKzjHKusc8jTMhd5eL1NqQqOpE0KzMds804/yHlglp3uxgluOqAPLw==", - "dev": true, "dependencies": { "es-errors": "^1.3.0", "hasown": "^2.0.2", @@ -7974,6 +9239,48 @@ "node": ">= 0.4" } }, + "node_modules/io-ts": { + "version": "2.2.22", + "resolved": "https://registry.npmjs.org/io-ts/-/io-ts-2.2.22.tgz", + "integrity": "sha512-FHCCztTkHoV9mdBsHpocLpdTAfh956ZQcIkWQxxS0U5HT53vtrcuYdQneEJKH6xILaLNzXVl2Cvwtoy8XNN0AA==", + "license": "MIT", + "peerDependencies": { + "fp-ts": "^2.5.0" + } + }, + "node_modules/ipaddr.js": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz", + "integrity": "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==", + "license": "MIT", + "engines": { + "node": ">= 0.10" + } + }, + "node_modules/is-alphabetical": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-alphabetical/-/is-alphabetical-2.0.1.tgz", + "integrity": "sha512-FWyyY60MeTNyeSRpkM2Iry0G9hpr7/9kD40mD/cGQEuilcZYS4okz8SN2Q6rLCJ8gbCt6fN+rC+6tMGS99LaxQ==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/is-alphanumerical": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-alphanumerical/-/is-alphanumerical-2.0.1.tgz", + "integrity": "sha512-hmbYhX/9MUMF5uh7tOXyK/n0ZvWpad5caBA17GsC6vyuCqaWliRG5K1qS9inmUhEMaOBIW7/whAnSwveW/LtZw==", + "license": "MIT", + "dependencies": { + "is-alphabetical": "^2.0.0", + "is-decimal": "^2.0.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/is-arguments": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/is-arguments/-/is-arguments-1.2.0.tgz", @@ -7993,7 +9300,6 @@ "version": "3.0.5", "resolved": "https://registry.npmjs.org/is-array-buffer/-/is-array-buffer-3.0.5.tgz", "integrity": "sha512-DDfANUiiG2wC1qawP66qlTugJeL5HyzMpfr8lLK+jMQirGzNod0B12cFB/9q838Ru27sBwfw78/rdoU7RERz6A==", - "dev": true, "dependencies": { "call-bind": "^1.0.8", "call-bound": "^1.0.3", @@ -8015,7 +9321,6 @@ "version": "2.1.1", "resolved": "https://registry.npmjs.org/is-async-function/-/is-async-function-2.1.1.tgz", "integrity": "sha512-9dgM/cZBnNvjzaMYHVoxxfPj2QXt22Ev7SuuPrs+xav0ukGB0S6d4ydZdEiM48kLx5kDV+QBPrpVnFyefL8kkQ==", - "dev": true, "dependencies": { "async-function": "^1.0.0", "call-bound": "^1.0.3", @@ -8034,7 +9339,6 @@ "version": "1.1.0", "resolved": "https://registry.npmjs.org/is-bigint/-/is-bigint-1.1.0.tgz", "integrity": "sha512-n4ZT37wG78iz03xPRKJrHTdZbe3IicyucEtdRsV5yglwc3GyUfbAfpSeD0FJ41NbUNSt5wbhqfp1fS+BgnvDFQ==", - "dev": true, "dependencies": { "has-bigints": "^1.0.2" }, @@ -8049,7 +9353,6 @@ "version": "1.2.2", "resolved": "https://registry.npmjs.org/is-boolean-object/-/is-boolean-object-1.2.2.tgz", "integrity": "sha512-wa56o2/ElJMYqjCjGkXri7it5FbebW5usLw/nPmCMs5DeZ7eziSYZhSmPRn0txqeW4LnAmQQU7FgqLpsEFKM4A==", - "dev": true, "dependencies": { "call-bound": "^1.0.3", "has-tostringtag": "^1.0.2" @@ -8116,7 +9419,6 @@ "version": "1.0.2", "resolved": "https://registry.npmjs.org/is-data-view/-/is-data-view-1.0.2.tgz", "integrity": "sha512-RKtWF8pGmS87i2D6gqQu/l7EYRlVdfzemCJN/P3UOs//x1QE7mfhvzHIApBTRf7axvT6DMGwSwBXYCT0nfB9xw==", - "dev": true, "dependencies": { "call-bound": "^1.0.2", "get-intrinsic": "^1.2.6", @@ -8133,7 +9435,6 @@ "version": "1.1.0", "resolved": "https://registry.npmjs.org/is-date-object/-/is-date-object-1.1.0.tgz", "integrity": "sha512-PwwhEakHVKTdRNVOw+/Gyh0+MzlCl4R6qKvkhuvLtPMggI1WAHt9sOwZxQLSGpUaDnrdyDsomoRgNnCfKNSXXg==", - "dev": true, "dependencies": { "call-bound": "^1.0.2", "has-tostringtag": "^1.0.2" @@ -8145,6 +9446,16 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/is-decimal": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-decimal/-/is-decimal-2.0.1.tgz", + "integrity": "sha512-AAB9hiomQs5DXWcRB1rqsxGUstbRroFOPPVAomNk/3XHR5JyEZChOyTWe2oayKnsSsr/kcGqF+z6yuH6HHpN0A==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/is-docker": { "version": "2.2.1", "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-2.2.1.tgz", @@ -8172,7 +9483,6 @@ "version": "1.1.1", "resolved": "https://registry.npmjs.org/is-finalizationregistry/-/is-finalizationregistry-1.1.1.tgz", "integrity": "sha512-1pC6N8qWJbWoPtEjgcL2xyhQOP491EQjeUo3qTKcmV8YSDDJrOepfG8pcC7h/QgnQHYSv0mJ3Z/ZWxmatVrysg==", - "dev": true, "dependencies": { "call-bound": "^1.0.3" }, @@ -8243,6 +9553,16 @@ "node": ">=0.10.0" } }, + "node_modules/is-hexadecimal": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-hexadecimal/-/is-hexadecimal-2.0.1.tgz", + "integrity": "sha512-DgZQp241c8oO6cA1SbTEWiXeoxV42vlcJxgH+B3hi1AiqqKruZR3ZGF8In3fj4+/y/7rHvlOZLZtgJ/4ttYGZg==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/is-iexplorer": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/is-iexplorer/-/is-iexplorer-1.0.0.tgz", @@ -8255,7 +9575,6 @@ "version": "2.0.3", "resolved": "https://registry.npmjs.org/is-map/-/is-map-2.0.3.tgz", "integrity": "sha512-1Qed0/Hr2m+YqxnM09CjA2d/i6YZNfF6R2oRAOj36eUdS6qIV/huPJNSEpKbupewFs+ZsJlxsjjPbc0/afW6Lw==", - "dev": true, "engines": { "node": ">= 0.4" }, @@ -8272,7 +9591,6 @@ "version": "2.0.3", "resolved": "https://registry.npmjs.org/is-negative-zero/-/is-negative-zero-2.0.3.tgz", "integrity": "sha512-5KoIu2Ngpyek75jXodFvnafB6DJgr3u8uuK0LEZJjrU19DrMD3EVERaR8sjz8CCGgpZvxPl9SuE1GMVPFHx1mw==", - "dev": true, "engines": { "node": ">= 0.4" }, @@ -8293,7 +9611,6 @@ "version": "1.1.1", "resolved": "https://registry.npmjs.org/is-number-object/-/is-number-object-1.1.1.tgz", "integrity": "sha512-lZhclumE1G6VYD8VHe35wFaIif+CTy5SJIi5+3y4psDgWu4wPDoBhF8NxUOinEc7pHgiTsT6MaBb92rKhhD+Xw==", - "dev": true, "dependencies": { "call-bound": "^1.0.3", "has-tostringtag": "^1.0.2" @@ -8327,6 +9644,12 @@ "integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==", "dev": true }, + "node_modules/is-promise": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/is-promise/-/is-promise-4.0.0.tgz", + "integrity": "sha512-hvpoI6korhJMnej285dSg6nu1+e6uxs7zG3BYAm5byqDsgJNWwxzM6z6iZiAgQR4TJ30JmBTOwqZUw3WlyH3AQ==", + "license": "MIT" + }, "node_modules/is-regex": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.2.1.tgz", @@ -8348,7 +9671,6 @@ "version": "2.0.3", "resolved": "https://registry.npmjs.org/is-set/-/is-set-2.0.3.tgz", "integrity": "sha512-iPAjerrse27/ygGLxw+EBR9agv9Y6uLeYVJMu+QNCoouJ1/1ri0mGrcWpfCqFZuzzx3WjtwxG098X+n4OuRkPg==", - "dev": true, "engines": { "node": ">= 0.4" }, @@ -8360,7 +9682,6 @@ "version": "1.0.4", "resolved": "https://registry.npmjs.org/is-shared-array-buffer/-/is-shared-array-buffer-1.0.4.tgz", "integrity": "sha512-ISWac8drv4ZGfwKl5slpHG9OwPNty4jOWPRIhBpxOoD+hqITiwuipOQ2bNthAzwA3B4fIjO4Nln74N0S9byq8A==", - "dev": true, "dependencies": { "call-bound": "^1.0.3" }, @@ -8387,7 +9708,6 @@ "version": "1.1.1", "resolved": "https://registry.npmjs.org/is-string/-/is-string-1.1.1.tgz", "integrity": "sha512-BtEeSsoaQjlSPBemMQIrY1MY0uM6vnS1g5fmufYOtnxLGUZM2178PKbhsk7Ffv58IX+ZtcvoGwccYsh0PglkAA==", - "dev": true, "dependencies": { "call-bound": "^1.0.3", "has-tostringtag": "^1.0.2" @@ -8413,7 +9733,6 @@ "version": "1.1.1", "resolved": "https://registry.npmjs.org/is-symbol/-/is-symbol-1.1.1.tgz", "integrity": "sha512-9gGx6GTtCQM73BgmHQXfDmLtfjjTUDSyoxTCbp5WtoixAhfgsDirWIcVQ/IHpvI5Vgd5i/J5F7B9cN/WlVbC/w==", - "dev": true, "dependencies": { "call-bound": "^1.0.2", "has-symbols": "^1.1.0", @@ -8444,7 +9763,6 @@ "version": "2.0.2", "resolved": "https://registry.npmjs.org/is-weakmap/-/is-weakmap-2.0.2.tgz", "integrity": "sha512-K5pXYOm9wqY1RgjpL3YTkF39tni1XajUIkawTLUo9EZEVUFga5gSQJF8nNS7ZwJQ02y+1YCNYcMh+HIf1ZqE+w==", - "dev": true, "engines": { "node": ">= 0.4" }, @@ -8456,7 +9774,6 @@ "version": "1.1.1", "resolved": "https://registry.npmjs.org/is-weakref/-/is-weakref-1.1.1.tgz", "integrity": "sha512-6i9mGWSlqzNMEqpCp93KwRS1uUOodk2OJ6b+sq7ZPDSy2WuI5NFIxp/254TytR8ftefexkWn5xNiHUNpPOfSew==", - "dev": true, "dependencies": { "call-bound": "^1.0.3" }, @@ -8471,7 +9788,6 @@ "version": "2.0.4", "resolved": "https://registry.npmjs.org/is-weakset/-/is-weakset-2.0.4.tgz", "integrity": "sha512-mfcwb6IzQyOKTs84CQMrOwW4gQcaTOAWJ0zzJCl2WSPDrWk/OzDaImWFH3djXhb24g4eudZfLRozAvPGw4d9hQ==", - "dev": true, "dependencies": { "call-bound": "^1.0.3", "get-intrinsic": "^1.2.6" @@ -8967,6 +10283,16 @@ "url": "https://github.com/chalk/wrap-ansi?sponsor=1" } }, + "node_modules/longest-streak": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/longest-streak/-/longest-streak-3.1.0.tgz", + "integrity": "sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/loose-envify": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz", @@ -9158,6 +10484,16 @@ "resolved": "https://registry.npmjs.org/tinyqueue/-/tinyqueue-3.0.0.tgz", "integrity": "sha512-gRa9gwYU3ECmQYv3lslts5hxuIa90veaEcxDYuu3QGOIAEM2mOZkVHp48ANJuu1CURtRdHKUBY5Lm1tHV+sD4g==" }, + "node_modules/markdown-table": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.4.tgz", + "integrity": "sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/math-intrinsics": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", @@ -9174,58 +10510,924 @@ "node": ">=0.10.0" } }, - "node_modules/memoizerific": { - "version": "1.11.3", - "resolved": "https://registry.npmjs.org/memoizerific/-/memoizerific-1.11.3.tgz", - "integrity": "sha512-/EuHYwAPdLtXwAwSZkh/Gutery6pD2KYd44oQLhAvQp/50mpyduZh8Q7PYHXTCJ+wuXxt7oij2LXyIJOOYFPog==", - "dev": true, + "node_modules/mdast-util-find-and-replace": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/mdast-util-find-and-replace/-/mdast-util-find-and-replace-3.0.2.tgz", + "integrity": "sha512-Tmd1Vg/m3Xz43afeNxDIhWRtFZgM2VLyaf4vSTYwudTyeuTneoL3qtWMA5jeLyz/O1vDJmmV4QuScFCA2tBPwg==", + "license": "MIT", "dependencies": { - "map-or-similar": "^1.5.0" + "@types/mdast": "^4.0.0", + "escape-string-regexp": "^5.0.0", + "unist-util-is": "^6.0.0", + "unist-util-visit-parents": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" } }, - "node_modules/merge-stream": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz", - "integrity": "sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==", - "dev": true - }, - "node_modules/merge2": { - "version": "1.4.1", - "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz", - "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==", - "dev": true, + "node_modules/mdast-util-find-and-replace/node_modules/escape-string-regexp": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz", + "integrity": "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==", + "license": "MIT", "engines": { - "node": ">= 8" - } - }, - "node_modules/micromatch": { - "version": "4.0.8", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz", - "integrity": "sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==", - "dev": true, - "dependencies": { - "braces": "^3.0.3", - "picomatch": "^2.3.1" + "node": ">=12" }, - "engines": { - "node": ">=8.6" + "funding": { + "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/micromatch/node_modules/picomatch": { - "version": "2.3.1", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz", - "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==", - "dev": true, - "engines": { - "node": ">=8.6" + "node_modules/mdast-util-from-markdown": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/mdast-util-from-markdown/-/mdast-util-from-markdown-2.0.3.tgz", + "integrity": "sha512-W4mAWTvSlKvf8L6J+VN9yLSqQ9AOAAvHuoDAmPkz4dHf553m5gVj2ejadHJhoJmcmxEnOv6Pa8XJhpxE93kb8Q==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "@types/unist": "^3.0.0", + "decode-named-character-reference": "^1.0.0", + "devlop": "^1.0.0", + "mdast-util-to-string": "^4.0.0", + "micromark": "^4.0.0", + "micromark-util-decode-numeric-character-reference": "^2.0.0", + "micromark-util-decode-string": "^2.0.0", + "micromark-util-normalize-identifier": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0", + "unist-util-stringify-position": "^4.0.0" }, "funding": { - "url": "https://github.com/sponsors/jonschlinkert" + "type": "opencollective", + "url": "https://opencollective.com/unified" } }, - "node_modules/mime-db": { - "version": "1.52.0", - "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", + "node_modules/mdast-util-gfm": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm/-/mdast-util-gfm-3.1.0.tgz", + "integrity": "sha512-0ulfdQOM3ysHhCJ1p06l0b0VKlhU0wuQs3thxZQagjcjPrlFRqY215uZGHHJan9GEAXd9MbfPjFJz+qMkVR6zQ==", + "license": "MIT", + "dependencies": { + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-gfm-autolink-literal": "^2.0.0", + "mdast-util-gfm-footnote": "^2.0.0", + "mdast-util-gfm-strikethrough": "^2.0.0", + "mdast-util-gfm-table": "^2.0.0", + "mdast-util-gfm-task-list-item": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-autolink-literal": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-autolink-literal/-/mdast-util-gfm-autolink-literal-2.0.1.tgz", + "integrity": "sha512-5HVP2MKaP6L+G6YaxPNjuL0BPrq9orG3TsrZ9YXbA3vDw/ACI4MEsnoDpn6ZNm7GnZgtAcONJyPhOP8tNJQavQ==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "ccount": "^2.0.0", + "devlop": "^1.0.0", + "mdast-util-find-and-replace": "^3.0.0", + "micromark-util-character": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-footnote": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-footnote/-/mdast-util-gfm-footnote-2.1.0.tgz", + "integrity": "sha512-sqpDWlsHn7Ac9GNZQMeUzPQSMzR6Wv0WKRNvQRg0KqHh02fpTz69Qc1QSseNX29bhz1ROIyNyxExfawVKTm1GQ==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "devlop": "^1.1.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0", + "micromark-util-normalize-identifier": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-strikethrough": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-strikethrough/-/mdast-util-gfm-strikethrough-2.0.0.tgz", + "integrity": "sha512-mKKb915TF+OC5ptj5bJ7WFRPdYtuHv0yTRxK2tJvi+BDqbkiG7h7u/9SI89nRAYcmap2xHQL9D+QG/6wSrTtXg==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-table": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-table/-/mdast-util-gfm-table-2.0.0.tgz", + "integrity": "sha512-78UEvebzz/rJIxLvE7ZtDd/vIQ0RHv+3Mh5DR96p7cS7HsBhYIICDBCu8csTNWNO6tBWfqXPWekRuj2FNOGOZg==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "devlop": "^1.0.0", + "markdown-table": "^3.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-task-list-item": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-task-list-item/-/mdast-util-gfm-task-list-item-2.0.0.tgz", + "integrity": "sha512-IrtvNvjxC1o06taBAVJznEnkiHxLFTzgonUdy8hzFVeDun0uTjxxrRGVaNFqkU1wJR3RBPEfsxmU6jDWPofrTQ==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "devlop": "^1.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-mdx-expression": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/mdast-util-mdx-expression/-/mdast-util-mdx-expression-2.0.1.tgz", + "integrity": "sha512-J6f+9hUp+ldTZqKRSg7Vw5V6MqjATc+3E4gf3CFNcuZNWD8XdyI6zQ8GqH7f8169MM6P7hMBRDVGnn7oHB9kXQ==", + "license": "MIT", + "dependencies": { + "@types/estree-jsx": "^1.0.0", + "@types/hast": "^3.0.0", + "@types/mdast": "^4.0.0", + "devlop": "^1.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-mdx-jsx": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/mdast-util-mdx-jsx/-/mdast-util-mdx-jsx-3.2.0.tgz", + "integrity": "sha512-lj/z8v0r6ZtsN/cGNNtemmmfoLAFZnjMbNyLzBafjzikOM+glrjNHPlf6lQDOTccj9n5b0PPihEBbhneMyGs1Q==", + "license": "MIT", + "dependencies": { + "@types/estree-jsx": "^1.0.0", + "@types/hast": "^3.0.0", + "@types/mdast": "^4.0.0", + "@types/unist": "^3.0.0", + "ccount": "^2.0.0", + "devlop": "^1.1.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0", + "parse-entities": "^4.0.0", + "stringify-entities": "^4.0.0", + "unist-util-stringify-position": "^4.0.0", + "vfile-message": "^4.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-mdxjs-esm": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/mdast-util-mdxjs-esm/-/mdast-util-mdxjs-esm-2.0.1.tgz", + "integrity": "sha512-EcmOpxsZ96CvlP03NghtH1EsLtr0n9Tm4lPUJUBccV9RwUOneqSycg19n5HGzCf+10LozMRSObtVr3ee1WoHtg==", + "license": "MIT", + "dependencies": { + "@types/estree-jsx": "^1.0.0", + "@types/hast": "^3.0.0", + "@types/mdast": "^4.0.0", + "devlop": "^1.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-phrasing": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/mdast-util-phrasing/-/mdast-util-phrasing-4.1.0.tgz", + "integrity": "sha512-TqICwyvJJpBwvGAMZjj4J2n0X8QWp21b9l0o7eXyVJ25YNWYbJDVIyD1bZXE6WtV6RmKJVYmQAKWa0zWOABz2w==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "unist-util-is": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-to-hast": { + "version": "13.2.1", + "resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-13.2.1.tgz", + "integrity": "sha512-cctsq2wp5vTsLIcaymblUriiTcZd0CwWtCbLvrOzYCDZoWyMNV8sZ7krj09FSnsiJi3WVsHLM4k6Dq/yaPyCXA==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "@types/mdast": "^4.0.0", + "@ungap/structured-clone": "^1.0.0", + "devlop": "^1.0.0", + "micromark-util-sanitize-uri": "^2.0.0", + "trim-lines": "^3.0.0", + "unist-util-position": "^5.0.0", + "unist-util-visit": "^5.0.0", + "vfile": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-to-markdown": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/mdast-util-to-markdown/-/mdast-util-to-markdown-2.1.2.tgz", + "integrity": "sha512-xj68wMTvGXVOKonmog6LwyJKrYXZPvlwabaryTjLh9LuvovB/KAH+kvi8Gjj+7rJjsFi23nkUxRQv1KqSroMqA==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "@types/unist": "^3.0.0", + "longest-streak": "^3.0.0", + "mdast-util-phrasing": "^4.0.0", + "mdast-util-to-string": "^4.0.0", + "micromark-util-classify-character": "^2.0.0", + "micromark-util-decode-string": "^2.0.0", + "unist-util-visit": "^5.0.0", + "zwitch": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-to-string": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-to-string/-/mdast-util-to-string-4.0.0.tgz", + "integrity": "sha512-0H44vDimn51F0YwvxSJSm0eCDOJTRlmN0R1yBh4HLj9wiV1Dn0QoXGbvFAWj2hSItVTlCmBF1hqKlIyUBVFLPg==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/media-typer": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-1.1.0.tgz", + "integrity": "sha512-aisnrDP4GNe06UcKFnV5bfMNPBUw4jsLGaWwWfnH3v02GnBuXX2MCVn5RbrWo0j3pczUilYblq7fQ7Nw2t5XKw==", + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/memoizerific": { + "version": "1.11.3", + "resolved": "https://registry.npmjs.org/memoizerific/-/memoizerific-1.11.3.tgz", + "integrity": "sha512-/EuHYwAPdLtXwAwSZkh/Gutery6pD2KYd44oQLhAvQp/50mpyduZh8Q7PYHXTCJ+wuXxt7oij2LXyIJOOYFPog==", + "dev": true, + "dependencies": { + "map-or-similar": "^1.5.0" + } + }, + "node_modules/merge-descriptors": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-2.0.0.tgz", + "integrity": "sha512-Snk314V5ayFLhp3fkUREub6WtjBfPdCPY1Ln8/8munuLuiYhsABgBVWsozAG+MWMbVEvcdcpbi9R7ww22l9Q3g==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/merge-stream": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz", + "integrity": "sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==", + "dev": true + }, + "node_modules/merge2": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz", + "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==", + "dev": true, + "engines": { + "node": ">= 8" + } + }, + "node_modules/micromark": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/micromark/-/micromark-4.0.2.tgz", + "integrity": "sha512-zpe98Q6kvavpCr1NPVSCMebCKfD7CA2NqZ+rykeNhONIJBpc1tFKt9hucLGwha3jNTNI8lHpctWJWoimVF4PfA==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "@types/debug": "^4.0.0", + "debug": "^4.0.0", + "decode-named-character-reference": "^1.0.0", + "devlop": "^1.0.0", + "micromark-core-commonmark": "^2.0.0", + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-chunked": "^2.0.0", + "micromark-util-combine-extensions": "^2.0.0", + "micromark-util-decode-numeric-character-reference": "^2.0.0", + "micromark-util-encode": "^2.0.0", + "micromark-util-normalize-identifier": "^2.0.0", + "micromark-util-resolve-all": "^2.0.0", + "micromark-util-sanitize-uri": "^2.0.0", + "micromark-util-subtokenize": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, + "node_modules/micromark-core-commonmark": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/micromark-core-commonmark/-/micromark-core-commonmark-2.0.3.tgz", + "integrity": "sha512-RDBrHEMSxVFLg6xvnXmb1Ayr2WzLAWjeSATAoxwKYJV94TeNavgoIdA0a9ytzDSVzBy2YKFK+emCPOEibLeCrg==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "decode-named-character-reference": "^1.0.0", + "devlop": "^1.0.0", + "micromark-factory-destination": "^2.0.0", + "micromark-factory-label": "^2.0.0", + "micromark-factory-space": "^2.0.0", + "micromark-factory-title": "^2.0.0", + "micromark-factory-whitespace": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-chunked": "^2.0.0", + "micromark-util-classify-character": "^2.0.0", + "micromark-util-html-tag-name": "^2.0.0", + "micromark-util-normalize-identifier": "^2.0.0", + "micromark-util-resolve-all": "^2.0.0", + "micromark-util-subtokenize": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, + "node_modules/micromark-extension-gfm": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm/-/micromark-extension-gfm-3.0.0.tgz", + "integrity": "sha512-vsKArQsicm7t0z2GugkCKtZehqUm31oeGBV/KVSorWSy8ZlNAv7ytjFhvaryUiCUJYqs+NoE6AFhpQvBTM6Q4w==", + "license": "MIT", + "dependencies": { + "micromark-extension-gfm-autolink-literal": "^2.0.0", + "micromark-extension-gfm-footnote": "^2.0.0", + "micromark-extension-gfm-strikethrough": "^2.0.0", + "micromark-extension-gfm-table": "^2.0.0", + "micromark-extension-gfm-tagfilter": "^2.0.0", + "micromark-extension-gfm-task-list-item": "^2.0.0", + "micromark-util-combine-extensions": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-autolink-literal": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-autolink-literal/-/micromark-extension-gfm-autolink-literal-2.1.0.tgz", + "integrity": "sha512-oOg7knzhicgQ3t4QCjCWgTmfNhvQbDDnJeVu9v81r7NltNCVmhPy1fJRX27pISafdjL+SVc4d3l48Gb6pbRypw==", + "license": "MIT", + "dependencies": { + "micromark-util-character": "^2.0.0", + "micromark-util-sanitize-uri": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-footnote": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-footnote/-/micromark-extension-gfm-footnote-2.1.0.tgz", + "integrity": "sha512-/yPhxI1ntnDNsiHtzLKYnE3vf9JZ6cAisqVDauhp4CEHxlb4uoOTxOCJ+9s51bIB8U1N1FJ1RXOKTIlD5B/gqw==", + "license": "MIT", + "dependencies": { + "devlop": "^1.0.0", + "micromark-core-commonmark": "^2.0.0", + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-normalize-identifier": "^2.0.0", + "micromark-util-sanitize-uri": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-strikethrough": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-strikethrough/-/micromark-extension-gfm-strikethrough-2.1.0.tgz", + "integrity": "sha512-ADVjpOOkjz1hhkZLlBiYA9cR2Anf8F4HqZUO6e5eDcPQd0Txw5fxLzzxnEkSkfnD0wziSGiv7sYhk/ktvbf1uw==", + "license": "MIT", + "dependencies": { + "devlop": "^1.0.0", + "micromark-util-chunked": "^2.0.0", + "micromark-util-classify-character": "^2.0.0", + "micromark-util-resolve-all": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-table": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-table/-/micromark-extension-gfm-table-2.1.1.tgz", + "integrity": "sha512-t2OU/dXXioARrC6yWfJ4hqB7rct14e8f7m0cbI5hUmDyyIlwv5vEtooptH8INkbLzOatzKuVbQmAYcbWoyz6Dg==", + "license": "MIT", + "dependencies": { + "devlop": "^1.0.0", + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-tagfilter": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-tagfilter/-/micromark-extension-gfm-tagfilter-2.0.0.tgz", + "integrity": "sha512-xHlTOmuCSotIA8TW1mDIM6X2O1SiX5P9IuDtqGonFhEK0qgRI4yeC6vMxEV2dgyr2TiD+2PQ10o+cOhdVAcwfg==", + "license": "MIT", + "dependencies": { + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-task-list-item": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-task-list-item/-/micromark-extension-gfm-task-list-item-2.1.0.tgz", + "integrity": "sha512-qIBZhqxqI6fjLDYFTBIa4eivDMnP+OZqsNwmQ3xNLE4Cxwc+zfQEfbs6tzAo2Hjq+bh6q5F+Z8/cksrLFYWQQw==", + "license": "MIT", + "dependencies": { + "devlop": "^1.0.0", + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-factory-destination": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/micromark-factory-destination/-/micromark-factory-destination-2.0.1.tgz", + "integrity": "sha512-Xe6rDdJlkmbFRExpTOmRj9N3MaWmbAgdpSrBQvCFqhezUn4AHqJHbaEnfbVYYiexVSs//tqOdY/DxhjdCiJnIA==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, + "node_modules/micromark-factory-label": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/micromark-factory-label/-/micromark-factory-label-2.0.1.tgz", + "integrity": "sha512-VFMekyQExqIW7xIChcXn4ok29YE3rnuyveW3wZQWWqF4Nv9Wk5rgJ99KzPvHjkmPXF93FXIbBp6YdW3t71/7Vg==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "devlop": "^1.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, + "node_modules/micromark-factory-space": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/micromark-factory-space/-/micromark-factory-space-2.0.1.tgz", + "integrity": "sha512-zRkxjtBxxLd2Sc0d+fbnEunsTj46SWXgXciZmHq0kDYGnck/ZSGj9/wULTV95uoeYiK5hRXP2mJ98Uo4cq/LQg==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "micromark-util-character": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, + "node_modules/micromark-factory-title": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/micromark-factory-title/-/micromark-factory-title-2.0.1.tgz", + "integrity": "sha512-5bZ+3CjhAd9eChYTHsjy6TGxpOFSKgKKJPJxr293jTbfry2KDoWkhBb6TcPVB4NmzaPhMs1Frm9AZH7OD4Cjzw==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, + "node_modules/micromark-factory-whitespace": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/micromark-factory-whitespace/-/micromark-factory-whitespace-2.0.1.tgz", + "integrity": "sha512-Ob0nuZ3PKt/n0hORHyvoD9uZhr+Za8sFoP+OnMcnWK5lngSzALgQYKMr9RJVOWLqQYuyn6ulqGWSXdwf6F80lQ==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, + "node_modules/micromark-util-character": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/micromark-util-character/-/micromark-util-character-2.1.1.tgz", + "integrity": "sha512-wv8tdUTJ3thSFFFJKtpYKOYiGP2+v96Hvk4Tu8KpCAsTMs6yi+nVmGh1syvSCsaxz45J6Jbw+9DD6g97+NV67Q==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, + "node_modules/micromark-util-chunked": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/micromark-util-chunked/-/micromark-util-chunked-2.0.1.tgz", + "integrity": "sha512-QUNFEOPELfmvv+4xiNg2sRYeS/P84pTW0TCgP5zc9FpXetHY0ab7SxKyAQCNCc1eK0459uoLI1y5oO5Vc1dbhA==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "micromark-util-symbol": "^2.0.0" + } + }, + "node_modules/micromark-util-classify-character": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/micromark-util-classify-character/-/micromark-util-classify-character-2.0.1.tgz", + "integrity": "sha512-K0kHzM6afW/MbeWYWLjoHQv1sgg2Q9EccHEDzSkxiP/EaagNzCm7T/WMKZ3rjMbvIpvBiZgwR3dKMygtA4mG1Q==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, + "node_modules/micromark-util-combine-extensions": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/micromark-util-combine-extensions/-/micromark-util-combine-extensions-2.0.1.tgz", + "integrity": "sha512-OnAnH8Ujmy59JcyZw8JSbK9cGpdVY44NKgSM7E9Eh7DiLS2E9RNQf0dONaGDzEG9yjEl5hcqeIsj4hfRkLH/Bg==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "micromark-util-chunked": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, + "node_modules/micromark-util-decode-numeric-character-reference": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/micromark-util-decode-numeric-character-reference/-/micromark-util-decode-numeric-character-reference-2.0.2.tgz", + "integrity": "sha512-ccUbYk6CwVdkmCQMyr64dXz42EfHGkPQlBj5p7YVGzq8I7CtjXZJrubAYezf7Rp+bjPseiROqe7G6foFd+lEuw==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "micromark-util-symbol": "^2.0.0" + } + }, + "node_modules/micromark-util-decode-string": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/micromark-util-decode-string/-/micromark-util-decode-string-2.0.1.tgz", + "integrity": "sha512-nDV/77Fj6eH1ynwscYTOsbK7rR//Uj0bZXBwJZRfaLEJ1iGBR6kIfNmlNqaqJf649EP0F3NWNdeJi03elllNUQ==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "decode-named-character-reference": "^1.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-decode-numeric-character-reference": "^2.0.0", + "micromark-util-symbol": "^2.0.0" + } + }, + "node_modules/micromark-util-encode": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/micromark-util-encode/-/micromark-util-encode-2.0.1.tgz", + "integrity": "sha512-c3cVx2y4KqUnwopcO9b/SCdo2O67LwJJ/UyqGfbigahfegL9myoEFoDYZgkT7f36T0bLrM9hZTAaAyH+PCAXjw==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT" + }, + "node_modules/micromark-util-html-tag-name": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/micromark-util-html-tag-name/-/micromark-util-html-tag-name-2.0.1.tgz", + "integrity": "sha512-2cNEiYDhCWKI+Gs9T0Tiysk136SnR13hhO8yW6BGNyhOC4qYFnwF1nKfD3HFAIXA5c45RrIG1ub11GiXeYd1xA==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT" + }, + "node_modules/micromark-util-normalize-identifier": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/micromark-util-normalize-identifier/-/micromark-util-normalize-identifier-2.0.1.tgz", + "integrity": "sha512-sxPqmo70LyARJs0w2UclACPUUEqltCkJ6PhKdMIDuJ3gSf/Q+/GIe3WKl0Ijb/GyH9lOpUkRAO2wp0GVkLvS9Q==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "micromark-util-symbol": "^2.0.0" + } + }, + "node_modules/micromark-util-resolve-all": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/micromark-util-resolve-all/-/micromark-util-resolve-all-2.0.1.tgz", + "integrity": "sha512-VdQyxFWFT2/FGJgwQnJYbe1jjQoNTS4RjglmSjTUlpUMa95Htx9NHeYW4rGDJzbjvCsl9eLjMQwGeElsqmzcHg==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "micromark-util-types": "^2.0.0" + } + }, + "node_modules/micromark-util-sanitize-uri": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/micromark-util-sanitize-uri/-/micromark-util-sanitize-uri-2.0.1.tgz", + "integrity": "sha512-9N9IomZ/YuGGZZmQec1MbgxtlgougxTodVwDzzEouPKo3qFWvymFHWcnDi2vzV1ff6kas9ucW+o3yzJK9YB1AQ==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "micromark-util-character": "^2.0.0", + "micromark-util-encode": "^2.0.0", + "micromark-util-symbol": "^2.0.0" + } + }, + "node_modules/micromark-util-subtokenize": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-util-subtokenize/-/micromark-util-subtokenize-2.1.0.tgz", + "integrity": "sha512-XQLu552iSctvnEcgXw6+Sx75GflAPNED1qx7eBJ+wydBb2KCbRZe+NwvIEEMM83uml1+2WSXpBAcp9IUCgCYWA==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "devlop": "^1.0.0", + "micromark-util-chunked": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, + "node_modules/micromark-util-symbol": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/micromark-util-symbol/-/micromark-util-symbol-2.0.1.tgz", + "integrity": "sha512-vs5t8Apaud9N28kgCrRUdEed4UJ+wWNvicHLPxCa9ENlYuAY31M0ETy5y1vA33YoNPDFTghEbnh6efaE8h4x0Q==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT" + }, + "node_modules/micromark-util-types": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/micromark-util-types/-/micromark-util-types-2.0.2.tgz", + "integrity": "sha512-Yw0ECSpJoViF1qTU4DC6NwtC4aWGt1EkzaQB8KPPyCRR8z9TWeV0HbEFGTO+ZY1wB22zmxnJqhPyTpOVCpeHTA==", + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT" + }, + "node_modules/micromatch": { + "version": "4.0.8", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz", + "integrity": "sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==", + "dev": true, + "dependencies": { + "braces": "^3.0.3", + "picomatch": "^2.3.1" + }, + "engines": { + "node": ">=8.6" + } + }, + "node_modules/micromatch/node_modules/picomatch": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz", + "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==", + "dev": true, + "engines": { + "node": ">=8.6" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/mime-db": { + "version": "1.52.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", "engines": { "node": ">= 0.6" @@ -9340,6 +11542,93 @@ } } }, + "node_modules/molstar": { + "version": "5.7.0", + "resolved": "https://registry.npmjs.org/molstar/-/molstar-5.7.0.tgz", + "integrity": "sha512-Bo/QDiEkoRdhyhmFXNBPP2kiNTHZNgJO69AzqO+CrpkSj7JUrYNtBlt49vqa2AJfLUrBNgeeHcfjbxGLmfO7sQ==", + "license": "MIT", + "dependencies": { + "@types/argparse": "^2.0.17", + "@types/benchmark": "^2.1.5", + "@types/compression": "1.8.1", + "@types/express": "^5.0.6", + "@types/node": "^22.19.13", + "@types/node-fetch": "^2.6.13", + "@types/swagger-ui-dist": "3.30.6", + "argparse": "^2.0.1", + "compression": "^1.8.1", + "cors": "^2.8.6", + "express": "^5.2.1", + "h264-mp4-encoder": "^1.0.12", + "immutable": "^5.1.4", + "io-ts": "^2.2.22", + "mutative": "^1.3.0", + "node-fetch": "^2.7.0", + "react-markdown": "^10.1.0", + "remark-gfm": "^4.0.1", + "rxjs": "^7.8.2", + "swagger-ui-dist": "^5.32.0", + "tslib": "^2.8.1", + "util.promisify": "^1.1.3" + }, + "bin": { + "cif2bcif": "lib/commonjs/cli/cif2bcif/index.js", + "cifschema": "lib/commonjs/cli/cifschema/index.js", + "model-server": "lib/commonjs/servers/model/server.js", + "model-server-preprocess": "lib/commonjs/servers/model/preprocess.js", + "model-server-query": "lib/commonjs/servers/model/query.js", + "mvs-print-schema": "lib/commonjs/cli/mvs/mvs-print-schema.js", + "mvs-render": "lib/commonjs/cli/mvs/mvs-render.js", + "mvs-validate": "lib/commonjs/cli/mvs/mvs-validate.js", + "volume-server": "lib/commonjs/servers/volume/server.js", + "volume-server-pack": "lib/commonjs/servers/volume/pack.js", + "volume-server-query": "lib/commonjs/servers/volume/query.js" + }, + "engines": { + "node": ">=22.0.0" + }, + "peerDependencies": { + "@google-cloud/storage": "^7.14.0", + "canvas": "^2.11.2", + "gl": "^6.0.2", + "jpeg-js": "^0.4.4", + "pngjs": "^6.0.0", + "react": ">=16.14.0", + "react-dom": ">=16.14.0" + }, + "peerDependenciesMeta": { + "@google-cloud/storage": { + "optional": true + }, + "canvas": { + "optional": true + }, + "gl": { + "optional": true + }, + "jpeg-js": { + "optional": true + }, + "pngjs": { + "optional": true + } + } + }, + "node_modules/molstar/node_modules/@types/node": { + "version": "22.19.15", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.19.15.tgz", + "integrity": "sha512-F0R/h2+dsy5wJAUe3tAU6oqa2qbWY5TpNfL/RGmo1y38hiyO1w3x2jPtt76wmuaJI4DQnOBu21cNXQ2STIUUWg==", + "license": "MIT", + "dependencies": { + "undici-types": "~6.21.0" + } + }, + "node_modules/molstar/node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "license": "MIT" + }, "node_modules/moment": { "version": "2.30.1", "resolved": "https://registry.npmjs.org/moment/-/moment-2.30.1.tgz", @@ -9399,6 +11688,15 @@ "resolved": "https://registry.npmjs.org/murmurhash-js/-/murmurhash-js-1.0.0.tgz", "integrity": "sha512-TvmkNhkv8yct0SVBSy+o8wYzXjE4Zz3PCesbfs8HiCXXdcTuocApFv11UWlNFWKYsP2okqrhb7JNlSm9InBhIw==" }, + "node_modules/mutative": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/mutative/-/mutative-1.3.0.tgz", + "integrity": "sha512-8MJj6URmOZAV70dpFe1YnSppRTKC4DsMkXQiBDFayLcDI4ljGokHxmpqaBQuDWa4iAxWaJJ1PS8vAmbntjjKmQ==", + "license": "MIT", + "engines": { + "node": ">=14.0" + } + }, "node_modules/nanoid": { "version": "3.3.11", "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", @@ -9466,6 +11764,15 @@ "ms": "^2.1.1" } }, + "node_modules/negotiator": { + "version": "0.6.4", + "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.4.tgz", + "integrity": "sha512-myRT3DiWPHqho5PrJaIRyaMv2kgYf0mUVgBNOYMuCH5Ki1yEiQaf/ZJuQ62nvpc44wL5WDbTX7yGJi1Neevw8w==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, "node_modules/next-tick": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/next-tick/-/next-tick-1.1.0.tgz", @@ -9481,6 +11788,56 @@ "tslib": "^2.0.3" } }, + "node_modules/node-addon-api": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-7.1.1.tgz", + "integrity": "sha512-5m3bsyrjFWE1xf7nz7YXdN4udnVtXK6/Yfgn5qnahL6bCkf2yKt4k3nuTKAtT4r3IG8JNR2ncsIMdZuAzJjHQQ==", + "dev": true, + "license": "MIT", + "optional": true + }, + "node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "license": "MIT", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, + "node_modules/node-fetch/node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==", + "license": "MIT" + }, + "node_modules/node-fetch/node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==", + "license": "BSD-2-Clause" + }, + "node_modules/node-fetch/node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "license": "MIT", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + }, "node_modules/node-releases": { "version": "2.0.20", "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.20.tgz", @@ -9548,7 +11905,6 @@ "version": "1.13.4", "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz", "integrity": "sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew==", - "dev": true, "engines": { "node": ">= 0.4" }, @@ -9560,7 +11916,6 @@ "version": "1.1.1", "resolved": "https://registry.npmjs.org/object-keys/-/object-keys-1.1.1.tgz", "integrity": "sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA==", - "dev": true, "engines": { "node": ">= 0.4" } @@ -9569,7 +11924,6 @@ "version": "4.1.7", "resolved": "https://registry.npmjs.org/object.assign/-/object.assign-4.1.7.tgz", "integrity": "sha512-nK28WOo+QIjBkDduTINE4JkF/UJJKyf2EJxvJKfblDpyg0Q+pkOHNTL0Qwy6NP6FhE/EnzV73BxxqcJaXY9anw==", - "dev": true, "dependencies": { "call-bind": "^1.0.8", "call-bound": "^1.0.3", @@ -9618,6 +11972,27 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/object.getownpropertydescriptors": { + "version": "2.1.9", + "resolved": "https://registry.npmjs.org/object.getownpropertydescriptors/-/object.getownpropertydescriptors-2.1.9.tgz", + "integrity": "sha512-mt8YM6XwsTTovI+kdZdHSxoyF2DI59up034orlC9NfweclcWOt7CVascNNLp6U+bjFVCVCIh9PwS76tDM/rH8g==", + "license": "MIT", + "dependencies": { + "array.prototype.reduce": "^1.0.8", + "call-bind": "^1.0.8", + "define-properties": "^1.2.1", + "es-abstract": "^1.24.0", + "es-object-atoms": "^1.1.1", + "gopd": "^1.2.0", + "safe-array-concat": "^1.1.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/object.groupby": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/object.groupby/-/object.groupby-1.0.3.tgz", @@ -9650,6 +12025,27 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/on-finished": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.4.1.tgz", + "integrity": "sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg==", + "license": "MIT", + "dependencies": { + "ee-first": "1.1.1" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/on-headers": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/on-headers/-/on-headers-1.1.0.tgz", + "integrity": "sha512-737ZY3yNnXy37FHkQxPzt4UZ2UWPWiCZWLvFZ4fu5cueciegX0zGPnrlY6bwRg4FdQOe9YU8MkmJwGhoMybl8A==", + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, "node_modules/once": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", @@ -9710,7 +12106,6 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/own-keys/-/own-keys-1.0.1.tgz", "integrity": "sha512-qFOyK5PjiWZd+QQIh+1jhdb9LpxTF0qs7Pm8o5QHYZ0M3vKqSqzsZaEB6oWlxZ+q2sJBMI/Ktgd2N5ZwQoRHfg==", - "dev": true, "dependencies": { "get-intrinsic": "^1.2.6", "object-keys": "^1.1.1", @@ -9775,6 +12170,31 @@ "resolved": "https://registry.npmjs.org/parenthesis/-/parenthesis-3.1.8.tgz", "integrity": "sha512-KF/U8tk54BgQewkJPvB4s/US3VQY68BRDpH638+7O/n58TpnwiwnOtGIOsT2/i+M78s61BBpeC83STB88d8sqw==" }, + "node_modules/parse-entities": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/parse-entities/-/parse-entities-4.0.2.tgz", + "integrity": "sha512-GG2AQYWoLgL877gQIKeRPGO1xF9+eG1ujIb5soS5gPvLQ1y2o8FL90w2QWNdf9I361Mpp7726c+lj3U0qK1uGw==", + "license": "MIT", + "dependencies": { + "@types/unist": "^2.0.0", + "character-entities-legacy": "^3.0.0", + "character-reference-invalid": "^2.0.0", + "decode-named-character-reference": "^1.0.0", + "is-alphanumerical": "^2.0.0", + "is-decimal": "^2.0.0", + "is-hexadecimal": "^2.0.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/parse-entities/node_modules/@types/unist": { + "version": "2.0.11", + "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz", + "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==", + "license": "MIT" + }, "node_modules/parse-json": { "version": "5.2.0", "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-5.2.0.tgz", @@ -9822,6 +12242,15 @@ "url": "https://github.com/inikulin/parse5?sponsor=1" } }, + "node_modules/parseurl": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", + "integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==", + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, "node_modules/path-exists": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz", @@ -9867,6 +12296,16 @@ "integrity": "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==", "dev": true }, + "node_modules/path-to-regexp": { + "version": "8.3.0", + "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-8.3.0.tgz", + "integrity": "sha512-7jdwVIRtsP8MYpdXSwOS0YdD0Du+qOoF/AEPIt88PcCFrZCzx41oxku1jD88hZBwbNUIEfpqvuhjFaMAqMTWnA==", + "license": "MIT", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, "node_modules/path-type": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/path-type/-/path-type-4.0.0.tgz", @@ -10166,11 +12605,34 @@ "resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz", "integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==" }, + "node_modules/property-information": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/property-information/-/property-information-7.1.0.tgz", + "integrity": "sha512-TwEZ+X+yCJmYfL7TPUOcvBZ4QfoT5YenQiJuX//0th53DE6w0xxLEtfK3iyryQFddXuvkIk51EEgrJQ0WJkOmQ==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/protocol-buffers-schema": { "version": "3.6.0", "resolved": "https://registry.npmjs.org/protocol-buffers-schema/-/protocol-buffers-schema-3.6.0.tgz", "integrity": "sha512-TdDRD+/QNdrCGCE7v8340QyuXd4kIWIgapsE2+n/SaGiSSbomYl4TjHlvIoCWRpE7wFt02EpB35VVA2ImcBVqw==" }, + "node_modules/proxy-addr": { + "version": "2.0.7", + "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz", + "integrity": "sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==", + "license": "MIT", + "dependencies": { + "forwarded": "0.2.0", + "ipaddr.js": "1.9.1" + }, + "engines": { + "node": ">= 0.10" + } + }, "node_modules/proxy-from-env": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", @@ -10185,6 +12647,21 @@ "node": ">=6" } }, + "node_modules/qs": { + "version": "6.15.0", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.15.0.tgz", + "integrity": "sha512-mAZTtNCeetKMH+pSjrb76NAM8V9a05I9aBZOHztWy/UqcJdQYNsf59vrRKWnojAT9Y+GbIvoTBC++CPHqpDBhQ==", + "license": "BSD-3-Clause", + "dependencies": { + "side-channel": "^1.1.0" + }, + "engines": { + "node": ">=0.6" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/queue-microtask": { "version": "1.2.3", "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", @@ -10218,6 +12695,46 @@ "performance-now": "^2.1.0" } }, + "node_modules/range-parser": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz", + "integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/raw-body": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-3.0.2.tgz", + "integrity": "sha512-K5zQjDllxWkf7Z5xJdV0/B0WTNqx6vxG70zJE4N0kBs4LovmEYWJzQGxC9bS9RAKu3bgM40lrd5zoLJ12MQ5BA==", + "license": "MIT", + "dependencies": { + "bytes": "~3.1.2", + "http-errors": "~2.0.1", + "iconv-lite": "~0.7.0", + "unpipe": "~1.0.0" + }, + "engines": { + "node": ">= 0.10" + } + }, + "node_modules/raw-body/node_modules/iconv-lite": { + "version": "0.7.2", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.7.2.tgz", + "integrity": "sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==", + "license": "MIT", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, "node_modules/react": { "version": "18.3.1", "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz", @@ -10314,6 +12831,33 @@ "resolved": "https://registry.npmjs.org/react-is/-/react-is-19.1.1.tgz", "integrity": "sha512-tr41fA15Vn8p4X9ntI+yCyeGSf1TlYaY5vlTZfQmeLBrFo3psOPX6HhTDnFNL9uj3EhP0KAQ80cugCl4b4BERA==" }, + "node_modules/react-markdown": { + "version": "10.1.0", + "resolved": "https://registry.npmjs.org/react-markdown/-/react-markdown-10.1.0.tgz", + "integrity": "sha512-qKxVopLT/TyA6BX3Ue5NwabOsAzm0Q7kAPwq6L+wWDwisYs7R8vZ0nRXqq6rkueboxpkjvLGU9fWifiX/ZZFxQ==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "@types/mdast": "^4.0.0", + "devlop": "^1.0.0", + "hast-util-to-jsx-runtime": "^2.0.0", + "html-url-attributes": "^3.0.0", + "mdast-util-to-hast": "^13.0.0", + "remark-parse": "^11.0.0", + "remark-rehype": "^11.0.0", + "unified": "^11.0.0", + "unist-util-visit": "^5.0.0", + "vfile": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + }, + "peerDependencies": { + "@types/react": ">=18", + "react": ">=18" + } + }, "node_modules/react-plotly.js": { "version": "2.6.0", "resolved": "https://registry.npmjs.org/react-plotly.js/-/react-plotly.js-2.6.0.tgz", @@ -10410,6 +12954,21 @@ "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" }, + "node_modules/readdirp": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-4.1.2.tgz", + "integrity": "sha512-GDhwkLfywWL2s6vEjyhri+eXmfH6j1L7JE27WhqLeYzoh/A3DBaYGEj2H/HFZCn/kMfim73FXxEJTw06WtxQwg==", + "dev": true, + "license": "MIT", + "optional": true, + "engines": { + "node": ">= 14.18.0" + }, + "funding": { + "type": "individual", + "url": "https://paulmillr.com/funding/" + } + }, "node_modules/recast": { "version": "0.23.11", "resolved": "https://registry.npmjs.org/recast/-/recast-0.23.11.tgz", @@ -10462,7 +13021,6 @@ "version": "1.0.10", "resolved": "https://registry.npmjs.org/reflect.getprototypeof/-/reflect.getprototypeof-1.0.10.tgz", "integrity": "sha512-00o4I+DVrefhv+nX0ulyi3biSHCPDe+yLv5o/p6d/UVlirijB8E16FtfwSAi4g3tcqrQ4lRAqQSoFEZJehYEcw==", - "dev": true, "dependencies": { "call-bind": "^1.0.8", "define-properties": "^1.2.1", @@ -10484,7 +13042,6 @@ "version": "1.5.4", "resolved": "https://registry.npmjs.org/regexp.prototype.flags/-/regexp.prototype.flags-1.5.4.tgz", "integrity": "sha512-dYqgNSZbDwkaJ2ceRd9ojCGjBq+mOm9LmtXnAnEGyHhN/5R7iDW2TRw3h+o/jCFxus3P2LfWIIiwowAjANm7IA==", - "dev": true, "dependencies": { "call-bind": "^1.0.8", "define-properties": "^1.2.1", @@ -10591,6 +13148,72 @@ "regl-scatter2d": "^3.2.3" } }, + "node_modules/remark-gfm": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/remark-gfm/-/remark-gfm-4.0.1.tgz", + "integrity": "sha512-1quofZ2RQ9EWdeN34S79+KExV1764+wCUGop5CPL1WGdD0ocPpu91lzPGbwWMECpEpd42kJGQwzRfyov9j4yNg==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-gfm": "^3.0.0", + "micromark-extension-gfm": "^3.0.0", + "remark-parse": "^11.0.0", + "remark-stringify": "^11.0.0", + "unified": "^11.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/remark-parse": { + "version": "11.0.0", + "resolved": "https://registry.npmjs.org/remark-parse/-/remark-parse-11.0.0.tgz", + "integrity": "sha512-FCxlKLNGknS5ba/1lmpYijMUzX2esxW5xQqjWxw2eHFfS2MSdaHVINFmhjo+qN1WhZhNimq0dZATN9pH0IDrpA==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-from-markdown": "^2.0.0", + "micromark-util-types": "^2.0.0", + "unified": "^11.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/remark-rehype": { + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/remark-rehype/-/remark-rehype-11.1.2.tgz", + "integrity": "sha512-Dh7l57ianaEoIpzbp0PC9UKAdCSVklD8E5Rpw7ETfbTl3FqcOOgq5q2LVDhgGCkaBv7p24JXikPdvhhmHvKMsw==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "@types/mdast": "^4.0.0", + "mdast-util-to-hast": "^13.0.0", + "unified": "^11.0.0", + "vfile": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/remark-stringify": { + "version": "11.0.0", + "resolved": "https://registry.npmjs.org/remark-stringify/-/remark-stringify-11.0.0.tgz", + "integrity": "sha512-1OSmLd3awB/t8qdoEOMazZkNsfVTeY4fTsgzcQFdXNq8ToTN4ZGwrMnlda4K6smTFKD+GRV6O48i6Z4iKgPPpw==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-to-markdown": "^2.0.0", + "unified": "^11.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/reselect": { "version": "5.1.1", "resolved": "https://registry.npmjs.org/reselect/-/reselect-5.1.1.tgz", @@ -10732,6 +13355,22 @@ "fsevents": "~2.3.2" } }, + "node_modules/router": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/router/-/router-2.2.0.tgz", + "integrity": "sha512-nLTrUKm2UyiL7rlhapu/Zl45FwNgkZGaCpZbIHajDYgwlJCOzLSk+cIPAnsEqV955GjILJnKbdQC1nVPz+gAYQ==", + "license": "MIT", + "dependencies": { + "debug": "^4.4.0", + "depd": "^2.0.0", + "is-promise": "^4.0.0", + "parseurl": "^1.3.3", + "path-to-regexp": "^8.0.0" + }, + "engines": { + "node": ">= 18" + } + }, "node_modules/rrweb-cssom": { "version": "0.8.0", "resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.8.0.tgz", @@ -10766,11 +13405,19 @@ "resolved": "https://registry.npmjs.org/rw/-/rw-1.3.3.tgz", "integrity": "sha512-PdhdWy89SiZogBLaw42zdeqtRJ//zFd2PgQavcICDUgJT5oW10QCRKbJ6bg4r0/UY2M6BWd5tkxuGFRvCkgfHQ==" }, + "node_modules/rxjs": { + "version": "7.8.2", + "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.2.tgz", + "integrity": "sha512-dhKf903U/PQZY6boNNtAGdWbG85WAbjT/1xYoZIC7FAY0yWapOBQVsVrDl58W86//e1VpMNBtRV4MaXfdMySFA==", + "license": "Apache-2.0", + "dependencies": { + "tslib": "^2.1.0" + } + }, "node_modules/safe-array-concat": { "version": "1.1.3", "resolved": "https://registry.npmjs.org/safe-array-concat/-/safe-array-concat-1.1.3.tgz", "integrity": "sha512-AURm5f0jYEOydBj7VQlVvDrjeFgthDdEF5H1dP+6mNpoXOMo1quQqJ4wvJDyRZ9+pO3kGWoOdmV08cSv2aJV6Q==", - "dev": true, "dependencies": { "call-bind": "^1.0.8", "call-bound": "^1.0.2", @@ -10808,7 +13455,6 @@ "version": "1.0.0", "resolved": "https://registry.npmjs.org/safe-push-apply/-/safe-push-apply-1.0.0.tgz", "integrity": "sha512-iKE9w/Z7xCzUMIZqdBsp6pEQvwuEebH4vdpjcDWnyzaI6yl6O9FHvVpmGelvEHNsoY6wGblkxR6Zty/h00WiSA==", - "dev": true, "dependencies": { "es-errors": "^1.3.0", "isarray": "^2.0.5" @@ -10841,6 +13487,392 @@ "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" }, + "node_modules/sass": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass/-/sass-1.98.0.tgz", + "integrity": "sha512-+4N/u9dZ4PrgzGgPlKnaaRQx64RO0JBKs9sDhQ2pLgN6JQZ25uPQZKQYaBJU48Kd5BxgXoJ4e09Dq7nMcOUW3A==", + "dev": true, + "license": "MIT", + "optional": true, + "dependencies": { + "chokidar": "^4.0.0", + "immutable": "^5.1.5", + "source-map-js": ">=0.6.2 <2.0.0" + }, + "bin": { + "sass": "sass.js" + }, + "engines": { + "node": ">=14.0.0" + }, + "optionalDependencies": { + "@parcel/watcher": "^2.4.1" + } + }, + "node_modules/sass-embedded": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded/-/sass-embedded-1.98.0.tgz", + "integrity": "sha512-Do7u6iRb6K+lrllcTkB1BXcHwOxcKe3rEfOF/GcCLE2w3WpddakRAosJOHFUR37DpsvimQXEt5abs3NzUjEIqg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@bufbuild/protobuf": "^2.5.0", + "colorjs.io": "^0.5.0", + "immutable": "^5.1.5", + "rxjs": "^7.4.0", + "supports-color": "^8.1.1", + "sync-child-process": "^1.0.2", + "varint": "^6.0.0" + }, + "bin": { + "sass": "dist/bin/sass.js" + }, + "engines": { + "node": ">=16.0.0" + }, + "optionalDependencies": { + "sass-embedded-all-unknown": "1.98.0", + "sass-embedded-android-arm": "1.98.0", + "sass-embedded-android-arm64": "1.98.0", + "sass-embedded-android-riscv64": "1.98.0", + "sass-embedded-android-x64": "1.98.0", + "sass-embedded-darwin-arm64": "1.98.0", + "sass-embedded-darwin-x64": "1.98.0", + "sass-embedded-linux-arm": "1.98.0", + "sass-embedded-linux-arm64": "1.98.0", + "sass-embedded-linux-musl-arm": "1.98.0", + "sass-embedded-linux-musl-arm64": "1.98.0", + "sass-embedded-linux-musl-riscv64": "1.98.0", + "sass-embedded-linux-musl-x64": "1.98.0", + "sass-embedded-linux-riscv64": "1.98.0", + "sass-embedded-linux-x64": "1.98.0", + "sass-embedded-unknown-all": "1.98.0", + "sass-embedded-win32-arm64": "1.98.0", + "sass-embedded-win32-x64": "1.98.0" + } + }, + "node_modules/sass-embedded-all-unknown": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded-all-unknown/-/sass-embedded-all-unknown-1.98.0.tgz", + "integrity": "sha512-6n4RyK7/1mhdfYvpP3CClS3fGoYqDvRmLClCESS6I7+SAzqjxvGG6u5Fo+cb1nrPNbbilgbM4QKdgcgWHO9NCA==", + "cpu": [ + "!arm", + "!arm64", + "!riscv64", + "!x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "dependencies": { + "sass": "1.98.0" + } + }, + "node_modules/sass-embedded-android-arm": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded-android-arm/-/sass-embedded-android-arm-1.98.0.tgz", + "integrity": "sha512-LjGiMhHgu7VL1n7EJxTCre1x14bUsWd9d3dnkS2rku003IWOI/fxc7OXgaKagoVzok1kv09rzO3vFXJR5ZeONQ==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-android-arm64": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded-android-arm64/-/sass-embedded-android-arm64-1.98.0.tgz", + "integrity": "sha512-M9Ra98A6vYJHpwhoC/5EuH1eOshQ9ZyNwC8XifUDSbRl/cGeQceT1NReR9wFj3L7s1pIbmes1vMmaY2np0uAKQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-android-riscv64": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded-android-riscv64/-/sass-embedded-android-riscv64-1.98.0.tgz", + "integrity": "sha512-WPe+0NbaJIZE1fq/RfCZANMeIgmy83x4f+SvFOG7LhUthHpZWcOcrPTsCKKmN3xMT3iw+4DXvqTYOCYGRL3hcQ==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-android-x64": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded-android-x64/-/sass-embedded-android-x64-1.98.0.tgz", + "integrity": "sha512-zrD25dT7OHPEgLWuPEByybnIfx4rnCtfge4clBgjZdZ3lF6E7qNLRBtSBmoFflh6Vg0RlEjJo5VlpnTMBM5MQQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-darwin-arm64": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded-darwin-arm64/-/sass-embedded-darwin-arm64-1.98.0.tgz", + "integrity": "sha512-cgr1z9rBnCdMf8K+JabIaYd9Rag2OJi5mjq08XJfbJGMZV/TA6hFJCLGkr5/+ZOn4/geTM5/3aSfQ8z5EIJAOg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-darwin-x64": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded-darwin-x64/-/sass-embedded-darwin-x64-1.98.0.tgz", + "integrity": "sha512-OLBOCs/NPeiMqTdOrMFbVHBQFj19GS3bSVSxIhcCq16ZyhouUkYJEZjxQgzv9SWA2q6Ki8GCqp4k6jMeUY9dcA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-linux-arm": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded-linux-arm/-/sass-embedded-linux-arm-1.98.0.tgz", + "integrity": "sha512-03baQZCxVyEp8v1NWBRlzGYrmVT/LK7ZrHlF1piscGiGxwfdxoLXVuxsylx3qn/dD/4i/rh7Bzk7reK1br9jvQ==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-linux-arm64": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded-linux-arm64/-/sass-embedded-linux-arm64-1.98.0.tgz", + "integrity": "sha512-axOE3t2MTBwCtkUCbrdM++Gj0gC0fdHJPrgzQ+q1WUmY9NoNMGqflBtk5mBZaWUeha2qYO3FawxCB8lctFwCtw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-linux-musl-arm": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-arm/-/sass-embedded-linux-musl-arm-1.98.0.tgz", + "integrity": "sha512-OBkjTDPYR4hSaueOGIM6FDpl9nt/VZwbSRpbNu9/eEJcxE8G/vynRugW8KRZmCFjPy8j/jkGBvvS+k9iOqKV3g==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-linux-musl-arm64": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-arm64/-/sass-embedded-linux-musl-arm64-1.98.0.tgz", + "integrity": "sha512-LeqNxQA8y4opjhe68CcFvMzCSrBuJqYVFbwElEj9bagHXQHTp9xVPJRn6VcrC+0VLEDq13HVXMv7RslIuU0zmA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-linux-musl-riscv64": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-riscv64/-/sass-embedded-linux-musl-riscv64-1.98.0.tgz", + "integrity": "sha512-7w6hSuOHKt8FZsmjRb3iGSxEzM87fO9+M8nt5JIQYMhHTj5C+JY/vcske0v715HCVj5e1xyTnbGXf8FcASeAIw==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-linux-musl-x64": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-x64/-/sass-embedded-linux-musl-x64-1.98.0.tgz", + "integrity": "sha512-QikNyDEJOVqPmxyCFkci8ZdCwEssdItfjQFJB+D+Uy5HFqcS5Lv3d3GxWNX/h1dSb23RPyQdQc267ok5SbEyJw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-linux-riscv64": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded-linux-riscv64/-/sass-embedded-linux-riscv64-1.98.0.tgz", + "integrity": "sha512-E7fNytc/v4xFBQKzgzBddV/jretA4ULAPO6XmtBiQu4zZBdBozuSxsQLe2+XXeb0X4S2GIl72V7IPABdqke/vA==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-linux-x64": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded-linux-x64/-/sass-embedded-linux-x64-1.98.0.tgz", + "integrity": "sha512-VsvP0t/uw00mMNPv3vwyYKUrFbqzxQHnRMO+bHdAMjvLw4NFf6mscpym9Bzf+NXwi1ZNKnB6DtXjmcpcvqFqYg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-unknown-all": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded-unknown-all/-/sass-embedded-unknown-all-1.98.0.tgz", + "integrity": "sha512-C4MMzcAo3oEDQnW7L8SBgB9F2Fq5qHPnaYTZRMOH3Mp/7kM4OooBInXpCiiFjLnjY95hzP4KyctVx0uYR6MYlQ==", + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "!android", + "!darwin", + "!linux", + "!win32" + ], + "dependencies": { + "sass": "1.98.0" + } + }, + "node_modules/sass-embedded-win32-arm64": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded-win32-arm64/-/sass-embedded-win32-arm64-1.98.0.tgz", + "integrity": "sha512-nP/10xbAiPbhQkMr3zQfXE4TuOxPzWRQe1Hgbi90jv2R4TbzbqQTuZVOaJf7KOAN4L2Bo6XCTRjK5XkVnwZuwQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-win32-x64": { + "version": "1.98.0", + "resolved": "https://registry.npmjs.org/sass-embedded-win32-x64/-/sass-embedded-win32-x64-1.98.0.tgz", + "integrity": "sha512-/lbrVsfbcbdZQ5SJCWcV0NVPd6YRs+FtAnfedp4WbCkO/ZO7Zt/58MvI4X2BVpRY/Nt5ZBo1/7v2gYcQ+J4svQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded/node_modules/supports-color": { + "version": "8.1.1", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-8.1.1.tgz", + "integrity": "sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "has-flag": "^4.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/supports-color?sponsor=1" + } + }, "node_modules/sax": { "version": "1.4.1", "resolved": "https://registry.npmjs.org/sax/-/sax-1.4.1.tgz", @@ -10875,6 +13907,76 @@ "semver": "bin/semver.js" } }, + "node_modules/send": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/send/-/send-1.2.1.tgz", + "integrity": "sha512-1gnZf7DFcoIcajTjTwjwuDjzuz4PPcY2StKPlsGAQ1+YH20IRVrBaXSWmdjowTJ6u8Rc01PoYOGHXfP1mYcZNQ==", + "license": "MIT", + "dependencies": { + "debug": "^4.4.3", + "encodeurl": "^2.0.0", + "escape-html": "^1.0.3", + "etag": "^1.8.1", + "fresh": "^2.0.0", + "http-errors": "^2.0.1", + "mime-types": "^3.0.2", + "ms": "^2.1.3", + "on-finished": "^2.4.1", + "range-parser": "^1.2.1", + "statuses": "^2.0.2" + }, + "engines": { + "node": ">= 18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, + "node_modules/send/node_modules/mime-db": { + "version": "1.54.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.54.0.tgz", + "integrity": "sha512-aU5EJuIN2WDemCcAp2vFBfp/m4EAhWJnUNSSw0ixs7/kXbd6Pg64EmwJkNdFhB8aWt1sH2CTXrLxo/iAGV3oPQ==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/send/node_modules/mime-types": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-3.0.2.tgz", + "integrity": "sha512-Lbgzdk0h4juoQ9fCKXW4by0UJqj+nOOrI9MJ1sSj4nI8aI2eo1qmvQEie4VD1glsS250n15LsWsYtCugiStS5A==", + "license": "MIT", + "dependencies": { + "mime-db": "^1.54.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, + "node_modules/serve-static": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-2.2.1.tgz", + "integrity": "sha512-xRXBn0pPqQTVQiC8wyQrKs2MOlX24zQ0POGaj0kultvoOCstBQM5yvOhAVSUwOMjQtTvsPWoNCHfPGwaaQJhTw==", + "license": "MIT", + "dependencies": { + "encodeurl": "^2.0.0", + "escape-html": "^1.0.3", + "parseurl": "^1.3.3", + "send": "^1.2.0" + }, + "engines": { + "node": ">= 18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, "node_modules/set-cookie-parser": { "version": "2.7.1", "resolved": "https://registry.npmjs.org/set-cookie-parser/-/set-cookie-parser-2.7.1.tgz", @@ -10900,7 +14002,6 @@ "version": "2.0.2", "resolved": "https://registry.npmjs.org/set-function-name/-/set-function-name-2.0.2.tgz", "integrity": "sha512-7PGFlmtwsEADb0WYyvCMa1t+yke6daIG4Wirafur5kcf+MhUnPms1UeR0CKQdTZD81yESwMHbtn+TR+dMviakQ==", - "dev": true, "dependencies": { "define-data-property": "^1.1.4", "es-errors": "^1.3.0", @@ -10915,7 +14016,6 @@ "version": "1.0.0", "resolved": "https://registry.npmjs.org/set-proto/-/set-proto-1.0.0.tgz", "integrity": "sha512-RJRdvCo6IAnPdsvP/7m6bsQqNnn1FCBX5ZNtFL98MmFF/4xAIJTIg1YbHW5DC2W5SKZanrC6i4HsJqlajw/dZw==", - "dev": true, "dependencies": { "dunder-proto": "^1.0.1", "es-errors": "^1.3.0", @@ -10925,6 +14025,12 @@ "node": ">= 0.4" } }, + "node_modules/setprototypeof": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.2.0.tgz", + "integrity": "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==", + "license": "ISC" + }, "node_modules/shallow-copy": { "version": "0.0.1", "resolved": "https://registry.npmjs.org/shallow-copy/-/shallow-copy-0.0.1.tgz", @@ -10960,7 +14066,6 @@ "version": "1.1.0", "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.1.0.tgz", "integrity": "sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw==", - "dev": true, "dependencies": { "es-errors": "^1.3.0", "object-inspect": "^1.13.3", @@ -10979,7 +14084,6 @@ "version": "1.0.0", "resolved": "https://registry.npmjs.org/side-channel-list/-/side-channel-list-1.0.0.tgz", "integrity": "sha512-FCLHtRD/gnpCiCHEiJLOwdmFP+wzCmDEkc9y7NsYxeF4u7Btsn1ZuwgwJGxImImHicJArLP4R0yX4c2KCrMrTA==", - "dev": true, "dependencies": { "es-errors": "^1.3.0", "object-inspect": "^1.13.3" @@ -10995,7 +14099,6 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/side-channel-map/-/side-channel-map-1.0.1.tgz", "integrity": "sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA==", - "dev": true, "dependencies": { "call-bound": "^1.0.2", "es-errors": "^1.3.0", @@ -11013,7 +14116,6 @@ "version": "1.0.2", "resolved": "https://registry.npmjs.org/side-channel-weakmap/-/side-channel-weakmap-1.0.2.tgz", "integrity": "sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A==", - "dev": true, "dependencies": { "call-bound": "^1.0.2", "es-errors": "^1.3.0", @@ -11105,6 +14207,16 @@ "node": ">=0.10.0" } }, + "node_modules/space-separated-tokens": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/space-separated-tokens/-/space-separated-tokens-2.0.2.tgz", + "integrity": "sha512-PEGlAwrG8yXGXRjW32fGbg66JAlOAwbObuqVoJpv/mRgoWDQfgH1wDPvtzWyUSNAXBGSk8h755YDbbcEy3SH2Q==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/stable-hash": { "version": "0.0.5", "resolved": "https://registry.npmjs.org/stable-hash/-/stable-hash-0.0.5.tgz", @@ -11133,6 +14245,15 @@ "escodegen": "^2.1.0" } }, + "node_modules/statuses": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.2.tgz", + "integrity": "sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw==", + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, "node_modules/std-env": { "version": "3.9.0", "resolved": "https://registry.npmjs.org/std-env/-/std-env-3.9.0.tgz", @@ -11143,7 +14264,6 @@ "version": "1.1.0", "resolved": "https://registry.npmjs.org/stop-iteration-iterator/-/stop-iteration-iterator-1.1.0.tgz", "integrity": "sha512-eLoXW/DHyl62zxY4SCaIgnRhuMr6ri4juEYARS8E6sCEqzKpOiE521Ucofdx+KnDZl5xmvGYaaKCk5FEOxJCoQ==", - "dev": true, "dependencies": { "es-errors": "^1.3.0", "internal-slot": "^1.1.0" @@ -11333,7 +14453,6 @@ "version": "1.2.10", "resolved": "https://registry.npmjs.org/string.prototype.trim/-/string.prototype.trim-1.2.10.tgz", "integrity": "sha512-Rs66F0P/1kedk5lyYyH9uBzuiI/kNRmwJAR9quK6VOtIpZ2G+hMZd+HQbbv25MgCA6gEffoMZYxlTod4WcdrKA==", - "dev": true, "dependencies": { "call-bind": "^1.0.8", "call-bound": "^1.0.2", @@ -11354,7 +14473,6 @@ "version": "1.0.9", "resolved": "https://registry.npmjs.org/string.prototype.trimend/-/string.prototype.trimend-1.0.9.tgz", "integrity": "sha512-G7Ok5C6E/j4SGfyLCloXTrngQIQU3PWtXGst3yM7Bea9FRURf1S42ZHlZZtsNque2FN2PoUhfZXYLNWwEr4dLQ==", - "dev": true, "dependencies": { "call-bind": "^1.0.8", "call-bound": "^1.0.2", @@ -11372,7 +14490,6 @@ "version": "1.0.8", "resolved": "https://registry.npmjs.org/string.prototype.trimstart/-/string.prototype.trimstart-1.0.8.tgz", "integrity": "sha512-UXSH262CSZY1tfu3G3Secr6uGLCFVPMhIqHjlgCUtCCcgihYc/xKs9djMTMUOb2j1mVSeU8EU6NWc/iQKU6Gfg==", - "dev": true, "dependencies": { "call-bind": "^1.0.7", "define-properties": "^1.2.1", @@ -11385,6 +14502,20 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/stringify-entities": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/stringify-entities/-/stringify-entities-4.0.4.tgz", + "integrity": "sha512-IwfBptatlO+QCJUo19AqvrPNqlVMpW9YEL2LIVY+Rpv2qsjCGxaDLNRgeGsQWJhfItebuJhsGSLjaBbNSQ+ieg==", + "license": "MIT", + "dependencies": { + "character-entities-html4": "^2.0.0", + "character-entities-legacy": "^3.0.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/strip-ansi": { "version": "7.1.2", "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.2.tgz", @@ -11496,6 +14627,24 @@ "resolved": "https://registry.npmjs.org/strongly-connected-components/-/strongly-connected-components-1.0.1.tgz", "integrity": "sha512-i0TFx4wPcO0FwX+4RkLJi1MxmcTv90jNZgxMu9XRnMXMeFUY1VJlIoXpZunPUvUUqbCT1pg5PEkFqqpcaElNaA==" }, + "node_modules/style-to-js": { + "version": "1.1.21", + "resolved": "https://registry.npmjs.org/style-to-js/-/style-to-js-1.1.21.tgz", + "integrity": "sha512-RjQetxJrrUJLQPHbLku6U/ocGtzyjbJMP9lCNK7Ag0CNh690nSH8woqWH9u16nMjYBAok+i7JO1NP2pOy8IsPQ==", + "license": "MIT", + "dependencies": { + "style-to-object": "1.0.14" + } + }, + "node_modules/style-to-object": { + "version": "1.0.14", + "resolved": "https://registry.npmjs.org/style-to-object/-/style-to-object-1.0.14.tgz", + "integrity": "sha512-LIN7rULI0jBscWQYaSswptyderlarFkjQ+t79nzty8tcIAceVomEVlLzH5VP4Cmsv6MtKhs7qaAiwlcp+Mgaxw==", + "license": "MIT", + "dependencies": { + "inline-style-parser": "0.2.7" + } + }, "node_modules/styled-components": { "version": "6.1.19", "resolved": "https://registry.npmjs.org/styled-components/-/styled-components-6.1.19.tgz", @@ -11639,12 +14788,44 @@ "svg-path-bounds": "^1.0.1" } }, + "node_modules/swagger-ui-dist": { + "version": "5.32.0", + "resolved": "https://registry.npmjs.org/swagger-ui-dist/-/swagger-ui-dist-5.32.0.tgz", + "integrity": "sha512-nKZB0OuDvacB0s/lC2gbge+RigYvGRGpLLMWMFxaTUwfM+CfndVk9Th2IaTinqXiz6Mn26GK2zriCpv6/+5m3Q==", + "license": "Apache-2.0", + "dependencies": { + "@scarf/scarf": "=1.4.0" + } + }, "node_modules/symbol-tree": { "version": "3.2.4", "resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz", "integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==", "dev": true }, + "node_modules/sync-child-process": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/sync-child-process/-/sync-child-process-1.0.2.tgz", + "integrity": "sha512-8lD+t2KrrScJ/7KXCSyfhT3/hRq78rC0wBFqNJXv3mZyn6hW2ypM05JmlSvtqRbeq6jqA94oHbxAr2vYsJ8vDA==", + "dev": true, + "license": "MIT", + "dependencies": { + "sync-message-port": "^1.0.0" + }, + "engines": { + "node": ">=16.0.0" + } + }, + "node_modules/sync-message-port": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/sync-message-port/-/sync-message-port-1.2.0.tgz", + "integrity": "sha512-gAQ9qrUN/UCypHtGFbbe7Rc/f9bzO88IwrG8TDo/aMKAApKyD6E3W4Cm0EfhfBb6Z6SKt59tTCTfD+n1xmAvMg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=16.0.0" + } + }, "node_modules/through2": { "version": "2.0.5", "resolved": "https://registry.npmjs.org/through2/-/through2-2.0.5.tgz", @@ -11767,6 +14948,15 @@ "node": ">=8.0" } }, + "node_modules/toidentifier": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.1.tgz", + "integrity": "sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA==", + "license": "MIT", + "engines": { + "node": ">=0.6" + } + }, "node_modules/topojson-client": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/topojson-client/-/topojson-client-3.1.0.tgz", @@ -11809,6 +14999,26 @@ "node": ">=18" } }, + "node_modules/trim-lines": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/trim-lines/-/trim-lines-3.0.1.tgz", + "integrity": "sha512-kRj8B+YHZCc9kQYdWfJB2/oUl9rA99qbowYYBtr4ui4mZyAQ2JpvVBd/6U2YloATfqBhBTSMhTpgBHtU0Mf3Rg==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/trough": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/trough/-/trough-2.2.0.tgz", + "integrity": "sha512-tmMpK00BjZiUyVyvrBK7knerNgmgvcV/KLVyuma/SC+TQN167GrMRciANTz09+k3zW8L8t60jWO1GpfkZdjTaw==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/ts-api-utils": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-2.1.0.tgz", @@ -11904,11 +15114,49 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/type-is": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/type-is/-/type-is-2.0.1.tgz", + "integrity": "sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw==", + "license": "MIT", + "dependencies": { + "content-type": "^1.0.5", + "media-typer": "^1.1.0", + "mime-types": "^3.0.0" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/type-is/node_modules/mime-db": { + "version": "1.54.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.54.0.tgz", + "integrity": "sha512-aU5EJuIN2WDemCcAp2vFBfp/m4EAhWJnUNSSw0ixs7/kXbd6Pg64EmwJkNdFhB8aWt1sH2CTXrLxo/iAGV3oPQ==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/type-is/node_modules/mime-types": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-3.0.2.tgz", + "integrity": "sha512-Lbgzdk0h4juoQ9fCKXW4by0UJqj+nOOrI9MJ1sSj4nI8aI2eo1qmvQEie4VD1glsS250n15LsWsYtCugiStS5A==", + "license": "MIT", + "dependencies": { + "mime-db": "^1.54.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, "node_modules/typed-array-buffer": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/typed-array-buffer/-/typed-array-buffer-1.0.3.tgz", "integrity": "sha512-nAYYwfY3qnzX30IkA6AQZjVbtK6duGontcQm1WSG1MD94YLqK0515GNApXkoxKOWMusVssAHWLh9SeaoefYFGw==", - "dev": true, "dependencies": { "call-bound": "^1.0.3", "es-errors": "^1.3.0", @@ -11922,7 +15170,6 @@ "version": "1.0.3", "resolved": "https://registry.npmjs.org/typed-array-byte-length/-/typed-array-byte-length-1.0.3.tgz", "integrity": "sha512-BaXgOuIxz8n8pIq3e7Atg/7s+DpiYrxn4vdot3w9KbnBhcRQq6o3xemQdIfynqSeXeDrF32x+WvfzmOjPiY9lg==", - "dev": true, "dependencies": { "call-bind": "^1.0.8", "for-each": "^0.3.3", @@ -11941,7 +15188,6 @@ "version": "1.0.4", "resolved": "https://registry.npmjs.org/typed-array-byte-offset/-/typed-array-byte-offset-1.0.4.tgz", "integrity": "sha512-bTlAFB/FBYMcuX81gbL4OcpH5PmlFHqlCCpAl8AlEzMz5k53oNDvN8p1PNOWLEmI2x4orp3raOFB51tv9X+MFQ==", - "dev": true, "dependencies": { "available-typed-arrays": "^1.0.7", "call-bind": "^1.0.8", @@ -11962,7 +15208,6 @@ "version": "1.0.7", "resolved": "https://registry.npmjs.org/typed-array-length/-/typed-array-length-1.0.7.tgz", "integrity": "sha512-3KS2b+kL7fsuk/eJZ7EQdnEmQoaho/r6KUef7hxvltNA5DR8NAUM+8wJMbJyZ4G9/7i3v5zPBIMN5aybAh2/Jg==", - "dev": true, "dependencies": { "call-bind": "^1.0.7", "for-each": "^0.3.3", @@ -12032,7 +15277,6 @@ "version": "1.1.0", "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.1.0.tgz", "integrity": "sha512-nWJ91DjeOkej/TA8pXQ3myruKpKEYgqvpw9lz4OPHj/NWFNluYrjbz9j01CJ8yKQd2g4jFoOkINCTW2I5LEEyw==", - "dev": true, "dependencies": { "call-bound": "^1.0.3", "has-bigints": "^1.0.2", @@ -12051,6 +15295,105 @@ "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.10.0.tgz", "integrity": "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag==" }, + "node_modules/unified": { + "version": "11.0.5", + "resolved": "https://registry.npmjs.org/unified/-/unified-11.0.5.tgz", + "integrity": "sha512-xKvGhPWw3k84Qjh8bI3ZeJjqnyadK+GEFtazSfZv/rKeTkTjOJho6mFqh2SM96iIcZokxiOpg78GazTSg8+KHA==", + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0", + "bail": "^2.0.0", + "devlop": "^1.0.0", + "extend": "^3.0.0", + "is-plain-obj": "^4.0.0", + "trough": "^2.0.0", + "vfile": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/unified/node_modules/is-plain-obj": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-4.1.0.tgz", + "integrity": "sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/unist-util-is": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/unist-util-is/-/unist-util-is-6.0.1.tgz", + "integrity": "sha512-LsiILbtBETkDz8I9p1dQ0uyRUWuaQzd/cuEeS1hoRSyW5E5XGmTzlwY1OrNzzakGowI9Dr/I8HVaw4hTtnxy8g==", + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/unist-util-position": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/unist-util-position/-/unist-util-position-5.0.0.tgz", + "integrity": "sha512-fucsC7HjXvkB5R3kTCO7kUjRdrS0BJt3M/FPxmHMBOm8JQi2BsHAHFsy27E0EolP8rp0NzXsJ+jNPyDWvOJZPA==", + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/unist-util-stringify-position": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz", + "integrity": "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ==", + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/unist-util-visit": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/unist-util-visit/-/unist-util-visit-5.1.0.tgz", + "integrity": "sha512-m+vIdyeCOpdr/QeQCu2EzxX/ohgS8KbnPDgFni4dQsfSCtpz8UqDyY5GjRru8PDKuYn7Fq19j1CQ+nJSsGKOzg==", + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0", + "unist-util-is": "^6.0.0", + "unist-util-visit-parents": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/unist-util-visit-parents": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/unist-util-visit-parents/-/unist-util-visit-parents-6.0.2.tgz", + "integrity": "sha512-goh1s1TBrqSqukSc8wrjwWhL0hiJxgA8m4kFxGlQ+8FYQ3C/m11FcTs4YYem7V664AhHVvgoQLk890Ssdsr2IQ==", + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0", + "unist-util-is": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/universalify": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/universalify/-/universalify-2.0.1.tgz", @@ -12060,6 +15403,15 @@ "node": ">= 10.0.0" } }, + "node_modules/unpipe": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz", + "integrity": "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==", + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, "node_modules/unplugin": { "version": "1.16.1", "resolved": "https://registry.npmjs.org/unplugin/-/unplugin-1.16.1.tgz", @@ -12181,6 +15533,32 @@ "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==" }, + "node_modules/util.promisify": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/util.promisify/-/util.promisify-1.1.3.tgz", + "integrity": "sha512-GIEaZ6o86fj09Wtf0VfZ5XP7tmd4t3jM5aZCgmBi231D0DB1AEBa3Aa6MP48DMsAIi96WkpWLimIWVwOjbDMOw==", + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.3", + "define-data-property": "^1.1.4", + "define-properties": "^1.2.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.0.0", + "for-each": "^0.3.3", + "get-intrinsic": "^1.2.6", + "has-proto": "^1.2.0", + "has-symbols": "^1.1.0", + "object.getownpropertydescriptors": "^2.1.8", + "safe-array-concat": "^1.1.3" + }, + "engines": { + "node": ">= 0.8" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/uuid": { "version": "11.1.0", "resolved": "https://registry.npmjs.org/uuid/-/uuid-11.1.0.tgz", @@ -12193,6 +15571,50 @@ "uuid": "dist/esm/bin/uuid" } }, + "node_modules/varint": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/varint/-/varint-6.0.0.tgz", + "integrity": "sha512-cXEIW6cfr15lFv563k4GuVuW/fiwjknytD37jIOLSdSWuOI6WnO/oKwmP2FQTU2l01LP8/M5TSAJpzUaGe3uWg==", + "dev": true, + "license": "MIT" + }, + "node_modules/vary": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz", + "integrity": "sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg==", + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/vfile": { + "version": "6.0.3", + "resolved": "https://registry.npmjs.org/vfile/-/vfile-6.0.3.tgz", + "integrity": "sha512-KzIbH/9tXat2u30jf+smMwFCsno4wHVdNmzFyL+T/L3UGqqk6JKfVqOFOZEpZSHADH1k40ab6NUIXZq422ov3Q==", + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0", + "vfile-message": "^4.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/vfile-message": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-4.0.3.tgz", + "integrity": "sha512-QTHzsGd1EhbZs4AsQ20JX1rC3cOlt/IWJruk893DfLRr57lcnOeMaWG4K0JrRta4mIJZKth2Au3mM3u03/JWKw==", + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0", + "unist-util-stringify-position": "^4.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/vite": { "version": "6.3.6", "resolved": "https://registry.npmjs.org/vite/-/vite-6.3.6.tgz", @@ -12609,7 +16031,6 @@ "version": "1.1.1", "resolved": "https://registry.npmjs.org/which-boxed-primitive/-/which-boxed-primitive-1.1.1.tgz", "integrity": "sha512-TbX3mj8n0odCBFVlY8AxkqcHASw3L60jIuF8jFP78az3C2YhmGvqbHBpAjTRH2/xqYunrJ9g1jSyjCjpoWzIAA==", - "dev": true, "dependencies": { "is-bigint": "^1.1.0", "is-boolean-object": "^1.2.1", @@ -12628,7 +16049,6 @@ "version": "1.2.1", "resolved": "https://registry.npmjs.org/which-builtin-type/-/which-builtin-type-1.2.1.tgz", "integrity": "sha512-6iBczoX+kDQ7a3+YJBnh3T+KZRxM/iYNPXicqk66/Qfm1b93iu+yOImkg0zHbj5LNOcNv1TEADiZ0xa34B4q6Q==", - "dev": true, "dependencies": { "call-bound": "^1.0.2", "function.prototype.name": "^1.1.6", @@ -12655,7 +16075,6 @@ "version": "1.0.2", "resolved": "https://registry.npmjs.org/which-collection/-/which-collection-1.0.2.tgz", "integrity": "sha512-K4jVyjnBdgvc86Y6BkaLZEN933SwYOuBFkdmBu9ZfkcAbdVbpITnDmjvZ/aQjRXQrv5EPkTnD1s39GiiqbngCw==", - "dev": true, "dependencies": { "is-map": "^2.0.3", "is-set": "^2.0.3", @@ -12887,6 +16306,44 @@ "funding": { "url": "https://github.com/sponsors/sindresorhus" } + }, + "node_modules/zustand": { + "version": "4.5.7", + "resolved": "https://registry.npmjs.org/zustand/-/zustand-4.5.7.tgz", + "integrity": "sha512-CHOUy7mu3lbD6o6LJLfllpjkzhHXSBlX8B9+qPddUsIfeF5S/UZ5q0kmCsnRqT1UHFQZchNFDDzMbQsuesHWlw==", + "license": "MIT", + "dependencies": { + "use-sync-external-store": "^1.2.2" + }, + "engines": { + "node": ">=12.7.0" + }, + "peerDependencies": { + "@types/react": ">=16.8", + "immer": ">=9.0.6", + "react": ">=16.8" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "immer": { + "optional": true + }, + "react": { + "optional": true + } + } + }, + "node_modules/zwitch": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/zwitch/-/zwitch-2.0.4.tgz", + "integrity": "sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } } } } diff --git a/frontend/package.json b/frontend/package.json index 4cb0aa4bc..a887a3416 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -31,13 +31,14 @@ "@xyflow/react": "^12.10.0", "axios": "^1.9.0", "bootstrap": "^5.3.6", - "corepack": "^0.34.0", + "corepack": "^0.34.6", "fast-deep-equal": "^3.1.3", "file-saver": "^2.0.5", "framer-motion": "^12.17.0", "lodash.merge": "^4.6.2", "mobx": "^6.13.7", "mobx-react-lite": "^4.1.0", + "molstar": "^5.7.0", "moment": "^2.30.1", "plotly.js": "^3.0.1", "plotly.js-dist-min": "^3.0.1", @@ -81,6 +82,7 @@ "jsdom": "^26.1.0", "lint-staged": "^15.5.2", "prettier": "^3.5.3", + "sass-embedded": "^1.98.0", "storybook": "^8.6.14", "typescript": "~5.6.3", "typescript-eslint": "^8.34.0", diff --git a/frontend/pnpm-lock.yaml b/frontend/pnpm-lock.yaml index 02417ae58..34a5cd65e 100644 --- a/frontend/pnpm-lock.yaml +++ b/frontend/pnpm-lock.yaml @@ -33,7 +33,7 @@ importers: version: 24.3.1 "@xyflow/react": specifier: ^12.10.0 - version: 12.10.0(@types/react@18.3.24)(react-dom@18.3.1(react@18.3.1))(react@18.3.1) + version: 12.10.2(@types/react@18.3.24)(react-dom@18.3.1(react@18.3.1))(react@18.3.1) axios: specifier: ^1.9.0 version: 1.11.0 @@ -41,8 +41,8 @@ importers: specifier: ^5.3.6 version: 5.3.8(@popperjs/core@2.11.8) corepack: - specifier: ^0.34.0 - version: 0.34.0 + specifier: ^0.34.6 + version: 0.34.6 fast-deep-equal: specifier: ^3.1.3 version: 3.1.3 @@ -61,6 +61,9 @@ importers: mobx-react-lite: specifier: ^4.1.0 version: 4.1.0(mobx@6.13.7)(react-dom@18.3.1(react@18.3.1))(react@18.3.1) + molstar: + specifier: ^5.7.0 + version: 5.7.0(@types/react@18.3.24)(fp-ts@2.16.11)(react-dom@18.3.1(react@18.3.1))(react@18.3.1) moment: specifier: ^2.30.1 version: 2.30.1 @@ -118,7 +121,7 @@ importers: version: 8.6.14(@storybook/test@8.6.14(storybook@8.6.14(prettier@3.6.2)))(react-dom@18.3.1(react@18.3.1))(react@18.3.1)(storybook@8.6.14(prettier@3.6.2))(typescript@5.6.3) "@storybook/react-vite": specifier: ^8.6.14 - version: 8.6.14(@storybook/test@8.6.14(storybook@8.6.14(prettier@3.6.2)))(react-dom@18.3.1(react@18.3.1))(react@18.3.1)(rollup@4.50.1)(storybook@8.6.14(prettier@3.6.2))(typescript@5.6.3)(vite@6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1)) + version: 8.6.14(@storybook/test@8.6.14(storybook@8.6.14(prettier@3.6.2)))(react-dom@18.3.1(react@18.3.1))(react@18.3.1)(rollup@4.50.1)(storybook@8.6.14(prettier@3.6.2))(typescript@5.6.3)(vite@6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1)) "@storybook/test": specifier: ^8.6.14 version: 8.6.14(storybook@8.6.14(prettier@3.6.2)) @@ -148,7 +151,7 @@ importers: version: 8.43.0(eslint@9.35.0)(typescript@5.6.3) "@vitejs/plugin-react": specifier: ^4.5.2 - version: 4.7.0(vite@6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1)) + version: 4.7.0(vite@6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1)) eslint: specifier: ^9.28.0 version: 9.35.0 @@ -185,6 +188,9 @@ importers: prettier: specifier: ^3.5.3 version: 3.6.2 + sass-embedded: + specifier: ^1.98.0 + version: 1.98.0 storybook: specifier: ^8.6.14 version: 8.6.14(prettier@3.6.2) @@ -196,16 +202,16 @@ importers: version: 8.43.0(eslint@9.35.0)(typescript@5.6.3) vite: specifier: ^6.3.5 - version: 6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1) + version: 6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1) vite-plugin-svgr: specifier: ^4.3.0 - version: 4.5.0(rollup@4.50.1)(typescript@5.6.3)(vite@6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1)) + version: 4.5.0(rollup@4.50.1)(typescript@5.6.3)(vite@6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1)) vite-tsconfig-paths: specifier: ^5.1.4 - version: 5.1.4(typescript@5.6.3)(vite@6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1)) + version: 5.1.4(typescript@5.6.3)(vite@6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1)) vitest: specifier: ^3.2.3 - version: 3.2.4(@types/node@24.3.1)(jsdom@26.1.0)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1) + version: 3.2.4(@types/debug@4.1.12)(@types/node@24.3.1)(jsdom@26.1.0)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1) packages: "@adobe/css-tools@4.4.4": @@ -431,6 +437,12 @@ packages: } engines: { node: ">=6.9.0" } + "@bufbuild/protobuf@2.11.0": + resolution: + { + integrity: sha512-sBXGT13cpmPR5BMgHE6UEEfEaShh5Ror6rfN3yEK5si7QVrtZg8LEPQb0VVhiLRUslD2yLnXtnRzG035J/mZXQ==, + } + "@choojs/findup@0.2.1": resolution: { @@ -1663,6 +1675,12 @@ packages: integrity: sha512-zt6OdqaDoOnJ1ZYsCYGt9YmWzDXl4vQdKTyJev62gFhRGKdx7mcT54V9KIjg+d2wi9EXsPvAPKe7i7WjfVWB8g==, } + "@scarf/scarf@1.4.0": + resolution: + { + integrity: sha512-xxeapPiUXdZAE3che6f3xogoJPeZgig6omHEy1rIY5WVsB3H2BHNnZH+gHG6x91SCWyQCzWGsuL2Hh3ClO5/qQ==, + } + "@storybook/addon-actions@8.6.14": resolution: { @@ -2097,6 +2115,12 @@ packages: integrity: sha512-VyyPYFlOMNylG45GoAe0xDoLwWuowvf92F9kySqzYh8vmYm7D2u4iUJKa1tOUpS70Ku13ASrOkS4ScXFsTaCNQ==, } + "@types/argparse@2.0.17": + resolution: + { + integrity: sha512-fueJssTf+4dW4HODshEGkIZbkLKHzgu1FvCI4cTc/MKum/534Euo3SrN+ilq8xgyHnOjtmg33/hee8iXLRg1XA==, + } + "@types/aria-query@5.0.4": resolution: { @@ -2127,12 +2151,36 @@ packages: integrity: sha512-8PvcXf70gTDZBgt9ptxJ8elBeBjcLOAcOtoO/mPJjtji1+CdGbHgm77om1GrsPxsiE+uXIpNSK64UYaIwQXd4Q==, } + "@types/benchmark@2.1.5": + resolution: + { + integrity: sha512-cKio2eFB3v7qmKcvIHLUMw/dIx/8bhWPuzpzRT4unCPRTD8VdA9Zb0afxpcxOqR4PixRS7yT42FqGS8BYL8g1w==, + } + + "@types/body-parser@1.19.6": + resolution: + { + integrity: sha512-HLFeCYgz89uk22N5Qg3dvGvsv46B8GLvKKo1zKG4NybA8U2DiEO3w9lqGg29t/tfLRJpJ6iQxnVw4OnB7MoM9g==, + } + "@types/chai@5.2.2": resolution: { integrity: sha512-8kB30R7Hwqf40JPiKhVzodJs2Qc1ZJ5zuT3uzw5Hq/dhNCl3G3l83jfpdI1e20BP348+fV7VIL/+FxaXkqBmWg==, } + "@types/compression@1.8.1": + resolution: + { + integrity: sha512-kCFuWS0ebDbmxs0AXYn6e2r2nrGAb5KwQhknjSPSPgJcGd8+HVSILlUyFhGqML2gk39HcG7D1ydW9/qpYkN00Q==, + } + + "@types/connect@3.4.38": + resolution: + { + integrity: sha512-K6uROf1LD88uDQqJCktA4yzL1YYAK6NgfsI0v/mTgyPKWsX1CnJ0XPSDhViejru1GcRkLWb8RlzFYJRqGUbaug==, + } + "@types/d3-color@3.1.3": resolution: { @@ -2169,6 +2217,12 @@ packages: integrity: sha512-iqMC4/YlFCSlO8+2Ii1GGGliCAY4XdeG748w5vQUbevlbDu0zSjH/+jojorQVBK/se0j6DUFNPBGSqD3YWYnDw==, } + "@types/debug@4.1.12": + resolution: + { + integrity: sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ==, + } + "@types/deep-eql@4.0.2": resolution: { @@ -2181,12 +2235,30 @@ packages: integrity: sha512-eOIHzCUSH7SMfonMG1LsC2f8vxBFtho6NGBznK41R84YzPuvSBzrhEps33IsQiOW9+VL6NQ9DbjQJznk/S4uRA==, } + "@types/estree-jsx@1.0.5": + resolution: + { + integrity: sha512-52CcUVNFyfb1A2ALocQw/Dd1BQFNmSdkuC3BkZ6iqhdMfQz7JWOFRuJFloOzjk+6WijU56m9oKXFAXc7o3Towg==, + } + "@types/estree@1.0.8": resolution: { integrity: sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==, } + "@types/express-serve-static-core@5.1.1": + resolution: + { + integrity: sha512-v4zIMr/cX7/d2BpAEX3KNKL/JrT1s43s96lLvvdTmza1oEvDudCqK9aF/djc/SWgy8Yh0h30TZx5VpzqFCxk5A==, + } + + "@types/express@5.0.6": + resolution: + { + integrity: sha512-sKYVuV7Sv9fbPIt/442koC7+IIwK5olP1KWeD88e/idgoJqDm3JV/YUiPwkoKK92ylff2MGxSz1CSjsXelx0YA==, + } + "@types/file-saver@2.0.7": resolution: { @@ -2205,6 +2277,12 @@ packages: integrity: sha512-6C8nqWur3j98U6+lXDfTUWIfgvZU+EumvpHKcYjujKH7woYyLj2sUmff0tRhrqM7BohUw7Pz3ZB1jj2gW9Fvmg==, } + "@types/hast@3.0.4": + resolution: + { + integrity: sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==, + } + "@types/hoist-non-react-statics@3.3.7": resolution: { @@ -2213,6 +2291,12 @@ packages: peerDependencies: "@types/react": "*" + "@types/http-errors@2.0.5": + resolution: + { + integrity: sha512-r8Tayk8HJnX0FztbZN7oVqGccWgw98T/0neJphO91KkmOzug1KkofZURD4UaD5uH8AqcFLfdPErnBod0u71/qg==, + } + "@types/json-schema@7.0.15": resolution: { @@ -2237,12 +2321,36 @@ packages: integrity: sha512-bpd8dRn9pr6xKvuEBQup8pwQfD4VUyqO/2deGjfpe6AwC8YRlyEipvefyRJUSiCJTZuCb8Pl1ciVV5ekqJ96Bg==, } + "@types/mdast@4.0.4": + resolution: + { + integrity: sha512-kGaNbPh1k7AFzgpud/gMdvIm5xuECykRR+JnWKQno9TAXVa6WIVCGTPvYGekIDL4uwCZQSYbUxNBSb1aUo79oA==, + } + "@types/mdx@2.0.13": resolution: { integrity: sha512-+OWZQfAYyio6YkJb3HLxDrvnx6SWWDbC0zVPfBRzUk0/nqoDyf6dNxQi3eArPe8rJ473nobTMQ/8Zk+LxJ+Yuw==, } + "@types/ms@2.1.0": + resolution: + { + integrity: sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==, + } + + "@types/node-fetch@2.6.13": + resolution: + { + integrity: sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw==, + } + + "@types/node@22.19.15": + resolution: + { + integrity: sha512-F0R/h2+dsy5wJAUe3tAU6oqa2qbWY5TpNfL/RGmo1y38hiyO1w3x2jPtt76wmuaJI4DQnOBu21cNXQ2STIUUWg==, + } + "@types/node@24.3.1": resolution: { @@ -2279,6 +2387,18 @@ packages: integrity: sha512-F6bEyamV9jKGAFBEmlQnesRPGOQqS2+Uwi0Em15xenOxHaf2hv6L8YCVn3rPdPJOiJfPiCnLIRyvwVaqMY3MIw==, } + "@types/qs@6.15.0": + resolution: + { + integrity: sha512-JawvT8iBVWpzTrz3EGw9BTQFg3BQNmwERdKE22vlTxawwtbyUSlMppvZYKLZzB5zgACXdXxbD3m1bXaMqP/9ow==, + } + + "@types/range-parser@1.2.7": + resolution: + { + integrity: sha512-hKormJbkJqzQGhziax5PItDUTMAM9uE2XXQmM37dyd4hVM+5aVl7oVxMVUiVQn2oCQFN/LKCZdvSM0pFRqbSmQ==, + } + "@types/react-dom@18.3.7": resolution: { @@ -2313,6 +2433,18 @@ packages: integrity: sha512-A4STmOXPhMUtHH+S6ymgE2GiBSMqf4oTvcQZMcHzokuTLVYzXTB8ttjcgxOVaAp2lGwEdzZ0J+cRbbeevQj1UQ==, } + "@types/send@1.2.1": + resolution: + { + integrity: sha512-arsCikDvlU99zl1g69TcAB3mzZPpxgw0UQnaHeC1Nwb015xp8bknZv5rIfri9xTOcMuaVgvabfIRA7PSZVuZIQ==, + } + + "@types/serve-static@2.2.0": + resolution: + { + integrity: sha512-8mam4H1NHLtu7nmtalF7eyBH14QyOASmcxHhSfEoRyr0nP/YdoesEtU+uSRvMe96TW/HPTtkoKqQLl53N7UXMQ==, + } + "@types/stylis@4.2.5": resolution: { @@ -2325,6 +2457,24 @@ packages: integrity: sha512-Z0pOY34GDFl3Q6hUFYf3HkTwKEE02e7QgtJppBt+beEAxnyOpJua+voGFvxINBHa06GwLFFym7gRPY2SiKIfIA==, } + "@types/swagger-ui-dist@3.30.6": + resolution: + { + integrity: sha512-FVxN7wjLYRtJsZBscOcOcf8oR++m38vbUFjT33Mr9HBuasX9bRDrJsp7iwixcOtKSHEEa2B7o2+4wEiXqC+Ebw==, + } + + "@types/unist@2.0.11": + resolution: + { + integrity: sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==, + } + + "@types/unist@3.0.3": + resolution: + { + integrity: sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==, + } + "@types/uuid@9.0.8": resolution: { @@ -2420,6 +2570,12 @@ packages: } engines: { node: ^18.18.0 || ^20.9.0 || >=21.1.0 } + "@ungap/structured-clone@1.3.0": + resolution: + { + integrity: sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g==, + } + "@unrs/resolver-binding-android-arm-eabi@1.11.1": resolution: { @@ -2667,19 +2823,19 @@ packages: integrity: sha512-fB2V0JFrQSMsCo9HiSq3Ezpdv4iYaXRG1Sx8edX3MwxfyNn83mKiGzOcH+Fkxt4MHxr3y42fQi1oeAInqgX2QA==, } - "@xyflow/react@12.10.0": + "@xyflow/react@12.10.2": resolution: { - integrity: sha512-eOtz3whDMWrB4KWVatIBrKuxECHqip6PfA8fTpaS2RUGVpiEAe+nqDKsLqkViVWxDGreq0lWX71Xth/SPAzXiw==, + integrity: sha512-CgIi6HwlcHXwlkTpr0fxLv/0sRVNZ8IdwKLzzeCscaYBwpvfcH1QFOCeaTCuEn1FQEs/B8CjnTSjhs8udgmBgQ==, } peerDependencies: react: ">=17" react-dom: ">=17" - "@xyflow/system@0.0.74": + "@xyflow/system@0.0.76": resolution: { - integrity: sha512-7v7B/PkiVrkdZzSbL+inGAo6tkR/WQHHG0/jhSvLQToCsfa8YubOGmBYd1s08tpKpihdHDZFwzQZeR69QSBb4Q==, + integrity: sha512-hvwvnRS1B3REwVDlWexsq7YQaPZeG3/mKo1jv38UmnpWmxihp14bW6VtEOuHEwJX2FvzFw8k77LyKSk/wiZVNA==, } abs-svg-path@0.1.1: @@ -2688,6 +2844,13 @@ packages: integrity: sha512-d8XPSGjfyzlXC3Xx891DJRyZfqk5JU0BJrDQcsWomFIV1/BIzPW5HDH5iDdWpqWaav0YVIEzT1RHTwWr0FFshA==, } + accepts@2.0.0: + resolution: + { + integrity: sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng==, + } + engines: { node: ">= 0.6" } + acorn-jsx@5.3.2: resolution: { @@ -2859,6 +3022,13 @@ packages: } engines: { node: ">= 0.4" } + array.prototype.reduce@1.0.8: + resolution: + { + integrity: sha512-DwuEqgXFBwbmZSRqt3BpQigWNUoqw9Ml2dTWdF3B2zQlQX4OeUE0zyuzX0fX0IbTvjdkZbcBTU3idgpO78qkTw==, + } + engines: { node: ">= 0.4" } + array.prototype.tosorted@1.1.4: resolution: { @@ -2920,6 +3090,12 @@ packages: } engines: { node: ">=10", npm: ">=6" } + bail@2.0.2: + resolution: + { + integrity: sha512-0xO6mYd7JB2YesxDKplafRpsiOzPt9V02ddPCLbY1xYGPOX24NTyN50qnUxgCPcSoYMhKpAuBTjQoRZCAkUDRw==, + } + balanced-match@1.0.2: resolution: { @@ -2964,6 +3140,13 @@ packages: integrity: sha512-6Pesp1w0DEX1N550i/uGV/TqucVL4AM/pgThFSN/Qq9si1/DF9aIHs1BxD8V/QU0HoeHO6cQRTAuYnLPKq1e4g==, } + body-parser@2.2.2: + resolution: + { + integrity: sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==, + } + engines: { node: ">=18" } + bootstrap@5.3.8: resolution: { @@ -3011,6 +3194,13 @@ packages: integrity: sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==, } + bytes@3.1.2: + resolution: + { + integrity: sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==, + } + engines: { node: ">= 0.8" } + cac@6.7.14: resolution: { @@ -3071,6 +3261,12 @@ packages: integrity: sha512-onIcjRpz69/Hx5bB5HGbYKUF2uC6QT6Gp+pfpGm3A7mPfcluSLV5v4Zu+oflDUwLdUw0rLIBhUbi0v8hM4FJQQ==, } + ccount@2.0.1: + resolution: + { + integrity: sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg==, + } + chai@5.3.3: resolution: { @@ -3099,6 +3295,30 @@ packages: } engines: { node: ^12.17.0 || ^14.13 || >=16.0.0 } + character-entities-html4@2.1.0: + resolution: + { + integrity: sha512-1v7fgQRj6hnSwFpq1Eu0ynr/CDEw0rXo2B61qXrLNdHZmPKgb7fqS1a2JwF0rISo9q77jDI8VMEHoApn8qDoZA==, + } + + character-entities-legacy@3.0.0: + resolution: + { + integrity: sha512-RpPp0asT/6ufRm//AJVwpViZbGM/MkjQFxJccQRHmISF/22NBtsHqAWmL+/pmkPWoIUJdWyeVleTl1wydHATVQ==, + } + + character-entities@2.0.2: + resolution: + { + integrity: sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ==, + } + + character-reference-invalid@2.0.1: + resolution: + { + integrity: sha512-iBZ4F4wRbyORVsu0jPV7gXkOsGYjGHPmAyv+HiHG8gi5PtC9KI2j1+v8/tlibRvjoWX027ypmG/n0HtO5t7unw==, + } + check-error@2.1.1: resolution: { @@ -3228,6 +3448,12 @@ packages: integrity: sha512-IfEDxwoWIjkeXL1eXcDiow4UbKjhLdq6/EuSVR9GMN7KVH3r9gQ83e73hsz1Nd1T3ijd5xv1wcWRYO+D6kCI2w==, } + colorjs.io@0.5.2: + resolution: + { + integrity: sha512-twmVoizEW7ylZSN32OgKdXRmo1qg+wT5/6C3xu5b9QsWzSFAhHLn2xd8ro0diCsKfCj1RdaTP/nrcW+vAoQPIw==, + } + combined-stream@1.0.8: resolution: { @@ -3235,6 +3461,12 @@ packages: } engines: { node: ">= 0.8" } + comma-separated-tokens@2.0.3: + resolution: + { + integrity: sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg==, + } + commander@13.1.0: resolution: { @@ -3248,6 +3480,20 @@ packages: integrity: sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==, } + compressible@2.0.18: + resolution: + { + integrity: sha512-AF3r7P5dWxL8MxyITRMlORQNaOA2IkAFaTr4k7BUumjPtRpGDTZpl0Pb1XCO6JeDCBdp126Cgs9sMxqSjgYyRg==, + } + engines: { node: ">= 0.6" } + + compression@1.8.1: + resolution: + { + integrity: sha512-9mAqGPHLakhCLeNyxPkK4xVo746zQ/czLH1Ky+vkitMnWfWZps8r0qXuwhwizagCRttsL4lfG4pIOvaWLpAP0w==, + } + engines: { node: ">= 0.8.0" } + concat-map@0.0.1: resolution: { @@ -3261,6 +3507,20 @@ packages: } engines: { "0": node >= 0.8 } + content-disposition@1.0.1: + resolution: + { + integrity: sha512-oIXISMynqSqm241k6kcQ5UwttDILMK4BiurCfGEREw6+X9jkkpEe5T9FZaApyLGGOnFuyMWZpdolTXMtvEJ08Q==, + } + engines: { node: ">=18" } + + content-type@1.0.5: + resolution: + { + integrity: sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==, + } + engines: { node: ">= 0.6" } + convert-source-map@1.9.0: resolution: { @@ -3273,6 +3533,20 @@ packages: integrity: sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==, } + cookie-signature@1.2.2: + resolution: + { + integrity: sha512-D76uU73ulSXrD1UXF4KE2TMxVVwhsnCgfAyTg9k8P6KGZjlXKrOLe4dJQKI3Bxi5wjesZoFXJWElNWBjPZMbhg==, + } + engines: { node: ">=6.6.0" } + + cookie@0.7.2: + resolution: + { + integrity: sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w==, + } + engines: { node: ">= 0.6" } + cookie@1.0.2: resolution: { @@ -3286,14 +3560,21 @@ packages: integrity: sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==, } - corepack@0.34.0: + corepack@0.34.6: resolution: { - integrity: sha512-8D9N/k9hDjoISCDGUzH2wBF0fJD49p3G7ifoEZcc0vhB7Py6r+Mc1SpJ8dvnWY/HMP95K60WkQbN7vgbUgXgpA==, + integrity: sha512-gvylq9kzJB09mSsiOnKOnhg0YdCWNy2aGaeGbYF4HlyGd/v4moxEonQjJPYI45/K4zP7q1hW9qCVvaYYKK5nkA==, } engines: { node: ^20.10.0 || ^22.11.0 || >=24.0.0 } hasBin: true + cors@2.8.6: + resolution: + { + integrity: sha512-tJtZBBHA6vjIAaF6EnIaq6laBBP9aq/Y3ouVJjEfoHbRBcHBAHYcMh/w8LDrk2PvIMMq8gmopa5D4V8RmbrxGw==, + } + engines: { node: ">= 0.10" } + cosmiconfig@7.1.0: resolution: { @@ -3611,12 +3892,30 @@ packages: supports-color: optional: true + debug@4.4.3: + resolution: + { + integrity: sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==, + } + engines: { node: ">=6.0" } + peerDependencies: + supports-color: "*" + peerDependenciesMeta: + supports-color: + optional: true + decimal.js@10.6.0: resolution: { integrity: sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg==, } + decode-named-character-reference@1.3.0: + resolution: + { + integrity: sha512-GtpQYB283KrPp6nRw50q3U9/VfOutZOe103qlN7BPP6Ad27xYnOIWv4lPzo8HCAL+mMZofJ9KEy30fq6MfaK6Q==, + } + deep-eql@5.0.2: resolution: { @@ -3664,6 +3963,13 @@ packages: } engines: { node: ">=0.4.0" } + depd@2.0.0: + resolution: + { + integrity: sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==, + } + engines: { node: ">= 0.8" } + dequal@2.0.3: resolution: { @@ -3685,6 +3991,12 @@ packages: engines: { node: ">=0.10" } hasBin: true + devlop@1.1.0: + resolution: + { + integrity: sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA==, + } + doctrine@2.1.0: resolution: { @@ -3773,6 +4085,12 @@ packages: integrity: sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==, } + ee-first@1.1.1: + resolution: + { + integrity: sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==, + } + electron-to-chromium@1.5.217: resolution: { @@ -3809,6 +4127,13 @@ packages: integrity: sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==, } + encodeurl@2.0.0: + resolution: + { + integrity: sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==, + } + engines: { node: ">= 0.8" } + end-of-stream@1.4.5: resolution: { @@ -3849,6 +4174,12 @@ packages: } engines: { node: ">= 0.4" } + es-array-method-boxes-properly@1.0.0: + resolution: + { + integrity: sha512-wd6JXUmyHmt8T5a2xreUwKcGPq6f1f+WwIJkijUqiGcJz1qqnZgP6XIK+QyIWU5lT7imeNxUll48bziG+TSYcA==, + } + es-define-property@1.0.1: resolution: { @@ -3953,6 +4284,12 @@ packages: } engines: { node: ">=6" } + escape-html@1.0.3: + resolution: + { + integrity: sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow==, + } + escape-string-regexp@4.0.0: resolution: { @@ -3960,6 +4297,13 @@ packages: } engines: { node: ">=10" } + escape-string-regexp@5.0.0: + resolution: + { + integrity: sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==, + } + engines: { node: ">=12" } + escodegen@2.1.0: resolution: { @@ -4139,6 +4483,12 @@ packages: } engines: { node: ">=4.0" } + estree-util-is-identifier-name@3.0.0: + resolution: + { + integrity: sha512-hFtqIDZTIUZ9BXLb8y4pYGyk6+wekIivNVTcmvk8NoOh+VeRn5y6cEHzbURrWbfp1fIqdVipilzj+lfaadNZmg==, + } + estree-walker@2.0.2: resolution: { @@ -4158,6 +4508,13 @@ packages: } engines: { node: ">=0.10.0" } + etag@1.8.1: + resolution: + { + integrity: sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg==, + } + engines: { node: ">= 0.6" } + event-emitter@0.3.5: resolution: { @@ -4191,12 +4548,25 @@ packages: } engines: { node: ">=12.0.0" } + express@5.2.1: + resolution: + { + integrity: sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw==, + } + engines: { node: ">= 18" } + ext@1.7.0: resolution: { integrity: sha512-6hxeJYaL110a9b5TEJSj0gojyHQAmA2ch5Os+ySCiA1QGdS697XWY1pzsrSjqA9LDEEgdB/KypIlR59RcLuHYw==, } + extend@3.0.2: + resolution: + { + integrity: sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==, + } + falafel@2.2.5: resolution: { @@ -4280,6 +4650,13 @@ packages: } engines: { node: ">=8" } + finalhandler@2.1.1: + resolution: + { + integrity: sha512-S8KoZgRZN+a5rNwqTxlZZePjT/4cnm0ROV70LedRHZ0p8u9fRID0hJUZQpkKLzro8LfmC8sx23bY6tVNxv8pQA==, + } + engines: { node: ">= 18.0.0" } + find-root@1.1.0: resolution: { @@ -4357,12 +4734,25 @@ packages: } engines: { node: ">= 6" } - framer-motion@12.23.12: + forwarded@0.2.0: resolution: { - integrity: sha512-6e78rdVtnBvlEVgu6eFEAgG9v3wLnYEboM8I5O5EXvfKC8gxGQB8wXJdhkMy10iVcn05jl6CNw7/HTsTCfwcWg==, + integrity: sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==, } - peerDependencies: + engines: { node: ">= 0.6" } + + fp-ts@2.16.11: + resolution: + { + integrity: sha512-LaI+KaX2NFkfn1ZGHoKCmcfv7yrZsC3b8NtWsTVQeHkq4F27vI5igUuO53sxqDEa2gNQMHFPmpojDw/1zmUK7w==, + } + + framer-motion@12.23.12: + resolution: + { + integrity: sha512-6e78rdVtnBvlEVgu6eFEAgG9v3wLnYEboM8I5O5EXvfKC8gxGQB8wXJdhkMy10iVcn05jl6CNw7/HTsTCfwcWg==, + } + peerDependencies: "@emotion/is-prop-valid": "*" react: ^18.0.0 || ^19.0.0 react-dom: ^18.0.0 || ^19.0.0 @@ -4374,6 +4764,13 @@ packages: react-dom: optional: true + fresh@2.0.0: + resolution: + { + integrity: sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==, + } + engines: { node: ">= 0.8" } + from2@2.3.0: resolution: { @@ -4675,6 +5072,12 @@ packages: integrity: sha512-HZRwumpOGUrHyxO5bqKZL0B0GlUpwtCAzZ42sgxUPniu33R1LSFH5yrIcBCHjkctCAh3mtWKcKd9J4vDDdeVHA==, } + h264-mp4-encoder@1.0.12: + resolution: + { + integrity: sha512-xih3J+Go0o1RqGjhOt6TwXLWWGqLONRPyS8yoMu/RoS/S8WyEv4HuHp1KBsDDl8srZQ3gw9f95JYkCSjCuZbHQ==, + } + has-bigints@1.1.0: resolution: { @@ -4735,6 +5138,18 @@ packages: } engines: { node: ">= 0.4" } + hast-util-to-jsx-runtime@2.3.6: + resolution: + { + integrity: sha512-zl6s8LwNyo1P9uw+XJGvZtdFF1GdAkOg8ujOw+4Pyb76874fLps4ueHXDhXWdk6YHQ6OgUtinliG7RsYvCbbBg==, + } + + hast-util-whitespace@3.0.0: + resolution: + { + integrity: sha512-88JUN06ipLwsnv+dVn+OIYOvAuvBMy/Qoi6O7mQHxdPXpjy+Cd6xRkWwux7DKO+4sYILtLBRIKgsdpS2gQc7qw==, + } + hoist-non-react-statics@3.3.2: resolution: { @@ -4748,6 +5163,19 @@ packages: } engines: { node: ">=18" } + html-url-attributes@3.0.1: + resolution: + { + integrity: sha512-ol6UPyBWqsrO6EJySPz2O7ZSr856WDrEzM5zMqp+FJJLGMW35cLYmmZnl0vztAZxRUoNZJFTCohfjuIJ8I4QBQ==, + } + + http-errors@2.0.1: + resolution: + { + integrity: sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ==, + } + engines: { node: ">= 0.8" } + http-proxy-agent@7.0.2: resolution: { @@ -4791,6 +5219,13 @@ packages: } engines: { node: ">=0.10.0" } + iconv-lite@0.7.2: + resolution: + { + integrity: sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==, + } + engines: { node: ">=0.10.0" } + ieee754@1.2.1: resolution: { @@ -4811,10 +5246,10 @@ packages: } engines: { node: ">= 4" } - immutable@5.1.3: + immutable@5.1.5: resolution: { - integrity: sha512-+chQdDfvscSF1SJqv2gn4SRO2ZyS3xL3r7IW/wWEEzrzLisnOlKiQu5ytC/BVNcS15C39WT2Hg/bjKjDMcu+zg==, + integrity: sha512-t7xcm2siw+hlUM68I+UEOK+z84RzmN59as9DZ7P1l0994DKUWV7UXBMQZVxaoMSRQ+PBZbHCOoBt7a2wxOMt+A==, } import-fresh@3.3.1: @@ -4851,6 +5286,12 @@ packages: } engines: { node: ^14.17.0 || ^16.13.0 || >=18.0.0 } + inline-style-parser@0.2.7: + resolution: + { + integrity: sha512-Nb2ctOyNR8DqQoR0OwRG95uNWIC0C1lCgf5Naz5H6Ji72KZ8OcFZLz2P5sNgwlyoJ8Yif11oMuYs5pBQa86csA==, + } + internal-slot@1.1.0: resolution: { @@ -4858,6 +5299,33 @@ packages: } engines: { node: ">= 0.4" } + io-ts@2.2.22: + resolution: + { + integrity: sha512-FHCCztTkHoV9mdBsHpocLpdTAfh956ZQcIkWQxxS0U5HT53vtrcuYdQneEJKH6xILaLNzXVl2Cvwtoy8XNN0AA==, + } + peerDependencies: + fp-ts: ^2.5.0 + + ipaddr.js@1.9.1: + resolution: + { + integrity: sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==, + } + engines: { node: ">= 0.10" } + + is-alphabetical@2.0.1: + resolution: + { + integrity: sha512-FWyyY60MeTNyeSRpkM2Iry0G9hpr7/9kD40mD/cGQEuilcZYS4okz8SN2Q6rLCJ8gbCt6fN+rC+6tMGS99LaxQ==, + } + + is-alphanumerical@2.0.1: + resolution: + { + integrity: sha512-hmbYhX/9MUMF5uh7tOXyK/n0ZvWpad5caBA17GsC6vyuCqaWliRG5K1qS9inmUhEMaOBIW7/whAnSwveW/LtZw==, + } + is-arguments@1.2.0: resolution: { @@ -4939,6 +5407,12 @@ packages: } engines: { node: ">= 0.4" } + is-decimal@2.0.1: + resolution: + { + integrity: sha512-AAB9hiomQs5DXWcRB1rqsxGUstbRroFOPPVAomNk/3XHR5JyEZChOyTWe2oayKnsSsr/kcGqF+z6yuH6HHpN0A==, + } + is-docker@2.2.1: resolution: { @@ -5010,6 +5484,12 @@ packages: } engines: { node: ">=0.10.0" } + is-hexadecimal@2.0.1: + resolution: + { + integrity: sha512-DgZQp241c8oO6cA1SbTEWiXeoxV42vlcJxgH+B3hi1AiqqKruZR3ZGF8In3fj4+/y/7rHvlOZLZtgJ/4ttYGZg==, + } + is-iexplorer@1.0.0: resolution: { @@ -5065,12 +5545,25 @@ packages: } engines: { node: ">=0.10.0" } + is-plain-obj@4.1.0: + resolution: + { + integrity: sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg==, + } + engines: { node: ">=12" } + is-potential-custom-element-name@1.0.1: resolution: { integrity: sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==, } + is-promise@4.0.0: + resolution: + { + integrity: sha512-hvpoI6korhJMnej285dSg6nu1+e6uxs7zG3BYAm5byqDsgJNWwxzM6z6iZiAgQR4TJ30JmBTOwqZUw3WlyH3AQ==, + } + is-regex@1.2.1: resolution: { @@ -5394,6 +5887,12 @@ packages: } engines: { node: ">=18" } + longest-streak@3.1.0: + resolution: + { + integrity: sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g==, + } + loose-envify@1.4.0: resolution: { @@ -5471,6 +5970,12 @@ packages: } engines: { node: ">=16.14.0", npm: ">=8.1.0" } + markdown-table@3.0.4: + resolution: + { + integrity: sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==, + } + math-intrinsics@1.1.0: resolution: { @@ -5485,364 +5990,731 @@ packages: } engines: { node: ">=0.10.0" } - memoizerific@1.11.3: + mdast-util-find-and-replace@3.0.2: resolution: { - integrity: sha512-/EuHYwAPdLtXwAwSZkh/Gutery6pD2KYd44oQLhAvQp/50mpyduZh8Q7PYHXTCJ+wuXxt7oij2LXyIJOOYFPog==, + integrity: sha512-Tmd1Vg/m3Xz43afeNxDIhWRtFZgM2VLyaf4vSTYwudTyeuTneoL3qtWMA5jeLyz/O1vDJmmV4QuScFCA2tBPwg==, } - merge-stream@2.0.0: + mdast-util-from-markdown@2.0.3: resolution: { - integrity: sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==, + integrity: sha512-W4mAWTvSlKvf8L6J+VN9yLSqQ9AOAAvHuoDAmPkz4dHf553m5gVj2ejadHJhoJmcmxEnOv6Pa8XJhpxE93kb8Q==, } - merge2@1.4.1: + mdast-util-gfm-autolink-literal@2.0.1: resolution: { - integrity: sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==, + integrity: sha512-5HVP2MKaP6L+G6YaxPNjuL0BPrq9orG3TsrZ9YXbA3vDw/ACI4MEsnoDpn6ZNm7GnZgtAcONJyPhOP8tNJQavQ==, } - engines: { node: ">= 8" } - micromatch@4.0.8: + mdast-util-gfm-footnote@2.1.0: resolution: { - integrity: sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==, + integrity: sha512-sqpDWlsHn7Ac9GNZQMeUzPQSMzR6Wv0WKRNvQRg0KqHh02fpTz69Qc1QSseNX29bhz1ROIyNyxExfawVKTm1GQ==, } - engines: { node: ">=8.6" } - mime-db@1.52.0: + mdast-util-gfm-strikethrough@2.0.0: resolution: { - integrity: sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==, + integrity: sha512-mKKb915TF+OC5ptj5bJ7WFRPdYtuHv0yTRxK2tJvi+BDqbkiG7h7u/9SI89nRAYcmap2xHQL9D+QG/6wSrTtXg==, } - engines: { node: ">= 0.6" } - mime-types@2.1.35: + mdast-util-gfm-table@2.0.0: resolution: { - integrity: sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==, + integrity: sha512-78UEvebzz/rJIxLvE7ZtDd/vIQ0RHv+3Mh5DR96p7cS7HsBhYIICDBCu8csTNWNO6tBWfqXPWekRuj2FNOGOZg==, } - engines: { node: ">= 0.6" } - mimic-fn@4.0.0: + mdast-util-gfm-task-list-item@2.0.0: resolution: { - integrity: sha512-vqiC06CuhBTUdZH+RYl8sFrL096vA45Ok5ISO6sE/Mr1jRbGH4Csnhi8f3wKVl7x8mO4Au7Ir9D3Oyv1VYMFJw==, + integrity: sha512-IrtvNvjxC1o06taBAVJznEnkiHxLFTzgonUdy8hzFVeDun0uTjxxrRGVaNFqkU1wJR3RBPEfsxmU6jDWPofrTQ==, } - engines: { node: ">=12" } - mimic-function@5.0.1: + mdast-util-gfm@3.1.0: resolution: { - integrity: sha512-VP79XUPxV2CigYP3jWwAUFSku2aKqBH7uTAapFWCBqutsbmDo96KY5o8uh6U+/YSIn5OxJnXp73beVkpqMIGhA==, + integrity: sha512-0ulfdQOM3ysHhCJ1p06l0b0VKlhU0wuQs3thxZQagjcjPrlFRqY215uZGHHJan9GEAXd9MbfPjFJz+qMkVR6zQ==, } - engines: { node: ">=18" } - min-indent@1.0.1: + mdast-util-mdx-expression@2.0.1: resolution: { - integrity: sha512-I9jwMn07Sy/IwOj3zVkVik2JTvgpaykDZEigL6Rx6N9LbMywwUSMtxET+7lVoDLLd3O3IXwJwvuuns8UB/HeAg==, + integrity: sha512-J6f+9hUp+ldTZqKRSg7Vw5V6MqjATc+3E4gf3CFNcuZNWD8XdyI6zQ8GqH7f8169MM6P7hMBRDVGnn7oHB9kXQ==, } - engines: { node: ">=4" } - minimatch@3.1.2: + mdast-util-mdx-jsx@3.2.0: resolution: { - integrity: sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==, + integrity: sha512-lj/z8v0r6ZtsN/cGNNtemmmfoLAFZnjMbNyLzBafjzikOM+glrjNHPlf6lQDOTccj9n5b0PPihEBbhneMyGs1Q==, } - minimatch@9.0.5: + mdast-util-mdxjs-esm@2.0.1: resolution: { - integrity: sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==, + integrity: sha512-EcmOpxsZ96CvlP03NghtH1EsLtr0n9Tm4lPUJUBccV9RwUOneqSycg19n5HGzCf+10LozMRSObtVr3ee1WoHtg==, } - engines: { node: ">=16 || 14 >=14.17" } - minimist@1.2.8: + mdast-util-phrasing@4.1.0: resolution: { - integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==, + integrity: sha512-TqICwyvJJpBwvGAMZjj4J2n0X8QWp21b9l0o7eXyVJ25YNWYbJDVIyD1bZXE6WtV6RmKJVYmQAKWa0zWOABz2w==, } - minipass@7.1.2: + mdast-util-to-hast@13.2.1: resolution: { - integrity: sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==, + integrity: sha512-cctsq2wp5vTsLIcaymblUriiTcZd0CwWtCbLvrOzYCDZoWyMNV8sZ7krj09FSnsiJi3WVsHLM4k6Dq/yaPyCXA==, } - engines: { node: ">=16 || 14 >=14.17" } - mobx-react-lite@4.1.0: + mdast-util-to-markdown@2.1.2: resolution: { - integrity: sha512-QEP10dpHHBeQNv1pks3WnHRCem2Zp636lq54M2nKO2Sarr13pL4u6diQXf65yzXUn0mkk18SyIDCm9UOJYTi1w==, + integrity: sha512-xj68wMTvGXVOKonmog6LwyJKrYXZPvlwabaryTjLh9LuvovB/KAH+kvi8Gjj+7rJjsFi23nkUxRQv1KqSroMqA==, } - peerDependencies: - mobx: ^6.9.0 - react: ^16.8.0 || ^17 || ^18 || ^19 - react-dom: "*" - react-native: "*" - peerDependenciesMeta: - react-dom: - optional: true - react-native: - optional: true - mobx@6.13.7: + mdast-util-to-string@4.0.0: resolution: { - integrity: sha512-aChaVU/DO5aRPmk1GX8L+whocagUUpBQqoPtJk+cm7UOXUk87J4PeWCh6nNmTTIfEhiR9DI/+FnA8dln/hTK7g==, + integrity: sha512-0H44vDimn51F0YwvxSJSm0eCDOJTRlmN0R1yBh4HLj9wiV1Dn0QoXGbvFAWj2hSItVTlCmBF1hqKlIyUBVFLPg==, } - moment@2.30.1: + media-typer@1.1.0: resolution: { - integrity: sha512-uEmtNhbDOrWPFS+hdjFCBfy9f2YoyzRpwcl+DqpC6taX21FzsTLQVbMV/W7PzNSX6x/bhC1zA3c2UQ5NzH6how==, + integrity: sha512-aisnrDP4GNe06UcKFnV5bfMNPBUw4jsLGaWwWfnH3v02GnBuXX2MCVn5RbrWo0j3pczUilYblq7fQ7Nw2t5XKw==, } + engines: { node: ">= 0.8" } - motion-dom@12.23.12: + memoizerific@1.11.3: resolution: { - integrity: sha512-RcR4fvMCTESQBD/uKQe49D5RUeDOokkGRmz4ceaJKDBgHYtZtntC/s2vLvY38gqGaytinij/yi3hMcWVcEF5Kw==, + integrity: sha512-/EuHYwAPdLtXwAwSZkh/Gutery6pD2KYd44oQLhAvQp/50mpyduZh8Q7PYHXTCJ+wuXxt7oij2LXyIJOOYFPog==, } - motion-utils@12.23.6: + merge-descriptors@2.0.0: resolution: { - integrity: sha512-eAWoPgr4eFEOFfg2WjIsMoqJTW6Z8MTUCgn/GZ3VRpClWBdnbjryiA3ZSNLyxCTmCQx4RmYX6jX1iWHbenUPNQ==, + integrity: sha512-Snk314V5ayFLhp3fkUREub6WtjBfPdCPY1Ln8/8munuLuiYhsABgBVWsozAG+MWMbVEvcdcpbi9R7ww22l9Q3g==, } + engines: { node: ">=18" } - mouse-change@1.4.0: + merge-stream@2.0.0: resolution: { - integrity: sha512-vpN0s+zLL2ykyyUDh+fayu9Xkor5v/zRD9jhSqjRS1cJTGS0+oakVZzNm5n19JvvEj0you+MXlYTpNxUDQUjkQ==, + integrity: sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==, } - mouse-event-offset@3.0.2: + merge2@1.4.1: resolution: { - integrity: sha512-s9sqOs5B1Ykox3Xo8b3Ss2IQju4UwlW6LSR+Q5FXWpprJ5fzMLefIIItr3PH8RwzfGy6gxs/4GAmiNuZScE25w==, + integrity: sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==, } + engines: { node: ">= 8" } - mouse-event@1.0.5: + micromark-core-commonmark@2.0.3: resolution: { - integrity: sha512-ItUxtL2IkeSKSp9cyaX2JLUuKk2uMoxBg4bbOWVd29+CskYJR9BGsUqtXenNzKbnDshvupjUewDIYVrOB6NmGw==, + integrity: sha512-RDBrHEMSxVFLg6xvnXmb1Ayr2WzLAWjeSATAoxwKYJV94TeNavgoIdA0a9ytzDSVzBy2YKFK+emCPOEibLeCrg==, } - mouse-wheel@1.2.0: + micromark-extension-gfm-autolink-literal@2.1.0: resolution: { - integrity: sha512-+OfYBiUOCTWcTECES49neZwL5AoGkXE+lFjIvzwNCnYRlso+EnfvovcBxGoyQ0yQt806eSPjS675K0EwWknXmw==, + integrity: sha512-oOg7knzhicgQ3t4QCjCWgTmfNhvQbDDnJeVu9v81r7NltNCVmhPy1fJRX27pISafdjL+SVc4d3l48Gb6pbRypw==, } - ms@2.0.0: + micromark-extension-gfm-footnote@2.1.0: resolution: { - integrity: sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==, + integrity: sha512-/yPhxI1ntnDNsiHtzLKYnE3vf9JZ6cAisqVDauhp4CEHxlb4uoOTxOCJ+9s51bIB8U1N1FJ1RXOKTIlD5B/gqw==, } - ms@2.1.3: + micromark-extension-gfm-strikethrough@2.1.0: resolution: { - integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==, + integrity: sha512-ADVjpOOkjz1hhkZLlBiYA9cR2Anf8F4HqZUO6e5eDcPQd0Txw5fxLzzxnEkSkfnD0wziSGiv7sYhk/ktvbf1uw==, } - murmurhash-js@1.0.0: + micromark-extension-gfm-table@2.1.1: resolution: { - integrity: sha512-TvmkNhkv8yct0SVBSy+o8wYzXjE4Zz3PCesbfs8HiCXXdcTuocApFv11UWlNFWKYsP2okqrhb7JNlSm9InBhIw==, + integrity: sha512-t2OU/dXXioARrC6yWfJ4hqB7rct14e8f7m0cbI5hUmDyyIlwv5vEtooptH8INkbLzOatzKuVbQmAYcbWoyz6Dg==, } - nanoid@3.3.11: + micromark-extension-gfm-tagfilter@2.0.0: resolution: { - integrity: sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==, + integrity: sha512-xHlTOmuCSotIA8TW1mDIM6X2O1SiX5P9IuDtqGonFhEK0qgRI4yeC6vMxEV2dgyr2TiD+2PQ10o+cOhdVAcwfg==, } - engines: { node: ^10 || ^12 || ^13.7 || ^14 || >=15.0.1 } - hasBin: true - napi-postinstall@0.3.3: + micromark-extension-gfm-task-list-item@2.1.0: resolution: { - integrity: sha512-uTp172LLXSxuSYHv/kou+f6KW3SMppU9ivthaVTXian9sOt3XM/zHYHpRZiLgQoxeWfYUnslNWQHF1+G71xcow==, + integrity: sha512-qIBZhqxqI6fjLDYFTBIa4eivDMnP+OZqsNwmQ3xNLE4Cxwc+zfQEfbs6tzAo2Hjq+bh6q5F+Z8/cksrLFYWQQw==, } - engines: { node: ^12.20.0 || ^14.18.0 || >=16.0.0 } - hasBin: true - native-promise-only@0.8.1: + micromark-extension-gfm@3.0.0: resolution: { - integrity: sha512-zkVhZUA3y8mbz652WrL5x0fB0ehrBkulWT3TomAQ9iDtyXZvzKeEA6GPxAItBYeNYl5yngKRX612qHOhvMkDeg==, + integrity: sha512-vsKArQsicm7t0z2GugkCKtZehqUm31oeGBV/KVSorWSy8ZlNAv7ytjFhvaryUiCUJYqs+NoE6AFhpQvBTM6Q4w==, } - natural-compare@1.4.0: + micromark-factory-destination@2.0.1: resolution: { - integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==, + integrity: sha512-Xe6rDdJlkmbFRExpTOmRj9N3MaWmbAgdpSrBQvCFqhezUn4AHqJHbaEnfbVYYiexVSs//tqOdY/DxhjdCiJnIA==, } - needle@2.9.1: + micromark-factory-label@2.0.1: resolution: { - integrity: sha512-6R9fqJ5Zcmf+uYaFgdIHmLwNldn5HbK8L5ybn7Uz+ylX/rnOsSp1AHcvQSrCaFN+qNM1wpymHqD7mVasEOlHGQ==, + integrity: sha512-VFMekyQExqIW7xIChcXn4ok29YE3rnuyveW3wZQWWqF4Nv9Wk5rgJ99KzPvHjkmPXF93FXIbBp6YdW3t71/7Vg==, } - engines: { node: ">= 4.4.x" } - hasBin: true - next-tick@1.1.0: + micromark-factory-space@2.0.1: resolution: { - integrity: sha512-CXdUiJembsNjuToQvxayPZF9Vqht7hewsvy2sOWafLvi2awflj9mOC6bHIg50orX8IJvWKY9wYQ/zB2kogPslQ==, + integrity: sha512-zRkxjtBxxLd2Sc0d+fbnEunsTj46SWXgXciZmHq0kDYGnck/ZSGj9/wULTV95uoeYiK5hRXP2mJ98Uo4cq/LQg==, } - no-case@3.0.4: + micromark-factory-title@2.0.1: resolution: { - integrity: sha512-fgAN3jGAh+RoxUGZHTSOLJIqUc2wmoBwGR4tbpNAKmmovFoWq0OdRkb0VkldReO2a2iBT/OEulG9XSUc10r3zg==, + integrity: sha512-5bZ+3CjhAd9eChYTHsjy6TGxpOFSKgKKJPJxr293jTbfry2KDoWkhBb6TcPVB4NmzaPhMs1Frm9AZH7OD4Cjzw==, } - node-addon-api@7.1.1: + micromark-factory-whitespace@2.0.1: resolution: { - integrity: sha512-5m3bsyrjFWE1xf7nz7YXdN4udnVtXK6/Yfgn5qnahL6bCkf2yKt4k3nuTKAtT4r3IG8JNR2ncsIMdZuAzJjHQQ==, + integrity: sha512-Ob0nuZ3PKt/n0hORHyvoD9uZhr+Za8sFoP+OnMcnWK5lngSzALgQYKMr9RJVOWLqQYuyn6ulqGWSXdwf6F80lQ==, } - node-releases@2.0.20: + micromark-util-character@2.1.1: resolution: { - integrity: sha512-7gK6zSXEH6neM212JgfYFXe+GmZQM+fia5SsusuBIUgnPheLFBmIPhtFoAQRj8/7wASYQnbDlHPVwY0BefoFgA==, + integrity: sha512-wv8tdUTJ3thSFFFJKtpYKOYiGP2+v96Hvk4Tu8KpCAsTMs6yi+nVmGh1syvSCsaxz45J6Jbw+9DD6g97+NV67Q==, } - normalize-svg-path@0.1.0: + micromark-util-chunked@2.0.1: resolution: { - integrity: sha512-1/kmYej2iedi5+ROxkRESL/pI02pkg0OBnaR4hJkSIX6+ORzepwbuUXfrdZaPjysTsJInj0Rj5NuX027+dMBvA==, + integrity: sha512-QUNFEOPELfmvv+4xiNg2sRYeS/P84pTW0TCgP5zc9FpXetHY0ab7SxKyAQCNCc1eK0459uoLI1y5oO5Vc1dbhA==, } - normalize-svg-path@1.1.0: + micromark-util-classify-character@2.0.1: resolution: { - integrity: sha512-r9KHKG2UUeB5LoTouwDzBy2VxXlHsiM6fyLQvnJa0S5hrhzqElH/CH7TUGhT1fVvIYBIKf3OpY4YJ4CK+iaqHg==, + integrity: sha512-K0kHzM6afW/MbeWYWLjoHQv1sgg2Q9EccHEDzSkxiP/EaagNzCm7T/WMKZ3rjMbvIpvBiZgwR3dKMygtA4mG1Q==, } - npm-run-path@5.3.0: + micromark-util-combine-extensions@2.0.1: resolution: { - integrity: sha512-ppwTtiJZq0O/ai0z7yfudtBpWIoxM8yE6nHi1X47eFR2EWORqfbu6CnPlNsjeN683eT0qG6H/Pyf9fCcvjnnnQ==, + integrity: sha512-OnAnH8Ujmy59JcyZw8JSbK9cGpdVY44NKgSM7E9Eh7DiLS2E9RNQf0dONaGDzEG9yjEl5hcqeIsj4hfRkLH/Bg==, } - engines: { node: ^12.20.0 || ^14.13.1 || >=16.0.0 } - number-is-integer@1.0.1: + micromark-util-decode-numeric-character-reference@2.0.2: resolution: { - integrity: sha512-Dq3iuiFBkrbmuQjGFFF3zckXNCQoSD37/SdSbgcBailUx6knDvDwb5CympBgcoWHy36sfS12u74MHYkXyHq6bg==, + integrity: sha512-ccUbYk6CwVdkmCQMyr64dXz42EfHGkPQlBj5p7YVGzq8I7CtjXZJrubAYezf7Rp+bjPseiROqe7G6foFd+lEuw==, } - engines: { node: ">=0.10.0" } - nwsapi@2.2.22: + micromark-util-decode-string@2.0.1: resolution: { - integrity: sha512-ujSMe1OWVn55euT1ihwCI1ZcAaAU3nxUiDwfDQldc51ZXaB9m2AyOn6/jh1BLe2t/G8xd6uKG1UBF2aZJeg2SQ==, + integrity: sha512-nDV/77Fj6eH1ynwscYTOsbK7rR//Uj0bZXBwJZRfaLEJ1iGBR6kIfNmlNqaqJf649EP0F3NWNdeJi03elllNUQ==, } - object-assign@4.1.1: + micromark-util-encode@2.0.1: resolution: { - integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==, + integrity: sha512-c3cVx2y4KqUnwopcO9b/SCdo2O67LwJJ/UyqGfbigahfegL9myoEFoDYZgkT7f36T0bLrM9hZTAaAyH+PCAXjw==, } - engines: { node: ">=0.10.0" } - object-inspect@1.13.4: + micromark-util-html-tag-name@2.0.1: resolution: { - integrity: sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew==, + integrity: sha512-2cNEiYDhCWKI+Gs9T0Tiysk136SnR13hhO8yW6BGNyhOC4qYFnwF1nKfD3HFAIXA5c45RrIG1ub11GiXeYd1xA==, } - engines: { node: ">= 0.4" } - object-keys@1.1.1: + micromark-util-normalize-identifier@2.0.1: resolution: { - integrity: sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA==, + integrity: sha512-sxPqmo70LyARJs0w2UclACPUUEqltCkJ6PhKdMIDuJ3gSf/Q+/GIe3WKl0Ijb/GyH9lOpUkRAO2wp0GVkLvS9Q==, } - engines: { node: ">= 0.4" } - object.assign@4.1.7: + micromark-util-resolve-all@2.0.1: resolution: { - integrity: sha512-nK28WOo+QIjBkDduTINE4JkF/UJJKyf2EJxvJKfblDpyg0Q+pkOHNTL0Qwy6NP6FhE/EnzV73BxxqcJaXY9anw==, + integrity: sha512-VdQyxFWFT2/FGJgwQnJYbe1jjQoNTS4RjglmSjTUlpUMa95Htx9NHeYW4rGDJzbjvCsl9eLjMQwGeElsqmzcHg==, } - engines: { node: ">= 0.4" } - object.entries@1.1.9: + micromark-util-sanitize-uri@2.0.1: resolution: { - integrity: sha512-8u/hfXFRBD1O0hPUjioLhoWFHRmt6tKA4/vZPyckBr18l1KE9uHrFaFaUi8MDRTpi4uak2goyPTSNJLXX2k2Hw==, + integrity: sha512-9N9IomZ/YuGGZZmQec1MbgxtlgougxTodVwDzzEouPKo3qFWvymFHWcnDi2vzV1ff6kas9ucW+o3yzJK9YB1AQ==, } - engines: { node: ">= 0.4" } - object.fromentries@2.0.8: + micromark-util-subtokenize@2.1.0: resolution: { - integrity: sha512-k6E21FzySsSK5a21KRADBd/NGneRegFO5pLHfdQLpRDETUNJueLXs3WCzyQ3tFRDYgbq3KHGXfTbi2bs8WQ6rQ==, + integrity: sha512-XQLu552iSctvnEcgXw6+Sx75GflAPNED1qx7eBJ+wydBb2KCbRZe+NwvIEEMM83uml1+2WSXpBAcp9IUCgCYWA==, } - engines: { node: ">= 0.4" } - object.groupby@1.0.3: + micromark-util-symbol@2.0.1: resolution: { - integrity: sha512-+Lhy3TQTuzXI5hevh8sBGqbmurHbbIjAi0Z4S63nthVLmLxfbj4T54a4CfZrXIrt9iP4mVAPYMo/v99taj3wjQ==, + integrity: sha512-vs5t8Apaud9N28kgCrRUdEed4UJ+wWNvicHLPxCa9ENlYuAY31M0ETy5y1vA33YoNPDFTghEbnh6efaE8h4x0Q==, } - engines: { node: ">= 0.4" } - object.values@1.2.1: + micromark-util-types@2.0.2: resolution: { - integrity: sha512-gXah6aZrcUxjWg2zR2MwouP2eHlCBzdV4pygudehaKXSGW4v2AsRQUK+lwwXhii6KFZcunEnmSUoYp5CXibxtA==, + integrity: sha512-Yw0ECSpJoViF1qTU4DC6NwtC4aWGt1EkzaQB8KPPyCRR8z9TWeV0HbEFGTO+ZY1wB22zmxnJqhPyTpOVCpeHTA==, } - engines: { node: ">= 0.4" } - once@1.3.3: + micromark@4.0.2: resolution: { - integrity: sha512-6vaNInhu+CHxtONf3zw3vq4SP2DOQhjBvIa3rNcG0+P7eKWlYH6Peu7rHizSloRU2EwMz6GraLieis9Ac9+p1w==, + integrity: sha512-zpe98Q6kvavpCr1NPVSCMebCKfD7CA2NqZ+rykeNhONIJBpc1tFKt9hucLGwha3jNTNI8lHpctWJWoimVF4PfA==, } - once@1.4.0: + micromatch@4.0.8: resolution: { - integrity: sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==, + integrity: sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==, } + engines: { node: ">=8.6" } - onetime@6.0.0: + mime-db@1.52.0: resolution: { - integrity: sha512-1FlR+gjXK7X+AsAHso35MnyN5KqGwJRi/31ft6x0M194ht7S+rWAvd7PHss9xSKMzE0asv1pyIHaJYq+BbacAQ==, + integrity: sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==, } - engines: { node: ">=12" } + engines: { node: ">= 0.6" } - onetime@7.0.0: + mime-db@1.54.0: resolution: { - integrity: sha512-VXJjc87FScF88uafS3JllDgvAm+c/Slfz06lorj2uAY34rlUu0Nt+v8wreiImcrgAjjIHp1rXpTDlLOGw29WwQ==, + integrity: sha512-aU5EJuIN2WDemCcAp2vFBfp/m4EAhWJnUNSSw0ixs7/kXbd6Pg64EmwJkNdFhB8aWt1sH2CTXrLxo/iAGV3oPQ==, } - engines: { node: ">=18" } + engines: { node: ">= 0.6" } - open@8.4.2: + mime-types@2.1.35: resolution: { - integrity: sha512-7x81NCL719oNbsq/3mh+hVrAWmFuEYUqrq/Iw3kUzH8ReypT9QQ0BLoJS7/G9k6N81XjW4qHWtjWwe/9eLy1EQ==, + integrity: sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==, } - engines: { node: ">=12" } + engines: { node: ">= 0.6" } - optionator@0.9.4: + mime-types@3.0.2: resolution: { - integrity: sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==, + integrity: sha512-Lbgzdk0h4juoQ9fCKXW4by0UJqj+nOOrI9MJ1sSj4nI8aI2eo1qmvQEie4VD1glsS250n15LsWsYtCugiStS5A==, } - engines: { node: ">= 0.8.0" } + engines: { node: ">=18" } - own-keys@1.0.1: + mimic-fn@4.0.0: + resolution: + { + integrity: sha512-vqiC06CuhBTUdZH+RYl8sFrL096vA45Ok5ISO6sE/Mr1jRbGH4Csnhi8f3wKVl7x8mO4Au7Ir9D3Oyv1VYMFJw==, + } + engines: { node: ">=12" } + + mimic-function@5.0.1: + resolution: + { + integrity: sha512-VP79XUPxV2CigYP3jWwAUFSku2aKqBH7uTAapFWCBqutsbmDo96KY5o8uh6U+/YSIn5OxJnXp73beVkpqMIGhA==, + } + engines: { node: ">=18" } + + min-indent@1.0.1: + resolution: + { + integrity: sha512-I9jwMn07Sy/IwOj3zVkVik2JTvgpaykDZEigL6Rx6N9LbMywwUSMtxET+7lVoDLLd3O3IXwJwvuuns8UB/HeAg==, + } + engines: { node: ">=4" } + + minimatch@3.1.2: + resolution: + { + integrity: sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==, + } + + minimatch@9.0.5: + resolution: + { + integrity: sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==, + } + engines: { node: ">=16 || 14 >=14.17" } + + minimist@1.2.8: + resolution: + { + integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==, + } + + minipass@7.1.2: + resolution: + { + integrity: sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==, + } + engines: { node: ">=16 || 14 >=14.17" } + + mobx-react-lite@4.1.0: + resolution: + { + integrity: sha512-QEP10dpHHBeQNv1pks3WnHRCem2Zp636lq54M2nKO2Sarr13pL4u6diQXf65yzXUn0mkk18SyIDCm9UOJYTi1w==, + } + peerDependencies: + mobx: ^6.9.0 + react: ^16.8.0 || ^17 || ^18 || ^19 + react-dom: "*" + react-native: "*" + peerDependenciesMeta: + react-dom: + optional: true + react-native: + optional: true + + mobx@6.13.7: + resolution: + { + integrity: sha512-aChaVU/DO5aRPmk1GX8L+whocagUUpBQqoPtJk+cm7UOXUk87J4PeWCh6nNmTTIfEhiR9DI/+FnA8dln/hTK7g==, + } + + molstar@5.7.0: + resolution: + { + integrity: sha512-Bo/QDiEkoRdhyhmFXNBPP2kiNTHZNgJO69AzqO+CrpkSj7JUrYNtBlt49vqa2AJfLUrBNgeeHcfjbxGLmfO7sQ==, + } + engines: { node: ">=22.0.0" } + hasBin: true + peerDependencies: + "@google-cloud/storage": ^7.14.0 + canvas: ^2.11.2 + gl: ^6.0.2 + jpeg-js: ^0.4.4 + pngjs: ^6.0.0 + react: ">=16.14.0" + react-dom: ">=16.14.0" + peerDependenciesMeta: + "@google-cloud/storage": + optional: true + canvas: + optional: true + gl: + optional: true + jpeg-js: + optional: true + pngjs: + optional: true + + moment@2.30.1: + resolution: + { + integrity: sha512-uEmtNhbDOrWPFS+hdjFCBfy9f2YoyzRpwcl+DqpC6taX21FzsTLQVbMV/W7PzNSX6x/bhC1zA3c2UQ5NzH6how==, + } + + motion-dom@12.23.12: + resolution: + { + integrity: sha512-RcR4fvMCTESQBD/uKQe49D5RUeDOokkGRmz4ceaJKDBgHYtZtntC/s2vLvY38gqGaytinij/yi3hMcWVcEF5Kw==, + } + + motion-utils@12.23.6: + resolution: + { + integrity: sha512-eAWoPgr4eFEOFfg2WjIsMoqJTW6Z8MTUCgn/GZ3VRpClWBdnbjryiA3ZSNLyxCTmCQx4RmYX6jX1iWHbenUPNQ==, + } + + mouse-change@1.4.0: + resolution: + { + integrity: sha512-vpN0s+zLL2ykyyUDh+fayu9Xkor5v/zRD9jhSqjRS1cJTGS0+oakVZzNm5n19JvvEj0you+MXlYTpNxUDQUjkQ==, + } + + mouse-event-offset@3.0.2: + resolution: + { + integrity: sha512-s9sqOs5B1Ykox3Xo8b3Ss2IQju4UwlW6LSR+Q5FXWpprJ5fzMLefIIItr3PH8RwzfGy6gxs/4GAmiNuZScE25w==, + } + + mouse-event@1.0.5: + resolution: + { + integrity: sha512-ItUxtL2IkeSKSp9cyaX2JLUuKk2uMoxBg4bbOWVd29+CskYJR9BGsUqtXenNzKbnDshvupjUewDIYVrOB6NmGw==, + } + + mouse-wheel@1.2.0: + resolution: + { + integrity: sha512-+OfYBiUOCTWcTECES49neZwL5AoGkXE+lFjIvzwNCnYRlso+EnfvovcBxGoyQ0yQt806eSPjS675K0EwWknXmw==, + } + + ms@2.0.0: + resolution: + { + integrity: sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==, + } + + ms@2.1.3: + resolution: + { + integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==, + } + + murmurhash-js@1.0.0: + resolution: + { + integrity: sha512-TvmkNhkv8yct0SVBSy+o8wYzXjE4Zz3PCesbfs8HiCXXdcTuocApFv11UWlNFWKYsP2okqrhb7JNlSm9InBhIw==, + } + + mutative@1.3.0: + resolution: + { + integrity: sha512-8MJj6URmOZAV70dpFe1YnSppRTKC4DsMkXQiBDFayLcDI4ljGokHxmpqaBQuDWa4iAxWaJJ1PS8vAmbntjjKmQ==, + } + engines: { node: ">=14.0" } + + nanoid@3.3.11: + resolution: + { + integrity: sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==, + } + engines: { node: ^10 || ^12 || ^13.7 || ^14 || >=15.0.1 } + hasBin: true + + napi-postinstall@0.3.3: + resolution: + { + integrity: sha512-uTp172LLXSxuSYHv/kou+f6KW3SMppU9ivthaVTXian9sOt3XM/zHYHpRZiLgQoxeWfYUnslNWQHF1+G71xcow==, + } + engines: { node: ^12.20.0 || ^14.18.0 || >=16.0.0 } + hasBin: true + + native-promise-only@0.8.1: + resolution: + { + integrity: sha512-zkVhZUA3y8mbz652WrL5x0fB0ehrBkulWT3TomAQ9iDtyXZvzKeEA6GPxAItBYeNYl5yngKRX612qHOhvMkDeg==, + } + + natural-compare@1.4.0: + resolution: + { + integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==, + } + + needle@2.9.1: + resolution: + { + integrity: sha512-6R9fqJ5Zcmf+uYaFgdIHmLwNldn5HbK8L5ybn7Uz+ylX/rnOsSp1AHcvQSrCaFN+qNM1wpymHqD7mVasEOlHGQ==, + } + engines: { node: ">= 4.4.x" } + hasBin: true + + negotiator@0.6.4: + resolution: + { + integrity: sha512-myRT3DiWPHqho5PrJaIRyaMv2kgYf0mUVgBNOYMuCH5Ki1yEiQaf/ZJuQ62nvpc44wL5WDbTX7yGJi1Neevw8w==, + } + engines: { node: ">= 0.6" } + + negotiator@1.0.0: + resolution: + { + integrity: sha512-8Ofs/AUQh8MaEcrlq5xOX0CQ9ypTF5dl78mjlMNfOK08fzpgTHQRQPBxcPlEtIw0yRpws+Zo/3r+5WRby7u3Gg==, + } + engines: { node: ">= 0.6" } + + next-tick@1.1.0: + resolution: + { + integrity: sha512-CXdUiJembsNjuToQvxayPZF9Vqht7hewsvy2sOWafLvi2awflj9mOC6bHIg50orX8IJvWKY9wYQ/zB2kogPslQ==, + } + + no-case@3.0.4: + resolution: + { + integrity: sha512-fgAN3jGAh+RoxUGZHTSOLJIqUc2wmoBwGR4tbpNAKmmovFoWq0OdRkb0VkldReO2a2iBT/OEulG9XSUc10r3zg==, + } + + node-addon-api@7.1.1: + resolution: + { + integrity: sha512-5m3bsyrjFWE1xf7nz7YXdN4udnVtXK6/Yfgn5qnahL6bCkf2yKt4k3nuTKAtT4r3IG8JNR2ncsIMdZuAzJjHQQ==, + } + + node-fetch@2.7.0: + resolution: + { + integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==, + } + engines: { node: 4.x || >=6.0.0 } + peerDependencies: + encoding: ^0.1.0 + peerDependenciesMeta: + encoding: + optional: true + + node-releases@2.0.20: + resolution: + { + integrity: sha512-7gK6zSXEH6neM212JgfYFXe+GmZQM+fia5SsusuBIUgnPheLFBmIPhtFoAQRj8/7wASYQnbDlHPVwY0BefoFgA==, + } + + normalize-svg-path@0.1.0: + resolution: + { + integrity: sha512-1/kmYej2iedi5+ROxkRESL/pI02pkg0OBnaR4hJkSIX6+ORzepwbuUXfrdZaPjysTsJInj0Rj5NuX027+dMBvA==, + } + + normalize-svg-path@1.1.0: + resolution: + { + integrity: sha512-r9KHKG2UUeB5LoTouwDzBy2VxXlHsiM6fyLQvnJa0S5hrhzqElH/CH7TUGhT1fVvIYBIKf3OpY4YJ4CK+iaqHg==, + } + + npm-run-path@5.3.0: + resolution: + { + integrity: sha512-ppwTtiJZq0O/ai0z7yfudtBpWIoxM8yE6nHi1X47eFR2EWORqfbu6CnPlNsjeN683eT0qG6H/Pyf9fCcvjnnnQ==, + } + engines: { node: ^12.20.0 || ^14.13.1 || >=16.0.0 } + + number-is-integer@1.0.1: + resolution: + { + integrity: sha512-Dq3iuiFBkrbmuQjGFFF3zckXNCQoSD37/SdSbgcBailUx6knDvDwb5CympBgcoWHy36sfS12u74MHYkXyHq6bg==, + } + engines: { node: ">=0.10.0" } + + nwsapi@2.2.22: + resolution: + { + integrity: sha512-ujSMe1OWVn55euT1ihwCI1ZcAaAU3nxUiDwfDQldc51ZXaB9m2AyOn6/jh1BLe2t/G8xd6uKG1UBF2aZJeg2SQ==, + } + + object-assign@4.1.1: + resolution: + { + integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==, + } + engines: { node: ">=0.10.0" } + + object-inspect@1.13.4: + resolution: + { + integrity: sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew==, + } + engines: { node: ">= 0.4" } + + object-keys@1.1.1: + resolution: + { + integrity: sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA==, + } + engines: { node: ">= 0.4" } + + object.assign@4.1.7: + resolution: + { + integrity: sha512-nK28WOo+QIjBkDduTINE4JkF/UJJKyf2EJxvJKfblDpyg0Q+pkOHNTL0Qwy6NP6FhE/EnzV73BxxqcJaXY9anw==, + } + engines: { node: ">= 0.4" } + + object.entries@1.1.9: + resolution: + { + integrity: sha512-8u/hfXFRBD1O0hPUjioLhoWFHRmt6tKA4/vZPyckBr18l1KE9uHrFaFaUi8MDRTpi4uak2goyPTSNJLXX2k2Hw==, + } + engines: { node: ">= 0.4" } + + object.fromentries@2.0.8: + resolution: + { + integrity: sha512-k6E21FzySsSK5a21KRADBd/NGneRegFO5pLHfdQLpRDETUNJueLXs3WCzyQ3tFRDYgbq3KHGXfTbi2bs8WQ6rQ==, + } + engines: { node: ">= 0.4" } + + object.getownpropertydescriptors@2.1.9: + resolution: + { + integrity: sha512-mt8YM6XwsTTovI+kdZdHSxoyF2DI59up034orlC9NfweclcWOt7CVascNNLp6U+bjFVCVCIh9PwS76tDM/rH8g==, + } + engines: { node: ">= 0.4" } + + object.groupby@1.0.3: + resolution: + { + integrity: sha512-+Lhy3TQTuzXI5hevh8sBGqbmurHbbIjAi0Z4S63nthVLmLxfbj4T54a4CfZrXIrt9iP4mVAPYMo/v99taj3wjQ==, + } + engines: { node: ">= 0.4" } + + object.values@1.2.1: + resolution: + { + integrity: sha512-gXah6aZrcUxjWg2zR2MwouP2eHlCBzdV4pygudehaKXSGW4v2AsRQUK+lwwXhii6KFZcunEnmSUoYp5CXibxtA==, + } + engines: { node: ">= 0.4" } + + on-finished@2.4.1: + resolution: + { + integrity: sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg==, + } + engines: { node: ">= 0.8" } + + on-headers@1.1.0: + resolution: + { + integrity: sha512-737ZY3yNnXy37FHkQxPzt4UZ2UWPWiCZWLvFZ4fu5cueciegX0zGPnrlY6bwRg4FdQOe9YU8MkmJwGhoMybl8A==, + } + engines: { node: ">= 0.8" } + + once@1.3.3: + resolution: + { + integrity: sha512-6vaNInhu+CHxtONf3zw3vq4SP2DOQhjBvIa3rNcG0+P7eKWlYH6Peu7rHizSloRU2EwMz6GraLieis9Ac9+p1w==, + } + + once@1.4.0: + resolution: + { + integrity: sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==, + } + + onetime@6.0.0: + resolution: + { + integrity: sha512-1FlR+gjXK7X+AsAHso35MnyN5KqGwJRi/31ft6x0M194ht7S+rWAvd7PHss9xSKMzE0asv1pyIHaJYq+BbacAQ==, + } + engines: { node: ">=12" } + + onetime@7.0.0: + resolution: + { + integrity: sha512-VXJjc87FScF88uafS3JllDgvAm+c/Slfz06lorj2uAY34rlUu0Nt+v8wreiImcrgAjjIHp1rXpTDlLOGw29WwQ==, + } + engines: { node: ">=18" } + + open@8.4.2: + resolution: + { + integrity: sha512-7x81NCL719oNbsq/3mh+hVrAWmFuEYUqrq/Iw3kUzH8ReypT9QQ0BLoJS7/G9k6N81XjW4qHWtjWwe/9eLy1EQ==, + } + engines: { node: ">=12" } + + optionator@0.9.4: + resolution: + { + integrity: sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==, + } + engines: { node: ">= 0.8.0" } + + own-keys@1.0.1: resolution: { integrity: sha512-qFOyK5PjiWZd+QQIh+1jhdb9LpxTF0qs7Pm8o5QHYZ0M3vKqSqzsZaEB6oWlxZ+q2sJBMI/Ktgd2N5ZwQoRHfg==, @@ -5882,6 +6754,12 @@ packages: integrity: sha512-KF/U8tk54BgQewkJPvB4s/US3VQY68BRDpH638+7O/n58TpnwiwnOtGIOsT2/i+M78s61BBpeC83STB88d8sqw==, } + parse-entities@4.0.2: + resolution: + { + integrity: sha512-GG2AQYWoLgL877gQIKeRPGO1xF9+eG1ujIb5soS5gPvLQ1y2o8FL90w2QWNdf9I361Mpp7726c+lj3U0qK1uGw==, + } + parse-json@5.2.0: resolution: { @@ -5913,6 +6791,13 @@ packages: integrity: sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==, } + parseurl@1.3.3: + resolution: + { + integrity: sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==, + } + engines: { node: ">= 0.8" } + path-exists@4.0.0: resolution: { @@ -5947,6 +6832,12 @@ packages: } engines: { node: ">=16 || 14 >=14.18" } + path-to-regexp@8.3.0: + resolution: + { + integrity: sha512-7jdwVIRtsP8MYpdXSwOS0YdD0Du+qOoF/AEPIt88PcCFrZCzx41oxku1jD88hZBwbNUIEfpqvuhjFaMAqMTWnA==, + } + path-type@4.0.0: resolution: { @@ -6132,12 +7023,25 @@ packages: integrity: sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==, } + property-information@7.1.0: + resolution: + { + integrity: sha512-TwEZ+X+yCJmYfL7TPUOcvBZ4QfoT5YenQiJuX//0th53DE6w0xxLEtfK3iyryQFddXuvkIk51EEgrJQ0WJkOmQ==, + } + protocol-buffers-schema@3.6.0: resolution: { integrity: sha512-TdDRD+/QNdrCGCE7v8340QyuXd4kIWIgapsE2+n/SaGiSSbomYl4TjHlvIoCWRpE7wFt02EpB35VVA2ImcBVqw==, } + proxy-addr@2.0.7: + resolution: + { + integrity: sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==, + } + engines: { node: ">= 0.10" } + proxy-from-env@1.1.0: resolution: { @@ -6151,6 +7055,13 @@ packages: } engines: { node: ">=6" } + qs@6.15.0: + resolution: + { + integrity: sha512-mAZTtNCeetKMH+pSjrb76NAM8V9a05I9aBZOHztWy/UqcJdQYNsf59vrRKWnojAT9Y+GbIvoTBC++CPHqpDBhQ==, + } + engines: { node: ">=0.6" } + queue-microtask@1.2.3: resolution: { @@ -6175,6 +7086,20 @@ packages: integrity: sha512-Sq4CW4QhwOHE8ucn6J34MqtZCeWFP2aQSmrlroYgqAV1PjStIhJXxYuTgUIfkEk7zTLjmIjLmU5q+fbD1NnOJA==, } + range-parser@1.2.1: + resolution: + { + integrity: sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==, + } + engines: { node: ">= 0.6" } + + raw-body@3.0.2: + resolution: + { + integrity: sha512-K5zQjDllxWkf7Z5xJdV0/B0WTNqx6vxG70zJE4N0kBs4LovmEYWJzQGxC9bS9RAKu3bgM40lrd5zoLJ12MQ5BA==, + } + engines: { node: ">= 0.10" } + react-confetti@6.4.0: resolution: { @@ -6233,6 +7158,15 @@ packages: integrity: sha512-tr41fA15Vn8p4X9ntI+yCyeGSf1TlYaY5vlTZfQmeLBrFo3psOPX6HhTDnFNL9uj3EhP0KAQ80cugCl4b4BERA==, } + react-markdown@10.1.0: + resolution: + { + integrity: sha512-qKxVopLT/TyA6BX3Ue5NwabOsAzm0Q7kAPwq6L+wWDwisYs7R8vZ0nRXqq6rkueboxpkjvLGU9fWifiX/ZZFxQ==, + } + peerDependencies: + "@types/react": ">=18" + react: ">=18" + react-plotly.js@2.6.0: resolution: { @@ -6365,6 +7299,30 @@ packages: integrity: sha512-+IOGrxl3FZ8ZM9ixCWQZzFRiRn7Rzn9bu3iFHwg/yz4tlOUQgbO4PHLgG+1ZT60zcIV8tief6Qrmyl8qcoJP0g==, } + remark-gfm@4.0.1: + resolution: + { + integrity: sha512-1quofZ2RQ9EWdeN34S79+KExV1764+wCUGop5CPL1WGdD0ocPpu91lzPGbwWMECpEpd42kJGQwzRfyov9j4yNg==, + } + + remark-parse@11.0.0: + resolution: + { + integrity: sha512-FCxlKLNGknS5ba/1lmpYijMUzX2esxW5xQqjWxw2eHFfS2MSdaHVINFmhjo+qN1WhZhNimq0dZATN9pH0IDrpA==, + } + + remark-rehype@11.1.2: + resolution: + { + integrity: sha512-Dh7l57ianaEoIpzbp0PC9UKAdCSVklD8E5Rpw7ETfbTl3FqcOOgq5q2LVDhgGCkaBv7p24JXikPdvhhmHvKMsw==, + } + + remark-stringify@11.0.0: + resolution: + { + integrity: sha512-1OSmLd3awB/t8qdoEOMazZkNsfVTeY4fTsgzcQFdXNq8ToTN4ZGwrMnlda4K6smTFKD+GRV6O48i6Z4iKgPPpw==, + } + reselect@5.1.1: resolution: { @@ -6418,94 +7376,273 @@ packages: } engines: { node: ">=18" } - reusify@1.1.0: + reusify@1.1.0: + resolution: + { + integrity: sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==, + } + engines: { iojs: ">=1.0.0", node: ">=0.10.0" } + + rfdc@1.4.1: + resolution: + { + integrity: sha512-q1b3N5QkRUWUl7iyylaaj3kOpIT0N2i9MqIEQXP73GVsN9cw3fdx8X63cEmWhJGi2PPCF23Ijp7ktmd39rawIA==, + } + + right-now@1.0.0: + resolution: + { + integrity: sha512-DA8+YS+sMIVpbsuKgy+Z67L9Lxb1p05mNxRpDPNksPDEFir4vmBlUtuN9jkTGn9YMMdlBuK7XQgFiz6ws+yhSg==, + } + + rollup@4.50.1: + resolution: + { + integrity: sha512-78E9voJHwnXQMiQdiqswVLZwJIzdBKJ1GdI5Zx6XwoFKUIk09/sSrr+05QFzvYb8q6Y9pPV45zzDuYa3907TZA==, + } + engines: { node: ">=18.0.0", npm: ">=8.0.0" } + hasBin: true + + router@2.2.0: + resolution: + { + integrity: sha512-nLTrUKm2UyiL7rlhapu/Zl45FwNgkZGaCpZbIHajDYgwlJCOzLSk+cIPAnsEqV955GjILJnKbdQC1nVPz+gAYQ==, + } + engines: { node: ">= 18" } + + rrweb-cssom@0.8.0: + resolution: + { + integrity: sha512-guoltQEx+9aMf2gDZ0s62EcV8lsXR+0w8915TC3ITdn2YueuNjdAYh/levpU9nFaoChh9RUS5ZdQMrKfVEN9tw==, + } + + run-parallel@1.2.0: + resolution: + { + integrity: sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==, + } + + rw@1.3.3: + resolution: + { + integrity: sha512-PdhdWy89SiZogBLaw42zdeqtRJ//zFd2PgQavcICDUgJT5oW10QCRKbJ6bg4r0/UY2M6BWd5tkxuGFRvCkgfHQ==, + } + + rxjs@7.8.2: + resolution: + { + integrity: sha512-dhKf903U/PQZY6boNNtAGdWbG85WAbjT/1xYoZIC7FAY0yWapOBQVsVrDl58W86//e1VpMNBtRV4MaXfdMySFA==, + } + + safe-array-concat@1.1.3: + resolution: + { + integrity: sha512-AURm5f0jYEOydBj7VQlVvDrjeFgthDdEF5H1dP+6mNpoXOMo1quQqJ4wvJDyRZ9+pO3kGWoOdmV08cSv2aJV6Q==, + } + engines: { node: ">=0.4" } + + safe-buffer@5.1.2: + resolution: + { + integrity: sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==, + } + + safe-buffer@5.2.1: + resolution: + { + integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==, + } + + safe-push-apply@1.0.0: + resolution: + { + integrity: sha512-iKE9w/Z7xCzUMIZqdBsp6pEQvwuEebH4vdpjcDWnyzaI6yl6O9FHvVpmGelvEHNsoY6wGblkxR6Zty/h00WiSA==, + } + engines: { node: ">= 0.4" } + + safe-regex-test@1.1.0: + resolution: + { + integrity: sha512-x/+Cz4YrimQxQccJf5mKEbIa1NzeCRNI5Ecl/ekmlYaampdNLPalVyIcCZNNH3MvmqBugV5TMYZXv0ljslUlaw==, + } + engines: { node: ">= 0.4" } + + safer-buffer@2.1.2: + resolution: + { + integrity: sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==, + } + + sass-embedded-all-unknown@1.98.0: + resolution: + { + integrity: sha512-6n4RyK7/1mhdfYvpP3CClS3fGoYqDvRmLClCESS6I7+SAzqjxvGG6u5Fo+cb1nrPNbbilgbM4QKdgcgWHO9NCA==, + } + cpu: ["!arm", "!arm64", "!riscv64", "!x64"] + + sass-embedded-android-arm64@1.98.0: + resolution: + { + integrity: sha512-M9Ra98A6vYJHpwhoC/5EuH1eOshQ9ZyNwC8XifUDSbRl/cGeQceT1NReR9wFj3L7s1pIbmes1vMmaY2np0uAKQ==, + } + engines: { node: ">=14.0.0" } + cpu: [arm64] + os: [android] + + sass-embedded-android-arm@1.98.0: + resolution: + { + integrity: sha512-LjGiMhHgu7VL1n7EJxTCre1x14bUsWd9d3dnkS2rku003IWOI/fxc7OXgaKagoVzok1kv09rzO3vFXJR5ZeONQ==, + } + engines: { node: ">=14.0.0" } + cpu: [arm] + os: [android] + + sass-embedded-android-riscv64@1.98.0: + resolution: + { + integrity: sha512-WPe+0NbaJIZE1fq/RfCZANMeIgmy83x4f+SvFOG7LhUthHpZWcOcrPTsCKKmN3xMT3iw+4DXvqTYOCYGRL3hcQ==, + } + engines: { node: ">=14.0.0" } + cpu: [riscv64] + os: [android] + + sass-embedded-android-x64@1.98.0: + resolution: + { + integrity: sha512-zrD25dT7OHPEgLWuPEByybnIfx4rnCtfge4clBgjZdZ3lF6E7qNLRBtSBmoFflh6Vg0RlEjJo5VlpnTMBM5MQQ==, + } + engines: { node: ">=14.0.0" } + cpu: [x64] + os: [android] + + sass-embedded-darwin-arm64@1.98.0: + resolution: + { + integrity: sha512-cgr1z9rBnCdMf8K+JabIaYd9Rag2OJi5mjq08XJfbJGMZV/TA6hFJCLGkr5/+ZOn4/geTM5/3aSfQ8z5EIJAOg==, + } + engines: { node: ">=14.0.0" } + cpu: [arm64] + os: [darwin] + + sass-embedded-darwin-x64@1.98.0: resolution: { - integrity: sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==, + integrity: sha512-OLBOCs/NPeiMqTdOrMFbVHBQFj19GS3bSVSxIhcCq16ZyhouUkYJEZjxQgzv9SWA2q6Ki8GCqp4k6jMeUY9dcA==, } - engines: { iojs: ">=1.0.0", node: ">=0.10.0" } + engines: { node: ">=14.0.0" } + cpu: [x64] + os: [darwin] - rfdc@1.4.1: + sass-embedded-linux-arm64@1.98.0: resolution: { - integrity: sha512-q1b3N5QkRUWUl7iyylaaj3kOpIT0N2i9MqIEQXP73GVsN9cw3fdx8X63cEmWhJGi2PPCF23Ijp7ktmd39rawIA==, + integrity: sha512-axOE3t2MTBwCtkUCbrdM++Gj0gC0fdHJPrgzQ+q1WUmY9NoNMGqflBtk5mBZaWUeha2qYO3FawxCB8lctFwCtw==, } + engines: { node: ">=14.0.0" } + cpu: [arm64] + os: [linux] - right-now@1.0.0: + sass-embedded-linux-arm@1.98.0: resolution: { - integrity: sha512-DA8+YS+sMIVpbsuKgy+Z67L9Lxb1p05mNxRpDPNksPDEFir4vmBlUtuN9jkTGn9YMMdlBuK7XQgFiz6ws+yhSg==, + integrity: sha512-03baQZCxVyEp8v1NWBRlzGYrmVT/LK7ZrHlF1piscGiGxwfdxoLXVuxsylx3qn/dD/4i/rh7Bzk7reK1br9jvQ==, } + engines: { node: ">=14.0.0" } + cpu: [arm] + os: [linux] - rollup@4.50.1: + sass-embedded-linux-musl-arm64@1.98.0: resolution: { - integrity: sha512-78E9voJHwnXQMiQdiqswVLZwJIzdBKJ1GdI5Zx6XwoFKUIk09/sSrr+05QFzvYb8q6Y9pPV45zzDuYa3907TZA==, + integrity: sha512-LeqNxQA8y4opjhe68CcFvMzCSrBuJqYVFbwElEj9bagHXQHTp9xVPJRn6VcrC+0VLEDq13HVXMv7RslIuU0zmA==, } - engines: { node: ">=18.0.0", npm: ">=8.0.0" } - hasBin: true + engines: { node: ">=14.0.0" } + cpu: [arm64] + os: [linux] - rrweb-cssom@0.8.0: + sass-embedded-linux-musl-arm@1.98.0: resolution: { - integrity: sha512-guoltQEx+9aMf2gDZ0s62EcV8lsXR+0w8915TC3ITdn2YueuNjdAYh/levpU9nFaoChh9RUS5ZdQMrKfVEN9tw==, + integrity: sha512-OBkjTDPYR4hSaueOGIM6FDpl9nt/VZwbSRpbNu9/eEJcxE8G/vynRugW8KRZmCFjPy8j/jkGBvvS+k9iOqKV3g==, } + engines: { node: ">=14.0.0" } + cpu: [arm] + os: [linux] - run-parallel@1.2.0: + sass-embedded-linux-musl-riscv64@1.98.0: resolution: { - integrity: sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==, + integrity: sha512-7w6hSuOHKt8FZsmjRb3iGSxEzM87fO9+M8nt5JIQYMhHTj5C+JY/vcske0v715HCVj5e1xyTnbGXf8FcASeAIw==, } + engines: { node: ">=14.0.0" } + cpu: [riscv64] + os: [linux] - rw@1.3.3: + sass-embedded-linux-musl-x64@1.98.0: resolution: { - integrity: sha512-PdhdWy89SiZogBLaw42zdeqtRJ//zFd2PgQavcICDUgJT5oW10QCRKbJ6bg4r0/UY2M6BWd5tkxuGFRvCkgfHQ==, + integrity: sha512-QikNyDEJOVqPmxyCFkci8ZdCwEssdItfjQFJB+D+Uy5HFqcS5Lv3d3GxWNX/h1dSb23RPyQdQc267ok5SbEyJw==, } + engines: { node: ">=14.0.0" } + cpu: [x64] + os: [linux] - safe-array-concat@1.1.3: + sass-embedded-linux-riscv64@1.98.0: resolution: { - integrity: sha512-AURm5f0jYEOydBj7VQlVvDrjeFgthDdEF5H1dP+6mNpoXOMo1quQqJ4wvJDyRZ9+pO3kGWoOdmV08cSv2aJV6Q==, + integrity: sha512-E7fNytc/v4xFBQKzgzBddV/jretA4ULAPO6XmtBiQu4zZBdBozuSxsQLe2+XXeb0X4S2GIl72V7IPABdqke/vA==, } - engines: { node: ">=0.4" } + engines: { node: ">=14.0.0" } + cpu: [riscv64] + os: [linux] - safe-buffer@5.1.2: + sass-embedded-linux-x64@1.98.0: resolution: { - integrity: sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==, + integrity: sha512-VsvP0t/uw00mMNPv3vwyYKUrFbqzxQHnRMO+bHdAMjvLw4NFf6mscpym9Bzf+NXwi1ZNKnB6DtXjmcpcvqFqYg==, } + engines: { node: ">=14.0.0" } + cpu: [x64] + os: [linux] - safe-buffer@5.2.1: + sass-embedded-unknown-all@1.98.0: resolution: { - integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==, + integrity: sha512-C4MMzcAo3oEDQnW7L8SBgB9F2Fq5qHPnaYTZRMOH3Mp/7kM4OooBInXpCiiFjLnjY95hzP4KyctVx0uYR6MYlQ==, } + os: ["!android", "!darwin", "!linux", "!win32"] - safe-push-apply@1.0.0: + sass-embedded-win32-arm64@1.98.0: resolution: { - integrity: sha512-iKE9w/Z7xCzUMIZqdBsp6pEQvwuEebH4vdpjcDWnyzaI6yl6O9FHvVpmGelvEHNsoY6wGblkxR6Zty/h00WiSA==, + integrity: sha512-nP/10xbAiPbhQkMr3zQfXE4TuOxPzWRQe1Hgbi90jv2R4TbzbqQTuZVOaJf7KOAN4L2Bo6XCTRjK5XkVnwZuwQ==, } - engines: { node: ">= 0.4" } + engines: { node: ">=14.0.0" } + cpu: [arm64] + os: [win32] - safe-regex-test@1.1.0: + sass-embedded-win32-x64@1.98.0: resolution: { - integrity: sha512-x/+Cz4YrimQxQccJf5mKEbIa1NzeCRNI5Ecl/ekmlYaampdNLPalVyIcCZNNH3MvmqBugV5TMYZXv0ljslUlaw==, + integrity: sha512-/lbrVsfbcbdZQ5SJCWcV0NVPd6YRs+FtAnfedp4WbCkO/ZO7Zt/58MvI4X2BVpRY/Nt5ZBo1/7v2gYcQ+J4svQ==, } - engines: { node: ">= 0.4" } + engines: { node: ">=14.0.0" } + cpu: [x64] + os: [win32] - safer-buffer@2.1.2: + sass-embedded@1.98.0: resolution: { - integrity: sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==, + integrity: sha512-Do7u6iRb6K+lrllcTkB1BXcHwOxcKe3rEfOF/GcCLE2w3WpddakRAosJOHFUR37DpsvimQXEt5abs3NzUjEIqg==, } + engines: { node: ">=16.0.0" } + hasBin: true - sass@1.89.2: + sass@1.98.0: resolution: { - integrity: sha512-xCmtksBKd/jdJ9Bt9p7nPKiuqrlBMBuuGkQlkhZjjQk3Ty48lv93k5Dq6OPkKt4XwxDJ7tvlfrTa1MPA9bf+QA==, + integrity: sha512-+4N/u9dZ4PrgzGgPlKnaaRQx64RO0JBKs9sDhQ2pLgN6JQZ25uPQZKQYaBJU48Kd5BxgXoJ4e09Dq7nMcOUW3A==, } engines: { node: ">=14.0.0" } hasBin: true @@ -6544,6 +7681,20 @@ packages: engines: { node: ">=10" } hasBin: true + send@1.2.1: + resolution: + { + integrity: sha512-1gnZf7DFcoIcajTjTwjwuDjzuz4PPcY2StKPlsGAQ1+YH20IRVrBaXSWmdjowTJ6u8Rc01PoYOGHXfP1mYcZNQ==, + } + engines: { node: ">= 18" } + + serve-static@2.2.1: + resolution: + { + integrity: sha512-xRXBn0pPqQTVQiC8wyQrKs2MOlX24zQ0POGaj0kultvoOCstBQM5yvOhAVSUwOMjQtTvsPWoNCHfPGwaaQJhTw==, + } + engines: { node: ">= 18" } + set-cookie-parser@2.7.1: resolution: { @@ -6571,6 +7722,12 @@ packages: } engines: { node: ">= 0.4" } + setprototypeof@1.2.0: + resolution: + { + integrity: sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==, + } + shallow-copy@0.0.1: resolution: { @@ -6691,6 +7848,12 @@ packages: } engines: { node: ">=0.10.0" } + space-separated-tokens@2.0.2: + resolution: + { + integrity: sha512-PEGlAwrG8yXGXRjW32fGbg66JAlOAwbObuqVoJpv/mRgoWDQfgH1wDPvtzWyUSNAXBGSk8h755YDbbcEy3SH2Q==, + } + stable-hash@0.0.5: resolution: { @@ -6715,6 +7878,13 @@ packages: integrity: sha512-MgWpQ/ZjGieSVB3eOJVs4OA2LT/q1vx98KPCTTQPzq/aLr0YUXTsgryTXr4SLfR0ZfUUCiedM9n/ABeDIyy4mA==, } + statuses@2.0.2: + resolution: + { + integrity: sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw==, + } + engines: { node: ">= 0.8" } + std-env@3.9.0: resolution: { @@ -6832,6 +8002,12 @@ packages: integrity: sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==, } + stringify-entities@4.0.4: + resolution: + { + integrity: sha512-IwfBptatlO+QCJUo19AqvrPNqlVMpW9YEL2LIVY+Rpv2qsjCGxaDLNRgeGsQWJhfItebuJhsGSLjaBbNSQ+ieg==, + } + strip-ansi@6.0.1: resolution: { @@ -6893,6 +8069,18 @@ packages: integrity: sha512-i0TFx4wPcO0FwX+4RkLJi1MxmcTv90jNZgxMu9XRnMXMeFUY1VJlIoXpZunPUvUUqbCT1pg5PEkFqqpcaElNaA==, } + style-to-js@1.1.21: + resolution: + { + integrity: sha512-RjQetxJrrUJLQPHbLku6U/ocGtzyjbJMP9lCNK7Ag0CNh690nSH8woqWH9u16nMjYBAok+i7JO1NP2pOy8IsPQ==, + } + + style-to-object@1.0.14: + resolution: + { + integrity: sha512-LIN7rULI0jBscWQYaSswptyderlarFkjQ+t79nzty8tcIAceVomEVlLzH5VP4Cmsv6MtKhs7qaAiwlcp+Mgaxw==, + } + styled-components@6.1.19: resolution: { @@ -6940,6 +8128,13 @@ packages: } engines: { node: ">=8" } + supports-color@8.1.1: + resolution: + { + integrity: sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q==, + } + engines: { node: ">=10" } + supports-preserve-symlinks-flag@1.0.0: resolution: { @@ -6971,12 +8166,32 @@ packages: integrity: sha512-vJJjVq/R5lSr2KLfVXVAStktfcfa1pNFjFOgyJnzZFXlO/fDZ5DmM8FpnSKKzLPfEYTVeXuVBTHF296TpxuJVg==, } + swagger-ui-dist@5.32.0: + resolution: + { + integrity: sha512-nKZB0OuDvacB0s/lC2gbge+RigYvGRGpLLMWMFxaTUwfM+CfndVk9Th2IaTinqXiz6Mn26GK2zriCpv6/+5m3Q==, + } + symbol-tree@3.2.4: resolution: { integrity: sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==, } + sync-child-process@1.0.2: + resolution: + { + integrity: sha512-8lD+t2KrrScJ/7KXCSyfhT3/hRq78rC0wBFqNJXv3mZyn6hW2ypM05JmlSvtqRbeq6jqA94oHbxAr2vYsJ8vDA==, + } + engines: { node: ">=16.0.0" } + + sync-message-port@1.2.0: + resolution: + { + integrity: sha512-gAQ9qrUN/UCypHtGFbbe7Rc/f9bzO88IwrG8TDo/aMKAApKyD6E3W4Cm0EfhfBb6Z6SKt59tTCTfD+n1xmAvMg==, + } + engines: { node: ">=16.0.0" } + terser@5.42.0: resolution: { @@ -7107,6 +8322,13 @@ packages: } engines: { node: ">=8.0" } + toidentifier@1.0.1: + resolution: + { + integrity: sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA==, + } + engines: { node: ">=0.6" } + topojson-client@3.1.0: resolution: { @@ -7121,6 +8343,12 @@ packages: } engines: { node: ">=16" } + tr46@0.0.3: + resolution: + { + integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==, + } + tr46@5.1.1: resolution: { @@ -7128,6 +8356,18 @@ packages: } engines: { node: ">=18" } + trim-lines@3.0.1: + resolution: + { + integrity: sha512-kRj8B+YHZCc9kQYdWfJB2/oUl9rA99qbowYYBtr4ui4mZyAQ2JpvVBd/6U2YloATfqBhBTSMhTpgBHtU0Mf3Rg==, + } + + trough@2.2.0: + resolution: + { + integrity: sha512-tmMpK00BjZiUyVyvrBK7knerNgmgvcV/KLVyuma/SC+TQN167GrMRciANTz09+k3zW8L8t60jWO1GpfkZdjTaw==, + } + ts-api-utils@2.1.0: resolution: { @@ -7202,6 +8442,13 @@ packages: } engines: { node: ">=12.20" } + type-is@2.0.1: + resolution: + { + integrity: sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw==, + } + engines: { node: ">= 0.6" } + type@2.7.3: resolution: { @@ -7273,12 +8520,54 @@ packages: } engines: { node: ">= 0.4" } + undici-types@6.21.0: + resolution: + { + integrity: sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==, + } + undici-types@7.10.0: resolution: { integrity: sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag==, } + unified@11.0.5: + resolution: + { + integrity: sha512-xKvGhPWw3k84Qjh8bI3ZeJjqnyadK+GEFtazSfZv/rKeTkTjOJho6mFqh2SM96iIcZokxiOpg78GazTSg8+KHA==, + } + + unist-util-is@6.0.1: + resolution: + { + integrity: sha512-LsiILbtBETkDz8I9p1dQ0uyRUWuaQzd/cuEeS1hoRSyW5E5XGmTzlwY1OrNzzakGowI9Dr/I8HVaw4hTtnxy8g==, + } + + unist-util-position@5.0.0: + resolution: + { + integrity: sha512-fucsC7HjXvkB5R3kTCO7kUjRdrS0BJt3M/FPxmHMBOm8JQi2BsHAHFsy27E0EolP8rp0NzXsJ+jNPyDWvOJZPA==, + } + + unist-util-stringify-position@4.0.0: + resolution: + { + integrity: sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ==, + } + + unist-util-visit-parents@6.0.2: + resolution: + { + integrity: sha512-goh1s1TBrqSqukSc8wrjwWhL0hiJxgA8m4kFxGlQ+8FYQ3C/m11FcTs4YYem7V664AhHVvgoQLk890Ssdsr2IQ==, + } + + unist-util-visit@5.1.0: + resolution: + { + integrity: sha512-m+vIdyeCOpdr/QeQCu2EzxX/ohgS8KbnPDgFni4dQsfSCtpz8UqDyY5GjRru8PDKuYn7Fq19j1CQ+nJSsGKOzg==, + } + universalify@2.0.1: resolution: { @@ -7286,6 +8575,13 @@ packages: } engines: { node: ">= 10.0.0" } + unpipe@1.0.0: + resolution: + { + integrity: sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==, + } + engines: { node: ">= 0.8" } + unplugin@1.16.1: resolution: { @@ -7340,6 +8636,13 @@ packages: integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==, } + util.promisify@1.1.3: + resolution: + { + integrity: sha512-GIEaZ6o86fj09Wtf0VfZ5XP7tmd4t3jM5aZCgmBi231D0DB1AEBa3Aa6MP48DMsAIi96WkpWLimIWVwOjbDMOw==, + } + engines: { node: ">= 0.8" } + util@0.12.5: resolution: { @@ -7360,6 +8663,31 @@ packages: } hasBin: true + varint@6.0.0: + resolution: + { + integrity: sha512-cXEIW6cfr15lFv563k4GuVuW/fiwjknytD37jIOLSdSWuOI6WnO/oKwmP2FQTU2l01LP8/M5TSAJpzUaGe3uWg==, + } + + vary@1.1.2: + resolution: + { + integrity: sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg==, + } + engines: { node: ">= 0.8" } + + vfile-message@4.0.3: + resolution: + { + integrity: sha512-QTHzsGd1EhbZs4AsQ20JX1rC3cOlt/IWJruk893DfLRr57lcnOeMaWG4K0JrRta4mIJZKth2Au3mM3u03/JWKw==, + } + + vfile@6.0.3: + resolution: + { + integrity: sha512-KzIbH/9tXat2u30jf+smMwFCsno4wHVdNmzFyL+T/L3UGqqk6JKfVqOFOZEpZSHADH1k40ab6NUIXZq422ov3Q==, + } + vite-node@3.2.4: resolution: { @@ -7486,6 +8814,12 @@ packages: integrity: sha512-q/fGIivtqTT7PEoF07axFIlHNk/XCPaYpq64btnepopSWvKNFkoORlQYgqDigBIuGA1ExnFd/GnSUnBNEPQY7Q==, } + webidl-conversions@3.0.1: + resolution: + { + integrity: sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==, + } + webidl-conversions@7.0.0: resolution: { @@ -7520,6 +8854,12 @@ packages: } engines: { node: ">=18" } + whatwg-url@5.0.0: + resolution: + { + integrity: sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==, + } + which-boxed-primitive@1.1.1: resolution: { @@ -7700,6 +9040,12 @@ packages: react: optional: true + zwitch@2.0.4: + resolution: + { + integrity: sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A==, + } + snapshots: "@adobe/css-tools@4.4.4": {} @@ -7883,6 +9229,8 @@ snapshots: "@babel/helper-string-parser": 7.27.1 "@babel/helper-validator-identifier": 7.27.1 + "@bufbuild/protobuf@2.11.0": {} + "@choojs/findup@0.2.1": dependencies: commander: 2.20.3 @@ -8175,12 +9523,12 @@ snapshots: wrap-ansi: 8.1.0 wrap-ansi-cjs: wrap-ansi@7.0.0 - "@joshwooding/vite-plugin-react-docgen-typescript@0.5.0(typescript@5.6.3)(vite@6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1))": + "@joshwooding/vite-plugin-react-docgen-typescript@0.5.0(typescript@5.6.3)(vite@6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1))": dependencies: glob: 10.4.5 magic-string: 0.27.0 react-docgen-typescript: 2.4.0(typescript@5.6.3) - vite: 6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1) + vite: 6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1) optionalDependencies: typescript: 5.6.3 @@ -8606,6 +9954,8 @@ snapshots: "@rtsao/scc@1.1.0": {} + "@scarf/scarf@1.4.0": {} + "@storybook/addon-actions@8.6.14(storybook@8.6.14(prettier@3.6.2))": dependencies: "@storybook/global": 5.0.0 @@ -8706,13 +10056,13 @@ snapshots: react: 18.3.1 react-dom: 18.3.1(react@18.3.1) - "@storybook/builder-vite@8.6.14(storybook@8.6.14(prettier@3.6.2))(vite@6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1))": + "@storybook/builder-vite@8.6.14(storybook@8.6.14(prettier@3.6.2))(vite@6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1))": dependencies: "@storybook/csf-plugin": 8.6.14(storybook@8.6.14(prettier@3.6.2)) browser-assert: 1.2.1 storybook: 8.6.14(prettier@3.6.2) ts-dedent: 2.2.0 - vite: 6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1) + vite: 6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1) "@storybook/components@8.6.14(storybook@8.6.14(prettier@3.6.2))": dependencies: @@ -8775,11 +10125,11 @@ snapshots: react-dom: 18.3.1(react@18.3.1) storybook: 8.6.14(prettier@3.6.2) - "@storybook/react-vite@8.6.14(@storybook/test@8.6.14(storybook@8.6.14(prettier@3.6.2)))(react-dom@18.3.1(react@18.3.1))(react@18.3.1)(rollup@4.50.1)(storybook@8.6.14(prettier@3.6.2))(typescript@5.6.3)(vite@6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1))": + "@storybook/react-vite@8.6.14(@storybook/test@8.6.14(storybook@8.6.14(prettier@3.6.2)))(react-dom@18.3.1(react@18.3.1))(react@18.3.1)(rollup@4.50.1)(storybook@8.6.14(prettier@3.6.2))(typescript@5.6.3)(vite@6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1))": dependencies: - "@joshwooding/vite-plugin-react-docgen-typescript": 0.5.0(typescript@5.6.3)(vite@6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1)) + "@joshwooding/vite-plugin-react-docgen-typescript": 0.5.0(typescript@5.6.3)(vite@6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1)) "@rollup/pluginutils": 5.3.0(rollup@4.50.1) - "@storybook/builder-vite": 8.6.14(storybook@8.6.14(prettier@3.6.2))(vite@6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1)) + "@storybook/builder-vite": 8.6.14(storybook@8.6.14(prettier@3.6.2))(vite@6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1)) "@storybook/react": 8.6.14(@storybook/test@8.6.14(storybook@8.6.14(prettier@3.6.2)))(react-dom@18.3.1(react@18.3.1))(react@18.3.1)(storybook@8.6.14(prettier@3.6.2))(typescript@5.6.3) find-up: 5.0.0 magic-string: 0.30.19 @@ -8789,7 +10139,7 @@ snapshots: resolve: 1.22.10 storybook: 8.6.14(prettier@3.6.2) tsconfig-paths: 4.2.0 - vite: 6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1) + vite: 6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1) optionalDependencies: "@storybook/test": 8.6.14(storybook@8.6.14(prettier@3.6.2)) transitivePeerDependencies: @@ -8968,6 +10318,8 @@ snapshots: tslib: 2.8.1 optional: true + "@types/argparse@2.0.17": {} + "@types/aria-query@5.0.4": {} "@types/babel__core@7.20.5": @@ -8991,10 +10343,26 @@ snapshots: dependencies: "@babel/types": 7.28.4 + "@types/benchmark@2.1.5": {} + + "@types/body-parser@1.19.6": + dependencies: + "@types/connect": 3.4.38 + "@types/node": 24.3.1 + "@types/chai@5.2.2": dependencies: "@types/deep-eql": 4.0.2 + "@types/compression@1.8.1": + dependencies: + "@types/express": 5.0.6 + "@types/node": 24.3.1 + + "@types/connect@3.4.38": + dependencies: + "@types/node": 24.3.1 + "@types/d3-color@3.1.3": {} "@types/d3-drag@3.0.7": @@ -9016,12 +10384,33 @@ snapshots: "@types/d3-interpolate": 3.0.4 "@types/d3-selection": 3.0.11 + "@types/debug@4.1.12": + dependencies: + "@types/ms": 2.1.0 + "@types/deep-eql@4.0.2": {} "@types/doctrine@0.0.9": {} + "@types/estree-jsx@1.0.5": + dependencies: + "@types/estree": 1.0.8 + "@types/estree@1.0.8": {} + "@types/express-serve-static-core@5.1.1": + dependencies: + "@types/node": 24.3.1 + "@types/qs": 6.15.0 + "@types/range-parser": 1.2.7 + "@types/send": 1.2.1 + + "@types/express@5.0.6": + dependencies: + "@types/body-parser": 1.19.6 + "@types/express-serve-static-core": 5.1.1 + "@types/serve-static": 2.2.0 + "@types/file-saver@2.0.7": {} "@types/geojson-vt@3.2.5": @@ -9030,11 +10419,17 @@ snapshots: "@types/geojson@7946.0.16": {} + "@types/hast@3.0.4": + dependencies: + "@types/unist": 3.0.3 + "@types/hoist-non-react-statics@3.3.7(@types/react@18.3.24)": dependencies: "@types/react": 18.3.24 hoist-non-react-statics: 3.3.2 + "@types/http-errors@2.0.5": {} + "@types/json-schema@7.0.15": {} "@types/json5@0.0.29": {} @@ -9047,8 +10442,23 @@ snapshots: "@types/mapbox__point-geometry": 0.1.4 "@types/pbf": 3.0.5 + "@types/mdast@4.0.4": + dependencies: + "@types/unist": 3.0.3 + "@types/mdx@2.0.13": {} + "@types/ms@2.1.0": {} + + "@types/node-fetch@2.6.13": + dependencies: + "@types/node": 24.3.1 + form-data: 4.0.4 + + "@types/node@22.19.15": + dependencies: + undici-types: 6.21.0 + "@types/node@24.3.1": dependencies: undici-types: 7.10.0 @@ -9065,6 +10475,10 @@ snapshots: "@types/prop-types@15.7.15": {} + "@types/qs@6.15.0": {} + + "@types/range-parser@1.2.7": {} + "@types/react-dom@18.3.7(@types/react@18.3.24)": dependencies: "@types/react": 18.3.24 @@ -9085,12 +10499,27 @@ snapshots: "@types/resolve@1.20.6": {} + "@types/send@1.2.1": + dependencies: + "@types/node": 24.3.1 + + "@types/serve-static@2.2.0": + dependencies: + "@types/http-errors": 2.0.5 + "@types/node": 24.3.1 + "@types/stylis@4.2.5": {} "@types/supercluster@7.1.3": dependencies: "@types/geojson": 7946.0.16 + "@types/swagger-ui-dist@3.30.6": {} + + "@types/unist@2.0.11": {} + + "@types/unist@3.0.3": {} + "@types/uuid@9.0.8": {} "@typescript-eslint/eslint-plugin@8.43.0(@typescript-eslint/parser@8.43.0(eslint@9.35.0)(typescript@5.6.3))(eslint@9.35.0)(typescript@5.6.3)": @@ -9186,6 +10615,8 @@ snapshots: "@typescript-eslint/types": 8.43.0 eslint-visitor-keys: 4.2.1 + "@ungap/structured-clone@1.3.0": {} + "@unrs/resolver-binding-android-arm-eabi@1.11.1": optional: true @@ -9245,7 +10676,7 @@ snapshots: "@unrs/resolver-binding-win32-x64-msvc@1.11.1": optional: true - "@vitejs/plugin-react@4.7.0(vite@6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1))": + "@vitejs/plugin-react@4.7.0(vite@6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1))": dependencies: "@babel/core": 7.28.4 "@babel/plugin-transform-react-jsx-self": 7.27.1(@babel/core@7.28.4) @@ -9253,7 +10684,7 @@ snapshots: "@rolldown/pluginutils": 1.0.0-beta.27 "@types/babel__core": 7.20.5 react-refresh: 0.17.0 - vite: 6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1) + vite: 6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1) transitivePeerDependencies: - supports-color @@ -9272,13 +10703,13 @@ snapshots: chai: 5.3.3 tinyrainbow: 2.0.0 - "@vitest/mocker@3.2.4(vite@6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1))": + "@vitest/mocker@3.2.4(vite@6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1))": dependencies: "@vitest/spy": 3.2.4 estree-walker: 3.0.3 magic-string: 0.30.19 optionalDependencies: - vite: 6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1) + vite: 6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1) "@vitest/pretty-format@2.0.5": dependencies: @@ -9331,9 +10762,9 @@ snapshots: loupe: 3.2.1 tinyrainbow: 2.0.0 - "@xyflow/react@12.10.0(@types/react@18.3.24)(react-dom@18.3.1(react@18.3.1))(react@18.3.1)": + "@xyflow/react@12.10.2(@types/react@18.3.24)(react-dom@18.3.1(react@18.3.1))(react@18.3.1)": dependencies: - "@xyflow/system": 0.0.74 + "@xyflow/system": 0.0.76 classcat: 5.0.5 react: 18.3.1 react-dom: 18.3.1(react@18.3.1) @@ -9342,7 +10773,7 @@ snapshots: - "@types/react" - immer - "@xyflow/system@0.0.74": + "@xyflow/system@0.0.76": dependencies: "@types/d3-drag": 3.0.7 "@types/d3-interpolate": 3.0.4 @@ -9356,6 +10787,11 @@ snapshots: abs-svg-path@0.1.1: {} + accepts@2.0.0: + dependencies: + mime-types: 3.0.2 + negotiator: 1.0.0 + acorn-jsx@5.3.2(acorn@8.15.0): dependencies: acorn: 8.15.0 @@ -9458,6 +10894,17 @@ snapshots: es-abstract: 1.24.0 es-shim-unscopables: 1.1.0 + array.prototype.reduce@1.0.8: + dependencies: + call-bind: 1.0.8 + call-bound: 1.0.4 + define-properties: 1.2.1 + es-abstract: 1.24.0 + es-array-method-boxes-properly: 1.0.0 + es-errors: 1.3.0 + es-object-atoms: 1.1.1 + is-string: 1.1.1 + array.prototype.tosorted@1.1.4: dependencies: call-bind: 1.0.8 @@ -9504,6 +10951,8 @@ snapshots: cosmiconfig: 7.1.0 resolve: 1.22.10 + bail@2.0.2: {} + balanced-match@1.0.2: {} base64-arraybuffer@1.0.2: {} @@ -9523,6 +10972,20 @@ snapshots: readable-stream: 2.3.8 safe-buffer: 5.2.1 + body-parser@2.2.2: + dependencies: + bytes: 3.1.2 + content-type: 1.0.5 + debug: 4.4.3 + http-errors: 2.0.1 + iconv-lite: 0.7.2 + on-finished: 2.4.1 + qs: 6.15.0 + raw-body: 3.0.2 + type-is: 2.0.1 + transitivePeerDependencies: + - supports-color + bootstrap@5.3.8(@popperjs/core@2.11.8): dependencies: "@popperjs/core": 2.11.8 @@ -9551,6 +11014,8 @@ snapshots: buffer-from@1.1.2: {} + bytes@3.1.2: {} + cac@6.7.14: {} call-bind-apply-helpers@1.0.2: @@ -9582,6 +11047,8 @@ snapshots: dependencies: element-size: 1.1.1 + ccount@2.0.1: {} + chai@5.3.3: dependencies: assertion-error: 2.0.1 @@ -9602,6 +11069,14 @@ snapshots: chalk@5.6.2: {} + character-entities-html4@2.1.0: {} + + character-entities-legacy@3.0.0: {} + + character-entities@2.0.2: {} + + character-reference-invalid@2.0.1: {} + check-error@2.1.1: {} chokidar@4.0.3: @@ -9668,14 +11143,34 @@ snapshots: colorette@2.0.20: {} + colorjs.io@0.5.2: {} + combined-stream@1.0.8: dependencies: delayed-stream: 1.0.0 + comma-separated-tokens@2.0.3: {} + commander@13.1.0: {} commander@2.20.3: {} + compressible@2.0.18: + dependencies: + mime-db: 1.52.0 + + compression@1.8.1: + dependencies: + bytes: 3.1.2 + compressible: 2.0.18 + debug: 2.6.9 + negotiator: 0.6.4 + on-headers: 1.1.0 + safe-buffer: 5.2.1 + vary: 1.1.2 + transitivePeerDependencies: + - supports-color + concat-map@0.0.1: {} concat-stream@1.6.2: @@ -9685,15 +11180,28 @@ snapshots: readable-stream: 2.3.8 typedarray: 0.0.6 + content-disposition@1.0.1: {} + + content-type@1.0.5: {} + convert-source-map@1.9.0: {} convert-source-map@2.0.0: {} + cookie-signature@1.2.2: {} + + cookie@0.7.2: {} + cookie@1.0.2: {} core-util-is@1.0.3: {} - corepack@0.34.0: {} + corepack@0.34.6: {} + + cors@2.8.6: + dependencies: + object-assign: 4.1.1 + vary: 1.1.2 cosmiconfig@7.1.0: dependencies: @@ -9879,8 +11387,16 @@ snapshots: dependencies: ms: 2.1.3 + debug@4.4.3: + dependencies: + ms: 2.1.3 + decimal.js@10.6.0: {} + decode-named-character-reference@1.3.0: + dependencies: + character-entities: 2.0.2 + deep-eql@5.0.2: {} deep-is@0.1.4: {} @@ -9903,6 +11419,8 @@ snapshots: delayed-stream@1.0.0: {} + depd@2.0.0: {} + dequal@2.0.3: {} detect-kerning@2.1.2: {} @@ -9910,6 +11428,10 @@ snapshots: detect-libc@1.0.3: optional: true + devlop@1.1.0: + dependencies: + dequal: 2.0.3 + doctrine@2.1.0: dependencies: esutils: 2.0.3 @@ -9960,6 +11482,8 @@ snapshots: eastasianwidth@0.2.0: {} + ee-first@1.1.1: {} + electron-to-chromium@1.5.217: {} element-size@1.1.1: {} @@ -9974,6 +11498,8 @@ snapshots: emoji-regex@9.2.2: {} + encodeurl@2.0.0: {} + end-of-stream@1.4.5: dependencies: once: 1.4.0 @@ -10045,6 +11571,8 @@ snapshots: unbox-primitive: 1.1.0 which-typed-array: 1.1.19 + es-array-method-boxes-properly@1.0.0: {} + es-define-property@1.0.1: {} es-errors@1.3.0: {} @@ -10154,8 +11682,12 @@ snapshots: escalade@3.2.0: {} + escape-html@1.0.3: {} + escape-string-regexp@4.0.0: {} + escape-string-regexp@5.0.0: {} + escodegen@2.1.0: dependencies: esprima: 4.0.1 @@ -10341,6 +11873,8 @@ snapshots: estraverse@5.3.0: {} + estree-util-is-identifier-name@3.0.0: {} + estree-walker@2.0.2: {} estree-walker@3.0.3: @@ -10349,6 +11883,8 @@ snapshots: esutils@2.0.3: {} + etag@1.8.1: {} + event-emitter@0.3.5: dependencies: d: 1.0.2 @@ -10372,10 +11908,45 @@ snapshots: expect-type@1.2.2: {} + express@5.2.1: + dependencies: + accepts: 2.0.0 + body-parser: 2.2.2 + content-disposition: 1.0.1 + content-type: 1.0.5 + cookie: 0.7.2 + cookie-signature: 1.2.2 + debug: 4.4.1 + depd: 2.0.0 + encodeurl: 2.0.0 + escape-html: 1.0.3 + etag: 1.8.1 + finalhandler: 2.1.1 + fresh: 2.0.0 + http-errors: 2.0.1 + merge-descriptors: 2.0.0 + mime-types: 3.0.2 + on-finished: 2.4.1 + once: 1.4.0 + parseurl: 1.3.3 + proxy-addr: 2.0.7 + qs: 6.15.0 + range-parser: 1.2.1 + router: 2.2.0 + send: 1.2.1 + serve-static: 2.2.1 + statuses: 2.0.2 + type-is: 2.0.1 + vary: 1.1.2 + transitivePeerDependencies: + - supports-color + ext@1.7.0: dependencies: type: 2.7.3 + extend@3.0.2: {} + falafel@2.2.5: dependencies: acorn: 7.4.1 @@ -10419,6 +11990,17 @@ snapshots: dependencies: to-regex-range: 5.0.1 + finalhandler@2.1.1: + dependencies: + debug: 4.4.1 + encodeurl: 2.0.0 + escape-html: 1.0.3 + on-finished: 2.4.1 + parseurl: 1.3.3 + statuses: 2.0.2 + transitivePeerDependencies: + - supports-color + find-root@1.1.0: {} find-up@5.0.0: @@ -10464,6 +12046,10 @@ snapshots: hasown: 2.0.2 mime-types: 2.1.35 + forwarded@0.2.0: {} + + fp-ts@2.16.11: {} + framer-motion@12.23.12(@emotion/is-prop-valid@1.4.0)(react-dom@18.3.1(react@18.3.1))(react@18.3.1): dependencies: motion-dom: 12.23.12 @@ -10474,6 +12060,8 @@ snapshots: react: 18.3.1 react-dom: 18.3.1(react@18.3.1) + fresh@2.0.0: {} + from2@2.3.0: dependencies: inherits: 2.0.4 @@ -10695,6 +12283,8 @@ snapshots: grid-index@1.1.0: {} + h264-mp4-encoder@1.0.12: {} + has-bigints@1.1.0: {} has-flag@4.0.0: {} @@ -10725,6 +12315,30 @@ snapshots: dependencies: function-bind: 1.1.2 + hast-util-to-jsx-runtime@2.3.6: + dependencies: + "@types/estree": 1.0.8 + "@types/hast": 3.0.4 + "@types/unist": 3.0.3 + comma-separated-tokens: 2.0.3 + devlop: 1.1.0 + estree-util-is-identifier-name: 3.0.0 + hast-util-whitespace: 3.0.0 + mdast-util-mdx-expression: 2.0.1 + mdast-util-mdx-jsx: 3.2.0 + mdast-util-mdxjs-esm: 2.0.1 + property-information: 7.1.0 + space-separated-tokens: 2.0.2 + style-to-js: 1.1.21 + unist-util-position: 5.0.0 + vfile-message: 4.0.3 + transitivePeerDependencies: + - supports-color + + hast-util-whitespace@3.0.0: + dependencies: + "@types/hast": 3.0.4 + hoist-non-react-statics@3.3.2: dependencies: react-is: 16.13.1 @@ -10733,6 +12347,16 @@ snapshots: dependencies: whatwg-encoding: 3.1.1 + html-url-attributes@3.0.1: {} + + http-errors@2.0.1: + dependencies: + depd: 2.0.0 + inherits: 2.0.4 + setprototypeof: 1.2.0 + statuses: 2.0.2 + toidentifier: 1.0.1 + http-proxy-agent@7.0.2: dependencies: agent-base: 7.1.4 @@ -10759,14 +12383,17 @@ snapshots: dependencies: safer-buffer: 2.1.2 + iconv-lite@0.7.2: + dependencies: + safer-buffer: 2.1.2 + ieee754@1.2.1: {} ignore@5.3.2: {} ignore@7.0.5: {} - immutable@5.1.3: - optional: true + immutable@5.1.5: {} import-fresh@3.3.1: dependencies: @@ -10781,12 +12408,27 @@ snapshots: ini@4.1.3: {} + inline-style-parser@0.2.7: {} + internal-slot@1.1.0: dependencies: es-errors: 1.3.0 hasown: 2.0.2 side-channel: 1.1.0 + io-ts@2.2.22(fp-ts@2.16.11): + dependencies: + fp-ts: 2.16.11 + + ipaddr.js@1.9.1: {} + + is-alphabetical@2.0.1: {} + + is-alphanumerical@2.0.1: + dependencies: + is-alphabetical: 2.0.1 + is-decimal: 2.0.1 + is-arguments@1.2.0: dependencies: call-bound: 1.0.4 @@ -10840,6 +12482,8 @@ snapshots: call-bound: 1.0.4 has-tostringtag: 1.0.2 + is-decimal@2.0.1: {} + is-docker@2.2.1: {} is-extglob@2.1.1: {} @@ -10871,6 +12515,8 @@ snapshots: dependencies: is-extglob: 2.1.1 + is-hexadecimal@2.0.1: {} + is-iexplorer@1.0.0: {} is-map@2.0.3: {} @@ -10890,8 +12536,12 @@ snapshots: is-plain-obj@1.1.0: {} + is-plain-obj@4.1.0: {} + is-potential-custom-element-name@1.0.1: {} + is-promise@4.0.0: {} + is-regex@1.2.1: dependencies: call-bound: 1.0.4 @@ -11093,6 +12743,8 @@ snapshots: strip-ansi: 7.1.2 wrap-ansi: 9.0.2 + longest-streak@3.1.0: {} + loose-envify@1.4.0: dependencies: js-tokens: 4.0.0 @@ -11101,95 +12753,445 @@ snapshots: lower-case@2.0.2: dependencies: - tslib: 2.8.1 + tslib: 2.8.1 + + lru-cache@10.4.3: {} + + lru-cache@5.1.1: + dependencies: + yallist: 3.1.1 + + lz-string@1.5.0: {} + + magic-string@0.27.0: + dependencies: + "@jridgewell/sourcemap-codec": 1.5.5 + + magic-string@0.30.19: + dependencies: + "@jridgewell/sourcemap-codec": 1.5.5 + + map-limit@0.0.1: + dependencies: + once: 1.3.3 + + map-or-similar@1.5.0: {} + + mapbox-gl@1.13.3: + dependencies: + "@mapbox/geojson-rewind": 0.5.2 + "@mapbox/geojson-types": 1.0.2 + "@mapbox/jsonlint-lines-primitives": 2.0.2 + "@mapbox/mapbox-gl-supported": 1.5.0(mapbox-gl@1.13.3) + "@mapbox/point-geometry": 0.1.0 + "@mapbox/tiny-sdf": 1.2.5 + "@mapbox/unitbezier": 0.0.0 + "@mapbox/vector-tile": 1.3.1 + "@mapbox/whoots-js": 3.1.0 + csscolorparser: 1.0.3 + earcut: 2.2.4 + geojson-vt: 3.2.1 + gl-matrix: 3.4.4 + grid-index: 1.1.0 + murmurhash-js: 1.0.0 + pbf: 3.3.0 + potpack: 1.0.2 + quickselect: 2.0.0 + rw: 1.3.3 + supercluster: 7.1.5 + tinyqueue: 2.0.3 + vt-pbf: 3.1.3 + + maplibre-gl@4.7.1: + dependencies: + "@mapbox/geojson-rewind": 0.5.2 + "@mapbox/jsonlint-lines-primitives": 2.0.2 + "@mapbox/point-geometry": 0.1.0 + "@mapbox/tiny-sdf": 2.0.7 + "@mapbox/unitbezier": 0.0.1 + "@mapbox/vector-tile": 1.3.1 + "@mapbox/whoots-js": 3.1.0 + "@maplibre/maplibre-gl-style-spec": 20.4.0 + "@types/geojson": 7946.0.16 + "@types/geojson-vt": 3.2.5 + "@types/mapbox__point-geometry": 0.1.4 + "@types/mapbox__vector-tile": 1.3.4 + "@types/pbf": 3.0.5 + "@types/supercluster": 7.1.3 + earcut: 3.0.2 + geojson-vt: 4.0.2 + gl-matrix: 3.4.4 + global-prefix: 4.0.0 + kdbush: 4.0.2 + murmurhash-js: 1.0.0 + pbf: 3.3.0 + potpack: 2.1.0 + quickselect: 3.0.0 + supercluster: 8.0.1 + tinyqueue: 3.0.0 + vt-pbf: 3.1.3 + + markdown-table@3.0.4: {} + + math-intrinsics@1.1.0: {} + + math-log2@1.0.1: {} + + mdast-util-find-and-replace@3.0.2: + dependencies: + "@types/mdast": 4.0.4 + escape-string-regexp: 5.0.0 + unist-util-is: 6.0.1 + unist-util-visit-parents: 6.0.2 + + mdast-util-from-markdown@2.0.3: + dependencies: + "@types/mdast": 4.0.4 + "@types/unist": 3.0.3 + decode-named-character-reference: 1.3.0 + devlop: 1.1.0 + mdast-util-to-string: 4.0.0 + micromark: 4.0.2 + micromark-util-decode-numeric-character-reference: 2.0.2 + micromark-util-decode-string: 2.0.1 + micromark-util-normalize-identifier: 2.0.1 + micromark-util-symbol: 2.0.1 + micromark-util-types: 2.0.2 + unist-util-stringify-position: 4.0.0 + transitivePeerDependencies: + - supports-color + + mdast-util-gfm-autolink-literal@2.0.1: + dependencies: + "@types/mdast": 4.0.4 + ccount: 2.0.1 + devlop: 1.1.0 + mdast-util-find-and-replace: 3.0.2 + micromark-util-character: 2.1.1 + + mdast-util-gfm-footnote@2.1.0: + dependencies: + "@types/mdast": 4.0.4 + devlop: 1.1.0 + mdast-util-from-markdown: 2.0.3 + mdast-util-to-markdown: 2.1.2 + micromark-util-normalize-identifier: 2.0.1 + transitivePeerDependencies: + - supports-color + + mdast-util-gfm-strikethrough@2.0.0: + dependencies: + "@types/mdast": 4.0.4 + mdast-util-from-markdown: 2.0.3 + mdast-util-to-markdown: 2.1.2 + transitivePeerDependencies: + - supports-color + + mdast-util-gfm-table@2.0.0: + dependencies: + "@types/mdast": 4.0.4 + devlop: 1.1.0 + markdown-table: 3.0.4 + mdast-util-from-markdown: 2.0.3 + mdast-util-to-markdown: 2.1.2 + transitivePeerDependencies: + - supports-color + + mdast-util-gfm-task-list-item@2.0.0: + dependencies: + "@types/mdast": 4.0.4 + devlop: 1.1.0 + mdast-util-from-markdown: 2.0.3 + mdast-util-to-markdown: 2.1.2 + transitivePeerDependencies: + - supports-color + + mdast-util-gfm@3.1.0: + dependencies: + mdast-util-from-markdown: 2.0.3 + mdast-util-gfm-autolink-literal: 2.0.1 + mdast-util-gfm-footnote: 2.1.0 + mdast-util-gfm-strikethrough: 2.0.0 + mdast-util-gfm-table: 2.0.0 + mdast-util-gfm-task-list-item: 2.0.0 + mdast-util-to-markdown: 2.1.2 + transitivePeerDependencies: + - supports-color + + mdast-util-mdx-expression@2.0.1: + dependencies: + "@types/estree-jsx": 1.0.5 + "@types/hast": 3.0.4 + "@types/mdast": 4.0.4 + devlop: 1.1.0 + mdast-util-from-markdown: 2.0.3 + mdast-util-to-markdown: 2.1.2 + transitivePeerDependencies: + - supports-color + + mdast-util-mdx-jsx@3.2.0: + dependencies: + "@types/estree-jsx": 1.0.5 + "@types/hast": 3.0.4 + "@types/mdast": 4.0.4 + "@types/unist": 3.0.3 + ccount: 2.0.1 + devlop: 1.1.0 + mdast-util-from-markdown: 2.0.3 + mdast-util-to-markdown: 2.1.2 + parse-entities: 4.0.2 + stringify-entities: 4.0.4 + unist-util-stringify-position: 4.0.0 + vfile-message: 4.0.3 + transitivePeerDependencies: + - supports-color + + mdast-util-mdxjs-esm@2.0.1: + dependencies: + "@types/estree-jsx": 1.0.5 + "@types/hast": 3.0.4 + "@types/mdast": 4.0.4 + devlop: 1.1.0 + mdast-util-from-markdown: 2.0.3 + mdast-util-to-markdown: 2.1.2 + transitivePeerDependencies: + - supports-color + + mdast-util-phrasing@4.1.0: + dependencies: + "@types/mdast": 4.0.4 + unist-util-is: 6.0.1 + + mdast-util-to-hast@13.2.1: + dependencies: + "@types/hast": 3.0.4 + "@types/mdast": 4.0.4 + "@ungap/structured-clone": 1.3.0 + devlop: 1.1.0 + micromark-util-sanitize-uri: 2.0.1 + trim-lines: 3.0.1 + unist-util-position: 5.0.0 + unist-util-visit: 5.1.0 + vfile: 6.0.3 + + mdast-util-to-markdown@2.1.2: + dependencies: + "@types/mdast": 4.0.4 + "@types/unist": 3.0.3 + longest-streak: 3.1.0 + mdast-util-phrasing: 4.1.0 + mdast-util-to-string: 4.0.0 + micromark-util-classify-character: 2.0.1 + micromark-util-decode-string: 2.0.1 + unist-util-visit: 5.1.0 + zwitch: 2.0.4 + + mdast-util-to-string@4.0.0: + dependencies: + "@types/mdast": 4.0.4 + + media-typer@1.1.0: {} + + memoizerific@1.11.3: + dependencies: + map-or-similar: 1.5.0 + + merge-descriptors@2.0.0: {} + + merge-stream@2.0.0: {} + + merge2@1.4.1: {} + + micromark-core-commonmark@2.0.3: + dependencies: + decode-named-character-reference: 1.3.0 + devlop: 1.1.0 + micromark-factory-destination: 2.0.1 + micromark-factory-label: 2.0.1 + micromark-factory-space: 2.0.1 + micromark-factory-title: 2.0.1 + micromark-factory-whitespace: 2.0.1 + micromark-util-character: 2.1.1 + micromark-util-chunked: 2.0.1 + micromark-util-classify-character: 2.0.1 + micromark-util-html-tag-name: 2.0.1 + micromark-util-normalize-identifier: 2.0.1 + micromark-util-resolve-all: 2.0.1 + micromark-util-subtokenize: 2.1.0 + micromark-util-symbol: 2.0.1 + micromark-util-types: 2.0.2 + + micromark-extension-gfm-autolink-literal@2.1.0: + dependencies: + micromark-util-character: 2.1.1 + micromark-util-sanitize-uri: 2.0.1 + micromark-util-symbol: 2.0.1 + micromark-util-types: 2.0.2 + + micromark-extension-gfm-footnote@2.1.0: + dependencies: + devlop: 1.1.0 + micromark-core-commonmark: 2.0.3 + micromark-factory-space: 2.0.1 + micromark-util-character: 2.1.1 + micromark-util-normalize-identifier: 2.0.1 + micromark-util-sanitize-uri: 2.0.1 + micromark-util-symbol: 2.0.1 + micromark-util-types: 2.0.2 + + micromark-extension-gfm-strikethrough@2.1.0: + dependencies: + devlop: 1.1.0 + micromark-util-chunked: 2.0.1 + micromark-util-classify-character: 2.0.1 + micromark-util-resolve-all: 2.0.1 + micromark-util-symbol: 2.0.1 + micromark-util-types: 2.0.2 + + micromark-extension-gfm-table@2.1.1: + dependencies: + devlop: 1.1.0 + micromark-factory-space: 2.0.1 + micromark-util-character: 2.1.1 + micromark-util-symbol: 2.0.1 + micromark-util-types: 2.0.2 + + micromark-extension-gfm-tagfilter@2.0.0: + dependencies: + micromark-util-types: 2.0.2 + + micromark-extension-gfm-task-list-item@2.1.0: + dependencies: + devlop: 1.1.0 + micromark-factory-space: 2.0.1 + micromark-util-character: 2.1.1 + micromark-util-symbol: 2.0.1 + micromark-util-types: 2.0.2 + + micromark-extension-gfm@3.0.0: + dependencies: + micromark-extension-gfm-autolink-literal: 2.1.0 + micromark-extension-gfm-footnote: 2.1.0 + micromark-extension-gfm-strikethrough: 2.1.0 + micromark-extension-gfm-table: 2.1.1 + micromark-extension-gfm-tagfilter: 2.0.0 + micromark-extension-gfm-task-list-item: 2.1.0 + micromark-util-combine-extensions: 2.0.1 + micromark-util-types: 2.0.2 + + micromark-factory-destination@2.0.1: + dependencies: + micromark-util-character: 2.1.1 + micromark-util-symbol: 2.0.1 + micromark-util-types: 2.0.2 + + micromark-factory-label@2.0.1: + dependencies: + devlop: 1.1.0 + micromark-util-character: 2.1.1 + micromark-util-symbol: 2.0.1 + micromark-util-types: 2.0.2 + + micromark-factory-space@2.0.1: + dependencies: + micromark-util-character: 2.1.1 + micromark-util-types: 2.0.2 - lru-cache@10.4.3: {} + micromark-factory-title@2.0.1: + dependencies: + micromark-factory-space: 2.0.1 + micromark-util-character: 2.1.1 + micromark-util-symbol: 2.0.1 + micromark-util-types: 2.0.2 - lru-cache@5.1.1: + micromark-factory-whitespace@2.0.1: dependencies: - yallist: 3.1.1 + micromark-factory-space: 2.0.1 + micromark-util-character: 2.1.1 + micromark-util-symbol: 2.0.1 + micromark-util-types: 2.0.2 - lz-string@1.5.0: {} + micromark-util-character@2.1.1: + dependencies: + micromark-util-symbol: 2.0.1 + micromark-util-types: 2.0.2 - magic-string@0.27.0: + micromark-util-chunked@2.0.1: dependencies: - "@jridgewell/sourcemap-codec": 1.5.5 + micromark-util-symbol: 2.0.1 - magic-string@0.30.19: + micromark-util-classify-character@2.0.1: dependencies: - "@jridgewell/sourcemap-codec": 1.5.5 + micromark-util-character: 2.1.1 + micromark-util-symbol: 2.0.1 + micromark-util-types: 2.0.2 - map-limit@0.0.1: + micromark-util-combine-extensions@2.0.1: dependencies: - once: 1.3.3 + micromark-util-chunked: 2.0.1 + micromark-util-types: 2.0.2 - map-or-similar@1.5.0: {} + micromark-util-decode-numeric-character-reference@2.0.2: + dependencies: + micromark-util-symbol: 2.0.1 - mapbox-gl@1.13.3: + micromark-util-decode-string@2.0.1: dependencies: - "@mapbox/geojson-rewind": 0.5.2 - "@mapbox/geojson-types": 1.0.2 - "@mapbox/jsonlint-lines-primitives": 2.0.2 - "@mapbox/mapbox-gl-supported": 1.5.0(mapbox-gl@1.13.3) - "@mapbox/point-geometry": 0.1.0 - "@mapbox/tiny-sdf": 1.2.5 - "@mapbox/unitbezier": 0.0.0 - "@mapbox/vector-tile": 1.3.1 - "@mapbox/whoots-js": 3.1.0 - csscolorparser: 1.0.3 - earcut: 2.2.4 - geojson-vt: 3.2.1 - gl-matrix: 3.4.4 - grid-index: 1.1.0 - murmurhash-js: 1.0.0 - pbf: 3.3.0 - potpack: 1.0.2 - quickselect: 2.0.0 - rw: 1.3.3 - supercluster: 7.1.5 - tinyqueue: 2.0.3 - vt-pbf: 3.1.3 + decode-named-character-reference: 1.3.0 + micromark-util-character: 2.1.1 + micromark-util-decode-numeric-character-reference: 2.0.2 + micromark-util-symbol: 2.0.1 - maplibre-gl@4.7.1: + micromark-util-encode@2.0.1: {} + + micromark-util-html-tag-name@2.0.1: {} + + micromark-util-normalize-identifier@2.0.1: dependencies: - "@mapbox/geojson-rewind": 0.5.2 - "@mapbox/jsonlint-lines-primitives": 2.0.2 - "@mapbox/point-geometry": 0.1.0 - "@mapbox/tiny-sdf": 2.0.7 - "@mapbox/unitbezier": 0.0.1 - "@mapbox/vector-tile": 1.3.1 - "@mapbox/whoots-js": 3.1.0 - "@maplibre/maplibre-gl-style-spec": 20.4.0 - "@types/geojson": 7946.0.16 - "@types/geojson-vt": 3.2.5 - "@types/mapbox__point-geometry": 0.1.4 - "@types/mapbox__vector-tile": 1.3.4 - "@types/pbf": 3.0.5 - "@types/supercluster": 7.1.3 - earcut: 3.0.2 - geojson-vt: 4.0.2 - gl-matrix: 3.4.4 - global-prefix: 4.0.0 - kdbush: 4.0.2 - murmurhash-js: 1.0.0 - pbf: 3.3.0 - potpack: 2.1.0 - quickselect: 3.0.0 - supercluster: 8.0.1 - tinyqueue: 3.0.0 - vt-pbf: 3.1.3 + micromark-util-symbol: 2.0.1 - math-intrinsics@1.1.0: {} + micromark-util-resolve-all@2.0.1: + dependencies: + micromark-util-types: 2.0.2 - math-log2@1.0.1: {} + micromark-util-sanitize-uri@2.0.1: + dependencies: + micromark-util-character: 2.1.1 + micromark-util-encode: 2.0.1 + micromark-util-symbol: 2.0.1 - memoizerific@1.11.3: + micromark-util-subtokenize@2.1.0: dependencies: - map-or-similar: 1.5.0 + devlop: 1.1.0 + micromark-util-chunked: 2.0.1 + micromark-util-symbol: 2.0.1 + micromark-util-types: 2.0.2 - merge-stream@2.0.0: {} + micromark-util-symbol@2.0.1: {} - merge2@1.4.1: {} + micromark-util-types@2.0.2: {} + + micromark@4.0.2: + dependencies: + "@types/debug": 4.1.12 + debug: 4.4.1 + decode-named-character-reference: 1.3.0 + devlop: 1.1.0 + micromark-core-commonmark: 2.0.3 + micromark-factory-space: 2.0.1 + micromark-util-character: 2.1.1 + micromark-util-chunked: 2.0.1 + micromark-util-combine-extensions: 2.0.1 + micromark-util-decode-numeric-character-reference: 2.0.2 + micromark-util-encode: 2.0.1 + micromark-util-normalize-identifier: 2.0.1 + micromark-util-resolve-all: 2.0.1 + micromark-util-sanitize-uri: 2.0.1 + micromark-util-subtokenize: 2.1.0 + micromark-util-symbol: 2.0.1 + micromark-util-types: 2.0.2 + transitivePeerDependencies: + - supports-color micromatch@4.0.8: dependencies: @@ -11198,10 +13200,16 @@ snapshots: mime-db@1.52.0: {} + mime-db@1.54.0: {} + mime-types@2.1.35: dependencies: mime-db: 1.52.0 + mime-types@3.0.2: + dependencies: + mime-db: 1.54.0 + mimic-fn@4.0.0: {} mimic-function@5.0.1: {} @@ -11230,6 +13238,38 @@ snapshots: mobx@6.13.7: {} + molstar@5.7.0(@types/react@18.3.24)(fp-ts@2.16.11)(react-dom@18.3.1(react@18.3.1))(react@18.3.1): + dependencies: + "@types/argparse": 2.0.17 + "@types/benchmark": 2.1.5 + "@types/compression": 1.8.1 + "@types/express": 5.0.6 + "@types/node": 22.19.15 + "@types/node-fetch": 2.6.13 + "@types/swagger-ui-dist": 3.30.6 + argparse: 2.0.1 + compression: 1.8.1 + cors: 2.8.6 + express: 5.2.1 + h264-mp4-encoder: 1.0.12 + immutable: 5.1.5 + io-ts: 2.2.22(fp-ts@2.16.11) + mutative: 1.3.0 + node-fetch: 2.7.0 + react: 18.3.1 + react-dom: 18.3.1(react@18.3.1) + react-markdown: 10.1.0(@types/react@18.3.24)(react@18.3.1) + remark-gfm: 4.0.1 + rxjs: 7.8.2 + swagger-ui-dist: 5.32.0 + tslib: 2.8.1 + util.promisify: 1.1.3 + transitivePeerDependencies: + - "@types/react" + - encoding + - fp-ts + - supports-color + moment@2.30.1: {} motion-dom@12.23.12: @@ -11258,6 +13298,8 @@ snapshots: murmurhash-js@1.0.0: {} + mutative@1.3.0: {} + nanoid@3.3.11: {} napi-postinstall@0.3.3: {} @@ -11274,6 +13316,10 @@ snapshots: transitivePeerDependencies: - supports-color + negotiator@0.6.4: {} + + negotiator@1.0.0: {} + next-tick@1.1.0: {} no-case@3.0.4: @@ -11284,6 +13330,10 @@ snapshots: node-addon-api@7.1.1: optional: true + node-fetch@2.7.0: + dependencies: + whatwg-url: 5.0.0 + node-releases@2.0.20: {} normalize-svg-path@0.1.0: {} @@ -11331,6 +13381,16 @@ snapshots: es-abstract: 1.24.0 es-object-atoms: 1.1.1 + object.getownpropertydescriptors@2.1.9: + dependencies: + array.prototype.reduce: 1.0.8 + call-bind: 1.0.8 + define-properties: 1.2.1 + es-abstract: 1.24.0 + es-object-atoms: 1.1.1 + gopd: 1.2.0 + safe-array-concat: 1.1.3 + object.groupby@1.0.3: dependencies: call-bind: 1.0.8 @@ -11344,6 +13404,12 @@ snapshots: define-properties: 1.2.1 es-object-atoms: 1.1.1 + on-finished@2.4.1: + dependencies: + ee-first: 1.1.1 + + on-headers@1.1.0: {} + once@1.3.3: dependencies: wrappy: 1.0.2 @@ -11397,6 +13463,16 @@ snapshots: parenthesis@3.1.8: {} + parse-entities@4.0.2: + dependencies: + "@types/unist": 2.0.11 + character-entities-legacy: 3.0.0 + character-reference-invalid: 2.0.1 + decode-named-character-reference: 1.3.0 + is-alphanumerical: 2.0.1 + is-decimal: 2.0.1 + is-hexadecimal: 2.0.1 + parse-json@5.2.0: dependencies: "@babel/code-frame": 7.27.1 @@ -11416,6 +13492,8 @@ snapshots: dependencies: entities: 6.0.1 + parseurl@1.3.3: {} + path-exists@4.0.0: {} path-key@3.1.1: {} @@ -11429,6 +13507,8 @@ snapshots: lru-cache: 10.4.3 minipass: 7.1.2 + path-to-regexp@8.3.0: {} + path-type@4.0.0: {} pathe@2.0.3: {} @@ -11566,12 +13646,23 @@ snapshots: object-assign: 4.1.1 react-is: 16.13.1 + property-information@7.1.0: {} + protocol-buffers-schema@3.6.0: {} + proxy-addr@2.0.7: + dependencies: + forwarded: 0.2.0 + ipaddr.js: 1.9.1 + proxy-from-env@1.1.0: {} punycode@2.3.1: {} + qs@6.15.0: + dependencies: + side-channel: 1.1.0 + queue-microtask@1.2.3: {} quickselect@2.0.0: {} @@ -11582,6 +13673,15 @@ snapshots: dependencies: performance-now: 2.1.0 + range-parser@1.2.1: {} + + raw-body@3.0.2: + dependencies: + bytes: 3.1.2 + http-errors: 2.0.1 + iconv-lite: 0.7.2 + unpipe: 1.0.0 + react-confetti@6.4.0(react@18.3.1): dependencies: react: 18.3.1 @@ -11623,6 +13723,24 @@ snapshots: react-is@19.1.1: {} + react-markdown@10.1.0(@types/react@18.3.24)(react@18.3.1): + dependencies: + "@types/hast": 3.0.4 + "@types/mdast": 4.0.4 + "@types/react": 18.3.24 + devlop: 1.1.0 + hast-util-to-jsx-runtime: 2.3.6 + html-url-attributes: 3.0.1 + mdast-util-to-hast: 13.2.1 + react: 18.3.1 + remark-parse: 11.0.0 + remark-rehype: 11.1.2 + unified: 11.0.5 + unist-util-visit: 5.1.0 + vfile: 6.0.3 + transitivePeerDependencies: + - supports-color + react-plotly.js@2.6.0(plotly.js@3.1.0(mapbox-gl@1.13.3))(react@18.3.1): dependencies: plotly.js: 3.1.0(mapbox-gl@1.13.3) @@ -11766,6 +13884,40 @@ snapshots: regl@2.1.1: {} + remark-gfm@4.0.1: + dependencies: + "@types/mdast": 4.0.4 + mdast-util-gfm: 3.1.0 + micromark-extension-gfm: 3.0.0 + remark-parse: 11.0.0 + remark-stringify: 11.0.0 + unified: 11.0.5 + transitivePeerDependencies: + - supports-color + + remark-parse@11.0.0: + dependencies: + "@types/mdast": 4.0.4 + mdast-util-from-markdown: 2.0.3 + micromark-util-types: 2.0.2 + unified: 11.0.5 + transitivePeerDependencies: + - supports-color + + remark-rehype@11.1.2: + dependencies: + "@types/hast": 3.0.4 + "@types/mdast": 4.0.4 + mdast-util-to-hast: 13.2.1 + unified: 11.0.5 + vfile: 6.0.3 + + remark-stringify@11.0.0: + dependencies: + "@types/mdast": 4.0.4 + mdast-util-to-markdown: 2.1.2 + unified: 11.0.5 + reselect@5.1.1: {} resolve-from@4.0.0: {} @@ -11828,6 +13980,16 @@ snapshots: "@rollup/rollup-win32-x64-msvc": 4.50.1 fsevents: 2.3.3 + router@2.2.0: + dependencies: + debug: 4.4.1 + depd: 2.0.0 + is-promise: 4.0.0 + parseurl: 1.3.3 + path-to-regexp: 8.3.0 + transitivePeerDependencies: + - supports-color + rrweb-cssom@0.8.0: {} run-parallel@1.2.0: @@ -11836,6 +13998,10 @@ snapshots: rw@1.3.3: {} + rxjs@7.8.2: + dependencies: + tslib: 2.8.1 + safe-array-concat@1.1.3: dependencies: call-bind: 1.0.8 @@ -11861,10 +14027,97 @@ snapshots: safer-buffer@2.1.2: {} - sass@1.89.2: + sass-embedded-all-unknown@1.98.0: + dependencies: + sass: 1.98.0 + optional: true + + sass-embedded-android-arm64@1.98.0: + optional: true + + sass-embedded-android-arm@1.98.0: + optional: true + + sass-embedded-android-riscv64@1.98.0: + optional: true + + sass-embedded-android-x64@1.98.0: + optional: true + + sass-embedded-darwin-arm64@1.98.0: + optional: true + + sass-embedded-darwin-x64@1.98.0: + optional: true + + sass-embedded-linux-arm64@1.98.0: + optional: true + + sass-embedded-linux-arm@1.98.0: + optional: true + + sass-embedded-linux-musl-arm64@1.98.0: + optional: true + + sass-embedded-linux-musl-arm@1.98.0: + optional: true + + sass-embedded-linux-musl-riscv64@1.98.0: + optional: true + + sass-embedded-linux-musl-x64@1.98.0: + optional: true + + sass-embedded-linux-riscv64@1.98.0: + optional: true + + sass-embedded-linux-x64@1.98.0: + optional: true + + sass-embedded-unknown-all@1.98.0: + dependencies: + sass: 1.98.0 + optional: true + + sass-embedded-win32-arm64@1.98.0: + optional: true + + sass-embedded-win32-x64@1.98.0: + optional: true + + sass-embedded@1.98.0: + dependencies: + "@bufbuild/protobuf": 2.11.0 + colorjs.io: 0.5.2 + immutable: 5.1.5 + rxjs: 7.8.2 + supports-color: 8.1.1 + sync-child-process: 1.0.2 + varint: 6.0.0 + optionalDependencies: + sass-embedded-all-unknown: 1.98.0 + sass-embedded-android-arm: 1.98.0 + sass-embedded-android-arm64: 1.98.0 + sass-embedded-android-riscv64: 1.98.0 + sass-embedded-android-x64: 1.98.0 + sass-embedded-darwin-arm64: 1.98.0 + sass-embedded-darwin-x64: 1.98.0 + sass-embedded-linux-arm: 1.98.0 + sass-embedded-linux-arm64: 1.98.0 + sass-embedded-linux-musl-arm: 1.98.0 + sass-embedded-linux-musl-arm64: 1.98.0 + sass-embedded-linux-musl-riscv64: 1.98.0 + sass-embedded-linux-musl-x64: 1.98.0 + sass-embedded-linux-riscv64: 1.98.0 + sass-embedded-linux-x64: 1.98.0 + sass-embedded-unknown-all: 1.98.0 + sass-embedded-win32-arm64: 1.98.0 + sass-embedded-win32-x64: 1.98.0 + + sass@1.98.0: dependencies: chokidar: 4.0.3 - immutable: 5.1.3 + immutable: 5.1.5 source-map-js: 1.2.1 optionalDependencies: "@parcel/watcher": 2.5.1 @@ -11884,6 +14137,31 @@ snapshots: semver@7.7.2: {} + send@1.2.1: + dependencies: + debug: 4.4.3 + encodeurl: 2.0.0 + escape-html: 1.0.3 + etag: 1.8.1 + fresh: 2.0.0 + http-errors: 2.0.1 + mime-types: 3.0.2 + ms: 2.1.3 + on-finished: 2.4.1 + range-parser: 1.2.1 + statuses: 2.0.2 + transitivePeerDependencies: + - supports-color + + serve-static@2.2.1: + dependencies: + encodeurl: 2.0.0 + escape-html: 1.0.3 + parseurl: 1.3.3 + send: 1.2.1 + transitivePeerDependencies: + - supports-color + set-cookie-parser@2.7.1: {} set-function-length@1.2.2: @@ -11908,6 +14186,8 @@ snapshots: es-errors: 1.3.0 es-object-atoms: 1.1.1 + setprototypeof@1.2.0: {} + shallow-copy@0.0.1: {} shallowequal@1.1.0: {} @@ -11979,6 +14259,8 @@ snapshots: source-map@0.6.1: {} + space-separated-tokens@2.0.2: {} + stable-hash@0.0.5: {} stack-trace@0.0.9: {} @@ -11989,6 +14271,8 @@ snapshots: dependencies: escodegen: 2.1.0 + statuses@2.0.2: {} + std-env@3.9.0: {} stop-iteration-iterator@1.1.0: @@ -12088,6 +14372,11 @@ snapshots: dependencies: safe-buffer: 5.1.2 + stringify-entities@4.0.4: + dependencies: + character-entities-html4: 2.1.0 + character-entities-legacy: 3.0.0 + strip-ansi@6.0.1: dependencies: ansi-regex: 5.0.1 @@ -12116,6 +14405,14 @@ snapshots: strongly-connected-components@1.0.1: {} + style-to-js@1.1.21: + dependencies: + style-to-object: 1.0.14 + + style-to-object@1.0.14: + dependencies: + inline-style-parser: 0.2.7 + styled-components@6.1.19(react-dom@18.3.1(react@18.3.1))(react@18.3.1): dependencies: "@emotion/is-prop-valid": 1.2.2 @@ -12148,6 +14445,10 @@ snapshots: dependencies: has-flag: 4.0.0 + supports-color@8.1.1: + dependencies: + has-flag: 4.0.0 + supports-preserve-symlinks-flag@1.0.0: {} svg-arc-to-cubic-bezier@3.2.0: {} @@ -12169,8 +14470,18 @@ snapshots: parse-svg-path: 0.1.2 svg-path-bounds: 1.0.2 + swagger-ui-dist@5.32.0: + dependencies: + "@scarf/scarf": 1.4.0 + symbol-tree@3.2.4: {} + sync-child-process@1.0.2: + dependencies: + sync-message-port: 1.2.0 + + sync-message-port@1.2.0: {} + terser@5.42.0: dependencies: "@jridgewell/source-map": 0.3.11 @@ -12232,6 +14543,8 @@ snapshots: dependencies: is-number: 7.0.0 + toidentifier@1.0.1: {} + topojson-client@3.1.0: dependencies: commander: 2.20.3 @@ -12240,10 +14553,16 @@ snapshots: dependencies: tldts: 6.1.86 + tr46@0.0.3: {} + tr46@5.1.1: dependencies: punycode: 2.3.1 + trim-lines@3.0.1: {} + + trough@2.2.0: {} + ts-api-utils@2.1.0(typescript@5.6.3): dependencies: typescript: 5.6.3 @@ -12279,6 +14598,12 @@ snapshots: type-fest@2.19.0: {} + type-is@2.0.1: + dependencies: + content-type: 1.0.5 + media-typer: 1.1.0 + mime-types: 3.0.2 + type@2.7.3: {} typed-array-buffer@1.0.3: @@ -12341,10 +14666,47 @@ snapshots: has-symbols: 1.1.0 which-boxed-primitive: 1.1.1 + undici-types@6.21.0: {} + undici-types@7.10.0: {} + unified@11.0.5: + dependencies: + "@types/unist": 3.0.3 + bail: 2.0.2 + devlop: 1.1.0 + extend: 3.0.2 + is-plain-obj: 4.1.0 + trough: 2.2.0 + vfile: 6.0.3 + + unist-util-is@6.0.1: + dependencies: + "@types/unist": 3.0.3 + + unist-util-position@5.0.0: + dependencies: + "@types/unist": 3.0.3 + + unist-util-stringify-position@4.0.0: + dependencies: + "@types/unist": 3.0.3 + + unist-util-visit-parents@6.0.2: + dependencies: + "@types/unist": 3.0.3 + unist-util-is: 6.0.1 + + unist-util-visit@5.1.0: + dependencies: + "@types/unist": 3.0.3 + unist-util-is: 6.0.1 + unist-util-visit-parents: 6.0.2 + universalify@2.0.1: {} + unpipe@1.0.0: {} + unplugin@1.16.1: dependencies: acorn: 8.15.0 @@ -12394,6 +14756,21 @@ snapshots: util-deprecate@1.0.2: {} + util.promisify@1.1.3: + dependencies: + call-bind: 1.0.8 + call-bound: 1.0.4 + define-data-property: 1.1.4 + define-properties: 1.2.1 + es-errors: 1.3.0 + es-object-atoms: 1.1.1 + for-each: 0.3.5 + get-intrinsic: 1.3.0 + has-proto: 1.2.0 + has-symbols: 1.1.0 + object.getownpropertydescriptors: 2.1.9 + safe-array-concat: 1.1.3 + util@0.12.5: dependencies: inherits: 2.0.4 @@ -12406,13 +14783,27 @@ snapshots: uuid@9.0.1: {} - vite-node@3.2.4(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1): + varint@6.0.0: {} + + vary@1.1.2: {} + + vfile-message@4.0.3: + dependencies: + "@types/unist": 3.0.3 + unist-util-stringify-position: 4.0.0 + + vfile@6.0.3: + dependencies: + "@types/unist": 3.0.3 + vfile-message: 4.0.3 + + vite-node@3.2.4(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1): dependencies: cac: 6.7.14 debug: 4.4.1 es-module-lexer: 1.7.0 pathe: 2.0.3 - vite: 6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1) + vite: 6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1) transitivePeerDependencies: - "@types/node" - jiti @@ -12427,29 +14818,29 @@ snapshots: - tsx - yaml - vite-plugin-svgr@4.5.0(rollup@4.50.1)(typescript@5.6.3)(vite@6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1)): + vite-plugin-svgr@4.5.0(rollup@4.50.1)(typescript@5.6.3)(vite@6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1)): dependencies: "@rollup/pluginutils": 5.3.0(rollup@4.50.1) "@svgr/core": 8.1.0(typescript@5.6.3) "@svgr/plugin-jsx": 8.1.0(@svgr/core@8.1.0(typescript@5.6.3)) - vite: 6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1) + vite: 6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1) transitivePeerDependencies: - rollup - supports-color - typescript - vite-tsconfig-paths@5.1.4(typescript@5.6.3)(vite@6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1)): + vite-tsconfig-paths@5.1.4(typescript@5.6.3)(vite@6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1)): dependencies: debug: 4.4.1 globrex: 0.1.2 tsconfck: 3.1.6(typescript@5.6.3) optionalDependencies: - vite: 6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1) + vite: 6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1) transitivePeerDependencies: - supports-color - typescript - vite@6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1): + vite@6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1): dependencies: esbuild: 0.25.9 fdir: 6.5.0(picomatch@4.0.3) @@ -12460,15 +14851,16 @@ snapshots: optionalDependencies: "@types/node": 24.3.1 fsevents: 2.3.3 - sass: 1.89.2 + sass: 1.98.0 + sass-embedded: 1.98.0 terser: 5.42.0 yaml: 2.8.1 - vitest@3.2.4(@types/node@24.3.1)(jsdom@26.1.0)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1): + vitest@3.2.4(@types/debug@4.1.12)(@types/node@24.3.1)(jsdom@26.1.0)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1): dependencies: "@types/chai": 5.2.2 "@vitest/expect": 3.2.4 - "@vitest/mocker": 3.2.4(vite@6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1)) + "@vitest/mocker": 3.2.4(vite@6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1)) "@vitest/pretty-format": 3.2.4 "@vitest/runner": 3.2.4 "@vitest/snapshot": 3.2.4 @@ -12486,10 +14878,11 @@ snapshots: tinyglobby: 0.2.15 tinypool: 1.1.1 tinyrainbow: 2.0.0 - vite: 6.3.6(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1) - vite-node: 3.2.4(@types/node@24.3.1)(sass@1.89.2)(terser@5.42.0)(yaml@2.8.1) + vite: 6.3.6(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1) + vite-node: 3.2.4(@types/node@24.3.1)(sass-embedded@1.98.0)(sass@1.98.0)(terser@5.42.0)(yaml@2.8.1) why-is-node-running: 2.3.0 optionalDependencies: + "@types/debug": 4.1.12 "@types/node": 24.3.1 jsdom: 26.1.0 transitivePeerDependencies: @@ -12522,6 +14915,8 @@ snapshots: dependencies: get-canvas-context: 1.0.2 + webidl-conversions@3.0.1: {} + webidl-conversions@7.0.0: {} webpack-virtual-modules@0.6.2: {} @@ -12537,6 +14932,11 @@ snapshots: tr46: 5.1.1 webidl-conversions: 7.0.0 + whatwg-url@5.0.0: + dependencies: + tr46: 0.0.3 + webidl-conversions: 3.0.1 + which-boxed-primitive@1.1.1: dependencies: is-bigint: 1.1.0 @@ -12641,3 +15041,5 @@ snapshots: optionalDependencies: "@types/react": 18.3.24 react: 18.3.1 + + zwitch@2.0.4: {} diff --git a/frontend/src/components/app/run-screen/run-screen.tsx b/frontend/src/components/app/run-screen/run-screen.tsx index 8a45908af..cee1c0d28 100644 --- a/frontend/src/components/app/run-screen/run-screen.tsx +++ b/frontend/src/components/app/run-screen/run-screen.tsx @@ -4,6 +4,7 @@ import { DataTable, FlexColumn, FlexRow, + MolstarViewer, PlotComponent, SecondaryButton, SectionTitle, @@ -12,6 +13,7 @@ import { import { useToggleableState } from "@protzilla/hooks"; import { spacing } from "@protzilla/theme"; import { + ApiResponse, callApiWithParameters, Download, emptyRunData, @@ -20,6 +22,7 @@ import { StepID, StepOutputInfo, SwitchComponent, + Visualization, } from "@protzilla/utils"; import { Figure } from "plotly.js"; import React, { useCallback, useEffect, useState } from "react"; @@ -27,6 +30,7 @@ import { Col } from "react-grid-system"; import { useLocation, useNavigate } from "react-router-dom"; import { styled } from "styled-components"; +import { CrosslinkerInformation } from "../../core/shared/molstar-viewer/crosslinker-processing"; import { H3 } from "../../core/shared/text"; const StyledNavbar = styled(Navbar)` @@ -88,19 +92,21 @@ interface UseStepOutputsParams { runName: string; stepId?: string; transform: (output: TOutput, response: TResponse) => TResult; + enabled?: boolean; } -export function useCertainStepOutputs({ +function useCertainStepOutputs({ available_outputs, endpoint, runName, stepId, transform, + enabled = true, }: UseStepOutputsParams): TResult[] { const [data, setData] = useState([]); useEffect(() => { - if (!stepId || available_outputs.length === 0) { + if (!enabled || !stepId || available_outputs.length === 0) { setData([]); return; } @@ -126,7 +132,7 @@ export function useCertainStepOutputs { const runName = location.state?.runName; const [runData, setRunData] = useState(emptyRunData); + const [selectedOutputTab, setSelectedOutputTab] = useState(""); const [plots, setPlots] = useState(); const [selectedPlot, setSelectedPlot] = useState
({ data: [], layout: {} }); const [availableTables, setAvailableTables] = useState(); + const [hasLoadedVisualizations, setHasLoadedVisualizations] = useState(false); + useEffect(() => { + if (selectedOutputTab === "Visualizations") { + setHasLoadedVisualizations(true); + } + }, [selectedOutputTab]); + + const [availableVisualizations, setAvailableVisualizations] = useState([]); + const transformVisualization = useCallback( + (_output: StepOutputInfo, response: ApiResponse) => ({ + structureEntryId: response.data.structureEntryId, + cifString: response.data.cifString, + crosslinks: response.data.crosslinks, + }), + [], + ); + const visualizations = useCertainStepOutputs< + StepOutputInfo, + ApiResponse, + { structureEntryId: string; cifString: string; crosslinks?: CrosslinkerInformation[] } + >({ + available_outputs: availableVisualizations, + endpoint: "get_step_visualizations/", + runName: runName, + stepId: runData.current_step_id, + transform: transformVisualization, + enabled: hasLoadedVisualizations, + }); + const [availableDownloads, setAvailableDownloads] = useState([]); const transformDownload = useCallback( - (output: StepOutputInfo, response: Download) => ({ + (output: StepOutputInfo, response: ApiResponse) => ({ title: output.label, - data: response.data, + data: response.data.data, }), [], ); const downloads = useCertainStepOutputs< StepOutputInfo, - Download, + ApiResponse, { title: string; data: Record } >({ available_outputs: availableDownloads, @@ -165,16 +201,16 @@ export const RunScreen: React.FC = () => { // Static PNGs sent as base64 const [availableImages, setAvailableImages] = useState([]); const transformImage = useCallback( - (output: StepOutputInfo, response: Image) => ({ + (output: StepOutputInfo, response: ApiResponse) => ({ title: output.label, alt: output.label, - data: "data:image/png;base64," + response.data, + data: "data:image/png;base64," + response.data.data, }), [], ); const images = useCertainStepOutputs< StepOutputInfo, - Image, + ApiResponse, { title: string; alt: string; data: string } >({ available_outputs: availableImages, @@ -225,6 +261,7 @@ export const RunScreen: React.FC = () => { setAvailableDownloads([]); setPlots(undefined); setAvailableImages([]); + setAvailableVisualizations([]); void getRunData(); void getStepPlots(); @@ -270,15 +307,18 @@ export const RunScreen: React.FC = () => { const tableOutputs = []; const imageOutputs = []; const downloadOutputs = []; + const visualizationOutputs = []; for (const output of response.outputs) { if (output.output_type === "dataframe" || output.output_type === "list") tableOutputs.push(output); else if (output.output_type === "png_base64") imageOutputs.push(output); else if (output.output_type === "download") downloadOutputs.push(output); + else if (output.output_type === "visualization") visualizationOutputs.push(output); } setAvailableTables(tableOutputs); setAvailableImages(imageOutputs); setAvailableDownloads(downloadOutputs); + setAvailableVisualizations(visualizationOutputs); } }, [runName]); @@ -295,6 +335,7 @@ export const RunScreen: React.FC = () => { setAvailableImages([]); setPlots(undefined); setAvailableDownloads([]); + setAvailableVisualizations([]); void getRunData(); void getStepPlots(); void getCurrentStepOutputLabels(); @@ -341,6 +382,20 @@ export const RunScreen: React.FC = () => { ); + const visualizationComponent = ( + + {visualizations.length > 0 ? ( + visualizations.map((viz) => ( + + + + )) + ) : ( + + )} + + ); + const singleTableComponent = (tableLabel: string) => ( @@ -359,10 +414,7 @@ export const RunScreen: React.FC = () => { }))} /> ) : ( - + )} ); @@ -414,7 +466,7 @@ export const RunScreen: React.FC = () => { )), ) ) : ( - + )} ); @@ -437,8 +489,20 @@ export const RunScreen: React.FC = () => { availableTables && availableTables.length > 0 && { name: "Tables", value: tableComponent }, availableImages.length > 0 && { name: "Images", value: imageComponent }, availableDownloads.length > 0 && { name: "Downloads", value: downloadComponent }, + availableVisualizations.length > 0 && { name: "Visualisations", value: visualizationComponent }, ].filter(Boolean) as { name: string; value: React.ReactNode }[]; + useEffect(() => { + if (components.length > 0 && !selectedOutputTab) { + setSelectedOutputTab(components[0].name); + } + }, [components, selectedOutputTab]); + + useEffect(() => { + setHasLoadedVisualizations(false); + setSelectedOutputTab(""); + }, [runData.current_step_id]); + return (
{ styleProps={{ height: "calc(100% - 3em)" }} components={components} hasCardTitle={false} + selection={selectedOutputTab} + callback={(component) => { + setSelectedOutputTab(component.name); + + if (component.name === "Visualisations" && !hasLoadedVisualizations) { + setHasLoadedVisualizations(true); + } + }} /> ) : ( diff --git a/frontend/src/components/core/index.ts b/frontend/src/components/core/index.ts index b2800ef1a..6747fa8a3 100644 --- a/frontend/src/components/core/index.ts +++ b/frontend/src/components/core/index.ts @@ -24,6 +24,7 @@ export * from "./shared/input-fields/info-field"; export * from "./shared/input-fields/header-info-field"; export * from "./shared/modal"; export * from "./shared/plot"; +export * from "./shared/molstar-viewer"; export * from "./shared/section-title"; export * from "./shared/switch"; export * from "./shared/text"; diff --git a/frontend/src/components/core/shared/index.ts b/frontend/src/components/core/shared/index.ts index bd6cfa757..2fd442629 100644 --- a/frontend/src/components/core/shared/index.ts +++ b/frontend/src/components/core/shared/index.ts @@ -21,6 +21,7 @@ export * from "./input-fields/info-field"; export * from "./input-fields/header-info-field"; export * from "./modal"; export * from "./plot"; +export * from "./molstar-viewer"; export * from "./section-title"; export * from "./switch"; export * from "./text"; diff --git a/frontend/src/components/core/shared/molstar-viewer/crosslinker-processing.tsx b/frontend/src/components/core/shared/molstar-viewer/crosslinker-processing.tsx new file mode 100644 index 000000000..0a43dc49a --- /dev/null +++ b/frontend/src/components/core/shared/molstar-viewer/crosslinker-processing.tsx @@ -0,0 +1,244 @@ +export interface CrosslinkerInformation { + crosslinkerPosition1: number; + crosslinkerPosition2: number; + chainId1: string; + chainId2: string; + isValid: boolean; + isIntraCrosslink: boolean; + reactiveAtom1?: string; + reactiveAtom2?: string; +} + +interface CrosslinkerAtom { + x: number; + y: number; + z: number; + seqPos: number; + atomId: string; +} + +interface AtomSiteIndices { + atomIdIdx: number; + seqIdIdx: number; + chainIdIdx: number; + xCoordIdx: number; + yCoordIdx: number; + zCoordIdx: number; +} + +export enum CrosslinkerType { + ValidIntra = "valid-intra-crosslink", + InvalidIntra = "invalid-intra-crosslink", + ValidInter = "valid-inter-crosslink", + InvalidInter = "invalid-inter-crosslink", +} + +export function generateCrosslinkCIF( + cifString: string, + crosslinks: CrosslinkerInformation[], +): { crosslinkerCifText: string; crosslinkerGroups: Record } { + const atomLines: string[] = []; + const connectionLines: string[] = []; + let connectionId = 1; + + const crosslinkGroups: Record = Object.values(CrosslinkerType).reduce( + (crosslinkGroups, type) => { + crosslinkGroups[type] = []; + return crosslinkGroups; + }, + {} as Record, + ); + + for (const crosslink of crosslinks) { + const [atom1, atom2] = getCrosslinkerAtoms(cifString, crosslink); + + const atom1Id = `XL${String(connectionId)}A`; + const atom2Id = `XL${String(connectionId)}B`; + + const crosslinkType = getCrosslinkerType(crosslink); + crosslinkGroups[crosslinkType].push(atom1Id, atom2Id); + + // chainId = CL, to enable inter-crosslinks, because connections can only exist within the same chain + // compId (indicating the residue), is unimportant for this representation and can therefore be a placeholder + + const atom1Line = [ + `ATOM ${String(connectionId * 2 - 1)} ${atom1Id} ${atom1Id}`, + `LIN CL ${String(atom1.seqPos)}`, + `${String(atom1.x)} ${String(atom1.y)} ${String(atom1.z)} 1.0 0.0`, + ].join(" "); + + const atom2Line = [ + `ATOM ${String(connectionId * 2)} ${atom2Id} ${atom2Id}`, + `LIN CL ${String(atom2.seqPos)}`, + `${String(atom2.x)} ${String(atom2.y)} ${String(atom2.z)} 1.0 0.0`, + ].join(" "); + + atomLines.push(atom1Line); + atomLines.push(atom2Line); + + const connectionLine = [ + `${String(connectionId)} misc ${atom1Id}`, + `X CL ${String(atom1.seqPos)} ${atom2Id}`, + `X CL ${String(atom2.seqPos)}`, + ].join(" "); + + connectionLines.push(connectionLine); + + connectionId++; + } + + const crosslinkCifText = ` + data_crosslink + + loop_ + _atom_site.group_PDB + _atom_site.id + _atom_site.type_symbol + _atom_site.label_atom_id + _atom_site.label_comp_id + _atom_site.label_asym_id + _atom_site.label_seq_id + _atom_site.Cartn_x + _atom_site.Cartn_y + _atom_site.Cartn_z + _atom_site.occupancy + _atom_site.B_iso_or_equiv + ${atomLines.join("\n")} + + loop_ + _struct_conn.id + _struct_conn.conn_type_id + _struct_conn.ptnr1_label_atom_id + _struct_conn.ptnr1_label_comp_id + _struct_conn.ptnr1_label_asym_id + _struct_conn.ptnr1_label_seq_id + _struct_conn.ptnr2_label_atom_id + _struct_conn.ptnr2_label_comp_id + _struct_conn.ptnr2_label_asym_id + _struct_conn.ptnr2_label_seq_id + ${connectionLines.join("\n")} + `; + + return { crosslinkerCifText: crosslinkCifText, crosslinkerGroups: crosslinkGroups }; +} + +// ------------------------- internal helpers: ------------------------- + +function getReactiveAtom(reactiveAtom?: string): string { + // right now we always return the central C atom + // later we might want to return the reactive atom of the amino acid residue of the specific amino acid type + // then we just have to define a reactiveAtom + if (!reactiveAtom) return "CA"; + const mapping: Record = { + K: "NZ", + S: "OG", + T: "OG1", + }; + return mapping[reactiveAtom] || "CA"; +} + +function findAtomCoordinatesInCif( + cifString: string, + cifIndices: ReturnType, + crosslinkerAtomId: string, + crosslinkerSeqPos: number, + crosslinkerChainId: string, +): CrosslinkerAtom { + const lines = cifString.split(/\r?\n/); + + for (const line of lines) { + const trimmed = line.trim(); + if (!trimmed || trimmed.startsWith("#")) continue; + const tokens = trimmed.split(/\s+/); + if (tokens[0] !== "ATOM" && tokens[0] !== "HETATM") continue; + + const lineAtomId = tokens[cifIndices.atomIdIdx]; + const lineSeqPos = parseInt(tokens[cifIndices.seqIdIdx], 10); + const lineChainId = tokens[cifIndices.chainIdIdx]; + + if ( + lineAtomId === crosslinkerAtomId && + lineSeqPos === crosslinkerSeqPos && + lineChainId === crosslinkerChainId + ) { + return { + x: parseFloat(tokens[cifIndices.xCoordIdx]), + y: parseFloat(tokens[cifIndices.yCoordIdx]), + z: parseFloat(tokens[cifIndices.zCoordIdx]), + seqPos: crosslinkerSeqPos, + atomId: crosslinkerAtomId, + }; + } + } + + throw new Error(`No atom found for seq=${String(crosslinkerSeqPos)}, atom=${crosslinkerAtomId}`); +} + +function getCrosslinkerAtoms( + cifString: string, + crosslink: CrosslinkerInformation, +): [CrosslinkerAtom, CrosslinkerAtom] { + const cifIndices = getCifAtomSiteIndices(cifString); + + const reactiveAtom1 = getReactiveAtom(crosslink.reactiveAtom1); + const atom1 = findAtomCoordinatesInCif( + cifString, + cifIndices, + reactiveAtom1, + crosslink.crosslinkerPosition1, + crosslink.chainId1, + ); + + const reactiveAtom2 = getReactiveAtom(crosslink.reactiveAtom2); + const atom2 = findAtomCoordinatesInCif( + cifString, + cifIndices, + reactiveAtom2, + crosslink.crosslinkerPosition2, + crosslink.chainId2, + ); + + return [atom1, atom2]; +} + +function getCifAtomSiteIndices(cifString: string): AtomSiteIndices { + const lines = cifString.split(/\r?\n/); + const atomSiteLines = lines.filter((line) => line.startsWith("_atom_site.")); + + const indices: Record = {}; + atomSiteLines.forEach((line, idx) => { + const colName = line.trim(); + indices[colName] = idx; + }); + + const atomIdColNames = ["_atom_site.label_atom_id", "_atom_site.auth_atom_id"]; + const seqIdColNames = ["_atom_site.label_seq_id", "_atom_site.auth_seq_id"]; + const chainIdColNames = ["_atom_site.label_asym_id", "_atom_site.auth_asym_id"]; + const xCoordColNames = ["_atom_site.Cartn_x"]; + const yCoordColNames = ["_atom_site.Cartn_y"]; + const zCoordColNames = ["_atom_site.Cartn_z"]; + + function findFirst(names: string[]): number { + for (const name of names) { + if (name in indices) return indices[name]; + } + throw new Error(`None of the column names found: ${names.join(", ")}`); + } + + return { + atomIdIdx: findFirst(atomIdColNames), + seqIdIdx: findFirst(seqIdColNames), + chainIdIdx: findFirst(chainIdColNames), + xCoordIdx: findFirst(xCoordColNames), + yCoordIdx: findFirst(yCoordColNames), + zCoordIdx: findFirst(zCoordColNames), + }; +} + +function getCrosslinkerType(crosslink: CrosslinkerInformation): CrosslinkerType { + if (crosslink.isIntraCrosslink) { + return crosslink.isValid ? CrosslinkerType.ValidIntra : CrosslinkerType.InvalidIntra; + } else { + return crosslink.isValid ? CrosslinkerType.ValidInter : CrosslinkerType.InvalidInter; + } +} diff --git a/frontend/src/components/core/shared/molstar-viewer/index.ts b/frontend/src/components/core/shared/molstar-viewer/index.ts new file mode 100644 index 000000000..45fc7b6db --- /dev/null +++ b/frontend/src/components/core/shared/molstar-viewer/index.ts @@ -0,0 +1,2 @@ +export { default as MolstarViewer } from "./molstar-viewer"; +export * from "./molstar-viewer.props"; diff --git a/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.config.ts b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.config.ts new file mode 100644 index 000000000..2f151f051 --- /dev/null +++ b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.config.ts @@ -0,0 +1,8 @@ +import { CrosslinkerType } from "./crosslinker-processing"; + +export const CROSSLINKER_COLORS: Record = { + [CrosslinkerType.ValidIntra]: 0xe03e00, // bright orange-red + [CrosslinkerType.InvalidIntra]: 0xfca311, // pale yellow-orange + [CrosslinkerType.ValidInter]: 0x8a2be2, // bright purple + [CrosslinkerType.InvalidInter]: 0xd8b4ff, // pale violet +}; diff --git a/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.props.tsx b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.props.tsx new file mode 100644 index 000000000..abf936f4d --- /dev/null +++ b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.props.tsx @@ -0,0 +1,6 @@ +import { CrosslinkerInformation } from "./crosslinker-processing"; + +export interface MolstarViewerProps { + cifText: string; + crosslinks?: CrosslinkerInformation[]; +} diff --git a/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts new file mode 100644 index 000000000..93b450c89 --- /dev/null +++ b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts @@ -0,0 +1,64 @@ +import { useNotification } from "@protzilla/app"; +import { PluginUIContext } from "molstar/lib/mol-plugin-ui/context"; +import { MolScriptBuilder as MS } from "molstar/lib/mol-script/language/builder"; + +import { + CrosslinkerInformation, + CrosslinkerType, + generateCrosslinkCIF, +} from "./crosslinker-processing"; +import { CROSSLINKER_COLORS } from "./molstar-viewer.config"; + +export async function addCrosslinks( + plugin: PluginUIContext, + cifText: string, + crosslinks: CrosslinkerInformation[], +) { + const { crosslinkerCifText: crosslinkerCifText, crosslinkerGroups: crosslinkerGroups } = + generateCrosslinkCIF(cifText, crosslinks); + + const lineData = await plugin.builders.data.rawData({ + data: crosslinkerCifText, + label: "line", + }); + const lineTrajectory = await plugin.builders.structure.parseTrajectory(lineData, "mmcif"); + const lineModel = await plugin.builders.structure.createModel(lineTrajectory); + const lineStructure = await plugin.builders.structure.createStructure(lineModel); + + for (const type of Object.values(CrosslinkerType)) { + const atomIds = crosslinkerGroups[type]; + + const expression = MS.struct.generator.atomGroups({ + "atom-test": MS.core.set.has([MS.set(...atomIds), MS.ammp("label_atom_id")]), + }); + + const component = await plugin.builders.structure.tryCreateComponentFromExpression( + lineStructure, + expression, + type, + ); + + if (component) { + await plugin.builders.structure.representation.addRepresentation(component, { + type: "line", + color: "uniform", + colorParams: { value: CROSSLINKER_COLORS[type] }, + }); + } + } +} + +export function handleError( + error: unknown, + errorTitle: string, + notify: ReturnType, +) { + const errorMessage = + typeof error === "string" ? error : error instanceof Error ? error.message : "Unknown error"; + notify({ + title: errorTitle, + message: errorMessage, + type: "error", + isClosingAutomatically: true, + }); +} diff --git a/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.tsx b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.tsx new file mode 100644 index 000000000..34ee45713 --- /dev/null +++ b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.tsx @@ -0,0 +1,81 @@ +import { useNotification } from "@protzilla/app"; +import { SectionTitle } from "@protzilla/core"; +import { createPluginUI } from "molstar/lib/mol-plugin-ui"; +import { PluginUIContext } from "molstar/lib/mol-plugin-ui/context"; +import { renderReact18 } from "molstar/lib/mol-plugin-ui/react18"; +import React, { useEffect, useRef, useState } from "react"; + +import { MolstarViewerProps } from "./molstar-viewer.props"; +import { addCrosslinks, handleError } from "./molstar-viewer.service"; +import { CanvasWrapper, Container } from "./styles"; +import "molstar/lib/mol-plugin-ui/skin/base/base.scss"; + +const MolstarViewer: React.FC = ({ cifText, crosslinks }) => { + const containerRef = useRef(null); + const [isLoading, setIsLoading] = useState(true); + const notify = useNotification(); + + useEffect(() => { + const container = containerRef.current; + if (!container) return; + + let plugin: PluginUIContext | null = null; + + const init = async () => { + try { + setIsLoading(true); + + if (!cifText) { + throw new Error("No CIF data provided"); + } + + // initialize Molstar + plugin = await createPluginUI({ + target: container, + render: renderReact18, + }); + + // load structure + const data = await plugin.builders.data.rawData({ + data: cifText, + label: "structure", + }); + const trajectory = await plugin.builders.structure.parseTrajectory(data, "mmcif"); + await plugin.builders.structure.hierarchy.applyPreset(trajectory, "default"); + + // add crosslinks to structure, if available + if (crosslinks !== undefined) { + await addCrosslinks(plugin, cifText, crosslinks); + } + + setIsLoading(false); + } catch (error: unknown) { + handleError(error, "MolstarViewer Error:", notify); + setIsLoading(false); + } + }; + + void init(); + + return () => { + if (plugin !== null) { + try { + plugin.dispose(); + } catch (disposeError) { + handleError(disposeError, "Error disposing Molstar plugin:", notify); + } + } + }; + }, [cifText, crosslinks, notify]); + + return ( + + {isLoading && ( + + )} + + + ); +}; + +export default MolstarViewer; diff --git a/frontend/src/components/core/shared/molstar-viewer/styles.ts b/frontend/src/components/core/shared/molstar-viewer/styles.ts new file mode 100644 index 000000000..e49e66c5e --- /dev/null +++ b/frontend/src/components/core/shared/molstar-viewer/styles.ts @@ -0,0 +1,250 @@ +import { color } from "@protzilla/theme"; +import { styled } from "styled-components"; + +export const Container = styled.div` + width: 100%; + height: 100vh; + position: relative; + display: flex; + flex-direction: column; + gap: 1rem; +`; + +const molstarTheme = { + primary: color("protzillaDarkBlue"), + surface: color("protzillaLightGray"), + hover: color("secondaryHover"), + border: color("protzillaGray"), + lightText: color("onPrimary"), + darkText: color("text"), + success: color("green"), + error: color("caution"), +}; + +const headers = ` + .msp-plugin .msp-sequence-select > select, + .msp-plugin .msp-control-group-header > button, + .msp-plugin .msp-control-group-header div, + .msp-plugin .msp-sequence, + .msp-plugin .msp-log-entry-info, + .msp-plugin .msp-log-entry-warning, + .msp-plugin .msp-section-header, + .msp-plugin .msp-sequence-select, + .msp-plugin ::-webkit-scrollbar-thumb, + .msp-plugin .msp-slider-base-handle +`; + +const controls = ` + .msp-plugin .msp-control-row button, + .msp-plugin .msp-btn, + .msp-plugin .msp-btn-link-toggle-off, + .msp-plugin .msp-btn-link-toggle-off:active, + .msp-plugin .msp-btn-link-toggle-off:focus, + .msp-plugin .msp-log .msp-log-entry, + .msp-plugin ::-webkit-scrollbar-track, + .msp-plugin .msp-semi-transparent-background +`; + +const lightSurfaces = ` + .msp-plugin .msp-form-control, + .msp-plugin .msp-control-row select, + .msp-plugin .msp-control-row input[type=text], + .msp-plugin .msp-btn-link-toggle-on, + .msp-plugin .msp-log li, + .msp-plugin, + .msp-plugin .msp-sequence-wrapper-non-empty, + .msp-plugin .msp-control-row, + .msp-plugin .msp-control-row > div, + .msp-plugin .msp-help-text, + .msp-plugin .msp-flex-row, + .msp-plugin .msp-state-image-row, + .msp-plugin .msp-image-preview, + .msp-plugin .msp-left-panel-controls-buttons, + .msp-plugin .msp-layout-right, + .msp-plugin .msp-layout-left, + .msp-plugin .msp-highlight-info +`; + +const layoutBlocks = ` + .msp-plugin .msp-log, + .msp-plugin .msp-viewport, + .msp-plugin .msp-layout-right, + .msp-plugin .msp-layout-left, + .msp-plugin .msp-slider-base-rail, + .msp-plugin + .msp-viewport-top-left-controls + .msp-animation-viewport-controls + .msp-animation-viewport-controls-select, + .msp-plugin .msp-viewport-controls-panel, +`; + +const elementsWithDarkText = ` + .msp-plugin .msp-viewport-controls-buttons .msp-btn-link-toggle-off, + .msp-plugin-content, + .msp-plugin .msp-log, + .msp-plugin .msp-log .msp-log-timestamp, + .msp-plugin .msp-btn-link-toggle-on, + .msp-plugin .msp-sequence-wrapper .msp-sequence-number, + .msp-plugin .msp-control-row > span.msp-control-row-label, + .msp-plugin .msp-control-row > button.msp-control-button-label, + .msp-plugin .msp-help-text > div, + .msp-plugin .msp-btn-action, + .msp-plugin .msp-btn-action:active, + .msp-plugin .msp-btn-action:focus, + .msp-plugin .msp-25-lower-contrast-text, + .msp-plugin .msp-highlight-info, + .msp-plugin .msp-form-control:hover, + .msp-plugin .msp-control-row select:hover, + .msp-plugin .msp-control-row button:hover, + .msp-plugin .msp-control-row input[type=text]:hover, + .msp-plugin .msp-btn:hover, + .msp-plugin .msp-btn-link-toggle-off, + .msp-plugin .msp-btn-link-toggle-off:active, + .msp-plugin .msp-btn-link-toggle-off:focus, + ::placeholder +`; + +const elementsWithLightText = ` + .msp-plugin .msp-sequence-select, + .msp-plugin .msp-control-group-header > button, + .msp-plugin .msp-control-group-header div, + .msp-plugin .msp-section-header +`; + +const hoverElements = ` + .msp-plugin .msp-btn-link-toggle-off:hover, + .msp-plugin .msp-control-group-expander .msp-icon, + .msp-plugin .msp-form-control:hover, + .msp-plugin .msp-control-row select:hover, + .msp-plugin .msp-control-row button:hover, + .msp-plugin .msp-control-row input[type=text]:hover, + .msp-plugin .msp-btn:hover, + .msp-plugin .msp-help:hover span +`; + +export const CanvasWrapper = styled.div` + flex: 1; + position: relative; + top: 12vh; + + && { + /* ================= BACKGROUNDS ================= */ + + ${headers} { + background: ${molstarTheme.primary} !important; + } + + ${controls} { + background: ${molstarTheme.surface}; + } + + ${lightSurfaces} { + background: ${molstarTheme.surface} !important; + } + + ${layoutBlocks} { + background: ${molstarTheme.border} !important; + } + + /* ================= TEXT COLORS ================= */ + + ${elementsWithDarkText} { + color: ${molstarTheme.primary} !important; + } + + ${elementsWithLightText} { + color: ${molstarTheme.lightText} !important; + } + + .msp-plugin .msp-sequence-wrapper .msp-sequence-present { + color: ${molstarTheme.darkText} !important; + } + + /* ================= HOVER ================= */ + + ${hoverElements} { + background: ${molstarTheme.hover} !important; + outline: 1px solid ${molstarTheme.border} !important; + } + + /* ================= BORDERS ================= */ + + ::-webkit-scrollbar-thumb { + border: 4px solid ${molstarTheme.primary}; + } + + .msp-plugin .msp-select-toggle::after { + border-top-color: ${molstarTheme.primary} !important; + } + + .msp-plugin .msp-accent-offset, + .msp-plugin .msp-state-list > li > button:first-child { + border-left-color: ${molstarTheme.primary} !important; + } + + .msp-plugin .msp-transform-header-brand-purple, + .msp-plugin .msp-transform-header-brand-blue { + border-bottom-color: ${molstarTheme.primary} !important; + } + + .msp-plugin .msp-slider-base-handle { + border: 4px solid ${molstarTheme.surface} !important; + } + + .msp-plugin .msp-layout-standard-outside .msp-layout-left { + border-top-color: ${molstarTheme.surface} !important; + } + + .msp-plugin .msp-layout-standard, + .msp-plugin .msp-layout-standard-outside .msp-layout-top, + .msp-plugin .msp-layout-standard-outside .msp-layout-bottom { + border: 1px solid ${molstarTheme.border} !important; + } + + .msp-plugin .msp-layout-standard-outside .msp-layout-left, + .msp-plugin .msp-layout-standard-outside .msp-layout-right { + border-top: 1px solid ${molstarTheme.border} !important; + } + + .msp-plugin .msp-layout-standard-outside .msp-layout-right { + border-left: 1px solid ${molstarTheme.border} !important; + } + + .msp-plugin .msp-log li:not(:last-child), + .msp-plugin .msp-layout-standard-outside .msp-layout-bottom { + border-bottom: 1px solid ${molstarTheme.border} !important ; + } + + .msp-plugin .msp-form-control:hover, + .msp-plugin .msp-control-row select:hover, + .msp-plugin .msp-control-row button:hover, + .msp-plugin .msp-control-row input[type="text"]:hover, + .msp-plugin .msp-btn:hover { + outline: 1px solid ${molstarTheme.border}!important; + } + + /* ================= SPECIAL ================= */ + + .msp-plugin .msp-transform-header-brand svg { + stroke: ${molstarTheme.primary} !important; + } + + .msp-svg-text, + .msp-plugin .msp-transform-header-brand svg { + fill: ${molstarTheme.primary} !important; + } + + /* ================= SIGNAL ================= */ + + .msp-plugin .msp-btn-commit-on, + .msp-plugin .msp-btn-commit-on:active, + .msp-plugin .msp-btn-commit-on:focus, + .msp-plugin .msp-log-entry-message { + color: ${molstarTheme.success} !important; + } + + .msp-plugin .msp-log-entry-error { + background: ${molstarTheme.error} !important; + } + } +`; diff --git a/frontend/src/utils/protzilla-types.ts b/frontend/src/utils/protzilla-types.ts index 19af495fe..4317559bd 100644 --- a/frontend/src/utils/protzilla-types.ts +++ b/frontend/src/utils/protzilla-types.ts @@ -1,6 +1,8 @@ import { GridValidRowModel } from "@mui/x-data-grid"; import type { Edge } from "@xyflow/react"; +import { CrosslinkerInformation } from "../components/core/shared/molstar-viewer/crosslinker-processing.tsx"; + export interface UIStateProps { isDisabled?: boolean; } @@ -17,6 +19,12 @@ export interface StepOutputInfo { display_name: string; } +export interface ApiResponse { + success: boolean; + message: string; + data: T; +} + export interface Image { title: string; alt: string; @@ -27,6 +35,12 @@ export interface Download { data: Record; } +export interface Visualization { + structureEntryId: string; + cifString: string; + crosslinks?: CrosslinkerInformation[]; +} + // We assume these are the only data types we receive for tables export type TableRecord = Record; From f7ce624845fe87adeb838bbf4439dc5f86ccfdbd Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Tue, 21 Apr 2026 13:52:25 +0200 Subject: [PATCH 180/240] format with black --- backend/protzilla/data_analysis/crosslinking_validation.py | 4 +++- .../protzilla/data_analysis/test_crosslinking_validation.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 624979e80..a580465de 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -703,7 +703,9 @@ def diagrams_of_crosslinking_validation_data( if len(crosslinker_df) == 1: standard_deviation_predicted_lengths = 0.0 else: - standard_deviation_predicted_lengths = crosslinker_df["alphafold_distance"].std() + standard_deviation_predicted_lengths = crosslinker_df[ + "alphafold_distance" + ].std() mean_plus_two_std = ( mean_of_predicted_lengths + 2 * standard_deviation_predicted_lengths ) diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 269e93862..7e97dad18 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -934,7 +934,7 @@ def test_validate_multimer_same_protein_different_chains_intra_vs_inter(): """Test that intra/inter link_type is determined by chain ID, not protein ID.""" sequences_df = pd.DataFrame( [ - ("P1-1", "ABCD"), + ("P1-1", "ABCD"), ], columns=["Protein ID", "Protein Sequence"], ) From 729550e735fb557be0bf462b3019c6fc316dedaf Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Tue, 21 Apr 2026 15:30:54 +0200 Subject: [PATCH 181/240] fix bug regarding panda types --- .../data_analysis/crosslinking_validation.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index a580465de..00ce210c1 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -13,6 +13,7 @@ create_histograms, create_bar_plot, ) +from backend.protzilla.constants.protzilla_logging import logger from protzilla.data_analysis.plots import add_vertical_line_with_annotation_in_legend @@ -283,9 +284,12 @@ def get_chains( :param id_column_name: column name to check against valid_ids. :return: list of unique chain IDs. """ - relevant_df = cif_df[cif_df[id_column_name].isin(valid_ids[protein_id])] + target_ids = valid_ids.get(protein_id, []) + if not target_ids: + return [] + target_ids_as_strings = [str(i) for i in target_ids] + relevant_df = cif_df[cif_df[id_column_name].astype(str).isin(target_ids_as_strings)] chain_ids = relevant_df["_atom_site.auth_asym_id"].dropna().unique().tolist() - return chain_ids @@ -506,6 +510,7 @@ def validate_with_angstrom_deviation( if relevant_crosslinks_df.empty: msg = "There are no cross links between the structures to validate." messages = [dict(level=logging.WARNING, msg=msg)] + logger.warning(msg) return dict(crosslinking_result_df=pd.DataFrame(), messages=messages) chains_per_protein = {} @@ -522,6 +527,12 @@ def validate_with_angstrom_deviation( chains_per_protein=chains_per_protein, ) + if relevant_crosslinks_df.empty: + msg = "There are no cross links between the structures to validate." + messages = [dict(level=logging.WARNING, msg=msg)] + logger.warning(msg) + return dict(crosslinking_result_df=pd.DataFrame(), messages=messages) + relevant_crosslinks_df, messages = add_protein_crosslink_positions_to_df( relevant_crosslinks_df, amino_acid_sequences_df ) @@ -645,6 +656,8 @@ def diagrams_of_crosslinking_validation_data( bar plot summarizing valid and invalid cross-links across all crosslinkers. :raises KeyError: If a required crosslinker entry is missing in crosslinker_information. """ + if validated_df.empty: + return {} validated_df = validated_df.dropna(subset=["valid_crosslink"]) figures = [] From 495f18813f308779f93257ee241a14f56f705029 Mon Sep 17 00:00:00 2001 From: 3dot141592 Date: Tue, 21 Apr 2026 15:22:42 +0200 Subject: [PATCH 182/240] add missing CL Icons --- .../app/run-screen/node-editor/StepNode.tsx | 21 ++++++++++-- .../core/shared/icon/icons/handle_cif.svg | 17 ++++++++++ .../shared/icon/icons/handle_confidence.svg | 6 ++++ .../shared/icon/icons/handle_crosslinking.svg | 16 ++++++++++ .../shared/icon/icons/handle_debug_data.svg | 16 ++++++++++ .../shared/icon/icons/handle_full_data.svg | 29 +++++++++++++++++ .../core/shared/icon/icons/handle_pae.svg | 32 +++++++++++++++++++ .../core/shared/icon/icons/handle_plddt.svg | 17 ++++++++++ .../icon/icons/handle_structure_metadata.svg | 18 +++++++++++ .../core/shared/icon/icons/index.ts | 8 +++++ .../shared/icon/icons/notusedicons/cif_02.svg | 23 +++++++++++++ .../shared/icon/icons/notusedicons/pae_02.svg | 18 +++++++++++ .../notusedicons/structure_metadata_02.svg | 31 ++++++++++++++++++ 13 files changed, 250 insertions(+), 2 deletions(-) create mode 100644 frontend/src/components/core/shared/icon/icons/handle_cif.svg create mode 100644 frontend/src/components/core/shared/icon/icons/handle_confidence.svg create mode 100644 frontend/src/components/core/shared/icon/icons/handle_crosslinking.svg create mode 100644 frontend/src/components/core/shared/icon/icons/handle_debug_data.svg create mode 100644 frontend/src/components/core/shared/icon/icons/handle_full_data.svg create mode 100644 frontend/src/components/core/shared/icon/icons/handle_pae.svg create mode 100644 frontend/src/components/core/shared/icon/icons/handle_plddt.svg create mode 100644 frontend/src/components/core/shared/icon/icons/handle_structure_metadata.svg create mode 100644 frontend/src/components/core/shared/icon/icons/notusedicons/cif_02.svg create mode 100644 frontend/src/components/core/shared/icon/icons/notusedicons/pae_02.svg create mode 100644 frontend/src/components/core/shared/icon/icons/notusedicons/structure_metadata_02.svg diff --git a/frontend/src/components/app/run-screen/node-editor/StepNode.tsx b/frontend/src/components/app/run-screen/node-editor/StepNode.tsx index ef921675f..a83c617fd 100644 --- a/frontend/src/components/app/run-screen/node-editor/StepNode.tsx +++ b/frontend/src/components/app/run-screen/node-editor/StepNode.tsx @@ -7,12 +7,20 @@ import type React from "react"; import { styled } from "styled-components"; import { + handleCifIcon, + handleConfidenceIcon, + handleCrosslinkingIcon, + handleDebugDataIcon, handleDnaIcon, + handleFullDataIcon, handleMetadataIcon, + handlePaeIcon, handlePeptidesIcon, + handlePlddtIcon, handleProteinIcon, handlePsmIcon, handleSequencesIcon, + handleStructureMetadataIcon, } from "../../../core/shared/icon/icons"; type HandleDirection = "Input" | "Output" | "None"; @@ -36,12 +44,21 @@ export type StepNodeType = Node; type HandleIcon = React.ComponentType>; const DATA_TYPE_ICON_MAP: Partial> = { + amino_acid_sequences_df: handleSequencesIcon, + cif_df: handleCifIcon, + confidence_df: handleConfidenceIcon, + crosslinking_df: handleCrosslinkingIcon, + debug_data: handleDebugDataIcon, fasta_df: handleSequencesIcon, + full_data_df: handleFullDataIcon, + gene_mapping_df: handleDnaIcon, + metadata_df: handleMetadataIcon, + pae_df: handlePaeIcon, peptide_df: handlePeptidesIcon, + plddt_df: handlePlddtIcon, protein_df: handleProteinIcon, - metadata_df: handleMetadataIcon, - gene_mapping_df: handleDnaIcon, psm_df: handlePsmIcon, + structure_metadata_df: handleStructureMetadataIcon, }; const HANDLE_ICON_SIZE = 26; diff --git a/frontend/src/components/core/shared/icon/icons/handle_cif.svg b/frontend/src/components/core/shared/icon/icons/handle_cif.svg new file mode 100644 index 000000000..25b1e2525 --- /dev/null +++ b/frontend/src/components/core/shared/icon/icons/handle_cif.svg @@ -0,0 +1,17 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/frontend/src/components/core/shared/icon/icons/handle_confidence.svg b/frontend/src/components/core/shared/icon/icons/handle_confidence.svg new file mode 100644 index 000000000..ba278382c --- /dev/null +++ b/frontend/src/components/core/shared/icon/icons/handle_confidence.svg @@ -0,0 +1,6 @@ + + + + + + diff --git a/frontend/src/components/core/shared/icon/icons/handle_crosslinking.svg b/frontend/src/components/core/shared/icon/icons/handle_crosslinking.svg new file mode 100644 index 000000000..c933facf8 --- /dev/null +++ b/frontend/src/components/core/shared/icon/icons/handle_crosslinking.svg @@ -0,0 +1,16 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/frontend/src/components/core/shared/icon/icons/handle_debug_data.svg b/frontend/src/components/core/shared/icon/icons/handle_debug_data.svg new file mode 100644 index 000000000..95303fb47 --- /dev/null +++ b/frontend/src/components/core/shared/icon/icons/handle_debug_data.svg @@ -0,0 +1,16 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/frontend/src/components/core/shared/icon/icons/handle_full_data.svg b/frontend/src/components/core/shared/icon/icons/handle_full_data.svg new file mode 100644 index 000000000..5685643e8 --- /dev/null +++ b/frontend/src/components/core/shared/icon/icons/handle_full_data.svg @@ -0,0 +1,29 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/frontend/src/components/core/shared/icon/icons/handle_pae.svg b/frontend/src/components/core/shared/icon/icons/handle_pae.svg new file mode 100644 index 000000000..b996dab84 --- /dev/null +++ b/frontend/src/components/core/shared/icon/icons/handle_pae.svg @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/frontend/src/components/core/shared/icon/icons/handle_plddt.svg b/frontend/src/components/core/shared/icon/icons/handle_plddt.svg new file mode 100644 index 000000000..c17e33485 --- /dev/null +++ b/frontend/src/components/core/shared/icon/icons/handle_plddt.svg @@ -0,0 +1,17 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/frontend/src/components/core/shared/icon/icons/handle_structure_metadata.svg b/frontend/src/components/core/shared/icon/icons/handle_structure_metadata.svg new file mode 100644 index 000000000..23b148920 --- /dev/null +++ b/frontend/src/components/core/shared/icon/icons/handle_structure_metadata.svg @@ -0,0 +1,18 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/frontend/src/components/core/shared/icon/icons/index.ts b/frontend/src/components/core/shared/icon/icons/index.ts index 6c4b438bd..b69cb5b6f 100644 --- a/frontend/src/components/core/shared/icon/icons/index.ts +++ b/frontend/src/components/core/shared/icon/icons/index.ts @@ -22,12 +22,20 @@ export { default as edit } from "./edit-icon.svg?react"; export { default as eye } from "./eye.svg?react"; export { default as failed } from "./failed.svg?react"; export { default as github } from "./github.svg?react"; +export { default as handleCifIcon } from "./handle_cif.svg?react"; +export { default as handleConfidenceIcon } from "./handle_confidence.svg?react"; +export { default as handleCrosslinkingIcon } from "./handle_crosslinking.svg?react"; +export { default as handleDebugDataIcon } from "./handle_debug_data.svg?react"; export { default as handleDnaIcon } from "./handle_dna.svg?react"; +export { default as handleFullDataIcon } from "./handle_full_data.svg?react"; export { default as handleMetadataIcon } from "./handle_metadata.svg?react"; +export { default as handlePaeIcon } from "./handle_pae.svg?react"; +export { default as handlePlddtIcon } from "./handle_plddt.svg?react"; export { default as handleSequencesIcon } from "./handle_sequences.svg?react"; export { default as handlePeptidesIcon } from "./handle_peptides.svg?react"; export { default as handleProteinIcon } from "./handle_protein.svg?react"; export { default as handlePsmIcon } from "./handle_psm.svg?react"; +export { default as handleStructureMetadataIcon } from "./handle_structure_metadata.svg?react"; export { default as help } from "./help.svg?react"; export { default as home } from "./home.svg?react"; export { default as importing } from "./importing.svg?react"; diff --git a/frontend/src/components/core/shared/icon/icons/notusedicons/cif_02.svg b/frontend/src/components/core/shared/icon/icons/notusedicons/cif_02.svg new file mode 100644 index 000000000..1f9fb8cfe --- /dev/null +++ b/frontend/src/components/core/shared/icon/icons/notusedicons/cif_02.svg @@ -0,0 +1,23 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/frontend/src/components/core/shared/icon/icons/notusedicons/pae_02.svg b/frontend/src/components/core/shared/icon/icons/notusedicons/pae_02.svg new file mode 100644 index 000000000..e123a22de --- /dev/null +++ b/frontend/src/components/core/shared/icon/icons/notusedicons/pae_02.svg @@ -0,0 +1,18 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/frontend/src/components/core/shared/icon/icons/notusedicons/structure_metadata_02.svg b/frontend/src/components/core/shared/icon/icons/notusedicons/structure_metadata_02.svg new file mode 100644 index 000000000..9e07b5291 --- /dev/null +++ b/frontend/src/components/core/shared/icon/icons/notusedicons/structure_metadata_02.svg @@ -0,0 +1,31 @@ + + + + + + + + + + + \ No newline at end of file From dc0fb58b271610fecae37eac870c12f115a150d7 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Fri, 24 Apr 2026 09:13:43 +0200 Subject: [PATCH 183/240] fix bug regarding chain mapping --- backend/protzilla/data_analysis/crosslinking_validation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 00ce210c1..e9e780167 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -420,9 +420,8 @@ def get_valid_ids_per_protein_id_from_job_request( if protein_id is not None: # Remove the specific isoform/variant suffix because we do not use it in the crosslinking df protein_id = protein_id.replace("-1", "") - valid_ids[protein_id] = [] for _ in range(count): - valid_ids[protein_id].append(unique_id) + valid_ids.setdefault(protein_id, []).append(unique_id) unique_id += 1 return valid_ids From f3bd7174aa8ad10cb1af5a563e1864c9b3725997 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Fri, 24 Apr 2026 11:51:05 +0200 Subject: [PATCH 184/240] fix tests and chain ids naming in visualization --- backend/main/views_helper.py | 12 ++++++------ .../data_analysis/test_crosslinking_validation.py | 5 +++++ .../test_alphafold_protein_structure_load.py | 3 +++ 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/backend/main/views_helper.py b/backend/main/views_helper.py index e7f6f7c0b..3d674657a 100644 --- a/backend/main/views_helper.py +++ b/backend/main/views_helper.py @@ -268,15 +268,15 @@ def extract_relevant_crosslink_information( :param crosslinking_df: DataFrame with columns 'crosslinker_position1', 'crosslinker_position2', - 'chain_id1', - 'chain_id2', + 'Chain_id1', + 'Chain_id2', 'valid_crosslink', 'Is_intra_crosslink', :return: List of dicts with keys 'crosslinkerPosition1', 'crosslinkerPosition2', - 'chainId1', - 'chainId2', + 'ChainId1', + 'ChainId2', 'isValid', 'isIntraCrosslink', """ @@ -289,8 +289,8 @@ def extract_relevant_crosslink_information( # Since we already need those chain ids to calculate correct distances in the validation, # it would be unnecessary to determine those again in the visualization. # Therefore we use placeholders for now and need to change the following, when the validation is extended: - chain_id1 = "A" # row.get("chain_id1") - chain_id2 = "A" # row.get("chain_id2") + chain_id1 = row.get("Chain_id1") + chain_id2 = row.get("Chain_id2") is_valid = row.get("valid_crosslink") is_intra_crosslink = row.get("Is_intra_crosslink") if pd.notnull(position1) and pd.notnull(position2) and pd.notnull(is_valid): diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 955c36bb7..eaa9e6fd8 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -992,6 +992,10 @@ def test_validate_multimer_same_protein_different_chains_intra_vs_inter(): } ) + structure_metadata_df = pd.DataFrame( + {"entry_id": ["test"], "uniprot_ids": ["ABCD"],} + ) + crosslinker_information = {"XL": [0.0, 0.0, 0.0]} valid_ids = {"P1": [1]} # One protein ID, but present in chains A and B structures_to_validate = ["P1"] @@ -999,6 +1003,7 @@ def test_validate_multimer_same_protein_different_chains_intra_vs_inter(): out = validate_with_angstrom_deviation( crosslinking_df=crosslinking_df, crosslinker_information=crosslinker_information, + structure_metadata_df=structure_metadata_df, cif_df=cif_df, amino_acid_sequences_df=sequences_df, valid_ids=valid_ids, diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index 0f62f69d2..570c7e0b4 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -967,8 +967,11 @@ def test_get_multimer_structure_dfs_json_fallback_warns(tmp_path, monkeypatch): j1 = prot_dir / "j1.json" j2 = prot_dir / "j2.json" + j3 = prot_dir / "j3s.json" j1.write_text(json.dumps({"something": 1})) j2.write_text(json.dumps({"other": 2})) + j3.write_text(json.dumps({"other": 3})) + out = get_multimer_structure_dfs("M2") assert any(m.get("level") == logging.WARNING for m in out["messages"]) From c740b278a8214f6f2a5dd612e958cb00a26712f9 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Fri, 24 Apr 2026 16:10:54 +0200 Subject: [PATCH 185/240] format with black --- .../protzilla/data_analysis/test_crosslinking_validation.py | 5 ++++- .../importing/test_alphafold_protein_structure_load.py | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index eaa9e6fd8..5a66cd8a7 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -993,7 +993,10 @@ def test_validate_multimer_same_protein_different_chains_intra_vs_inter(): ) structure_metadata_df = pd.DataFrame( - {"entry_id": ["test"], "uniprot_ids": ["ABCD"],} + { + "entry_id": ["test"], + "uniprot_ids": ["ABCD"], + } ) crosslinker_information = {"XL": [0.0, 0.0, 0.0]} diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index 570c7e0b4..8c3e3d69f 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -972,7 +972,6 @@ def test_get_multimer_structure_dfs_json_fallback_warns(tmp_path, monkeypatch): j2.write_text(json.dumps({"other": 2})) j3.write_text(json.dumps({"other": 3})) - out = get_multimer_structure_dfs("M2") assert any(m.get("level") == logging.WARNING for m in out["messages"]) assert any( From b88b58da870b4b3a93fd019c140a08620f5d2300 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Fri, 24 Apr 2026 16:40:05 +0200 Subject: [PATCH 186/240] fix multimer import test --- .../test_alphafold_protein_structure_load.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index 8c3e3d69f..940bbcea2 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -967,14 +967,15 @@ def test_get_multimer_structure_dfs_json_fallback_warns(tmp_path, monkeypatch): j1 = prot_dir / "j1.json" j2 = prot_dir / "j2.json" - j3 = prot_dir / "j3s.json" - j1.write_text(json.dumps({"something": 1})) - j2.write_text(json.dumps({"other": 2})) - j3.write_text(json.dumps({"other": 3})) - - out = get_multimer_structure_dfs("M2") - assert any(m.get("level") == logging.WARNING for m in out["messages"]) - assert any( - "Could not detect confidence scores" in str(m.get("msg", "")) - for m in out["messages"] + j3 = prot_dir / "j3.json" + j1.write_text(json.dumps({"wrong_key": 1})) + j2.write_text(json.dumps({"pae": 2})) + j3.write_text(json.dumps({"sequences": 3})) + + with pytest.raises(RuntimeError) as exc_info: + get_multimer_structure_dfs("M2") + + assert "Failed to read JSON files in" in str(exc_info.value) + assert "Could not detect confidence scores/full data/job request" in str( + exc_info.value ) From e6c7f6097aef92d770a7ec50828bda6545267104 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Sun, 26 Apr 2026 11:58:15 +0200 Subject: [PATCH 187/240] feat: show whole plot title and add padding to show proper axes labeling --- backend/protzilla/data_preprocessing/plots.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/backend/protzilla/data_preprocessing/plots.py b/backend/protzilla/data_preprocessing/plots.py index 68d241d94..48ec2abae 100644 --- a/backend/protzilla/data_preprocessing/plots.py +++ b/backend/protzilla/data_preprocessing/plots.py @@ -1,4 +1,5 @@ import math +import textwrap import numpy as np import pandas as pd @@ -279,10 +280,15 @@ def create_histograms( if visual_transformation == "log10": fig.update_layout(xaxis=generate_tics(0, max_value, True)) - fig.update_layout(title={"text": f"{heading}"}) + wrapped_title = "
".join(textwrap.wrap(heading, width=60)) + fig.update_layout( + title={"text": f"{wrapped_title}"} + ) fig.update_xaxes(title=x_title) fig.update_yaxes(title=y_title, rangemode="tozero") + fig.update_layout(margin_pad=20) + # Disable toggling of the visibility of the traces by clicking on the legend fig.update_layout(legend=dict(itemclick=False, itemdoubleclick=False)) return fig From 9f6f119b49d2f84ebd151b669d41f3f550c346cf Mon Sep 17 00:00:00 2001 From: 3dot141592 Date: Mon, 27 Apr 2026 12:06:00 +0200 Subject: [PATCH 188/240] Fix import --- backend/protzilla/methods/data_analysis.py | 2 +- backend/protzilla/methods/importing.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 10394af1f..9f72bc441 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -89,7 +89,7 @@ create_overview_ptm_visualization, get_detected_modifications, ) -from protzilla.data_analysis.crosslinking_validation import ( +from backend.protzilla.data_analysis.crosslinking_validation import ( validate_with_angstrom_deviation, diagrams_of_crosslinking_validation_data, ) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 90f65adc5..b1a25c92d 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -45,7 +45,7 @@ FeatureOrientationType, ) from backend.protzilla.constants.intensity_types import IntensityType, IntensityNameType -from protzilla.importing.query_generation import generate_alphafold_query_json +from backend.protzilla.importing.query_generation import generate_alphafold_query_json class ImportingStep(Step, ABC): From 301491a4d8efbb6a12799bf188df532150ae3cda Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 27 Apr 2026 15:39:54 +0200 Subject: [PATCH 189/240] fix: show tick for crosslinker length and allowed deviation --- .../data_analysis/crosslinking_validation.py | 87 +++++++++++++++++-- backend/protzilla/data_preprocessing/plots.py | 6 +- 2 files changed, 84 insertions(+), 9 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 32535d895..fa32abf22 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -1,12 +1,14 @@ import itertools import ast import math +from pipes import stepkinds import pandas as pd import numpy as np import re import logging +from pandas.io.stata import stata_epoch from plotly.graph_objects import Figure from backend.protzilla.data_preprocessing.plots import ( @@ -18,6 +20,7 @@ add_vertical_line_with_annotation_in_legend, ) from backend.protzilla.steps import OutputItem, OutputType +from backend.protzilla.data_preprocessing.plots_helper import millify def get_reactive_atom_of_amino_acid_residue(amino_acid_type: str) -> str: @@ -647,6 +650,43 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: ) +def _get_tick_values_with_lines(fig, min_value, max_value): + """ + Generates tick values and labels for a Plotly figure's x-axis, ensuring that + the x-positions of all vertical lines in the figure are included as additional ticks. + + Regular ticks are spaced evenly based on the range between min_value and max_value. + Vertical line positions that fall within the range and are not already covered by a regular tick + are appended and labeled with their rounded value. + + :param fig: Plotly Figure object whose shapes are inspected for vertical lines. + :param min_value: Lower bound of the x-axis range. + :param max_value: Upper bound of the x-axis range. + :return: Dictionary with tickmode, tickvals, and ticktext suitable for use in update_xaxes. + """ + line_x_values = [ + shape.x0 + for shape in fig.layout.shapes + if shape.type == "line" and shape.x0 == shape.x1 + ] + + step_size = pow(10, math.floor(np.log10(max_value - min_value))) + first_step = math.ceil(min_value / step_size) * step_size + last_step = math.ceil(max_value / step_size) * step_size + 3 * step_size + tick_values = list(np.arange(first_step, last_step, step_size)) + tick_text = list(np.vectorize(lambda x: millify(x))(tick_values)) + + for x in line_x_values: + if x not in tick_values and min_value <= x <= max_value: + tick_values.append(x) + tick_text.append(str(round(x, 2))) + + paired = sorted(zip(tick_values, tick_text)) + tick_values, tick_text = zip(*paired) + + return dict(tickmode="array", tickvals=list(tick_values), ticktext=list(tick_text)) + + def diagrams_of_crosslinking_validation_data( validated_df: pd.DataFrame, structures_to_validate: str, @@ -716,6 +756,31 @@ def diagrams_of_crosslinking_validation_data( accepted_deviation_upper_bound, accepted_deviation_lower_bound, ) = crosslinker_information[crosslinker] + # make sure that the crosslinker length is always shown + hist_min = math.floor( + min( + crosslinker_length, + np.nanmin( + [ + df_valid["alphafold_distance"].min(), + df_invalid["alphafold_distance"].min(), + ] + ), + ) + - 1 + ) + hist_max = math.ceil( + max( + crosslinker_length, + np.nanmax( + [ + df_valid["alphafold_distance"].max(), + df_invalid["alphafold_distance"].max(), + ] + ), + ) + + 1 + ) histogram = create_histograms( dataframe_a=df_valid, dataframe_b=df_invalid, @@ -728,12 +793,14 @@ def diagrams_of_crosslinking_validation_data( visual_transformation="linear", relevant_column_a="alphafold_distance", relevant_column_b="alphafold_distance", + min_value=hist_min, + max_value=hist_max, one_bin_per_int=True, ) add_vertical_line_with_annotation_in_legend( fig=histogram, dash="solid", - annotation=f"{crosslinker} length", + annotation=f"{crosslinker} length: {crosslinker_length}Å", x_value=crosslinker_length, ) @@ -750,6 +817,9 @@ def diagrams_of_crosslinking_validation_data( mean_minus_two_std = max( 0, mean_of_predicted_lengths - 2 * standard_deviation_predicted_lengths ) + # make sure that the crosslinker length is always shown + hist_2std_min = math.floor(min(crosslinker_length, mean_minus_two_std) - 1) + hist_2std_max = math.ceil(max(crosslinker_length, mean_plus_two_std) + 1) histogram_two_standard_deviations = create_histograms( dataframe_a=df_valid, @@ -763,14 +833,14 @@ def diagrams_of_crosslinking_validation_data( visual_transformation="linear", relevant_column_a="alphafold_distance", relevant_column_b="alphafold_distance", - min_value=mean_minus_two_std, - max_value=mean_plus_two_std, + min_value=hist_2std_min, + max_value=hist_2std_max, one_bin_per_int=True, ) add_vertical_line_with_annotation_in_legend( fig=histogram_two_standard_deviations, dash="solid", - annotation=f"{crosslinker} length", + annotation=f"{crosslinker} length: {crosslinker_length}Å", x_value=crosslinker_length, ) @@ -810,7 +880,14 @@ def diagrams_of_crosslinking_validation_data( annotation=f"allowed deviation lower bound", x_value=crosslinker_length - accepted_deviation_lower_bound, ) - + histogram.update_xaxes( + **_get_tick_values_with_lines(histogram, hist_min, hist_max) + ) + histogram_two_standard_deviations.update_xaxes( + **_get_tick_values_with_lines( + histogram_two_standard_deviations, hist_2std_min, hist_2std_max + ) + ) figures.append(histogram_two_standard_deviations) figures.append(histogram) diff --git a/backend/protzilla/data_preprocessing/plots.py b/backend/protzilla/data_preprocessing/plots.py index 48ec2abae..c255c4a3c 100644 --- a/backend/protzilla/data_preprocessing/plots.py +++ b/backend/protzilla/data_preprocessing/plots.py @@ -280,10 +280,8 @@ def create_histograms( if visual_transformation == "log10": fig.update_layout(xaxis=generate_tics(0, max_value, True)) - wrapped_title = "
".join(textwrap.wrap(heading, width=60)) - fig.update_layout( - title={"text": f"{wrapped_title}"} - ) + wrapped_title = "
".join(textwrap.wrap(heading, width=50)) + fig.update_layout(title={"text": f"{wrapped_title}"}) fig.update_xaxes(title=x_title) fig.update_yaxes(title=y_title, rangemode="tozero") From 5a96fb411b7a63103c9f38c5ce21a716b9778e4d Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 27 Apr 2026 15:49:36 +0200 Subject: [PATCH 190/240] fix: fix wrong use of ranges for 2std cl diagram --- .../data_analysis/crosslinking_validation.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index fa32abf22..2665c2835 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -759,7 +759,7 @@ def diagrams_of_crosslinking_validation_data( # make sure that the crosslinker length is always shown hist_min = math.floor( min( - crosslinker_length, + crosslinker_length - accepted_deviation_lower_bound, np.nanmin( [ df_valid["alphafold_distance"].min(), @@ -771,7 +771,7 @@ def diagrams_of_crosslinking_validation_data( ) hist_max = math.ceil( max( - crosslinker_length, + crosslinker_length + accepted_deviation_upper_bound, np.nanmax( [ df_valid["alphafold_distance"].max(), @@ -817,9 +817,6 @@ def diagrams_of_crosslinking_validation_data( mean_minus_two_std = max( 0, mean_of_predicted_lengths - 2 * standard_deviation_predicted_lengths ) - # make sure that the crosslinker length is always shown - hist_2std_min = math.floor(min(crosslinker_length, mean_minus_two_std) - 1) - hist_2std_max = math.ceil(max(crosslinker_length, mean_plus_two_std) + 1) histogram_two_standard_deviations = create_histograms( dataframe_a=df_valid, @@ -833,8 +830,8 @@ def diagrams_of_crosslinking_validation_data( visual_transformation="linear", relevant_column_a="alphafold_distance", relevant_column_b="alphafold_distance", - min_value=hist_2std_min, - max_value=hist_2std_max, + min_value=mean_minus_two_std, + max_value=mean_plus_two_std, one_bin_per_int=True, ) add_vertical_line_with_annotation_in_legend( @@ -885,7 +882,7 @@ def diagrams_of_crosslinking_validation_data( ) histogram_two_standard_deviations.update_xaxes( **_get_tick_values_with_lines( - histogram_two_standard_deviations, hist_2std_min, hist_2std_max + histogram_two_standard_deviations, mean_minus_two_std, mean_plus_two_std ) ) figures.append(histogram_two_standard_deviations) From 85283830a81f985f722bc1d8738da9ec25c57a6b Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Wed, 29 Apr 2026 10:44:14 +0200 Subject: [PATCH 191/240] fix: change modify-form to reset ALL input fields after successful calculation --- backend/protzilla/methods/importing.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index cf4af4b60..cea5ea22f 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -53,14 +53,9 @@ class ImportingStep(Step, ABC): def modify_form(self, run: Run): if run.steps.current_step.calculation_status == "complete": - self.form.input_fields[self.index_of_file_input()].value = None - - def index_of_file_input(self): - """ - Returns the index of the FileInput that should be reset by modify_form. This method - must be overridden if the FileInput is not index 0. - """ - return 0 + for field in self.form.input_fields: + if isinstance(field, FileInput): + field.value = None class ArbitraryCSVImport(ImportingStep): From ce08a47436cde1f40a0f861a8e10e25e4e246425 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Wed, 29 Apr 2026 11:30:44 +0200 Subject: [PATCH 192/240] Add accepted file endings in forms for cl imports --- backend/protzilla/methods/importing.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index cea5ea22f..b17ebee6b 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -479,6 +479,7 @@ def create_form(self): name="file_path", label="Crosslinking Data file (.xlsx or .csv)", value=None, + accept=".xlsx,.csv", ), TextField( name="organism_ids", @@ -567,26 +568,31 @@ def create_form(self): name="amino_acid_sequences", label="Amino acid sequences of proteins in the prediction (required)", value=None, + accept=".fasta,.fa,.faa", ), FileInput( name="cif_file", label="CIF file (required)", value=None, + accept=".cif,.mmcif", ), FileInput( name="confidence_file", label="Confidence summary json file (required)", value=None, + accept=".json", ), FileInput( name="full_data_file", label="Full data json file (required)", value=None, + accept=".json", ), FileInput( name="job_request_file", label="Job request json file (required)", value=None, + accept=".json", ), CheckboxField( name="persist_upload", From b4ba1451335aa05200e4599144ed0724adaf9512 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Thu, 30 Apr 2026 14:20:51 +0200 Subject: [PATCH 193/240] feat: add broken histogram with partly logarithmic scaling for crosslinking validation --- .../data_analysis/crosslinking_validation.py | 4 + backend/protzilla/data_analysis/plots.py | 3 +- backend/protzilla/data_preprocessing/plots.py | 159 ++++++++++++------ 3 files changed, 114 insertions(+), 52 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 2665c2835..c7ac545e8 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -796,12 +796,14 @@ def diagrams_of_crosslinking_validation_data( min_value=hist_min, max_value=hist_max, one_bin_per_int=True, + split_x_axis_at= crosslinker_length if accepted_deviation_upper_bound is None else crosslinker_length+accepted_deviation_upper_bound ) add_vertical_line_with_annotation_in_legend( fig=histogram, dash="solid", annotation=f"{crosslinker} length: {crosslinker_length}Å", x_value=crosslinker_length, + column=1 ) mean_of_predicted_lengths = crosslinker_df["alphafold_distance"].mean() @@ -847,6 +849,7 @@ def diagrams_of_crosslinking_validation_data( dash="dash", annotation=f"allowed deviation upper bound", x_value=crosslinker_length + accepted_deviation_upper_bound, + column=1 ) if ( math.floor(mean_minus_two_std) @@ -865,6 +868,7 @@ def diagrams_of_crosslinking_validation_data( dash="dash", annotation=f"allowed deviation lower bound", x_value=crosslinker_length - accepted_deviation_lower_bound, + column=1 ) if ( math.floor(mean_minus_two_std) diff --git a/backend/protzilla/data_analysis/plots.py b/backend/protzilla/data_analysis/plots.py index 8d012b0f0..05209e234 100644 --- a/backend/protzilla/data_analysis/plots.py +++ b/backend/protzilla/data_analysis/plots.py @@ -588,6 +588,7 @@ def add_vertical_line_with_annotation_in_legend( annotation: str, x_value: float, color: str = PLOT_PRIMARY_COLOR, + column: int = None ) -> None: """ Adds a vertical line to a Plotly figure and includes a corresponding entry in the legend @@ -601,7 +602,7 @@ def add_vertical_line_with_annotation_in_legend( :return: None """ # add vertical line - fig.add_vline(x=x_value, line_color=color, line_dash=dash, line_width=2) + fig.add_vline(x=x_value, line_color=color, line_dash=dash, line_width=2, col=column) # add annotation of the line to the legend fig.add_trace( go.Scatter( diff --git a/backend/protzilla/data_preprocessing/plots.py b/backend/protzilla/data_preprocessing/plots.py index c255c4a3c..a49aa325b 100644 --- a/backend/protzilla/data_preprocessing/plots.py +++ b/backend/protzilla/data_preprocessing/plots.py @@ -177,6 +177,7 @@ def create_histograms( min_value: float = None, max_value: float = None, one_bin_per_int: bool = False, + split_x_axis_at: float = None ) -> Figure: """ A function to create a histogram for visualisation @@ -227,63 +228,119 @@ def create_histograms( if max_value is None: max_value = np.nanmax([values_a.max(), values_b.max()]) - if one_bin_per_int: - min_value = math.floor(min_value) - max_value = math.ceil(max_value) - binsize_a = 1 - binsize_b = 1 - else: - number_of_bins = 100 - if len(values_a) > 0: - binsize_a = ( - values_a.max(skipna=True) - values_a.min(skipna=True) - ) / number_of_bins - else: - binsize_a = 1 # default value of 1 in case that values_a is empty - if len(values_b) > 0: - binsize_b = ( - values_b.max(skipna=True) - values_b.min(skipna=True) - ) / number_of_bins - else: - binsize_b = 1 # default value of 1 in case that values_b is empty - - if overlay and len(values_a) > 0 and len(values_b) > 0: - binsize_a = binsize_b = max(binsize_a, binsize_b) + # Logic for Split Axis (Linear -> Log) + if split_x_axis_at is not None and visual_transformation == "linear": + fig = make_subplots( + rows=1, cols=2, + shared_yaxes=True, + horizontal_spacing=0.02, + column_widths=[0.5, 0.5] + ) - trace0 = go.Histogram( - x=values_a, - marker_color=PLOT_PRIMARY_COLOR, - name=name_a, - xbins=dict(start=min_value, end=max_value, size=binsize_a), - ) - trace1 = go.Histogram( - x=values_b, - marker_color=PLOT_SECONDARY_COLOR, - name=name_b, - xbins=dict(start=min_value, end=max_value, size=binsize_b), - ) - if not overlay: - fig = make_subplots(rows=1, cols=2) - fig.add_trace(trace0, 1, 1) - fig.add_trace(trace1, 1, 2) - if visual_transformation == "log10": - fig.update_layout( - xaxis=generate_tics(0, max_value, True), - xaxis2=generate_tics(0, max_value, True), - ) - else: - fig = go.Figure() - fig.add_trace(trace0) - fig.add_trace(trace1) + def add_split_traces(values, name, color, show_legend): + # Split data + v_lin = values[values <= split_x_axis_at] + v_log = values[values > split_x_axis_at] + + # Trace for linear part + fig.add_trace(go.Histogram( + x=v_lin, name=name, marker_color=color, + xbins=dict(start=min_value, end=split_x_axis_at), + legendgroup=name, showlegend=show_legend + ), row=1, col=1) + + # Trace for log part + fig.add_trace(go.Histogram( + x=v_log, name=name, marker_color=color, + xbins=dict(start=split_x_axis_at, end=max_value), + legendgroup=name, showlegend=False + ), row=1, col=2) + + + add_split_traces(values_a, name_a, PLOT_PRIMARY_COLOR, True) + add_split_traces(values_b, name_b, PLOT_SECONDARY_COLOR, True) + + fig.update_xaxes(title_text=f"{x_title} (Linear)", range=[min_value, split_x_axis_at], row=1, col=1, + showline=True, + mirror=False, + zeroline=False + ) + fig.update_xaxes( + title_text=f"{x_title} (Log)", + type="log", + # Use log10 of the values for the range array! + range=[np.log10(split_x_axis_at), np.log10(max_value)], + row=1, col=2, + showline=True, + mirror=False, + zeroline=False + ) fig.update_layout(barmode="overlay") fig.update_traces(opacity=0.75) - if visual_transformation == "log10": - fig.update_layout(xaxis=generate_tics(0, max_value, True)) + + # Add the // break marks + fig.add_shape(type="line", xref="paper", yref="paper", x0=0.5, y0=-0.02, x1=0.52, y1=0.02, + line=dict(width=2)) + fig.add_shape(type="line", xref="paper", yref="paper", x0=0.48, y0=-0.02, x1=0.5, y1=0.02, + line=dict(width=2)) + else: + if one_bin_per_int: + min_value = math.floor(min_value) + max_value = math.ceil(max_value) + binsize_a = 1 + binsize_b = 1 + else: + number_of_bins = 100 + if len(values_a) > 0: + binsize_a = ( + values_a.max(skipna=True) - values_a.min(skipna=True) + ) / number_of_bins + else: + binsize_a = 1 # default value of 1 in case that values_a is empty + if len(values_b) > 0: + binsize_b = ( + values_b.max(skipna=True) - values_b.min(skipna=True) + ) / number_of_bins + else: + binsize_b = 1 # default value of 1 in case that values_b is empty + + if overlay and len(values_a) > 0 and len(values_b) > 0: + binsize_a = binsize_b = max(binsize_a, binsize_b) + + trace0 = go.Histogram( + x=values_a, + marker_color=PLOT_PRIMARY_COLOR, + name=name_a, + xbins=dict(start=min_value, end=max_value, size=binsize_a), + ) + trace1 = go.Histogram( + x=values_b, + marker_color=PLOT_SECONDARY_COLOR, + name=name_b, + xbins=dict(start=min_value, end=max_value, size=binsize_b), + ) + if not overlay: + fig = make_subplots(rows=1, cols=2) + fig.add_trace(trace0, 1, 1) + fig.add_trace(trace1, 1, 2) + if visual_transformation == "log10": + fig.update_layout( + xaxis=generate_tics(0, max_value, True), + xaxis2=generate_tics(0, max_value, True), + ) + else: + fig = go.Figure() + fig.add_trace(trace0) + fig.add_trace(trace1) + fig.update_layout(barmode="overlay") + fig.update_traces(opacity=0.75) + if visual_transformation == "log10": + fig.update_layout(xaxis=generate_tics(0, max_value, True)) + fig.update_xaxes(title=x_title) + fig.update_yaxes(title=y_title, rangemode="tozero") wrapped_title = "
".join(textwrap.wrap(heading, width=50)) fig.update_layout(title={"text": f"{wrapped_title}"}) - fig.update_xaxes(title=x_title) - fig.update_yaxes(title=y_title, rangemode="tozero") fig.update_layout(margin_pad=20) From 8cd789b9cc91ca8d0b07ed51686891cf6c9591eb Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Thu, 30 Apr 2026 14:58:52 +0200 Subject: [PATCH 194/240] remove unnecessary repetitive add_field function --- backend/protzilla/form.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/backend/protzilla/form.py b/backend/protzilla/form.py index 80b05ecb5..9e418979f 100644 --- a/backend/protzilla/form.py +++ b/backend/protzilla/form.py @@ -242,10 +242,6 @@ def add_field(self, new_field: InputField) -> None: if new_field.name in self._value_buffer: new_field.value = self._value_buffer.pop(new_field.name) - def add_field(self, new_field: InputField): - self.input_fields.append(new_field) - self._field_map[new_field.name] = new_field - def __getitem__(self, fieldname: str) -> InputField: "to do form[fieldname] to get the field object" From b08f95703018d36e900c13b02fb2f350c253e545 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Thu, 30 Apr 2026 15:01:47 +0200 Subject: [PATCH 195/240] Add default value handling for cl and a defaults_operator for future default handling for other purposes --- backend/main/urls.py | 15 ++ backend/main/views_settings.py | 53 ++++ backend/protzilla/disk_operator.py | 45 ++++ backend/protzilla/methods/data_analysis.py | 34 ++- .../other-settings/cl-default-settings.tsx | 254 ++++++++++++++++++ .../app/settings/other-settings/index.ts | 1 + .../src/components/app/settings/settings.tsx | 11 + 7 files changed, 404 insertions(+), 9 deletions(-) create mode 100644 frontend/src/components/app/settings/other-settings/cl-default-settings.tsx diff --git a/backend/main/urls.py b/backend/main/urls.py index 51a184285..79ffe4303 100644 --- a/backend/main/urls.py +++ b/backend/main/urls.py @@ -115,6 +115,21 @@ views_settings.delete_multimer_structure, name="delete_multimer_structure", ), + path( + "api/get_cl_defaults", + views_settings.get_cl_defaults, + name="get_cl_defaults", + ), + path( + "api/update_cl_default", + views_settings.update_cl_default, + name="update_cl_default", + ), + path( + "api/delete_cl_default", + views_settings.delete_cl_default, + name="delete_cl_default", + ), path( "api/load_ptm_settings", views_settings.load_ptm_settings, diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index 63ede4254..920d471af 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -32,6 +32,7 @@ uniprot_databases, ) from backend.protzilla.disk_operator import YamlOperator +from backend.protzilla.disk_operator import DefaultsOperator from backend.main.views_helper import load_yaml_from_file from backend.protzilla.constants.paths import ( CUSTOM_PLOT_SETTINGS_FILE_STEM, @@ -578,6 +579,58 @@ def delete_multimer_structure(request): ) +def get_cl_defaults(request): + default_operator = DefaultsOperator() + defaults = default_operator.get_all_defaults() + return JsonResponse(defaults, safe=False) + + +def update_cl_default(request): + if request.method == "POST": + data = json.loads(request.body) + cl_name = data.get("cl_name") + cl_length = data.get("cl_length") if data.get("cl_length") != "" else 0 + cl_upper_deviation = ( + data.get("cl_upper_deviation") if data.get("cl_length") != "" else 0 + ) + cl_lower_deviation = ( + data.get("cl_lower_deviation") if data.get("cl_length") != "" else 0 + ) + + cl_default_dict = { + "cl_length": cl_length, + "cl_upper_deviation": cl_upper_deviation, + "cl_lower_deviation": cl_lower_deviation, + } + try: + defaults_operator = DefaultsOperator() + defaults_operator.write_default(name=cl_name, value=cl_default_dict) + return JsonResponse( + { + "success": True, + "message": (f"Default values updated successfully. "), + }, + status=200, + ) + except Exception: + return JsonResponse( + {"success": False, "message": "Default values could not be updated."}, + status=405, + ) + else: + return JsonResponse( + {"success": False, "message": "Invalid request method"}, status=405 + ) + + +def delete_cl_default(request): + if request.method == "POST": + data = json.loads(request.body) + cl_name = data.get("cl_name") + defaults_operator = DefaultsOperator() + defaults_operator.delete_default(cl_name) + + # <--- Databases ---> diff --git a/backend/protzilla/disk_operator.py b/backend/protzilla/disk_operator.py index d2957bc83..41b9a2b4c 100644 --- a/backend/protzilla/disk_operator.py +++ b/backend/protzilla/disk_operator.py @@ -160,6 +160,50 @@ def write(file_path: Path, base64_string: bytes): file.write(data) +class DefaultsOperator: + def __init__(self): + self.yaml_operator = YamlOperator() + + def read_default(self, name: str) -> any: + with ErrorHandler(): + if not self.defaults_file.exists(): + return None + defaults = self.yaml_operator.read(self.defaults_file) or {} + return defaults.get(name) + + def write_default(self, name: str, value: any) -> None: + with ErrorHandler(): + if not self.defaults_file.parent.exists(): + self.defaults_file.parent.mkdir(parents=True, exist_ok=True) + defaults = {} + if self.defaults_file.exists(): + defaults = self.yaml_operator.read(self.defaults_file) or {} + defaults[name] = value + self.yaml_operator.write(self.defaults_file, defaults) + + def delete_default(self, name: str) -> None: + with ErrorHandler(): + if not self.defaults_file.exists(): + return + defaults = self.yaml_operator.read(self.defaults_file) or {} + if name in defaults: + del defaults[name] + self.yaml_operator.write(self.defaults_file, defaults) + + def get_all_defaults(self): + """Reads all default values from disk. Returns an empty dict if the file doesn't exist.""" + with ErrorHandler(): + if not self.defaults_file.exists(): + return {} + + defaults = self.yaml_operator.read(self.defaults_file) + return defaults or {} + + @property + def defaults_file(self) -> Path: + return paths.USER_DATA_PATH / "defaults.yaml" + + RUN_FILE = "run.yaml" @@ -190,6 +234,7 @@ def __init__(self, run_name: str, workflow_name: str): self.dataframe_operator = DataFrameOperator() self.artifact_operator = ArtifactOperator() self.base64_operator = Base64Operator() + self.defaults = DefaultsOperator() def read_run(self, file: Path | None = None) -> StepManager: with ErrorHandler(): diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 3a1b9d361..762cd29d6 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -95,13 +95,6 @@ monomer_validation, multimer_validation, ) -from backend.protzilla.run import Run -from backend.protzilla.methods.importing import ( - ImportMonomerStructurePredictionFromDisk, - AlphaFoldPredictionLoad, - ImportMultimerStructurePredictionFromDisk, - UploadMultimerPredictions, -) class TTestType(Enum): @@ -2384,20 +2377,32 @@ def create_crosslink_input_fields(self, form: Form, run: Run): for crosslinker in crosslinkers: field_name = f"{crosslinker}_length" if field_name not in form: + cl_defaults = run.disk_operator.defaults.read_default(crosslinker) + if cl_defaults: + length_default = cl_defaults["cl_length"] + upper_deviation_default = cl_defaults["cl_upper_deviation"] + lower_deviation_default = cl_defaults["cl_lower_deviation"] + else: + length_default = 0 + upper_deviation_default = 0 + lower_deviation_default = 0 crosslinker_length_field = FloatField( name=field_name, label=f"Length of {crosslinker} in Ångström", min=0, + value=length_default, ) upper_bound_length_deviation_field = FloatField( name=f"{crosslinker}_upper_accepted_deviation", label=f"Upper bound on the accepted deviation for {crosslinker} Cross-Links in Ångström (0 equals no bound)", min=0, + value=upper_deviation_default, ) lower_bound_length_deviation_field = FloatField( name=f"{crosslinker}_lower_accepted_deviation", label=f"Lower bound on the accepted deviation for {crosslinker} Cross-Links in Ångström (0 equals no bound)", min=0, + value=lower_deviation_default, ) form.add_field(crosslinker_length_field) form.add_field(upper_bound_length_deviation_field) @@ -2435,7 +2440,14 @@ class CrosslinkingValidationWithAngstromDeviation( plot_method = staticmethod(monomer_diagrams) def create_form(self): - return Form(label="Ångström Deviation - Monomer", input_fields=[]) + return Form( + label="Ångström Deviation - Monomer", + input_fields=[ + InfoField( + label="Set default cross-link lengths and their upper/lower deviations in settings under 'Cross-Links Defaults'.", + ) + ], + ) class CrosslinkingValidationWithAngstromDeviationForMultimer( @@ -2450,5 +2462,9 @@ class CrosslinkingValidationWithAngstromDeviationForMultimer( def create_form(self): return Form( label="Ångström Deviation - Multimer", - input_fields=[], + input_fields=[ + InfoField( + label="Set default cross-link lengths and their upper/lower deviations in settings under 'Cross-Links Defaults'.", + ) + ], ) diff --git a/frontend/src/components/app/settings/other-settings/cl-default-settings.tsx b/frontend/src/components/app/settings/other-settings/cl-default-settings.tsx new file mode 100644 index 000000000..382e790d7 --- /dev/null +++ b/frontend/src/components/app/settings/other-settings/cl-default-settings.tsx @@ -0,0 +1,254 @@ +import { useNotification } from "@protzilla/app"; +import { DeleteModal, Form, SecondaryButton, SectionTitle, Text } from "@protzilla/core"; +import { useToggleableState } from "@protzilla/hooks"; +import { spacing } from "@protzilla/theme"; +import { callApi, callApiWithParameters } from "@protzilla/utils"; +import { useEffect, useState } from "react"; +import { styled } from "styled-components"; + +const CrosslinkDefaultTitle = styled(SectionTitle)` + padding-top: ${spacing("large")}; + padding-bottom: ${spacing("small")}; +`; + +const CrosslinkDefaultList = styled.div` + display: flex; + flex-direction: column; + gap: ${spacing("verySmall")}; +`; + +interface CrosslinkDefaultProps { + cl_name: string; + cl_length: number; + cl_upper_deviation: number; + cl_lower_deviation: number; + handleDelete?: () => void; +} + +type ApiCrosslinkDefaults = Record; + +const CrosslinkDefaultContainer = styled.div` + display: flex; + flex-direction: row; + align-items: center; + justify-content: space-between; + padding-left: ${spacing("listIndentation")}; + padding-top: ${spacing("verySmall")}; + padding-bottom: ${spacing("verySmall")}; +`; + +const CrosslinkDefaultInfo = styled.div` + display: flex; + justify-content: space-between; + align-content: center; + flex-direction: column; + width: 90%; +`; + +const CrosslinkDefaultEntry = ({ + cl_name, + cl_length, + cl_upper_deviation, + cl_lower_deviation, + handleDelete, +}: CrosslinkDefaultProps) => { + return ( + + + + + + + + ); +}; + +export const CrosslinkDefaultUpload = () => { + const notify = useNotification(); + const [crosslinkDefaultList, setCrosslinkDefaultList] = useState([]); + const [isDeleteModalOpen, openDeleteModal, closeDeleteModal] = useToggleableState(false); + const [selectedCrosslinkDefault, setSelectedCrosslinkDefault] = useState(""); + + const fetchCrosslinkDefaults = async () => { + const crosslinkDefaults = (await callApi("get_cl_defaults")) as ApiCrosslinkDefaults | null; + + if (crosslinkDefaults) { + const transformedList: CrosslinkDefaultProps[] = Object.entries(crosslinkDefaults).map( + ([name, properties]) => ({ + cl_name: name, + cl_length: properties.cl_length, + cl_upper_deviation: properties.cl_upper_deviation, + cl_lower_deviation: properties.cl_lower_deviation, + }) + ); + setCrosslinkDefaultList(transformedList); + } + }; + + useEffect(() => { + void fetchCrosslinkDefaults(); + }, []); + + const handleAddCrosslinkDefault = async ( + cl_name: string, + cl_length: number, + cl_upper_deviation: number, + cl_lower_deviation: number, + ) => { + const response = await callApiWithParameters("update_cl_default", { + cl_name: cl_name, + cl_length: cl_length, + cl_upper_deviation: cl_upper_deviation, + cl_lower_deviation: cl_lower_deviation, + }); + if (response?.success) { + notify({ + title: "Crosslink default update", + message: response.message as string, + type: "success", + isClosingAutomatically: true, + }); + } else { + notify({ + title: "Crosslink default update failed", + message: response.message ?? "Unknown error", + type: "error", + isClosingAutomatically: true, + }); + } + void fetchCrosslinkDefaults(); + }; + + const onDeleteCrosslinkDefault = (cl_name: string) => { + openDeleteModal(); + setSelectedCrosslinkDefault(cl_name); + }; + + const handleDeleteCrosslinkDefault = async (cl_name: string) => { + const response = await callApiWithParameters("delete_cl_default", { + cl_name: cl_name, + }); + if (response?.success) { + notify({ + title: "Cross Link default deleted", + message: response.message as string, + type: "success", + isClosingAutomatically: true, + }); + } else { + notify({ + title: "Cross Link default deletion failed", + message: response?.message ?? "Unknown error", + type: "error", + isClosingAutomatically: true, + }); + } + void fetchCrosslinkDefaults(); + closeDeleteModal(); + }; + + return ( +
+ + + + { + void handleAddCrosslinkDefault( + data.cl_name as string, + data.cl_length as number, + data.cl_upper_deviation as number, + data.cl_lower_deviation as number, + ); + }} + /> + + {crosslinkDefaultList.length === 0 ? ( + + ) : ( + + {crosslinkDefaultList.map((ps) => ( + { + onDeleteCrosslinkDefault(ps.cl_name); + }} + /> + ))} + + )} + void handleDeleteCrosslinkDefault(selectedCrosslinkDefault)} + title={ + `Defaults for ` + + `"${selectedCrosslinkDefault}" will permanently be deleted. Would you like to proceed?` + } + /> +
+ ); +}; diff --git a/frontend/src/components/app/settings/other-settings/index.ts b/frontend/src/components/app/settings/other-settings/index.ts index 3d7daed5b..c55241e68 100644 --- a/frontend/src/components/app/settings/other-settings/index.ts +++ b/frontend/src/components/app/settings/other-settings/index.ts @@ -4,3 +4,4 @@ export * from "./github"; export * from "./ptm-vis-settings"; export * from "./monomer-structure-upload"; export * from "./multimer-structure-upload"; +export * from "./cl-default-settings"; diff --git a/frontend/src/components/app/settings/settings.tsx b/frontend/src/components/app/settings/settings.tsx index 329933309..ca79b2bc3 100644 --- a/frontend/src/components/app/settings/settings.tsx +++ b/frontend/src/components/app/settings/settings.tsx @@ -4,6 +4,7 @@ import { useState } from "react"; import { styled } from "styled-components"; import { + CrosslinkDefaultUpload, DatabaseSettings, GitHub, MonomerStructureUpload, @@ -134,6 +135,15 @@ export const Settings: React.FC = ({ handleSwitchSection("multimer-structure-upload"); }} /> + { + handleSwitchSection("crosslink-defaults"); + }} + /> = ({ {selectedSetting === "ptm-vis" && } {selectedSetting === "monomer-structure-upload" && } {selectedSetting === "multimer-structure-upload" && } + {selectedSetting === "crosslink-defaults" && } {selectedSetting === "github" && } Date: Thu, 30 Apr 2026 16:24:20 +0200 Subject: [PATCH 196/240] fix bug in setting of cl defaults --- backend/main/views_settings.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index 920d471af..d9559eabf 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -591,10 +591,14 @@ def update_cl_default(request): cl_name = data.get("cl_name") cl_length = data.get("cl_length") if data.get("cl_length") != "" else 0 cl_upper_deviation = ( - data.get("cl_upper_deviation") if data.get("cl_length") != "" else 0 + data.get("cl_upper_deviation") + if data.get("cl_upper_deviation") != "" + else 0 ) cl_lower_deviation = ( - data.get("cl_lower_deviation") if data.get("cl_length") != "" else 0 + data.get("cl_lower_deviation") + if data.get("cl_lower_deviation") != "" + else 0 ) cl_default_dict = { @@ -625,10 +629,26 @@ def update_cl_default(request): def delete_cl_default(request): if request.method == "POST": - data = json.loads(request.body) - cl_name = data.get("cl_name") - defaults_operator = DefaultsOperator() - defaults_operator.delete_default(cl_name) + try: + data = json.loads(request.body) + cl_name = data.get("cl_name") + defaults_operator = DefaultsOperator() + defaults_operator.delete_default(cl_name) + return JsonResponse( + { + "success": True, + "message": "Default values deleted successfully.", + }, + status=200, + ) + except Exception: + return JsonResponse( + {"success": False, "message": "Error occured while deleting."}, + status=405, + ) + return JsonResponse( + {"success": False, "message": "Invalid request method"}, status=405 + ) # <--- Databases ---> From 570ff5ba2f33dbd87d6cbbaea9c149c9d3fe22b5 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Thu, 30 Apr 2026 16:24:49 +0200 Subject: [PATCH 197/240] add CL icon for cl default settings --- frontend/src/components/app/settings/settings.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/components/app/settings/settings.tsx b/frontend/src/components/app/settings/settings.tsx index ca79b2bc3..7792d46ef 100644 --- a/frontend/src/components/app/settings/settings.tsx +++ b/frontend/src/components/app/settings/settings.tsx @@ -138,7 +138,7 @@ export const Settings: React.FC = ({ { handleSwitchSection("crosslink-defaults"); From 141ccb4ee84ae2c5e92995fb086013965e1ddc6b Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Thu, 30 Apr 2026 16:25:29 +0200 Subject: [PATCH 198/240] add tests for cl default handling --- backend/tests/main/test_views_settings.py | 122 ++++++++++++++++++++++ 1 file changed, 122 insertions(+) diff --git a/backend/tests/main/test_views_settings.py b/backend/tests/main/test_views_settings.py index e69de29bb..2d8b49d8c 100644 --- a/backend/tests/main/test_views_settings.py +++ b/backend/tests/main/test_views_settings.py @@ -0,0 +1,122 @@ +import json +import pytest +from unittest import mock +from django.http import JsonResponse + +from backend.main.views_settings import ( + get_cl_defaults, + update_cl_default, + delete_cl_default, +) + +import os +import django + +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend.main.settings") +if not django.apps.apps.ready: + django.setup() + +PATCH_PATH = "backend.main.views_settings.DefaultsOperator" + +def test_get_cl_defaults(monkeypatch): + request = mock.Mock() + request.method = "GET" + + mock_defaults_operator = mock.Mock() + mock_defaults_operator.get_all_defaults.return_value = { + "DSSO": {"cl_length": 10.3, "cl_upper_deviation": 1.0, "cl_lower_deviation": 1.0} + } + monkeypatch.setattr(PATCH_PATH, lambda: mock_defaults_operator) + + response = get_cl_defaults(request) + + assert response.status_code == 200 + response_data = json.loads(response.content.decode("utf-8")) + assert response_data == { + "DSSO": {"cl_length": 10.3, "cl_upper_deviation": 1.0, "cl_lower_deviation": 1.0} + } + + +def test_update_cl_default_success(monkeypatch): + payload = { + "cl_name": "DSSO", + "cl_length": 10.3, + "cl_upper_deviation": 1.0, + "cl_lower_deviation": 1.2 + } + + request = mock.Mock() + request.method = "POST" + request.body = json.dumps(payload).encode("utf-8") + + mock_defaults_operator = mock.Mock() + monkeypatch.setattr(PATCH_PATH, lambda: mock_defaults_operator) + + response = update_cl_default(request) + + mock_defaults_operator.write_default.assert_called_once_with( + name="DSSO", + value={ + "cl_length": 10.3, + "cl_upper_deviation": 1.0, + "cl_lower_deviation": 1.2, + } + ) + assert response.status_code == 200 + response_data = json.loads(response.content.decode("utf-8")) + assert response_data["success"] is True + + +def test_update_cl_default_exception(monkeypatch): + payload = {"cl_name": "DSSO", "cl_length": 10.3} + + request = mock.Mock() + request.method = "POST" + request.body = json.dumps(payload).encode("utf-8") + + mock_defaults_operator = mock.Mock() + mock_defaults_operator.write_default.side_effect = Exception("Disk write failed") + monkeypatch.setattr(PATCH_PATH, lambda: mock_defaults_operator) + + response = update_cl_default(request) + + assert response.status_code == 405 + response_data = json.loads(response.content.decode("utf-8")) + assert response_data["success"] is False + + +def test_delete_cl_default_success(monkeypatch): + payload = {"cl_name": "DSSO"} + + request = mock.Mock() + request.method = "POST" + request.body = json.dumps(payload).encode("utf-8") + + mock_defaults_operator = mock.Mock() + monkeypatch.setattr(PATCH_PATH, lambda: mock_defaults_operator) + + response = delete_cl_default(request) + + mock_defaults_operator.delete_default.assert_called_once_with("DSSO") + + assert response.status_code == 200 + response_data = json.loads(response.content.decode("utf-8")) + assert response_data["success"] is True + + +def test_delete_cl_default_exception(monkeypatch): + payload = {"cl_name": "DSSO"} + + request = mock.Mock() + request.method = "POST" + request.body = json.dumps(payload).encode("utf-8") + + mock_defaults_operator = mock.Mock() + mock_defaults_operator.delete_default.side_effect = Exception("Disk delete failed") + monkeypatch.setattr(PATCH_PATH, lambda: mock_defaults_operator) + + response = delete_cl_default(request) + + assert response.status_code == 405 + response_data = json.loads(response.content.decode("utf-8")) + assert response_data["success"] is False \ No newline at end of file From 3e30b02c85caa3040aae2b90accd92f76531fef1 Mon Sep 17 00:00:00 2001 From: jorisfu Date: Thu, 30 Apr 2026 18:12:11 +0200 Subject: [PATCH 199/240] (AI) rework histogram for proper log scale + manual creation using bar plot --- .../data_analysis/crosslinking_validation.py | 289 +++++++++++++++++- 1 file changed, 282 insertions(+), 7 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index c7ac545e8..e776be0f6 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -9,6 +9,7 @@ import logging from pandas.io.stata import stata_epoch +import plotly.graph_objects as go from plotly.graph_objects import Figure from backend.protzilla.data_preprocessing.plots import ( @@ -22,6 +23,17 @@ from backend.protzilla.steps import OutputItem, OutputType from backend.protzilla.data_preprocessing.plots_helper import millify +import textwrap + +from plotly.subplots import make_subplots + +from backend.protzilla.data_preprocessing.plots_helper import generate_tics +from backend.protzilla.utilities.utilities import default_intensity_column +from backend.protzilla.constants.colors import ( + PLOT_COLOR_SEQUENCE, + PLOT_PRIMARY_COLOR, + PLOT_SECONDARY_COLOR, +) def get_reactive_atom_of_amino_acid_residue(amino_acid_type: str) -> str: """ @@ -781,7 +793,7 @@ def diagrams_of_crosslinking_validation_data( ) + 1 ) - histogram = create_histograms( + histogram = create_cl_validation_histogram( dataframe_a=df_valid, dataframe_b=df_invalid, name_a=f"Valid Crosslinks (intra: {valid_intra}, inter: {valid_inter})", @@ -790,11 +802,8 @@ def diagrams_of_crosslinking_validation_data( x_title="Distance (Å)", y_title="Count", overlay=True, - visual_transformation="linear", relevant_column_a="alphafold_distance", relevant_column_b="alphafold_distance", - min_value=hist_min, - max_value=hist_max, one_bin_per_int=True, split_x_axis_at= crosslinker_length if accepted_deviation_upper_bound is None else crosslinker_length+accepted_deviation_upper_bound ) @@ -881,9 +890,9 @@ def diagrams_of_crosslinking_validation_data( annotation=f"allowed deviation lower bound", x_value=crosslinker_length - accepted_deviation_lower_bound, ) - histogram.update_xaxes( - **_get_tick_values_with_lines(histogram, hist_min, hist_max) - ) + # histogram.update_xaxes( + # **_get_tick_values_with_lines(histogram, hist_min, hist_max) + # ) histogram_two_standard_deviations.update_xaxes( **_get_tick_values_with_lines( histogram_two_standard_deviations, mean_minus_two_std, mean_plus_two_std @@ -1009,3 +1018,269 @@ def multimer_diagrams( structures_to_validate=structures_to_validate, crosslinker_information=crosslinker_information, ) + + +# Warning: 100% AI generated +def create_cl_validation_histogram( + dataframe_a: pd.DataFrame, + dataframe_b: pd.DataFrame, + name_a: str = "", + name_b: str = "", + heading: str = "", + y_title: str = "", + x_title: str = "", + overlay: bool = False, + relevant_column_a: str = None, + relevant_column_b: str = None, + one_bin_per_int: bool = False, + split_x_axis_at: float = None +): + """ + A function to create a histogram for visualisation + of distributions. Assumes that you are comparing two dataframes + (for example before and after filtering/normalisation) and creates + a visualisation for each one. + """ + + # It is good practice to drop NaNs before calculating bins/histograms + values_a = dataframe_a[relevant_column_a].dropna() + values_b = dataframe_b[relevant_column_b].dropna() + + min_value = np.nanmin([values_a.min(), values_b.min()]) + max_value = np.nanmax([values_a.max(), values_b.max()]) + + fig = make_subplots( + rows=1, cols=2, + shared_yaxes=True, + horizontal_spacing=0.1, + column_widths=[0.5, 0.5] + ) + + # --- Pre-calculate shared bins for BOTH datasets --- + # 1. Linear Bins + lin_start = math.floor(min_value) + lin_end = math.ceil(split_x_axis_at) + if lin_end <= lin_start: + lin_end = lin_start + 1 + lin_bins = np.arange(lin_start, lin_end + 1, 1) + + # 2. Log Bins + # We must ensure max_value > split_x_axis_at to avoid math domain errors + safe_max = max(max_value, split_x_axis_at * 1.01) + log_start = np.log10(split_x_axis_at) + log_end = np.log10(safe_max) + log_bins_transformed = np.arange(log_start, log_end + 0.1, 0.1) + + # Pre-compute the actual linear numbers of the log bins for the hover template + log_bins_linear = 10 ** log_bins_transformed + + def add_split_traces(values, name, color, show_legend): + # Split data + v_lin = values[values <= split_x_axis_at] + v_log_raw = values[values > split_x_axis_at] + v_log_transformed = np.log10(v_log_raw) + + # --- Calculate Histogram for Linear Part --- + counts_lin, _ = np.histogram(v_lin, bins=lin_bins) + # Pair up the left and right edges for the hover box + customdata_lin = np.stack((lin_bins[:-1], lin_bins[1:]), axis=-1) + + fig.add_trace(go.Bar( + x=lin_bins[:-1], + y=counts_lin, + width=1, # Match the bin size in np.arange + offset=0, # Force bars to start exactly at the bin edge + name=name, + marker_color=color, + legendgroup=name, + showlegend=show_legend, + customdata=customdata_lin, + hovertemplate="%{data.name}
Range: %{customdata[0]:g} to %{customdata[1]:g}
Count: %{y}" + ), row=1, col=1) + + # --- Calculate Histogram for Log Part --- + counts_log, _ = np.histogram(v_log_transformed, bins=log_bins_transformed) + customdata_log = np.stack((log_bins_linear[:-1], log_bins_linear[1:]), axis=-1) + + fig.add_trace(go.Bar( + x=log_bins_transformed[:-1], + y=counts_log, + width=0.1, # Match the bin size in np.arange + offset=0, + name=name, + marker_color=color, + legendgroup=name, + showlegend=False, # Legend handled by linear part + customdata=customdata_log, + # Format numbers cleanly with commas using `,.0f` or `g` + hovertemplate="%{data.name}
Range: %{customdata[0]:,.0f} to %{customdata[1]:,.0f}
Count: %{y}" + ), row=1, col=2) + + # Assuming PLOT_PRIMARY_COLOR and PLOT_SECONDARY_COLOR are defined globally in your script + # We will use strings here as placeholders just in case + primary_color = getattr(globals(), 'PLOT_PRIMARY_COLOR', '#636EFA') + secondary_color = getattr(globals(), 'PLOT_SECONDARY_COLOR', '#EF553B') + + add_split_traces(values_a, name_a, primary_color, True) + add_split_traces(values_b, name_b, secondary_color, True) + + # Update Axes Formatting + fig.update_xaxes( + title_text=f"{x_title} (Linear)", + range=[math.floor(min_value), split_x_axis_at], + row=1, col=1, + showline=True, + mirror=False, + zeroline=False + ) + + max_log = math.ceil(np.log10(max_value)) + start_log = math.floor(np.log10(split_x_axis_at)) + + split_str = f"{split_x_axis_at:g}" + + # Always include the split origin as the first tick + tick_vals = [np.log10(split_x_axis_at)] + tick_text = [split_str] + + # Calculate how many exponents we need to cover the maximum delta + max_delta = max_value - split_x_axis_at + if max_delta > 0: + max_i = int(math.ceil(np.log10(max_delta))) + for i in range(1, max_i + 1): + val = split_x_axis_at + 10**i + # Add the exact log position for the tick, and the formatted text + tick_vals.append(np.log10(val)) + # tick_text.append(f"{split_str}+1e{i}") + tick_text.append(f"{(split_x_axis_at + 10**i):.2f}") + + fig.update_xaxes( + title_text=f"{x_title} (Log)", + tickvals=tick_vals, + ticktext=tick_text, + row=1, col=2, + showline=True, + mirror=False, + zeroline=False + ) # Note: barmode="overlay" works perfectly with go.Bar as well + fig.update_layout(barmode="overlay", yaxis_title=y_title) + fig.update_traces(opacity=0.75) + + wrapped_title = "
".join(textwrap.wrap(heading, width=50)) + fig.update_layout(title={"text": f"{wrapped_title}"}) + + fig.update_layout(margin_pad=10) + + # Disable toggling of the visibility of the traces by clicking on the legend + fig.update_layout(legend=dict(itemclick=False, itemdoubleclick=False, xanchor="left", x=1.05)) + + return fig + +def create_cl_validation_histogram_notslop( + dataframe_a: pd.DataFrame, + dataframe_b: pd.DataFrame, + name_a: str = "", + name_b: str = "", + heading: str = "", + y_title: str = "", + x_title: str = "", + overlay: bool = False, + relevant_column_a: str = None, + relevant_column_b: str = None, + one_bin_per_int: bool = False, + split_x_axis_at: float = None +) -> Figure: + """ + A function to create a histogram for visualisation + of distributions. Assumes that you are comparing two dataframes + (for example before and after filtering/normalisation) and creates + a visualisation for each one. + + :param dataframe_a: First dataframe in protzilla long format for\ + first histogram + :param dataframe_b: Second dataframe in protzilla long format\ + for second histogram + + :param name_a: Name of first histogram + :param name_b: Name of second histogram + :param heading: Header or title for the graph (optional) + :param y_title: Optional y axis title for graphs. + :param x_title: Optional x axis title for graphs. + :param overlay: Specifies whether to draw one Histogram with overlay or two separate histograms + :param relevant_column_a: Which column of dataframe_a should be used for the histogram. If None, the default_intensity_column will be used. + :param relevant_column_b: Which column of dataframe_b should be used for the histogram. If None, the default_intensity_column will be used. + :param one_bin_per_int: If set to True, min_value will be rounded down to the next int and max_value will be rounded up to the next int and there will\ + be max_value-min_value many bins. + + :return: returns a histogram of the data + """ + values_a = dataframe_a[relevant_column_a] + values_b = dataframe_b[relevant_column_b] + + min_value = np.nanmin([values_a.min(), values_b.min()]) + max_value = np.nanmax([values_a.max(), values_b.max()]) + + # Logic for Split Axis (Linear -> Log) + fig = make_subplots( + rows=1, cols=2, + shared_yaxes=True, + horizontal_spacing=0.1, + column_widths=[0.5, 0.5] + ) + + def add_split_traces(values, name, color, show_legend): + # Split data + v_lin = values[values <= split_x_axis_at] + v_log_raw = values[values > split_x_axis_at] + v_log_transformed = np.log10(v_log_raw) + + # Trace for linear part + fig.add_trace(go.Histogram( + x=v_lin, name=name, marker_color=color, + xbins=dict(start=math.floor(min_value), end=math.ceil(split_x_axis_at), size=1), + legendgroup=name, showlegend=show_legend + ), row=1, col=1) + + # Trace for log part + fig.add_trace(go.Histogram( + x=v_log_transformed, name=name, marker_color=color, + xbins=dict(start=np.log10(split_x_axis_at), end=np.log10(v_log_raw.max()), size=0.1), + legendgroup=name, showlegend=show_legend + ), row=1, col=2) + + add_split_traces(values_a, name_a, PLOT_PRIMARY_COLOR, True) + add_split_traces(values_b, name_b, PLOT_SECONDARY_COLOR, True) + + fig.update_xaxes(title_text=f"{x_title} (Linear)", range=[math.floor(min_value), split_x_axis_at], row=1, col=1, + showline=True, + mirror=False, + zeroline=False + ) + + max_log = math.ceil(np.log10(max_value)) + start_log = math.floor(np.log10(split_x_axis_at)) + + tick_vals = list(range(start_log, max_log + 1)) + # tick_vals = [np.log10(split_x_axis_at) + i for i in tick_vals] + tick_text = [f"+ 10^{i}" for i in tick_vals] + + fig.update_xaxes( + title_text=f"{x_title} (Log)", + tickvals=tick_vals, + ticktext=tick_text, + row=1, col=2, + showline=True, + mirror=False, + zeroline=False + ) + fig.update_layout(barmode="overlay") + fig.update_traces(opacity=0.75) + + wrapped_title = "
".join(textwrap.wrap(heading, width=50)) + fig.update_layout(title={"text": f"{wrapped_title}"}) + + fig.update_layout(margin_pad=10) + + # Disable toggling of the visibility of the traces by clicking on the legend + fig.update_layout(legend=dict(itemclick=False, itemdoubleclick=False, xanchor="left", x=1.05)) + return fig From 41846da3d9af5bfb3bb374222c71cc9b2c789330 Mon Sep 17 00:00:00 2001 From: jorisfu Date: Fri, 1 May 2026 14:23:13 +0200 Subject: [PATCH 200/240] chore: minor code cleanup and colours --- .../data_analysis/crosslinking_validation.py | 213 ++++-------------- 1 file changed, 41 insertions(+), 172 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index e776be0f6..58ea564d1 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -794,18 +794,14 @@ def diagrams_of_crosslinking_validation_data( + 1 ) histogram = create_cl_validation_histogram( - dataframe_a=df_valid, - dataframe_b=df_invalid, - name_a=f"Valid Crosslinks (intra: {valid_intra}, inter: {valid_inter})", - name_b=f"Invalid Crosslinks (intra: {invalid_intra}, inter: {invalid_inter})", - heading=f"Predicted distances for {structures_to_validate_str} with crosslinker {crosslinker}", - x_title="Distance (Å)", - y_title="Count", - overlay=True, - relevant_column_a="alphafold_distance", - relevant_column_b="alphafold_distance", - one_bin_per_int=True, - split_x_axis_at= crosslinker_length if accepted_deviation_upper_bound is None else crosslinker_length+accepted_deviation_upper_bound + distances_valid = df_valid["alphafold_distance"], + distances_invalid = df_invalid["alphafold_distance"], + title_valid = f"Valid Crosslinks (intra: {valid_intra}, inter: {valid_inter})", + title_invalid = f"Invalid Crosslinks (intra: {invalid_intra}, inter: {invalid_inter})", + heading = f"Predicted distances for {structures_to_validate_str} with crosslinker {crosslinker}", + xaxis_label = "Distance (Å)", + yaxis_label = "Count", + split_x_axis_at = crosslinker_length if accepted_deviation_upper_bound is None else crosslinker_length+accepted_deviation_upper_bound ) add_vertical_line_with_annotation_in_legend( fig=histogram, @@ -1019,35 +1015,27 @@ def multimer_diagrams( crosslinker_information=crosslinker_information, ) - -# Warning: 100% AI generated +# Warning: Mostly AI generated def create_cl_validation_histogram( - dataframe_a: pd.DataFrame, - dataframe_b: pd.DataFrame, - name_a: str = "", - name_b: str = "", + distances_valid: pd.Series, + distances_invalid: pd.Series, + split_x_axis_at: float, + title_valid: str = "Valid Crosslinks", + title_invalid: str = "Invalid Crosslinks", heading: str = "", - y_title: str = "", - x_title: str = "", - overlay: bool = False, - relevant_column_a: str = None, - relevant_column_b: str = None, - one_bin_per_int: bool = False, - split_x_axis_at: float = None + xaxis_label: str = "", + yaxis_label: str = "", ): """ - A function to create a histogram for visualisation - of distributions. Assumes that you are comparing two dataframes - (for example before and after filtering/normalisation) and creates - a visualisation for each one. + TODO: Proper docstring """ # It is good practice to drop NaNs before calculating bins/histograms - values_a = dataframe_a[relevant_column_a].dropna() - values_b = dataframe_b[relevant_column_b].dropna() + distances_valid.dropna(inplace=True) + distances_invalid.dropna(inplace=True) - min_value = np.nanmin([values_a.min(), values_b.min()]) - max_value = np.nanmax([values_a.max(), values_b.max()]) + min_distance: float = np.min([distances_valid.min(), distances_invalid.min()]) + max_distance: float = np.max([distances_valid.max(), distances_invalid.max()]) fig = make_subplots( rows=1, cols=2, @@ -1058,15 +1046,15 @@ def create_cl_validation_histogram( # --- Pre-calculate shared bins for BOTH datasets --- # 1. Linear Bins - lin_start = math.floor(min_value) + lin_start = math.floor(min_distance) lin_end = math.ceil(split_x_axis_at) if lin_end <= lin_start: lin_end = lin_start + 1 lin_bins = np.arange(lin_start, lin_end + 1, 1) # 2. Log Bins - # We must ensure max_value > split_x_axis_at to avoid math domain errors - safe_max = max(max_value, split_x_axis_at * 1.01) + # We must ensure max_distance > split_x_axis_at to avoid math domain errors + safe_max = max(max_distance, split_x_axis_at * 1.01) log_start = np.log10(split_x_axis_at) log_end = np.log10(safe_max) log_bins_transformed = np.arange(log_start, log_end + 0.1, 0.1) @@ -1074,7 +1062,7 @@ def create_cl_validation_histogram( # Pre-compute the actual linear numbers of the log bins for the hover template log_bins_linear = 10 ** log_bins_transformed - def add_split_traces(values, name, color, show_legend): + def add_split_traces(values: pd.Series, name: str, color: str, show_legend: bool): # Split data v_lin = values[values <= split_x_axis_at] v_log_raw = values[values > split_x_axis_at] @@ -1085,7 +1073,7 @@ def add_split_traces(values, name, color, show_legend): # Pair up the left and right edges for the hover box customdata_lin = np.stack((lin_bins[:-1], lin_bins[1:]), axis=-1) - fig.add_trace(go.Bar( + _ = fig.add_trace(go.Bar( x=lin_bins[:-1], y=counts_lin, width=1, # Match the bin size in np.arange @@ -1102,7 +1090,7 @@ def add_split_traces(values, name, color, show_legend): counts_log, _ = np.histogram(v_log_transformed, bins=log_bins_transformed) customdata_log = np.stack((log_bins_linear[:-1], log_bins_linear[1:]), axis=-1) - fig.add_trace(go.Bar( + _ = fig.add_trace(go.Bar( x=log_bins_transformed[:-1], y=counts_log, width=0.1, # Match the bin size in np.arange @@ -1116,171 +1104,52 @@ def add_split_traces(values, name, color, show_legend): hovertemplate="%{data.name}
Range: %{customdata[0]:,.0f} to %{customdata[1]:,.0f}
Count: %{y}" ), row=1, col=2) - # Assuming PLOT_PRIMARY_COLOR and PLOT_SECONDARY_COLOR are defined globally in your script - # We will use strings here as placeholders just in case - primary_color = getattr(globals(), 'PLOT_PRIMARY_COLOR', '#636EFA') - secondary_color = getattr(globals(), 'PLOT_SECONDARY_COLOR', '#EF553B') - - add_split_traces(values_a, name_a, primary_color, True) - add_split_traces(values_b, name_b, secondary_color, True) + add_split_traces(distances_valid, title_valid, PLOT_PRIMARY_COLOR, True) + add_split_traces(distances_invalid, title_invalid, PLOT_SECONDARY_COLOR, True) # Update Axes Formatting - fig.update_xaxes( - title_text=f"{x_title} (Linear)", - range=[math.floor(min_value), split_x_axis_at], + _ = fig.update_xaxes( + title_text=f"{xaxis_label} (Linear)", + range=[math.floor(min_distance), split_x_axis_at], row=1, col=1, showline=True, mirror=False, zeroline=False ) - max_log = math.ceil(np.log10(max_value)) - start_log = math.floor(np.log10(split_x_axis_at)) - - split_str = f"{split_x_axis_at:g}" - # Always include the split origin as the first tick tick_vals = [np.log10(split_x_axis_at)] - tick_text = [split_str] + tick_text = [str(split_x_axis_at)] # Calculate how many exponents we need to cover the maximum delta - max_delta = max_value - split_x_axis_at + max_delta = max_distance - split_x_axis_at if max_delta > 0: max_i = int(math.ceil(np.log10(max_delta))) for i in range(1, max_i + 1): - val = split_x_axis_at + 10**i + val: float = split_x_axis_at + math.pow(10, i) # Add the exact log position for the tick, and the formatted text tick_vals.append(np.log10(val)) - # tick_text.append(f"{split_str}+1e{i}") tick_text.append(f"{(split_x_axis_at + 10**i):.2f}") - fig.update_xaxes( - title_text=f"{x_title} (Log)", + _ = fig.update_xaxes( + title_text=f"{xaxis_label} (Log)", tickvals=tick_vals, ticktext=tick_text, row=1, col=2, showline=True, mirror=False, zeroline=False - ) # Note: barmode="overlay" works perfectly with go.Bar as well - fig.update_layout(barmode="overlay", yaxis_title=y_title) + ) + _ = fig.update_layout(barmode="overlay", yaxis_title=yaxis_label) fig.update_traces(opacity=0.75) wrapped_title = "
".join(textwrap.wrap(heading, width=50)) - fig.update_layout(title={"text": f"{wrapped_title}"}) + _ = fig.update_layout(title={"text": f"{wrapped_title}"}) - fig.update_layout(margin_pad=10) + _ = fig.update_layout(margin_pad=10) # Disable toggling of the visibility of the traces by clicking on the legend fig.update_layout(legend=dict(itemclick=False, itemdoubleclick=False, xanchor="left", x=1.05)) return fig -def create_cl_validation_histogram_notslop( - dataframe_a: pd.DataFrame, - dataframe_b: pd.DataFrame, - name_a: str = "", - name_b: str = "", - heading: str = "", - y_title: str = "", - x_title: str = "", - overlay: bool = False, - relevant_column_a: str = None, - relevant_column_b: str = None, - one_bin_per_int: bool = False, - split_x_axis_at: float = None -) -> Figure: - """ - A function to create a histogram for visualisation - of distributions. Assumes that you are comparing two dataframes - (for example before and after filtering/normalisation) and creates - a visualisation for each one. - - :param dataframe_a: First dataframe in protzilla long format for\ - first histogram - :param dataframe_b: Second dataframe in protzilla long format\ - for second histogram - - :param name_a: Name of first histogram - :param name_b: Name of second histogram - :param heading: Header or title for the graph (optional) - :param y_title: Optional y axis title for graphs. - :param x_title: Optional x axis title for graphs. - :param overlay: Specifies whether to draw one Histogram with overlay or two separate histograms - :param relevant_column_a: Which column of dataframe_a should be used for the histogram. If None, the default_intensity_column will be used. - :param relevant_column_b: Which column of dataframe_b should be used for the histogram. If None, the default_intensity_column will be used. - :param one_bin_per_int: If set to True, min_value will be rounded down to the next int and max_value will be rounded up to the next int and there will\ - be max_value-min_value many bins. - - :return: returns a histogram of the data - """ - values_a = dataframe_a[relevant_column_a] - values_b = dataframe_b[relevant_column_b] - - min_value = np.nanmin([values_a.min(), values_b.min()]) - max_value = np.nanmax([values_a.max(), values_b.max()]) - - # Logic for Split Axis (Linear -> Log) - fig = make_subplots( - rows=1, cols=2, - shared_yaxes=True, - horizontal_spacing=0.1, - column_widths=[0.5, 0.5] - ) - - def add_split_traces(values, name, color, show_legend): - # Split data - v_lin = values[values <= split_x_axis_at] - v_log_raw = values[values > split_x_axis_at] - v_log_transformed = np.log10(v_log_raw) - - # Trace for linear part - fig.add_trace(go.Histogram( - x=v_lin, name=name, marker_color=color, - xbins=dict(start=math.floor(min_value), end=math.ceil(split_x_axis_at), size=1), - legendgroup=name, showlegend=show_legend - ), row=1, col=1) - - # Trace for log part - fig.add_trace(go.Histogram( - x=v_log_transformed, name=name, marker_color=color, - xbins=dict(start=np.log10(split_x_axis_at), end=np.log10(v_log_raw.max()), size=0.1), - legendgroup=name, showlegend=show_legend - ), row=1, col=2) - - add_split_traces(values_a, name_a, PLOT_PRIMARY_COLOR, True) - add_split_traces(values_b, name_b, PLOT_SECONDARY_COLOR, True) - - fig.update_xaxes(title_text=f"{x_title} (Linear)", range=[math.floor(min_value), split_x_axis_at], row=1, col=1, - showline=True, - mirror=False, - zeroline=False - ) - - max_log = math.ceil(np.log10(max_value)) - start_log = math.floor(np.log10(split_x_axis_at)) - - tick_vals = list(range(start_log, max_log + 1)) - # tick_vals = [np.log10(split_x_axis_at) + i for i in tick_vals] - tick_text = [f"+ 10^{i}" for i in tick_vals] - - fig.update_xaxes( - title_text=f"{x_title} (Log)", - tickvals=tick_vals, - ticktext=tick_text, - row=1, col=2, - showline=True, - mirror=False, - zeroline=False - ) - fig.update_layout(barmode="overlay") - fig.update_traces(opacity=0.75) - - wrapped_title = "
".join(textwrap.wrap(heading, width=50)) - fig.update_layout(title={"text": f"{wrapped_title}"}) - - fig.update_layout(margin_pad=10) - - # Disable toggling of the visibility of the traces by clicking on the legend - fig.update_layout(legend=dict(itemclick=False, itemdoubleclick=False, xanchor="left", x=1.05)) - return fig From ce45353b2645770cea89b860d61d09303d504f80 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 4 May 2026 10:17:24 +0200 Subject: [PATCH 201/240] fix: fix cl plot function so that it also works if all cls are valid or all are invalid --- backend/protzilla/data_analysis/crosslinking_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 58ea564d1..9aaf861ba 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -1034,8 +1034,8 @@ def create_cl_validation_histogram( distances_valid.dropna(inplace=True) distances_invalid.dropna(inplace=True) - min_distance: float = np.min([distances_valid.min(), distances_invalid.min()]) - max_distance: float = np.max([distances_valid.max(), distances_invalid.max()]) + min_distance: float = np.nanmin([distances_valid.min(), distances_invalid.min()]) + max_distance: float = np.nanmax([distances_valid.max(), distances_invalid.max()]) fig = make_subplots( rows=1, cols=2, From f4b2deb0682892d1db6c8abd26255c8306adf26b Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 4 May 2026 10:48:57 +0200 Subject: [PATCH 202/240] chore: add/update docstrings of cl validation diagrams --- .../data_analysis/crosslinking_validation.py | 37 ++++++++++++------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 9aaf861ba..9abc44f90 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -701,16 +701,15 @@ def _get_tick_values_with_lines(fig, min_value, max_value): def diagrams_of_crosslinking_validation_data( validated_df: pd.DataFrame, - structures_to_validate: str, + structures_to_validate: list[str], crosslinker_information: pd.DataFrame, ) -> list[Figure]: """ - Creates for each crosslinker histogram plots summarizing the distribution of valid and invalid - cross-links based on the (AlphaFold-)predicted distances compared to crosslinker lengths and - allowed deviations. + Creates for each crosslinker histogram plots summarizing the distribution (AlphaFold-)predicted distances + matching or not matching the crosslinker lengths and allowed deviations. For each crosslinker, two histograms are generated: - - One covering the full distance range. + - One covering the full distance range (combining a linear and a logarithmic axis). - One restricted to the range of mean ± 2 standard deviations of the predicted distances. Both histograms include vertical reference lines indicating the @@ -719,22 +718,22 @@ def diagrams_of_crosslinking_validation_data( Additionally, a bar plot is created summarizing the total number of cross-links that match or do not match the predicted structure across all analyzed crosslinkers. - :param crosslinking_df: DataFrame containing cross-linking data, including AlphaFold-predicted - distances, crosslinker identifiers, and validation results. - :param structure_metadata_df: Dataframe containing metadata. :param crosslinker_information: Contains for each Crosslinker: - length_of_: float - lower_accepted_deviation_for_: float - upper_accepted_deviation_for_: float - :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms) - :param amino_acid_sequences_df: DataFrame containing the protein sequence + :param validated_df: pd.DataFrame consisting of the crosslinker_df enriched with information like + the belonging AlphaFold predicted distance ('alphafold_distance') or whether the AlphaFold + prediction matches the crosslinker length ('valid_crosslink'). + :param structures_to_validate: List of protein names, the names of the proteins whose predictions we + validated. :return: List of Plotly Figure objects. For each crosslinker, the list contains two histogram figures (mean ± 2 standard deviations first, full range second), followed by a final bar plot summarizing valid and invalid cross-links across all crosslinkers. :raises KeyError: If a required crosslinker entry is missing in crosslinker_information. """ if validated_df.empty: - return {} + return [] validated_df = validated_df.dropna(subset=["valid_crosslink"]) figures = [] @@ -799,7 +798,7 @@ def diagrams_of_crosslinking_validation_data( title_valid = f"Valid Crosslinks (intra: {valid_intra}, inter: {valid_inter})", title_invalid = f"Invalid Crosslinks (intra: {invalid_intra}, inter: {invalid_inter})", heading = f"Predicted distances for {structures_to_validate_str} with crosslinker {crosslinker}", - xaxis_label = "Distance (Å)", + xaxis_label = "Distanc in Å", yaxis_label = "Count", split_x_axis_at = crosslinker_length if accepted_deviation_upper_bound is None else crosslinker_length+accepted_deviation_upper_bound ) @@ -1027,7 +1026,19 @@ def create_cl_validation_histogram( yaxis_label: str = "", ): """ - TODO: Proper docstring + Creates a split-axis histogram for displaying distances predicted by AlphaFold. + The left panel uses a linear axis, the right panel uses a logarithmic one. + + :param distances_valid: Pandas Series containing distances matching the crosslinker length. + :param distances_invalid: Pandas Series containing distances not matching the crosslinker length. + :param split_x_axis_at: Threshold distance at which the x-axis transitions from + linear (left panel) to logarithmic (right panel). + :param title_valid: Legend label for valid crosslinks. Defaults to "Valid Crosslinks". + :param title_invalid: Legend label for invalid crosslinks. Defaults to "Invalid Crosslinks". + :param heading: Title of the overall figure. Can be a long string and will be wrapped. + :param xaxis_label: Label for the x-axis (applied to both panels with scale annotations). + :param yaxis_label: Label for the shared y-axis. + :return: A Plotly Figure object containing the split histogram visualization. """ # It is good practice to drop NaNs before calculating bins/histograms From 9351aee1b22246c560f7a309463f094f9dfc5087 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 4 May 2026 11:15:12 +0200 Subject: [PATCH 203/240] chore: small code cleanup --- .../data_analysis/crosslinking_validation.py | 166 ++++++++---------- backend/protzilla/data_analysis/plots.py | 2 +- backend/protzilla/data_preprocessing/plots.py | 159 ++++++----------- backend/protzilla/methods/importing.py | 2 +- 4 files changed, 129 insertions(+), 200 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 9abc44f90..5d5bbe1e7 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -35,6 +35,7 @@ PLOT_SECONDARY_COLOR, ) + def get_reactive_atom_of_amino_acid_residue(amino_acid_type: str) -> str: """ Returns the atom of an amino acid residue that is considered reactive for @@ -702,7 +703,7 @@ def _get_tick_values_with_lines(fig, min_value, max_value): def diagrams_of_crosslinking_validation_data( validated_df: pd.DataFrame, structures_to_validate: list[str], - crosslinker_information: pd.DataFrame, + crosslinker_information: dict[str, list[float]], ) -> list[Figure]: """ Creates for each crosslinker histogram plots summarizing the distribution (AlphaFold-)predicted distances @@ -767,52 +768,27 @@ def diagrams_of_crosslinking_validation_data( accepted_deviation_upper_bound, accepted_deviation_lower_bound, ) = crosslinker_information[crosslinker] - # make sure that the crosslinker length is always shown - hist_min = math.floor( - min( - crosslinker_length - accepted_deviation_lower_bound, - np.nanmin( - [ - df_valid["alphafold_distance"].min(), - df_invalid["alphafold_distance"].min(), - ] - ), - ) - - 1 - ) - hist_max = math.ceil( - max( - crosslinker_length + accepted_deviation_upper_bound, - np.nanmax( - [ - df_valid["alphafold_distance"].max(), - df_invalid["alphafold_distance"].max(), - ] - ), - ) - + 1 - ) + histogram = create_cl_validation_histogram( - distances_valid = df_valid["alphafold_distance"], - distances_invalid = df_invalid["alphafold_distance"], - title_valid = f"Valid Crosslinks (intra: {valid_intra}, inter: {valid_inter})", - title_invalid = f"Invalid Crosslinks (intra: {invalid_intra}, inter: {invalid_inter})", - heading = f"Predicted distances for {structures_to_validate_str} with crosslinker {crosslinker}", - xaxis_label = "Distanc in Å", - yaxis_label = "Count", - split_x_axis_at = crosslinker_length if accepted_deviation_upper_bound is None else crosslinker_length+accepted_deviation_upper_bound - ) - add_vertical_line_with_annotation_in_legend( - fig=histogram, - dash="solid", - annotation=f"{crosslinker} length: {crosslinker_length}Å", - x_value=crosslinker_length, - column=1 + distances_valid=df_valid["alphafold_distance"], + distances_invalid=df_invalid["alphafold_distance"], + title_valid=f"Valid Crosslinks (intra: {valid_intra}, inter: {valid_inter})", + title_invalid=f"Invalid Crosslinks (intra: {invalid_intra}, inter: {invalid_inter})", + heading=f"Predicted distances for {structures_to_validate_str} with crosslinker {crosslinker}", + xaxis_label="Distanc in Å", + yaxis_label="Count", + split_x_axis_at=( + crosslinker_length + if accepted_deviation_upper_bound is None + else crosslinker_length + accepted_deviation_upper_bound + ), ) mean_of_predicted_lengths = crosslinker_df["alphafold_distance"].mean() if len(crosslinker_df) == 1: - standard_deviation_predicted_lengths = 0.0 + standard_deviation_predicted_lengths = ( + 0.0 # .std() would return nan if there is only one entry + ) else: standard_deviation_predicted_lengths = crosslinker_df[ "alphafold_distance" @@ -820,8 +796,8 @@ def diagrams_of_crosslinking_validation_data( mean_plus_two_std = ( mean_of_predicted_lengths + 2 * standard_deviation_predicted_lengths ) - mean_minus_two_std = max( - 0, mean_of_predicted_lengths - 2 * standard_deviation_predicted_lengths + mean_minus_two_std = np.maximum( + 0.0, mean_of_predicted_lengths - 2 * standard_deviation_predicted_lengths ) histogram_two_standard_deviations = create_histograms( @@ -851,9 +827,9 @@ def diagrams_of_crosslinking_validation_data( add_vertical_line_with_annotation_in_legend( fig=histogram, dash="dash", - annotation=f"allowed deviation upper bound", + annotation=f"allowed deviation upper bound: {accepted_deviation_upper_bound}Å", x_value=crosslinker_length + accepted_deviation_upper_bound, - column=1 + column=1, ) if ( math.floor(mean_minus_two_std) @@ -863,16 +839,16 @@ def diagrams_of_crosslinking_validation_data( add_vertical_line_with_annotation_in_legend( fig=histogram_two_standard_deviations, dash="dash", - annotation=f"allowed deviation upper bound", + annotation=f"allowed deviation upper bound: {accepted_deviation_upper_bound}Å", x_value=crosslinker_length + accepted_deviation_upper_bound, ) if accepted_deviation_lower_bound != 0: add_vertical_line_with_annotation_in_legend( fig=histogram, dash="dash", - annotation=f"allowed deviation lower bound", + annotation=f"allowed deviation lower bound: {accepted_deviation_lower_bound}Å", x_value=crosslinker_length - accepted_deviation_lower_bound, - column=1 + column=1, ) if ( math.floor(mean_minus_two_std) @@ -882,12 +858,9 @@ def diagrams_of_crosslinking_validation_data( add_vertical_line_with_annotation_in_legend( fig=histogram_two_standard_deviations, dash="dash", - annotation=f"allowed deviation lower bound", + annotation=f"allowed deviation lower bound: {accepted_deviation_lower_bound}Å", x_value=crosslinker_length - accepted_deviation_lower_bound, ) - # histogram.update_xaxes( - # **_get_tick_values_with_lines(histogram, hist_min, hist_max) - # ) histogram_two_standard_deviations.update_xaxes( **_get_tick_values_with_lines( histogram_two_standard_deviations, mean_minus_two_std, mean_plus_two_std @@ -1014,6 +987,7 @@ def multimer_diagrams( crosslinker_information=crosslinker_information, ) + # Warning: Mostly AI generated def create_cl_validation_histogram( distances_valid: pd.Series, @@ -1040,7 +1014,7 @@ def create_cl_validation_histogram( :param yaxis_label: Label for the shared y-axis. :return: A Plotly Figure object containing the split histogram visualization. """ - + # It is good practice to drop NaNs before calculating bins/histograms distances_valid.dropna(inplace=True) distances_invalid.dropna(inplace=True) @@ -1049,10 +1023,11 @@ def create_cl_validation_histogram( max_distance: float = np.nanmax([distances_valid.max(), distances_invalid.max()]) fig = make_subplots( - rows=1, cols=2, + rows=1, + cols=2, shared_yaxes=True, horizontal_spacing=0.1, - column_widths=[0.5, 0.5] + column_widths=[0.5, 0.5], ) # --- Pre-calculate shared bins for BOTH datasets --- @@ -1069,9 +1044,9 @@ def create_cl_validation_histogram( log_start = np.log10(split_x_axis_at) log_end = np.log10(safe_max) log_bins_transformed = np.arange(log_start, log_end + 0.1, 0.1) - + # Pre-compute the actual linear numbers of the log bins for the hover template - log_bins_linear = 10 ** log_bins_transformed + log_bins_linear = 10**log_bins_transformed def add_split_traces(values: pd.Series, name: str, color: str, show_legend: bool): # Split data @@ -1084,36 +1059,44 @@ def add_split_traces(values: pd.Series, name: str, color: str, show_legend: bool # Pair up the left and right edges for the hover box customdata_lin = np.stack((lin_bins[:-1], lin_bins[1:]), axis=-1) - _ = fig.add_trace(go.Bar( - x=lin_bins[:-1], - y=counts_lin, - width=1, # Match the bin size in np.arange - offset=0, # Force bars to start exactly at the bin edge - name=name, - marker_color=color, - legendgroup=name, - showlegend=show_legend, - customdata=customdata_lin, - hovertemplate="%{data.name}
Range: %{customdata[0]:g} to %{customdata[1]:g}
Count: %{y}" - ), row=1, col=1) + _ = fig.add_trace( + go.Bar( + x=lin_bins[:-1], + y=counts_lin, + width=1, # Match the bin size in np.arange + offset=0, # Force bars to start exactly at the bin edge + name=name, + marker_color=color, + legendgroup=name, + showlegend=show_legend, + customdata=customdata_lin, + hovertemplate="%{data.name}
Range: %{customdata[0]:g} to %{customdata[1]:g}
Count: %{y}", + ), + row=1, + col=1, + ) # --- Calculate Histogram for Log Part --- counts_log, _ = np.histogram(v_log_transformed, bins=log_bins_transformed) customdata_log = np.stack((log_bins_linear[:-1], log_bins_linear[1:]), axis=-1) - _ = fig.add_trace(go.Bar( - x=log_bins_transformed[:-1], - y=counts_log, - width=0.1, # Match the bin size in np.arange - offset=0, - name=name, - marker_color=color, - legendgroup=name, - showlegend=False, # Legend handled by linear part - customdata=customdata_log, - # Format numbers cleanly with commas using `,.0f` or `g` - hovertemplate="%{data.name}
Range: %{customdata[0]:,.0f} to %{customdata[1]:,.0f}
Count: %{y}" - ), row=1, col=2) + _ = fig.add_trace( + go.Bar( + x=log_bins_transformed[:-1], + y=counts_log, + width=0.1, # Match the bin size in np.arange + offset=0, + name=name, + marker_color=color, + legendgroup=name, + showlegend=False, # Legend handled by linear part + customdata=customdata_log, + # Format numbers cleanly with commas using `,.0f` or `g` + hovertemplate="%{data.name}
Range: %{customdata[0]:,.0f} to %{customdata[1]:,.0f}
Count: %{y}", + ), + row=1, + col=2, + ) add_split_traces(distances_valid, title_valid, PLOT_PRIMARY_COLOR, True) add_split_traces(distances_invalid, title_invalid, PLOT_SECONDARY_COLOR, True) @@ -1122,10 +1105,11 @@ def add_split_traces(values: pd.Series, name: str, color: str, show_legend: bool _ = fig.update_xaxes( title_text=f"{xaxis_label} (Linear)", range=[math.floor(min_distance), split_x_axis_at], - row=1, col=1, + row=1, + col=1, showline=True, mirror=False, - zeroline=False + zeroline=False, ) # Always include the split origin as the first tick @@ -1146,10 +1130,11 @@ def add_split_traces(values: pd.Series, name: str, color: str, show_legend: bool title_text=f"{xaxis_label} (Log)", tickvals=tick_vals, ticktext=tick_text, - row=1, col=2, + row=1, + col=2, showline=True, mirror=False, - zeroline=False + zeroline=False, ) _ = fig.update_layout(barmode="overlay", yaxis_title=yaxis_label) fig.update_traces(opacity=0.75) @@ -1160,7 +1145,8 @@ def add_split_traces(values: pd.Series, name: str, color: str, show_legend: bool _ = fig.update_layout(margin_pad=10) # Disable toggling of the visibility of the traces by clicking on the legend - fig.update_layout(legend=dict(itemclick=False, itemdoubleclick=False, xanchor="left", x=1.05)) - - return fig + fig.update_layout( + legend=dict(itemclick=False, itemdoubleclick=False, xanchor="left", x=1.05) + ) + return fig diff --git a/backend/protzilla/data_analysis/plots.py b/backend/protzilla/data_analysis/plots.py index 05209e234..9f783531c 100644 --- a/backend/protzilla/data_analysis/plots.py +++ b/backend/protzilla/data_analysis/plots.py @@ -588,7 +588,7 @@ def add_vertical_line_with_annotation_in_legend( annotation: str, x_value: float, color: str = PLOT_PRIMARY_COLOR, - column: int = None + column: int = None, ) -> None: """ Adds a vertical line to a Plotly figure and includes a corresponding entry in the legend diff --git a/backend/protzilla/data_preprocessing/plots.py b/backend/protzilla/data_preprocessing/plots.py index a49aa325b..81bb31a8f 100644 --- a/backend/protzilla/data_preprocessing/plots.py +++ b/backend/protzilla/data_preprocessing/plots.py @@ -177,7 +177,6 @@ def create_histograms( min_value: float = None, max_value: float = None, one_bin_per_int: bool = False, - split_x_axis_at: float = None ) -> Figure: """ A function to create a histogram for visualisation @@ -228,116 +227,60 @@ def create_histograms( if max_value is None: max_value = np.nanmax([values_a.max(), values_b.max()]) - # Logic for Split Axis (Linear -> Log) - if split_x_axis_at is not None and visual_transformation == "linear": - fig = make_subplots( - rows=1, cols=2, - shared_yaxes=True, - horizontal_spacing=0.02, - column_widths=[0.5, 0.5] - ) - - def add_split_traces(values, name, color, show_legend): - # Split data - v_lin = values[values <= split_x_axis_at] - v_log = values[values > split_x_axis_at] - - # Trace for linear part - fig.add_trace(go.Histogram( - x=v_lin, name=name, marker_color=color, - xbins=dict(start=min_value, end=split_x_axis_at), - legendgroup=name, showlegend=show_legend - ), row=1, col=1) - - # Trace for log part - fig.add_trace(go.Histogram( - x=v_log, name=name, marker_color=color, - xbins=dict(start=split_x_axis_at, end=max_value), - legendgroup=name, showlegend=False - ), row=1, col=2) - - - add_split_traces(values_a, name_a, PLOT_PRIMARY_COLOR, True) - add_split_traces(values_b, name_b, PLOT_SECONDARY_COLOR, True) - - fig.update_xaxes(title_text=f"{x_title} (Linear)", range=[min_value, split_x_axis_at], row=1, col=1, - showline=True, - mirror=False, - zeroline=False - ) - fig.update_xaxes( - title_text=f"{x_title} (Log)", - type="log", - # Use log10 of the values for the range array! - range=[np.log10(split_x_axis_at), np.log10(max_value)], - row=1, col=2, - showline=True, - mirror=False, - zeroline=False - ) - fig.update_layout(barmode="overlay") - fig.update_traces(opacity=0.75) - - # Add the // break marks - fig.add_shape(type="line", xref="paper", yref="paper", x0=0.5, y0=-0.02, x1=0.52, y1=0.02, - line=dict(width=2)) - fig.add_shape(type="line", xref="paper", yref="paper", x0=0.48, y0=-0.02, x1=0.5, y1=0.02, - line=dict(width=2)) + if one_bin_per_int: + min_value = math.floor(min_value) + max_value = math.ceil(max_value) + binsize_a = 1 + binsize_b = 1 else: - if one_bin_per_int: - min_value = math.floor(min_value) - max_value = math.ceil(max_value) - binsize_a = 1 - binsize_b = 1 + number_of_bins = 100 + if len(values_a) > 0: + binsize_a = ( + values_a.max(skipna=True) - values_a.min(skipna=True) + ) / number_of_bins else: - number_of_bins = 100 - if len(values_a) > 0: - binsize_a = ( - values_a.max(skipna=True) - values_a.min(skipna=True) - ) / number_of_bins - else: - binsize_a = 1 # default value of 1 in case that values_a is empty - if len(values_b) > 0: - binsize_b = ( - values_b.max(skipna=True) - values_b.min(skipna=True) - ) / number_of_bins - else: - binsize_b = 1 # default value of 1 in case that values_b is empty - - if overlay and len(values_a) > 0 and len(values_b) > 0: - binsize_a = binsize_b = max(binsize_a, binsize_b) - - trace0 = go.Histogram( - x=values_a, - marker_color=PLOT_PRIMARY_COLOR, - name=name_a, - xbins=dict(start=min_value, end=max_value, size=binsize_a), - ) - trace1 = go.Histogram( - x=values_b, - marker_color=PLOT_SECONDARY_COLOR, - name=name_b, - xbins=dict(start=min_value, end=max_value, size=binsize_b), - ) - if not overlay: - fig = make_subplots(rows=1, cols=2) - fig.add_trace(trace0, 1, 1) - fig.add_trace(trace1, 1, 2) - if visual_transformation == "log10": - fig.update_layout( - xaxis=generate_tics(0, max_value, True), - xaxis2=generate_tics(0, max_value, True), - ) + binsize_a = 1 # default value of 1 in case that values_a is empty + if len(values_b) > 0: + binsize_b = ( + values_b.max(skipna=True) - values_b.min(skipna=True) + ) / number_of_bins else: - fig = go.Figure() - fig.add_trace(trace0) - fig.add_trace(trace1) - fig.update_layout(barmode="overlay") - fig.update_traces(opacity=0.75) - if visual_transformation == "log10": - fig.update_layout(xaxis=generate_tics(0, max_value, True)) - fig.update_xaxes(title=x_title) - fig.update_yaxes(title=y_title, rangemode="tozero") + binsize_b = 1 # default value of 1 in case that values_b is empty + + if overlay and len(values_a) > 0 and len(values_b) > 0: + binsize_a = binsize_b = max(binsize_a, binsize_b) + + trace0 = go.Histogram( + x=values_a, + marker_color=PLOT_PRIMARY_COLOR, + name=name_a, + xbins=dict(start=min_value, end=max_value, size=binsize_a), + ) + trace1 = go.Histogram( + x=values_b, + marker_color=PLOT_SECONDARY_COLOR, + name=name_b, + xbins=dict(start=min_value, end=max_value, size=binsize_b), + ) + if not overlay: + fig = make_subplots(rows=1, cols=2) + fig.add_trace(trace0, 1, 1) + fig.add_trace(trace1, 1, 2) + if visual_transformation == "log10": + fig.update_layout( + xaxis=generate_tics(0, max_value, True), + xaxis2=generate_tics(0, max_value, True), + ) + else: + fig = go.Figure() + fig.add_trace(trace0) + fig.add_trace(trace1) + fig.update_layout(barmode="overlay") + fig.update_traces(opacity=0.75) + if visual_transformation == "log10": + fig.update_layout(xaxis=generate_tics(0, max_value, True)) + fig.update_xaxes(title=x_title) + fig.update_yaxes(title=y_title, rangemode="tozero") wrapped_title = "
".join(textwrap.wrap(heading, width=50)) fig.update_layout(title={"text": f"{wrapped_title}"}) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index e9de1d010..cf4af4b60 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -45,7 +45,7 @@ FeatureOrientationType, ) from backend.protzilla.constants.intensity_types import IntensityType, IntensityNameType -from protzilla.importing.query_generation import generate_alphafold_query_json +from backend.protzilla.importing.query_generation import generate_alphafold_query_json class ImportingStep(Step, ABC): From bcfdcb3d796c5172a6d5aee9b02f2a8a7d7c02ed Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 4 May 2026 12:08:28 +0200 Subject: [PATCH 204/240] fix: fix cl validation plot tests --- .../data_analysis/crosslinking_validation.py | 73 ++++++++++------- backend/protzilla/methods/data_analysis.py | 2 +- .../test_ptm_visualization.py | 8 +- .../test_crosslinking_validation.py | 80 ++++++++----------- .../test_differential_expression.py | 4 +- .../importing/test_crosslinking_import.py | 8 +- backend/tests/protzilla/test_disk_operator.py | 2 +- 7 files changed, 93 insertions(+), 84 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 5d5bbe1e7..2ce322a7e 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -683,7 +683,11 @@ def _get_tick_values_with_lines(fig, min_value, max_value): if shape.type == "line" and shape.x0 == shape.x1 ] - step_size = pow(10, math.floor(np.log10(max_value - min_value))) + step_size = ( + pow(10, math.floor(np.log10(max_value - min_value))) + if max_value - min_value > 0 + else 1 + ) first_step = math.ceil(min_value / step_size) * step_size last_step = math.ceil(max_value / step_size) * step_size + 3 * step_size tick_values = list(np.arange(first_step, last_step, step_size)) @@ -783,6 +787,13 @@ def diagrams_of_crosslinking_validation_data( else crosslinker_length + accepted_deviation_upper_bound ), ) + add_vertical_line_with_annotation_in_legend( + fig=histogram, + dash="solid", + annotation=f"{crosslinker} length: {crosslinker_length}Å", + x_value=crosslinker_length, + column=1, + ) mean_of_predicted_lengths = crosslinker_df["alphafold_distance"].mean() if len(crosslinker_df) == 1: @@ -869,32 +880,8 @@ def diagrams_of_crosslinking_validation_data( figures.append(histogram_two_standard_deviations) figures.append(histogram) - valid_crosslinks = (validated_df["valid_crosslink"]).sum() - invalid_crosslinks = (~validated_df["valid_crosslink"]).sum() - valid_intra_total = ( - (validated_df["valid_crosslink"]) & (validated_df["link_type"] == "intra") - ).sum() - valid_inter_total = ( - (validated_df["valid_crosslink"]) & (validated_df["link_type"] == "inter") - ).sum() - invalid_intra_total = ( - (~validated_df["valid_crosslink"]) & (validated_df["link_type"] == "intra") - ).sum() - invalid_inter_total = ( - (~validated_df["valid_crosslink"]) & (validated_df["link_type"] == "inter") - ).sum() - - bar_plot_over_all_checked_crosslinks = create_bar_plot( - values_of_sectors=[ - valid_crosslinks, - invalid_crosslinks, - ], - names_of_sectors=[ - f"Cross-Links matching predicted data (intra: {valid_intra_total}, inter: {valid_inter_total})", - f"Cross-Links not matching predicted data (intra: {invalid_intra_total}, inter: {invalid_inter_total})", - ], - heading=f"All Cross-Links used for validation of {structures_to_validate_str}", - y_title="Number of Cross-Links", + bar_plot_over_all_checked_crosslinks = _create_summarizing_cl_validation_bar_plot( + validated_df, structures_to_validate_str ) figures.append(bar_plot_over_all_checked_crosslinks) @@ -1150,3 +1137,35 @@ def add_split_traces(values: pd.Series, name: str, color: str, show_legend: bool ) return fig + + +def _create_summarizing_cl_validation_bar_plot( + validated_df: pd.DataFrame, structures_to_validate_str: str +) -> Figure: + valid_crosslinks = (validated_df["valid_crosslink"]).sum() + invalid_crosslinks = (~validated_df["valid_crosslink"]).sum() + valid_intra_total = ( + (validated_df["valid_crosslink"]) & (validated_df["link_type"] == "intra") + ).sum() + valid_inter_total = ( + (validated_df["valid_crosslink"]) & (validated_df["link_type"] == "inter") + ).sum() + invalid_intra_total = ( + (~validated_df["valid_crosslink"]) & (validated_df["link_type"] == "intra") + ).sum() + invalid_inter_total = ( + (~validated_df["valid_crosslink"]) & (validated_df["link_type"] == "inter") + ).sum() + + return create_bar_plot( + values_of_sectors=[ + valid_crosslinks, + invalid_crosslinks, + ], + names_of_sectors=[ + f"Cross-Links matching predicted data (intra: {valid_intra_total}, inter: {valid_inter_total})", + f"Cross-Links not matching predicted data (intra: {invalid_intra_total}, inter: {invalid_inter_total})", + ], + heading=f"All Cross-Links used for validation of {structures_to_validate_str}", + y_title="Number of Cross-Links", + ) diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 51d487642..3a1b9d361 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -89,7 +89,7 @@ create_overview_ptm_visualization, get_detected_modifications, ) -from protzilla.data_analysis.crosslinking_validation import ( +from backend.protzilla.data_analysis.crosslinking_validation import ( monomer_diagrams, multimer_diagrams, monomer_validation, diff --git a/backend/tests/protzilla/data_analysis/ptm_visualization/test_ptm_visualization.py b/backend/tests/protzilla/data_analysis/ptm_visualization/test_ptm_visualization.py index 47d9f6e5e..ac686f8dd 100644 --- a/backend/tests/protzilla/data_analysis/ptm_visualization/test_ptm_visualization.py +++ b/backend/tests/protzilla/data_analysis/ptm_visualization/test_ptm_visualization.py @@ -9,19 +9,19 @@ get_detected_modifications, create_overview_ptm_visualization, ) -from protzilla.data_analysis.ptm_visualization.ptm_bar_plot import ( +from backend.protzilla.data_analysis.ptm_visualization.ptm_bar_plot import ( create_bar_ptm_visualization, ) -from protzilla.data_analysis.ptm_visualization.ptm_details_plot import ( +from backend.protzilla.data_analysis.ptm_visualization.ptm_details_plot import ( create_details_ptm_visualization, ) -from tests.paths import ( +from backend.tests.paths import ( TEST_PTM_VISUALIZATION_PATH, TEST_FASTA_PATH, TEST_PEPTIDES_PATH, TEST_METADATA_PATH, ) -from tests.protzilla.data_analysis.ptm_visualization.ptm_vis_test_utils import ( +from backend.tests.protzilla.data_analysis.ptm_visualization.ptm_vis_test_utils import ( get_evidence_df, get_metadata_df, mock_settings_file, diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 5a66cd8a7..196c7e3b4 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -21,7 +21,9 @@ ) from backend.protzilla.form import Form -from protzilla.methods.data_analysis import CrosslinkingValidationWithAngstromDeviation +from backend.protzilla.methods.data_analysis import ( + CrosslinkingValidationWithAngstromDeviation, +) @pytest.mark.parametrize( @@ -611,8 +613,8 @@ def test_diagrams_of_crosslinking_validation_data_with_drawing_all_vertical_line mock_add_vline.call_count == 8 ) # for both crosslinkers: 1 call for crosslinker length for each histogram and 1 call for bound on deviation for each histogram - # Check that create_histograms was called 4 times (2 per crosslinker) - assert mock_create_hist.call_count == 4 + # Check that create_histograms was called 2 times (1 per crosslinker) + assert mock_create_hist.call_count == 2 # Check that create_bar_plot was called once mock_create_bar.assert_called_once() @@ -671,8 +673,8 @@ def test_diagrams_of_crosslinking_validation_data_without_drawing_all_vertical_l # the bounds are only drawn for the histogram that is not limited to the range of +- 2 standard deviations assert mock_add_vline.call_count == 10 - # Check that create_histograms was called 4 times (2 per crosslinker) - assert mock_create_hist.call_count == 4 + # Check that create_histograms was called 2 times (1 per crosslinker) + assert mock_create_hist.call_count == 2 # Check that create_bar_plot was called once mock_create_bar.assert_called_once() @@ -709,7 +711,8 @@ def test_diagrams_calls_with_correct_parameters( "backend.protzilla.data_analysis.crosslinking_validation.create_bar_plot" ) as mock_bar: - mock_hist.side_effect = lambda **kwargs: f"hist_{kwargs['heading']}" + # mock_hist.side_effect = lambda **kwargs: f"hist_{kwargs['heading']}" + mock_hist.return_value = Figure() mock_bar.return_value = "bar_fig" figures = diagrams_of_crosslinking_validation_data( @@ -718,40 +721,21 @@ def test_diagrams_calls_with_correct_parameters( crosslinker_information=sample_crosslinker_info_with_one_crosslinker, ) - # There should be 2 histogram calls: 2 per crosslinker - assert mock_hist.call_count == 2 - - # Check histogram call parameters for crosslinker full-range - first_hist_call = mock_hist.call_args_list[0].kwargs - assert first_hist_call["name_a"] == "Valid Crosslinks (intra: 1, inter: 1)" - assert first_hist_call["name_b"] == "Invalid Crosslinks (intra: 1, inter: 1)" - assert ( - first_hist_call["heading"] - == "Predicted distances for P12345 with crosslinker CL1" - ) - assert first_hist_call["relevant_column_a"] == "alphafold_distance" - assert first_hist_call["relevant_column_b"] == "alphafold_distance" - assert first_hist_call["one_bin_per_int"] == True - - valid_crosslinks = sample_crosslinking_df_with_one_crosslinker.loc[ - sample_crosslinking_df_with_one_crosslinker["valid_crosslink"] == True, - "alphafold_distance", - ] - invalid_crosslinks = sample_crosslinking_df_with_one_crosslinker.loc[ - sample_crosslinking_df_with_one_crosslinker["valid_crosslink"] == False, - "alphafold_distance", - ] - dataframe_a = pd.DataFrame({"alphafold_distance": valid_crosslinks}) - dataframe_b = pd.DataFrame({"alphafold_distance": invalid_crosslinks}) - pdt.assert_frame_equal(first_hist_call["dataframe_a"], dataframe_a) - pdt.assert_frame_equal(first_hist_call["dataframe_b"], dataframe_b) + # There should be 1 histogram calls: 1 per crosslinker + assert mock_hist.call_count == 1 # Check histogram call parameters for crosslinker ±2 std - second_hist_call = mock_hist.call_args_list[1].kwargs + hist_call = mock_hist.call_args_list[0].kwargs + assert hist_call["name_a"] == "Valid Crosslinks (intra: 1, inter: 1)" + assert hist_call["name_b"] == "Invalid Crosslinks (intra: 1, inter: 1)" assert ( - second_hist_call["heading"] + hist_call["heading"] == "Predicted distances for P12345 with crosslinker CL1, mean +/- 2 σ" ) + assert hist_call["relevant_column_a"] == "alphafold_distance" + assert hist_call["relevant_column_b"] == "alphafold_distance" + assert hist_call["one_bin_per_int"] == True + mean_predicted_lengths = sample_crosslinking_df_with_one_crosslinker[ "alphafold_distance" ].mean() @@ -762,24 +746,30 @@ def test_diagrams_calls_with_correct_parameters( max(0, mean_predicted_lengths - 2 * standard_deviation_predicted_lengths), mean_predicted_lengths + 2 * standard_deviation_predicted_lengths, ) - assert second_hist_call["min_value"] == mean_plus_minus_two_std_range[0] - assert second_hist_call["max_value"] == mean_plus_minus_two_std_range[1] + assert hist_call["min_value"] == mean_plus_minus_two_std_range[0] + assert hist_call["max_value"] == mean_plus_minus_two_std_range[1] + + valid_crosslinks = sample_crosslinking_df_with_one_crosslinker.loc[ + sample_crosslinking_df_with_one_crosslinker["valid_crosslink"] == True, + "alphafold_distance", + ] + invalid_crosslinks = sample_crosslinking_df_with_one_crosslinker.loc[ + sample_crosslinking_df_with_one_crosslinker["valid_crosslink"] == False, + "alphafold_distance", + ] + dataframe_a = pd.DataFrame({"alphafold_distance": valid_crosslinks}) + dataframe_b = pd.DataFrame({"alphafold_distance": invalid_crosslinks}) + pdt.assert_frame_equal(hist_call["dataframe_a"], dataframe_a) + pdt.assert_frame_equal(hist_call["dataframe_b"], dataframe_b) call_args_list = [call.kwargs for call in mock_vline.call_args_list] assert any( - call["annotation"] == "CL1 length" and call["x_value"] == 11.0 + call["annotation"] == "CL1 length: 11.0Å" and call["x_value"] == 11.0 for call in call_args_list ) mock_bar.assert_called_once() - expected_figures = [ - "hist_Predicted distances for P12345 with crosslinker CL1, mean +/- 2 σ", - "hist_Predicted distances for P12345 with crosslinker CL1", - "bar_fig", - ] - assert figures == expected_figures - def test_validate_multimer_with_invalid_crosslinks(): sequences_df = pd.DataFrame( diff --git a/backend/tests/protzilla/data_analysis/test_differential_expression.py b/backend/tests/protzilla/data_analysis/test_differential_expression.py index 049d89366..d7294db27 100644 --- a/backend/tests/protzilla/data_analysis/test_differential_expression.py +++ b/backend/tests/protzilla/data_analysis/test_differential_expression.py @@ -15,10 +15,10 @@ kruskal_wallis_test_on_ptm_data, ) from backend.protzilla.data_analysis.plots import create_volcano_plot -from protzilla.data_analysis.differential_expression_t_test import ( +from backend.protzilla.data_analysis.differential_expression_t_test import ( get_z_score_based_fold_change_significance, ) -from tests.paths import TEST_AML_DATA_PATH +from backend.tests.paths import TEST_AML_DATA_PATH @pytest.fixture diff --git a/backend/tests/protzilla/importing/test_crosslinking_import.py b/backend/tests/protzilla/importing/test_crosslinking_import.py index 923b1b7bc..abf5f6bd8 100644 --- a/backend/tests/protzilla/importing/test_crosslinking_import.py +++ b/backend/tests/protzilla/importing/test_crosslinking_import.py @@ -2,7 +2,7 @@ import pandas as pd from unittest.mock import patch, Mock from requests.exceptions import Timeout -from protzilla.importing.crosslinking_import import ( +from backend.protzilla.importing.crosslinking_import import ( aggregate_data, remove_brackets_from_peptide, get_amino_acid_where_crosslink_is_connected_proteomediscoverer_xlinkx_format, @@ -105,7 +105,7 @@ def test_process_uniprot_response_id_to_gene_name(): def test_uniprot_lookup_successful_request_but_no_results(monkeypatch): - from protzilla.importing.crosslinking_import import uniprot_lookup + from backend.protzilla.importing.crosslinking_import import uniprot_lookup def mock_execute(*args, **kwargs): mock = Mock() @@ -204,7 +204,7 @@ def mock_lookup(ids): def test_process_organism_id_from_text_field( monkeypatch, input_string, mock_result, expected ): - from protzilla.importing.crosslinking_import import ( + from backend.protzilla.importing.crosslinking_import import ( process_organism_id_from_text_field, ) @@ -232,7 +232,7 @@ def test_aggregate_failed_proteins_for_display(): } ) - from protzilla.importing.crosslinking_import import ( + from backend.protzilla.importing.crosslinking_import import ( aggregate_failed_proteins_for_display, ) diff --git a/backend/tests/protzilla/test_disk_operator.py b/backend/tests/protzilla/test_disk_operator.py index 58d88e3cf..e158ebff0 100644 --- a/backend/tests/protzilla/test_disk_operator.py +++ b/backend/tests/protzilla/test_disk_operator.py @@ -2,7 +2,7 @@ import pytest -from protzilla.disk_operator import YamlOperator +from backend.protzilla.disk_operator import YamlOperator @pytest.fixture() From 1785ff4343cc8b516c4e955ea819935d90bf7316 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Mon, 4 May 2026 14:21:08 +0200 Subject: [PATCH 205/240] refactor: increase width of plots and rename "(in)valid CLs" to "predictions (not) matching CLs" in legends --- .../data_analysis/crosslinking_validation.py | 24 ++++++++++--------- backend/protzilla/data_preprocessing/plots.py | 2 +- .../test_crosslinking_validation.py | 7 +++--- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 2ce322a7e..9574b8705 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -776,10 +776,10 @@ def diagrams_of_crosslinking_validation_data( histogram = create_cl_validation_histogram( distances_valid=df_valid["alphafold_distance"], distances_invalid=df_invalid["alphafold_distance"], - title_valid=f"Valid Crosslinks (intra: {valid_intra}, inter: {valid_inter})", - title_invalid=f"Invalid Crosslinks (intra: {invalid_intra}, inter: {invalid_inter})", + title_valid=f"Predictions matching CLs (intra: {valid_intra}, inter: {valid_inter})", + title_invalid=f"Predictions not matching CLs (intra: {invalid_intra}, inter: {invalid_inter})", heading=f"Predicted distances for {structures_to_validate_str} with crosslinker {crosslinker}", - xaxis_label="Distanc in Å", + xaxis_label="Distance in Å", yaxis_label="Count", split_x_axis_at=( crosslinker_length @@ -814,8 +814,8 @@ def diagrams_of_crosslinking_validation_data( histogram_two_standard_deviations = create_histograms( dataframe_a=df_valid, dataframe_b=df_invalid, - name_a=f"Valid Crosslinks (intra: {valid_intra}, inter: {valid_inter})", - name_b=f"Invalid Crosslinks (intra: {invalid_intra}, inter: {invalid_inter})", + name_a=f"Predictions matching CLs (intra: {valid_intra}, inter: {valid_inter})", + name_b=f"Predictions not matching CLs (intra: {invalid_intra}, inter: {invalid_inter})", heading=f"Predicted distances for {structures_to_validate_str} with crosslinker {crosslinker}, mean +/- 2 σ", x_title="Distance (Å)", y_title="Count", @@ -833,6 +833,7 @@ def diagrams_of_crosslinking_validation_data( annotation=f"{crosslinker} length: {crosslinker_length}Å", x_value=crosslinker_length, ) + histogram_two_standard_deviations.update_layout(width=900) if accepted_deviation_upper_bound != 0: add_vertical_line_with_annotation_in_legend( @@ -980,8 +981,8 @@ def create_cl_validation_histogram( distances_valid: pd.Series, distances_invalid: pd.Series, split_x_axis_at: float, - title_valid: str = "Valid Crosslinks", - title_invalid: str = "Invalid Crosslinks", + title_valid: str = "Predictions matching CLs", + title_invalid: str = "Predictions not matching CLs", heading: str = "", xaxis_label: str = "", yaxis_label: str = "", @@ -994,8 +995,8 @@ def create_cl_validation_histogram( :param distances_invalid: Pandas Series containing distances not matching the crosslinker length. :param split_x_axis_at: Threshold distance at which the x-axis transitions from linear (left panel) to logarithmic (right panel). - :param title_valid: Legend label for valid crosslinks. Defaults to "Valid Crosslinks". - :param title_invalid: Legend label for invalid crosslinks. Defaults to "Invalid Crosslinks". + :param title_valid: Legend label for valid crosslinks. Defaults to "Predictions matching CLs". + :param title_invalid: Legend label for invalid crosslinks. Defaults to "Predictions not matching CLs". :param heading: Title of the overall figure. Can be a long string and will be wrapped. :param xaxis_label: Label for the x-axis (applied to both panels with scale annotations). :param yaxis_label: Label for the shared y-axis. @@ -1016,6 +1017,7 @@ def create_cl_validation_histogram( horizontal_spacing=0.1, column_widths=[0.5, 0.5], ) + fig.update_layout(width=900) # --- Pre-calculate shared bins for BOTH datasets --- # 1. Linear Bins @@ -1111,7 +1113,7 @@ def add_split_traces(values: pd.Series, name: str, color: str, show_legend: bool val: float = split_x_axis_at + math.pow(10, i) # Add the exact log position for the tick, and the formatted text tick_vals.append(np.log10(val)) - tick_text.append(f"{(split_x_axis_at + 10**i):.2f}") + tick_text.append(f"{(split_x_axis_at + 10**i):.2f}".rstrip("0").rstrip(".")) _ = fig.update_xaxes( title_text=f"{xaxis_label} (Log)", @@ -1126,7 +1128,7 @@ def add_split_traces(values: pd.Series, name: str, color: str, show_legend: bool _ = fig.update_layout(barmode="overlay", yaxis_title=yaxis_label) fig.update_traces(opacity=0.75) - wrapped_title = "
".join(textwrap.wrap(heading, width=50)) + wrapped_title = "
".join(textwrap.wrap(heading, width=60)) _ = fig.update_layout(title={"text": f"{wrapped_title}"}) _ = fig.update_layout(margin_pad=10) diff --git a/backend/protzilla/data_preprocessing/plots.py b/backend/protzilla/data_preprocessing/plots.py index 81bb31a8f..445908609 100644 --- a/backend/protzilla/data_preprocessing/plots.py +++ b/backend/protzilla/data_preprocessing/plots.py @@ -282,7 +282,7 @@ def create_histograms( fig.update_xaxes(title=x_title) fig.update_yaxes(title=y_title, rangemode="tozero") - wrapped_title = "
".join(textwrap.wrap(heading, width=50)) + wrapped_title = "
".join(textwrap.wrap(heading, width=60)) fig.update_layout(title={"text": f"{wrapped_title}"}) fig.update_layout(margin_pad=20) diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 196c7e3b4..d620dc368 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -711,7 +711,6 @@ def test_diagrams_calls_with_correct_parameters( "backend.protzilla.data_analysis.crosslinking_validation.create_bar_plot" ) as mock_bar: - # mock_hist.side_effect = lambda **kwargs: f"hist_{kwargs['heading']}" mock_hist.return_value = Figure() mock_bar.return_value = "bar_fig" @@ -726,8 +725,10 @@ def test_diagrams_calls_with_correct_parameters( # Check histogram call parameters for crosslinker ±2 std hist_call = mock_hist.call_args_list[0].kwargs - assert hist_call["name_a"] == "Valid Crosslinks (intra: 1, inter: 1)" - assert hist_call["name_b"] == "Invalid Crosslinks (intra: 1, inter: 1)" + assert hist_call["name_a"] == "Predictions matching CLs (intra: 1, inter: 1)" + assert ( + hist_call["name_b"] == "Predictions not matching CLs (intra: 1, inter: 1)" + ) assert ( hist_call["heading"] == "Predicted distances for P12345 with crosslinker CL1, mean +/- 2 σ" From eb93a2762d2a8bd684e29bec14d4ad4f6ad8e4de Mon Sep 17 00:00:00 2001 From: jorisfu Date: Mon, 4 May 2026 14:36:41 +0200 Subject: [PATCH 206/240] chore: clean float format --- backend/protzilla/data_analysis/crosslinking_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 9574b8705..2f05dd343 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -1113,7 +1113,7 @@ def add_split_traces(values: pd.Series, name: str, color: str, show_legend: bool val: float = split_x_axis_at + math.pow(10, i) # Add the exact log position for the tick, and the formatted text tick_vals.append(np.log10(val)) - tick_text.append(f"{(split_x_axis_at + 10**i):.2f}".rstrip("0").rstrip(".")) + tick_text.append(f"{(split_x_axis_at + 10**i):.4g}") _ = fig.update_xaxes( title_text=f"{xaxis_label} (Log)", From 5f5a40e22bbd7aaa73f4248180900702000ce614 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 4 May 2026 14:55:15 +0200 Subject: [PATCH 207/240] format --- .../other-settings/cl-default-settings.tsx | 34 ++++++++----------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/frontend/src/components/app/settings/other-settings/cl-default-settings.tsx b/frontend/src/components/app/settings/other-settings/cl-default-settings.tsx index 382e790d7..1dade5f31 100644 --- a/frontend/src/components/app/settings/other-settings/cl-default-settings.tsx +++ b/frontend/src/components/app/settings/other-settings/cl-default-settings.tsx @@ -25,11 +25,14 @@ interface CrosslinkDefaultProps { handleDelete?: () => void; } -type ApiCrosslinkDefaults = Record; +type ApiCrosslinkDefaults = Record< + string, + { + cl_length: number; + cl_upper_deviation: number; + cl_lower_deviation: number; + } +>; const CrosslinkDefaultContainer = styled.div` display: flex; @@ -65,7 +68,7 @@ const CrosslinkDefaultEntry = ({ `length: ${String(cl_length)} | ` + `accepted upper deviation: ${String(cl_upper_deviation)} | ` + `accepted lower deviation: ${String(cl_lower_deviation)}` - } + } /> @@ -81,7 +84,7 @@ export const CrosslinkDefaultUpload = () => { const fetchCrosslinkDefaults = async () => { const crosslinkDefaults = (await callApi("get_cl_defaults")) as ApiCrosslinkDefaults | null; - + if (crosslinkDefaults) { const transformedList: CrosslinkDefaultProps[] = Object.entries(crosslinkDefaults).map( ([name, properties]) => ({ @@ -89,7 +92,7 @@ export const CrosslinkDefaultUpload = () => { cl_length: properties.cl_length, cl_upper_deviation: properties.cl_upper_deviation, cl_lower_deviation: properties.cl_lower_deviation, - }) + }), ); setCrosslinkDefaultList(transformedList); } @@ -166,9 +169,7 @@ export const CrosslinkDefaultUpload = () => { /> @@ -214,16 +215,9 @@ export const CrosslinkDefaultUpload = () => { ); }} /> - + {crosslinkDefaultList.length === 0 ? ( - + ) : ( {crosslinkDefaultList.map((ps) => ( From 185bb5181308d8803c6f0238e5720e960e280bae Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 4 May 2026 14:56:48 +0200 Subject: [PATCH 208/240] format --- backend/tests/main/test_views_settings.py | 33 ++++++++++++++--------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/backend/tests/main/test_views_settings.py b/backend/tests/main/test_views_settings.py index 2d8b49d8c..95e7a7c80 100644 --- a/backend/tests/main/test_views_settings.py +++ b/backend/tests/main/test_views_settings.py @@ -18,22 +18,31 @@ PATCH_PATH = "backend.main.views_settings.DefaultsOperator" + def test_get_cl_defaults(monkeypatch): request = mock.Mock() request.method = "GET" - + mock_defaults_operator = mock.Mock() mock_defaults_operator.get_all_defaults.return_value = { - "DSSO": {"cl_length": 10.3, "cl_upper_deviation": 1.0, "cl_lower_deviation": 1.0} + "DSSO": { + "cl_length": 10.3, + "cl_upper_deviation": 1.0, + "cl_lower_deviation": 1.0, + } } monkeypatch.setattr(PATCH_PATH, lambda: mock_defaults_operator) response = get_cl_defaults(request) - + assert response.status_code == 200 response_data = json.loads(response.content.decode("utf-8")) assert response_data == { - "DSSO": {"cl_length": 10.3, "cl_upper_deviation": 1.0, "cl_lower_deviation": 1.0} + "DSSO": { + "cl_length": 10.3, + "cl_upper_deviation": 1.0, + "cl_lower_deviation": 1.0, + } } @@ -42,9 +51,9 @@ def test_update_cl_default_success(monkeypatch): "cl_name": "DSSO", "cl_length": 10.3, "cl_upper_deviation": 1.0, - "cl_lower_deviation": 1.2 + "cl_lower_deviation": 1.2, } - + request = mock.Mock() request.method = "POST" request.body = json.dumps(payload).encode("utf-8") @@ -60,7 +69,7 @@ def test_update_cl_default_success(monkeypatch): "cl_length": 10.3, "cl_upper_deviation": 1.0, "cl_lower_deviation": 1.2, - } + }, ) assert response.status_code == 200 response_data = json.loads(response.content.decode("utf-8")) @@ -69,7 +78,7 @@ def test_update_cl_default_success(monkeypatch): def test_update_cl_default_exception(monkeypatch): payload = {"cl_name": "DSSO", "cl_length": 10.3} - + request = mock.Mock() request.method = "POST" request.body = json.dumps(payload).encode("utf-8") @@ -87,7 +96,7 @@ def test_update_cl_default_exception(monkeypatch): def test_delete_cl_default_success(monkeypatch): payload = {"cl_name": "DSSO"} - + request = mock.Mock() request.method = "POST" request.body = json.dumps(payload).encode("utf-8") @@ -98,7 +107,7 @@ def test_delete_cl_default_success(monkeypatch): response = delete_cl_default(request) mock_defaults_operator.delete_default.assert_called_once_with("DSSO") - + assert response.status_code == 200 response_data = json.loads(response.content.decode("utf-8")) assert response_data["success"] is True @@ -106,7 +115,7 @@ def test_delete_cl_default_success(monkeypatch): def test_delete_cl_default_exception(monkeypatch): payload = {"cl_name": "DSSO"} - + request = mock.Mock() request.method = "POST" request.body = json.dumps(payload).encode("utf-8") @@ -119,4 +128,4 @@ def test_delete_cl_default_exception(monkeypatch): assert response.status_code == 405 response_data = json.loads(response.content.decode("utf-8")) - assert response_data["success"] is False \ No newline at end of file + assert response_data["success"] is False From 69cd20eaf43f61dedd05aca856ab3f54e396efac Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Mon, 4 May 2026 16:57:25 +0200 Subject: [PATCH 209/240] Adjust handling of CL defaults according to review --- backend/main/views_settings.py | 23 ++++++++++++++++------ backend/protzilla/methods/data_analysis.py | 17 +++++++++++----- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index d9559eabf..9b62b33e3 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -579,9 +579,12 @@ def delete_multimer_structure(request): ) +# <--- Crosslink defaults ---> + + def get_cl_defaults(request): default_operator = DefaultsOperator() - defaults = default_operator.get_all_defaults() + defaults = default_operator.read_default(name="crosslinker_lengths") return JsonResponse(defaults, safe=False) @@ -602,13 +605,17 @@ def update_cl_default(request): ) cl_default_dict = { - "cl_length": cl_length, - "cl_upper_deviation": cl_upper_deviation, - "cl_lower_deviation": cl_lower_deviation, + cl_name: { + "cl_length": cl_length, + "cl_upper_deviation": cl_upper_deviation, + "cl_lower_deviation": cl_lower_deviation, + } } try: defaults_operator = DefaultsOperator() - defaults_operator.write_default(name=cl_name, value=cl_default_dict) + defaults_operator.write_default( + name="crosslinker_lengths", value=cl_default_dict + ) return JsonResponse( { "success": True, @@ -633,7 +640,11 @@ def delete_cl_default(request): data = json.loads(request.body) cl_name = data.get("cl_name") defaults_operator = DefaultsOperator() - defaults_operator.delete_default(cl_name) + cl_defaults = defaults_operator.read_default(name="crosslinker_lengths") + del cl_defaults[cl_name] + defaults_operator.write_default( + name="crosslinker_lengths", value=cl_defaults + ) return JsonResponse( { "success": True, diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 762cd29d6..e5e5f4532 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -2377,11 +2377,18 @@ def create_crosslink_input_fields(self, form: Form, run: Run): for crosslinker in crosslinkers: field_name = f"{crosslinker}_length" if field_name not in form: - cl_defaults = run.disk_operator.defaults.read_default(crosslinker) - if cl_defaults: - length_default = cl_defaults["cl_length"] - upper_deviation_default = cl_defaults["cl_upper_deviation"] - lower_deviation_default = cl_defaults["cl_lower_deviation"] + cl_defaults = ( + run.disk_operator.defaults.read_default("crosslinker_lengths") or {} + ) + specific_cl_defaults = cl_defaults.get(crosslinker, {}) + if specific_cl_defaults: + length_default = specific_cl_defaults.get("cl_length") + upper_deviation_default = specific_cl_defaults.get( + "cl_upper_deviation" + ) + lower_deviation_default = specific_cl_defaults.get( + "cl_lower_deviation" + ) else: length_default = 0 upper_deviation_default = 0 From f91c97ed9d72259107749bafdacfb27ed6dc2a28 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Tue, 5 May 2026 13:05:55 +0200 Subject: [PATCH 210/240] feat: overwrite default labels with crosslinker description and add legend --- .../molstar-viewer/molstar-viewer.service.ts | 65 +++++++++++++++++++ .../shared/molstar-viewer/molstar-viewer.tsx | 7 +- .../molstar-viewer/molstar-viewer.ui.tsx | 28 ++++++++ .../core/shared/molstar-viewer/styles.ts | 22 ++++++- 4 files changed, 119 insertions(+), 3 deletions(-) create mode 100644 frontend/src/components/core/shared/molstar-viewer/molstar-viewer.ui.tsx diff --git a/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts index 93b450c89..c387b6e01 100644 --- a/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts +++ b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts @@ -1,4 +1,5 @@ import { useNotification } from "@protzilla/app"; +import { OrderedSet } from "molstar/lib/mol-data/int"; import { PluginUIContext } from "molstar/lib/mol-plugin-ui/context"; import { MolScriptBuilder as MS } from "molstar/lib/mol-script/language/builder"; @@ -9,6 +10,14 @@ import { } from "./crosslinker-processing"; import { CROSSLINKER_COLORS } from "./molstar-viewer.config"; +type PluginWithCrosslinks = PluginUIContext & { + crosslinkerGroups?: Record; +}; + +interface LabelProvider { + label: (loci: any) => string | undefined; +} + export async function addCrosslinks( plugin: PluginUIContext, cifText: string, @@ -46,6 +55,62 @@ export async function addCrosslinks( }); } } + + (plugin as PluginWithCrosslinks).crosslinkerGroups = crosslinkerGroups; +} + +export function overrideLabels(plugin: PluginUIContext) { + const labelManager = plugin.managers.lociLabels as { + providers: LabelProvider[]; + addProvider: (p: LabelProvider) => void; + }; + + const defaultLabelProviders = [...labelManager.providers]; + labelManager.providers = []; + + plugin.managers.lociLabels.addProvider({ + label: (loci) => { + if (loci.kind !== "element-loci") { + return defaultLabelProviders + .map((p) => p.label(loci)) + .filter(Boolean) + .join(" | "); + } + + const structureElements = loci.elements[0]; + const firstElement = OrderedSet.getAt(structureElements.indices, 0); + + const crosslinkerGroups = (plugin as PluginWithCrosslinks).crosslinkerGroups; + if (!crosslinkerGroups) { + return defaultLabelProviders + .map((p) => p.label(loci)) + .filter(Boolean) + .join(" | "); + } + + const atomId = + structureElements.unit.model.atomicHierarchy.atoms.label_atom_id.value(firstElement); + + const crosslinkerGroupWithAtomIds = Object.entries(crosslinkerGroups).find(([, ids]) => + ids.includes(atomId), + ); + + if (crosslinkerGroupWithAtomIds) { + const [crosslinkerGroupName] = crosslinkerGroupWithAtomIds as [CrosslinkerType, string[]]; + const stringColor = getCrosslinkerColor(crosslinkerGroupName); + return `${crosslinkerGroupName}`; + } + + return defaultLabelProviders + .map((p) => p.label(loci)) + .filter(Boolean) + .join(" | "); + }, + }); +} + +export function getCrosslinkerColor(type: CrosslinkerType) { + return `#${CROSSLINKER_COLORS[type].toString(16).padStart(6, "0")}`; } export function handleError( diff --git a/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.tsx b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.tsx index 34ee45713..5d124c03b 100644 --- a/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.tsx +++ b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.tsx @@ -6,7 +6,8 @@ import { renderReact18 } from "molstar/lib/mol-plugin-ui/react18"; import React, { useEffect, useRef, useState } from "react"; import { MolstarViewerProps } from "./molstar-viewer.props"; -import { addCrosslinks, handleError } from "./molstar-viewer.service"; +import { addCrosslinks, handleError, overrideLabels } from "./molstar-viewer.service"; +import { LegendOverlay } from "./molstar-viewer.ui"; import { CanvasWrapper, Container } from "./styles"; import "molstar/lib/mol-plugin-ui/skin/base/base.scss"; @@ -48,6 +49,8 @@ const MolstarViewer: React.FC = ({ cifText, crosslinks }) => await addCrosslinks(plugin, cifText, crosslinks); } + overrideLabels(plugin); + setIsLoading(false); } catch (error: unknown) { handleError(error, "MolstarViewer Error:", notify); @@ -74,6 +77,8 @@ const MolstarViewer: React.FC = ({ cifText, crosslinks }) => )} + + {crosslinks !== undefined && } ); }; diff --git a/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.ui.tsx b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.ui.tsx new file mode 100644 index 000000000..97e77de31 --- /dev/null +++ b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.ui.tsx @@ -0,0 +1,28 @@ +import React from "react"; + +import { CrosslinkerType } from "./crosslinker-processing"; +import { getCrosslinkerColor } from "./molstar-viewer.service"; +import { LegendContainer } from "./styles"; + +export const LegendOverlay: React.FC = () => { + return ( + +
+ {" "} + {CrosslinkerType.ValidIntra}{" "} +
+
+ {" "} + {CrosslinkerType.InvalidIntra}{" "} +
+
+ {" "} + {CrosslinkerType.ValidInter}{" "} +
+
+ {" "} + {CrosslinkerType.InvalidInter}{" "} +
+
+ ); +}; diff --git a/frontend/src/components/core/shared/molstar-viewer/styles.ts b/frontend/src/components/core/shared/molstar-viewer/styles.ts index e49e66c5e..a28b823ee 100644 --- a/frontend/src/components/core/shared/molstar-viewer/styles.ts +++ b/frontend/src/components/core/shared/molstar-viewer/styles.ts @@ -1,4 +1,4 @@ -import { color } from "@protzilla/theme"; +import { color, font, fontSize, fontWeight } from "@protzilla/theme"; import { styled } from "styled-components"; export const Container = styled.div` @@ -10,7 +10,7 @@ export const Container = styled.div` gap: 1rem; `; -const molstarTheme = { +export const molstarTheme = { primary: color("protzillaDarkBlue"), surface: color("protzillaLightGray"), hover: color("secondaryHover"), @@ -248,3 +248,21 @@ export const CanvasWrapper = styled.div` } } `; + +export const LegendContainer = styled.div` + position: absolute; + bottom: 0vh; + right: 1.5vh; + + background: ${molstarTheme.surface}; + color: ${molstarTheme.primary}; + + padding: 1vh; + font-family: ${font("defaultWithFallbacks")}; + font-size: ${fontSize("h6")}; + font-weight: ${fontWeight("default")}; + border-radius: 0px; + + pointer-events: none; + z-index: 10; +`; From 0467dfac29808414c299f0c83aef091342313e30 Mon Sep 17 00:00:00 2001 From: jorisfu Date: Tue, 5 May 2026 19:05:04 +0200 Subject: [PATCH 211/240] fix: handle x < 0 split case --- backend/Dockerfile.dev | 2 +- backend/protzilla/data_analysis/crosslinking_validation.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/backend/Dockerfile.dev b/backend/Dockerfile.dev index ecd735bc8..e5ab79341 100644 --- a/backend/Dockerfile.dev +++ b/backend/Dockerfile.dev @@ -15,5 +15,5 @@ RUN --mount=type=bind,source=install_scripts/database_download.py,target=install --mount=type=bind,source=backend/protzilla/constants/paths.py,target=backend/protzilla/constants/paths.py \ python install_scripts/database_download.py -ENV DEBUGMODE=1 +# ENV DEBUGMODE=1 ENTRYPOINT ["bash", "-c", "python -m debugpy --listen 0.0.0.0:5678 backend/manage.py runserver 0.0.0.0:8000 --nothreading"] diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 2f05dd343..10e0be35f 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -1003,6 +1003,9 @@ def create_cl_validation_histogram( :return: A Plotly Figure object containing the split histogram visualization. """ + if split_x_axis_at <= 0.0: + raise ValueError("x-axis split must be at x > 0") + # It is good practice to drop NaNs before calculating bins/histograms distances_valid.dropna(inplace=True) distances_invalid.dropna(inplace=True) From b4e0b80095651e3abbc80a5631f5385855c63bd5 Mon Sep 17 00:00:00 2001 From: jorisfu Date: Tue, 5 May 2026 19:09:35 +0200 Subject: [PATCH 212/240] fix: spelling --- .../protzilla/data_analysis/crosslinking_validation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 10e0be35f..5aaf2f35f 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -1168,9 +1168,9 @@ def _create_summarizing_cl_validation_bar_plot( invalid_crosslinks, ], names_of_sectors=[ - f"Cross-Links matching predicted data (intra: {valid_intra_total}, inter: {valid_inter_total})", - f"Cross-Links not matching predicted data (intra: {invalid_intra_total}, inter: {invalid_inter_total})", + f"Crosslinks matching predicted data (intra: {valid_intra_total}, inter: {valid_inter_total})", + f"Crosslinks not matching predicted data (intra: {invalid_intra_total}, inter: {invalid_inter_total})", ], - heading=f"All Cross-Links used for validation of {structures_to_validate_str}", - y_title="Number of Cross-Links", + heading=f"All Crosslinks used for validation of {structures_to_validate_str}", + y_title="Number of Crosslinks", ) From bea130b825d5241d5ad44685f0f16019d362b7f6 Mon Sep 17 00:00:00 2001 From: jorisfu Date: Tue, 5 May 2026 19:10:20 +0200 Subject: [PATCH 213/240] revert dockerfile change --- backend/Dockerfile.dev | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/Dockerfile.dev b/backend/Dockerfile.dev index e5ab79341..ecd735bc8 100644 --- a/backend/Dockerfile.dev +++ b/backend/Dockerfile.dev @@ -15,5 +15,5 @@ RUN --mount=type=bind,source=install_scripts/database_download.py,target=install --mount=type=bind,source=backend/protzilla/constants/paths.py,target=backend/protzilla/constants/paths.py \ python install_scripts/database_download.py -# ENV DEBUGMODE=1 +ENV DEBUGMODE=1 ENTRYPOINT ["bash", "-c", "python -m debugpy --listen 0.0.0.0:5678 backend/manage.py runserver 0.0.0.0:8000 --nothreading"] From 2c3b130576e178a2c18b45ad4b87c6761b0f2ad9 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Tue, 5 May 2026 23:49:40 +0200 Subject: [PATCH 214/240] fix: select correct crosslinker group for label based on both connected atoms --- .../molstar-viewer/molstar-viewer.service.ts | 81 +++++++++++++------ 1 file changed, 56 insertions(+), 25 deletions(-) diff --git a/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts index c387b6e01..4b6001f9d 100644 --- a/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts +++ b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts @@ -65,46 +65,77 @@ export function overrideLabels(plugin: PluginUIContext) { addProvider: (p: LabelProvider) => void; }; - const defaultLabelProviders = [...labelManager.providers]; + const defaultProviders = [...labelManager.providers]; labelManager.providers = []; - plugin.managers.lociLabels.addProvider({ + const getDefaultLabel = (loci: any) => + defaultProviders + .map((p) => p.label(loci)) + .filter(Boolean) + .join(" | "); + + const getAtomIdsFromLoci = (loci: any): string[] => { + const ids: string[] = []; + + for (const element of loci.elements) { + const { indices, unit } = element; + const atoms = unit.model.atomicHierarchy.atoms.label_atom_id; + + for (let i = 0; i < OrderedSet.size(indices); i++) { + const idx = OrderedSet.getAt(indices, i); + ids.push(String(atoms.value(idx))); + } + } + + return [...new Set(ids)]; + }; + + const findMatchingAtomPair = (ids: string[]) => { + // since the atom-pair of one crosslink is always XL...A, XL...B those are the two ids we need + // (there can be atoms of other crosslinks at the exact same place, which is why they are listed here) + const getNumber = (id: string) => /\d+/.exec(id)?.[0]; + + for (let i = 0; i < ids.length; i++) { + for (let j = i + 1; j < ids.length; j++) { + if (getNumber(ids[i]) === getNumber(ids[j])) { + return [ids[i], ids[j]] as const; + } + } + } + return undefined; + }; + + labelManager.addProvider({ label: (loci) => { if (loci.kind !== "element-loci") { - return defaultLabelProviders - .map((p) => p.label(loci)) - .filter(Boolean) - .join(" | "); + return getDefaultLabel(loci); } - const structureElements = loci.elements[0]; - const firstElement = OrderedSet.getAt(structureElements.indices, 0); - const crosslinkerGroups = (plugin as PluginWithCrosslinks).crosslinkerGroups; if (!crosslinkerGroups) { - return defaultLabelProviders - .map((p) => p.label(loci)) - .filter(Boolean) - .join(" | "); + return getDefaultLabel(loci); + } + + const atomIds = getAtomIdsFromLoci(loci); + const pair = findMatchingAtomPair(atomIds); + + if (!pair) { + return getDefaultLabel(loci); } - const atomId = - structureElements.unit.model.atomicHierarchy.atoms.label_atom_id.value(firstElement); + const [atomId1, atomId2] = pair; - const crosslinkerGroupWithAtomIds = Object.entries(crosslinkerGroups).find(([, ids]) => - ids.includes(atomId), + const match = Object.entries(crosslinkerGroups).find( + ([, ids]) => ids.includes(atomId1) && ids.includes(atomId2), ); - if (crosslinkerGroupWithAtomIds) { - const [crosslinkerGroupName] = crosslinkerGroupWithAtomIds as [CrosslinkerType, string[]]; - const stringColor = getCrosslinkerColor(crosslinkerGroupName); - return `${crosslinkerGroupName}`; + if (!match) { + return getDefaultLabel(loci); } - return defaultLabelProviders - .map((p) => p.label(loci)) - .filter(Boolean) - .join(" | "); + const [groupName] = match as [CrosslinkerType, string[]]; + const color = getCrosslinkerColor(groupName); + return `${groupName}`; }, }); } From 6fa83f3de42a5f30256036181aeef1124a1909cd Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Wed, 6 May 2026 00:26:01 +0200 Subject: [PATCH 215/240] refactor: change spelling to 'crosslink...' everywhere --- .../data_analysis/crosslinking_validation.py | 36 +++++++++---------- .../importing/crosslinking_import.py | 8 ++--- backend/protzilla/methods/data_analysis.py | 12 +++---- .../test_crosslinking_validation.py | 2 +- 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 32535d895..3a410b462 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -23,7 +23,7 @@ def get_reactive_atom_of_amino_acid_residue(amino_acid_type: str) -> str: """ Returns the atom of an amino acid residue that is considered reactive for - cross-linking. Currently, this always returns the central alpha carbon (CA). + crosslinking. Currently, this always returns the central alpha carbon (CA). :param amino_acid_type: code of the amino acid @@ -42,7 +42,7 @@ def get_coordinates_of_atom_crosslinker_bound_to( chain_id: str, ) -> tuple[float, float, float]: """ - Returns the Cartesian coordinates of the atom to which the cross-linker is + Returns the Cartesian coordinates of the atom to which the crosslinker is bound for a given amino acid residue in a protein structure. :param amino_acid_position_where_crosslinker_bound: 1-based position of the amino acid residue @@ -174,7 +174,7 @@ def add_protein_crosslink_positions_to_df( If either peptide cannot be matched in its corresponding protein sequence, the row is removed and a warning message is recorded. - :param input_crosslinking_df: DataFrame containing cross-linking data with at least the following columns: + :param input_crosslinking_df: DataFrame containing crosslinking data with at least the following columns: - 'Peptide1': first peptide sequence - 'Peptide2': second peptide sequence - 'CL_position_within_peptide1': 0-based crosslinker position within Peptide1 @@ -491,12 +491,12 @@ def validate_with_angstrom_deviation( structures_to_validate: list, ) -> dict: """ - Validates cross-links by comparing the cross-linker lengths with the distances between the linked - amino acids in the AlphaFold protein structure. A cross-link is regarded as valid if it matches the AlphaFold data, - so if the distance between the connected amino acids in AlphaFold is less than (cross-linker length + the upper allowed deviation) - and more than (cross-linker length - the lower allowed deviation). If one of the bounds is zero only the other bound will be applied. + Validates crosslinks by comparing the crosslinker lengths with the distances between the linked + amino acids in the AlphaFold protein structure. A crosslink is regarded as valid if it matches the AlphaFold data, + so if the distance between the connected amino acids in AlphaFold is less than (crosslinker length + the upper allowed deviation) + and more than (crosslinker length - the lower allowed deviation). If one of the bounds is zero only the other bound will be applied. - :param crosslinking_df: DataFrame containing the cross-linking data to validate. + :param crosslinking_df: DataFrame containing the crosslinking data to validate. :param crosslinker_information: Dictionary mapping crosslinker names to a list of three floats: [crosslinker_length, upper_accepted_deviation, lower_accepted_deviation]. :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms). @@ -520,7 +520,7 @@ def validate_with_angstrom_deviation( # Check if dataframe is empty if relevant_crosslinks_df.empty: - msg = "There are no cross links between the structures to validate." + msg = "There are no crosslinks between the structures to validate." messages = [dict(level=logging.WARNING, msg=msg)] logger.warning(msg) return dict(crosslinking_result_df=pd.DataFrame(), messages=messages) @@ -540,7 +540,7 @@ def validate_with_angstrom_deviation( ) if relevant_crosslinks_df.empty: - msg = "There are no cross links between the structures to validate." + msg = "There are no crosslinks between the structures to validate." messages = [dict(level=logging.WARNING, msg=msg)] logger.warning(msg) return dict(crosslinking_result_df=pd.DataFrame(), messages=messages) @@ -654,7 +654,7 @@ def diagrams_of_crosslinking_validation_data( ) -> list[Figure]: """ Creates for each crosslinker histogram plots summarizing the distribution of valid and invalid - cross-links based on the (AlphaFold-)predicted distances compared to crosslinker lengths and + crosslinks based on the (AlphaFold-)predicted distances compared to crosslinker lengths and allowed deviations. For each crosslinker, two histograms are generated: @@ -664,10 +664,10 @@ def diagrams_of_crosslinking_validation_data( Both histograms include vertical reference lines indicating the crosslinker length and, if applicable, the upper and/or lower accepted deviation bounds. - Additionally, a bar plot is created summarizing the total number of cross-links that match + Additionally, a bar plot is created summarizing the total number of crosslinks that match or do not match the predicted structure across all analyzed crosslinkers. - :param crosslinking_df: DataFrame containing cross-linking data, including AlphaFold-predicted + :param crosslinking_df: DataFrame containing crosslinking data, including AlphaFold-predicted distances, crosslinker identifiers, and validation results. :param structure_metadata_df: Dataframe containing metadata. :param crosslinker_information: Contains for each Crosslinker: @@ -678,7 +678,7 @@ def diagrams_of_crosslinking_validation_data( :param amino_acid_sequences_df: DataFrame containing the protein sequence :return: List of Plotly Figure objects. For each crosslinker, the list contains two histogram figures (mean ± 2 standard deviations first, full range second), followed by a final - bar plot summarizing valid and invalid cross-links across all crosslinkers. + bar plot summarizing valid and invalid crosslinks across all crosslinkers. :raises KeyError: If a required crosslinker entry is missing in crosslinker_information. """ if validated_df.empty: @@ -835,11 +835,11 @@ def diagrams_of_crosslinking_validation_data( invalid_crosslinks, ], names_of_sectors=[ - f"Cross-Links matching predicted data (intra: {valid_intra_total}, inter: {valid_inter_total})", - f"Cross-Links not matching predicted data (intra: {invalid_intra_total}, inter: {invalid_inter_total})", + f"Crosslinks matching predicted data (intra: {valid_intra_total}, inter: {valid_inter_total})", + f"Crosslinks not matching predicted data (intra: {invalid_intra_total}, inter: {invalid_inter_total})", ], - heading=f"All Cross-Links used for validation of {structures_to_validate_str}", - y_title="Number of Cross-Links", + heading=f"All Crosslinks used for validation of {structures_to_validate_str}", + y_title="Number of Crosslinks", ) figures.append(bar_plot_over_all_checked_crosslinks) diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index e7747a20a..2a15bed5c 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -782,17 +782,17 @@ def crosslinking_import(file_path: Path, organism_ids: str) -> dict: else: raise ValueError(f"Unsupported file type: {file_path.suffix}") except Exception as e: - msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid cross linking file." + msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid crosslinking file." return error_output(msg, trace=format_trace(traceback.format_exception(e))) def base_message(): if file_type == ".csv": organism_names_string = ", ".join(scientific_organism_names) - return f"{len(good_df)} cross-links for the {organism_names_string} organism(s)" - return f"{len(good_df)} cross-links" + return f"{len(good_df)} crosslinks for the {organism_names_string} organism(s)" + return f"{len(good_df)} crosslinks" if good_df.empty: - msg = f"No cross-links could be processed from this file. File was read successfully, but the data of {base_message()} could be imported." + msg = f"No crosslinks could be processed from this file. File was read successfully, but the data of {base_message()} could be imported." messages = [dict(level=logging.ERROR, msg=msg)] elif failed_df.empty: msg = f"Successfully imported data of {base_message()}." diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 3a1b9d361..52410f64c 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -2391,12 +2391,12 @@ def create_crosslink_input_fields(self, form: Form, run: Run): ) upper_bound_length_deviation_field = FloatField( name=f"{crosslinker}_upper_accepted_deviation", - label=f"Upper bound on the accepted deviation for {crosslinker} Cross-Links in Ångström (0 equals no bound)", + label=f"Upper bound on the accepted deviation for {crosslinker} Crosslinks in Ångström (0 equals no bound)", min=0, ) lower_bound_length_deviation_field = FloatField( name=f"{crosslinker}_lower_accepted_deviation", - label=f"Lower bound on the accepted deviation for {crosslinker} Cross-Links in Ångström (0 equals no bound)", + label=f"Lower bound on the accepted deviation for {crosslinker} Crosslinks in Ångström (0 equals no bound)", min=0, ) form.add_field(crosslinker_length_field) @@ -2429,8 +2429,8 @@ class CrosslinkingValidationWithAngstromDeviation( CrosslinkingValidationWithAngstromStep ): display_name = "Ångström Deviation For Monomer Structures" - operation = "Cross Linking Validation" - method_description = "Validates cross links within the one protein structure based on the difference between the length of the cross linker and the distance between the amino acids which were connected by the cross linker. (in Ångström)" + operation = "Crosslinking Validation" + method_description = "Validates crosslinks within the one protein structure based on the difference between the length of the crosslinker and the distance between the amino acids which were connected by the crosslinker. (in Ångström)" calc_method = staticmethod(monomer_validation) plot_method = staticmethod(monomer_diagrams) @@ -2442,8 +2442,8 @@ class CrosslinkingValidationWithAngstromDeviationForMultimer( CrosslinkingValidationWithAngstromStep ): display_name = "Ångström Deviation For Multimer Structures" - operation = "Cross Linking Validation" - method_description = "Validates cross links between proteins based on the difference between the length of the cross linker and the distance between the amino acids which were connected by the cross linker. (in Ångström)" + operation = "Crosslinking Validation" + method_description = "Validates crosslinks between proteins based on the difference between the length of the crosslinker and the distance between the amino acids which were connected by the crosslinker. (in Ångström)" calc_method = staticmethod(multimer_validation) plot_method = staticmethod(multimer_diagrams) diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 5a66cd8a7..f62a1f716 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -437,7 +437,7 @@ def test_validate_multimer_no_links_between_structures_returns_empty_and_warning assert isinstance(messages, list) assert len(messages) >= 1 assert messages[0].get("level") is not None - assert "There are no cross links between the structures to validate." in messages[ + assert "There are no crosslinks between the structures to validate." in messages[ 0 ].get("msg", "") From 3bf42ce4d80a9066c3ab059888dc8bc01c60e20a Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Wed, 6 May 2026 00:36:52 +0200 Subject: [PATCH 216/240] fix: backend formatting --- backend/protzilla/importing/crosslinking_import.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/protzilla/importing/crosslinking_import.py b/backend/protzilla/importing/crosslinking_import.py index 2a15bed5c..8b1ae718e 100644 --- a/backend/protzilla/importing/crosslinking_import.py +++ b/backend/protzilla/importing/crosslinking_import.py @@ -788,7 +788,9 @@ def crosslinking_import(file_path: Path, organism_ids: str) -> dict: def base_message(): if file_type == ".csv": organism_names_string = ", ".join(scientific_organism_names) - return f"{len(good_df)} crosslinks for the {organism_names_string} organism(s)" + return ( + f"{len(good_df)} crosslinks for the {organism_names_string} organism(s)" + ) return f"{len(good_df)} crosslinks" if good_df.empty: From 429612b90e787300c2085baf89aedc058c531204 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Wed, 6 May 2026 14:19:27 +0200 Subject: [PATCH 217/240] fix bug which prevents showing predictions in tables in frontend --- backend/main/views_settings.py | 2 ++ backend/protzilla/methods/importing.py | 4 ++-- .../app/settings/other-settings/monomer-structure-upload.tsx | 4 ++-- .../app/settings/other-settings/multimer-structure-upload.tsx | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index 63ede4254..0874b6c03 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -355,6 +355,7 @@ def get_monomer_structure(request): ] df = get_metadata_df(csv_file_path=metadata_csv, expected_columns=expected_columns) + df = df.fillna("") df_infos = df.rename( columns={ @@ -470,6 +471,7 @@ def get_multimer_structure(request): "model_used", ] df = get_metadata_df(csv_file_path=metadata_csv, expected_columns=expected_columns) + df = df.fillna("") df_infos = df.rename( columns={ diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index b17ebee6b..690eb6151 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -548,14 +548,14 @@ def create_form(self): input_fields=[ TextField( name="entry_id", - label="Entry ID of the prediction to be loaded into the run.", + label="Entry ID of the prediction to be loaded into the run. (required)", ), InfoField( label="The entry ID should be a unique name given to the uploaded prediction.", ), TextField( name="uniprot_ids", - label="Protein IDs of all proteins used in the sequence. ", + label="Protein IDs of all proteins used in the sequence.", ), InfoField( label="Please provide a list of Protein IDs separated by a comma \n e.g.: P68871, P69905, Q5VSL9." diff --git a/frontend/src/components/app/settings/other-settings/monomer-structure-upload.tsx b/frontend/src/components/app/settings/other-settings/monomer-structure-upload.tsx index 4122e09ee..48a9326b1 100644 --- a/frontend/src/components/app/settings/other-settings/monomer-structure-upload.tsx +++ b/frontend/src/components/app/settings/other-settings/monomer-structure-upload.tsx @@ -200,13 +200,13 @@ export const MonomerStructureUpload = () => { { type: "text", name: "model_used", - label: "Alphafold Version Number (required):", + label: "Alphafold Version Number:", isVisible: true, }, { type: "text", name: "gene", - label: "Gene Name (required):", + label: "Gene Name:", isVisible: true, }, { diff --git a/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx b/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx index 11ec79ea6..98627e491 100644 --- a/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx +++ b/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx @@ -202,7 +202,7 @@ export const MultimerStructureUpload = () => { { type: "text", name: "model_used", - label: "AlphaFold Model used to predict the structure (required)", + label: "AlphaFold Model used to predict the structure", isVisible: true, }, { From 7fd00cd5bb75dd3e52eb889331c354239309999a Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Wed, 6 May 2026 16:34:01 +0200 Subject: [PATCH 218/240] refactor: update x-axis annotation and add vertical line to right subplot --- .../data_analysis/crosslinking_validation.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 5aaf2f35f..bb6a8fd94 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -794,6 +794,14 @@ def diagrams_of_crosslinking_validation_data( x_value=crosslinker_length, column=1, ) + # assumes that accepted_deviation_upper_bound = 0, if not set + add_vertical_line_with_annotation_in_legend( + fig=histogram, + dash="dash", + annotation=f"allowed deviation upper bound: {accepted_deviation_upper_bound}Å", + x_value=np.log10(crosslinker_length + accepted_deviation_upper_bound), + column=2, + ) mean_of_predicted_lengths = crosslinker_df["alphafold_distance"].mean() if len(crosslinker_df) == 1: @@ -817,7 +825,7 @@ def diagrams_of_crosslinking_validation_data( name_a=f"Predictions matching CLs (intra: {valid_intra}, inter: {valid_inter})", name_b=f"Predictions not matching CLs (intra: {invalid_intra}, inter: {invalid_inter})", heading=f"Predicted distances for {structures_to_validate_str} with crosslinker {crosslinker}, mean +/- 2 σ", - x_title="Distance (Å)", + x_title="Distance in Å", y_title="Count", overlay=True, visual_transformation="linear", @@ -1119,7 +1127,7 @@ def add_split_traces(values: pd.Series, name: str, color: str, show_legend: bool tick_text.append(f"{(split_x_axis_at + 10**i):.4g}") _ = fig.update_xaxes( - title_text=f"{xaxis_label} (Log)", + title_text=f"{xaxis_label} (Log10)", tickvals=tick_vals, ticktext=tick_text, row=1, From 9a3cedeb740ff73ff59cfcf06abe20f8e5ee711e Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Wed, 6 May 2026 16:47:18 +0200 Subject: [PATCH 219/240] chore: run black and add comments --- .../data_analysis/crosslinking_validation.py | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index bb6a8fd94..32d6528ba 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -794,14 +794,15 @@ def diagrams_of_crosslinking_validation_data( x_value=crosslinker_length, column=1, ) - # assumes that accepted_deviation_upper_bound = 0, if not set - add_vertical_line_with_annotation_in_legend( - fig=histogram, - dash="dash", - annotation=f"allowed deviation upper bound: {accepted_deviation_upper_bound}Å", - x_value=np.log10(crosslinker_length + accepted_deviation_upper_bound), - column=2, - ) + if accepted_deviation_upper_bound == 0: + # also add rightmost line (upper_bound/CL length to right subplot) + histogram.add_vline( + x=np.log10(crosslinker_length), + line_color=PLOT_PRIMARY_COLOR, + line_dash="solid", + line_width=2, + col=2, + ) mean_of_predicted_lengths = crosslinker_df["alphafold_distance"].mean() if len(crosslinker_df) == 1: @@ -851,6 +852,14 @@ def diagrams_of_crosslinking_validation_data( x_value=crosslinker_length + accepted_deviation_upper_bound, column=1, ) + # also add rightmost line (upper_bound/CL length to right subplot) + histogram.add_vline( + x=np.log10(crosslinker_length + accepted_deviation_upper_bound), + line_color=PLOT_PRIMARY_COLOR, + line_dash="dash", + line_width=2, + col=2, + ) if ( math.floor(mean_minus_two_std) <= crosslinker_length + accepted_deviation_upper_bound From 5ecce6537f2cc954c99756deb7061f5c6a1da96d Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Thu, 7 May 2026 11:13:54 +0200 Subject: [PATCH 220/240] fix tests --- backend/tests/main/test_views_settings.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/backend/tests/main/test_views_settings.py b/backend/tests/main/test_views_settings.py index 95e7a7c80..dd70744ad 100644 --- a/backend/tests/main/test_views_settings.py +++ b/backend/tests/main/test_views_settings.py @@ -24,7 +24,7 @@ def test_get_cl_defaults(monkeypatch): request.method = "GET" mock_defaults_operator = mock.Mock() - mock_defaults_operator.get_all_defaults.return_value = { + mock_defaults_operator.read_default.return_value = { "DSSO": { "cl_length": 10.3, "cl_upper_deviation": 1.0, @@ -59,16 +59,19 @@ def test_update_cl_default_success(monkeypatch): request.body = json.dumps(payload).encode("utf-8") mock_defaults_operator = mock.Mock() + mock_defaults_operator.read_default.return_value = {} monkeypatch.setattr(PATCH_PATH, lambda: mock_defaults_operator) response = update_cl_default(request) mock_defaults_operator.write_default.assert_called_once_with( - name="DSSO", + name="crosslinker_lengths", value={ - "cl_length": 10.3, - "cl_upper_deviation": 1.0, - "cl_lower_deviation": 1.2, + "DSSO": { + "cl_length": 10.3, + "cl_upper_deviation": 1.0, + "cl_lower_deviation": 1.2, + } }, ) assert response.status_code == 200 @@ -102,11 +105,17 @@ def test_delete_cl_default_success(monkeypatch): request.body = json.dumps(payload).encode("utf-8") mock_defaults_operator = mock.Mock() + mock_defaults_operator.read_default.return_value = { + "DSSO": {"cl_length": 10.3} + } monkeypatch.setattr(PATCH_PATH, lambda: mock_defaults_operator) response = delete_cl_default(request) - mock_defaults_operator.delete_default.assert_called_once_with("DSSO") + mock_defaults_operator.write_default.assert_called_once_with( + name="crosslinker_lengths", + value={} + ) assert response.status_code == 200 response_data = json.loads(response.content.decode("utf-8")) From ba3e4e3acfc34ee2dd36324e3de2af45852b626d Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Thu, 7 May 2026 11:14:18 +0200 Subject: [PATCH 221/240] fix crosslink spelling --- .../other-settings/cl-default-settings.tsx | 18 +++++++++--------- .../src/components/app/settings/settings.tsx | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/frontend/src/components/app/settings/other-settings/cl-default-settings.tsx b/frontend/src/components/app/settings/other-settings/cl-default-settings.tsx index 1dade5f31..d01a854d7 100644 --- a/frontend/src/components/app/settings/other-settings/cl-default-settings.tsx +++ b/frontend/src/components/app/settings/other-settings/cl-default-settings.tsx @@ -143,14 +143,14 @@ export const CrosslinkDefaultUpload = () => { }); if (response?.success) { notify({ - title: "Cross Link default deleted", + title: "Crosslink default deleted", message: response.message as string, type: "success", isClosingAutomatically: true, }); } else { notify({ - title: "Cross Link default deletion failed", + title: "Crosslink default deletion failed", message: response?.message ?? "Unknown error", type: "error", isClosingAutomatically: true, @@ -164,7 +164,7 @@ export const CrosslinkDefaultUpload = () => {
{ { type: "text", name: "cl_name", - label: "Name of the Cross-Link", + label: "Name of the crosslink", isVisible: true, }, { type: "number", name: "cl_length", - label: "Length of the specified Cross-Link:", + label: "Length of the specified crosslink:", isVisible: true, }, { type: "number", name: "cl_upper_deviation", - label: "Upper deviation of the specified Cross-Link:", + label: "Upper deviation of the specified crosslink:", isVisible: true, }, { type: "number", name: "cl_lower_deviation", - label: "Lower deviation of the specified Cross-Link:", + label: "Lower deviation of the specified crosslink:", isVisible: true, }, ], @@ -215,9 +215,9 @@ export const CrosslinkDefaultUpload = () => { ); }} /> - + {crosslinkDefaultList.length === 0 ? ( - + ) : ( {crosslinkDefaultList.map((ps) => ( diff --git a/frontend/src/components/app/settings/settings.tsx b/frontend/src/components/app/settings/settings.tsx index 7792d46ef..adde4c13f 100644 --- a/frontend/src/components/app/settings/settings.tsx +++ b/frontend/src/components/app/settings/settings.tsx @@ -139,7 +139,7 @@ export const Settings: React.FC = ({ id={"crosslink-defaults"} isActive={selectedSetting === "crosslink-defaults"} icon={"handleCrosslinkingIcon"} - text={"Cross-Links Defaults"} + text={"Crosslinks Defaults"} onPress={() => { handleSwitchSection("crosslink-defaults"); }} From af1778ad2e4c2bc77278dd831451e6c95137ac48 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Thu, 7 May 2026 11:14:32 +0200 Subject: [PATCH 222/240] fix update bug --- backend/main/views_settings.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index 9b62b33e3..fdd716608 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -604,17 +604,16 @@ def update_cl_default(request): else 0 ) - cl_default_dict = { - cl_name: { + try: + defaults_operator = DefaultsOperator() + all_cl_defaults = defaults_operator.read_default(name="crosslinker_lengths") + all_cl_defaults[cl_name] = { "cl_length": cl_length, "cl_upper_deviation": cl_upper_deviation, "cl_lower_deviation": cl_lower_deviation, } - } - try: - defaults_operator = DefaultsOperator() defaults_operator.write_default( - name="crosslinker_lengths", value=cl_default_dict + name="crosslinker_lengths", value=all_cl_defaults ) return JsonResponse( { From ec85ed2f1dad6413a18c9cf27539f1ca85b7b212 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Thu, 7 May 2026 11:18:03 +0200 Subject: [PATCH 223/240] format --- backend/tests/main/test_views_settings.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/backend/tests/main/test_views_settings.py b/backend/tests/main/test_views_settings.py index dd70744ad..b8eca6449 100644 --- a/backend/tests/main/test_views_settings.py +++ b/backend/tests/main/test_views_settings.py @@ -105,16 +105,13 @@ def test_delete_cl_default_success(monkeypatch): request.body = json.dumps(payload).encode("utf-8") mock_defaults_operator = mock.Mock() - mock_defaults_operator.read_default.return_value = { - "DSSO": {"cl_length": 10.3} - } + mock_defaults_operator.read_default.return_value = {"DSSO": {"cl_length": 10.3}} monkeypatch.setattr(PATCH_PATH, lambda: mock_defaults_operator) response = delete_cl_default(request) mock_defaults_operator.write_default.assert_called_once_with( - name="crosslinker_lengths", - value={} + name="crosslinker_lengths", value={} ) assert response.status_code == 200 From 01b0e0e5089b900a03fbd47c19ab899ef6e6973c Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Mon, 11 May 2026 19:54:45 +0200 Subject: [PATCH 224/240] feat: add crosslinker colors to settings --- backend/main/urls.py | 10 + backend/main/views_settings.py | 30 ++ .../other-settings/cl-colors-settings.tsx | 294 ++++++++++++++++++ .../app/settings/other-settings/index.ts | 1 + .../src/components/app/settings/settings.tsx | 11 + .../molstar-viewer/molstar-viewer.config.ts | 4 +- .../molstar-viewer/molstar-viewer.service.ts | 23 +- .../shared/molstar-viewer/molstar-viewer.tsx | 5 +- 8 files changed, 373 insertions(+), 5 deletions(-) create mode 100644 frontend/src/components/app/settings/other-settings/cl-colors-settings.tsx diff --git a/backend/main/urls.py b/backend/main/urls.py index 79ffe4303..8b33843fe 100644 --- a/backend/main/urls.py +++ b/backend/main/urls.py @@ -130,6 +130,16 @@ views_settings.delete_cl_default, name="delete_cl_default", ), + path( + "api/get_cl_colors", + views_settings.get_cl_colors, + name="get_cl_colors", + ), + path( + "api/update_cl_colors", + views_settings.update_cl_colors, + name="update_cl_colors", + ), path( "api/load_ptm_settings", views_settings.load_ptm_settings, diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index fdd716608..7fd115c8a 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -661,6 +661,36 @@ def delete_cl_default(request): ) +# <--- Crosslink colors ---> + + +def get_cl_colors(request): + operator = DefaultsOperator() + colors = operator.read_default(name="crosslinker_colors") + return JsonResponse(colors or {}, safe=False) + + +def update_cl_colors(request): + if request.method == "POST": + data = json.loads(request.body) + + try: + operator = DefaultsOperator() + operator.write_default(name="crosslinker_colors", value=data) + + return JsonResponse( + {"success": True, "message": "Colours updated successfully."}, + status=200, + ) + except Exception: + return JsonResponse( + {"success": False, "message": "Could not update colours."}, + status=405, + ) + + return JsonResponse({"success": False, "message": "Invalid method"}, status=405) + + # <--- Databases ---> diff --git a/frontend/src/components/app/settings/other-settings/cl-colors-settings.tsx b/frontend/src/components/app/settings/other-settings/cl-colors-settings.tsx new file mode 100644 index 000000000..3399a101b --- /dev/null +++ b/frontend/src/components/app/settings/other-settings/cl-colors-settings.tsx @@ -0,0 +1,294 @@ +import { useNotification } from "@protzilla/app"; +import { DeleteModal, Form, SecondaryButton, SectionTitle, Text } from "@protzilla/core"; +import { useToggleableState } from "@protzilla/hooks"; +import { spacing } from "@protzilla/theme"; +import { callApi, callApiWithParameters } from "@protzilla/utils"; +import { useEffect, useState } from "react"; +import { styled } from "styled-components"; + +import { CrosslinkerType } from "../../../core/shared/molstar-viewer/crosslinker-processing"; +import { CROSSLINK_DEFAULT_COLORS } from "../../../core/shared/molstar-viewer/molstar-viewer.config"; + +const CurrentColorsTitle = styled(SectionTitle)` + padding-top: ${spacing("large")}; + padding-bottom: ${spacing("small")}; + + margin: 0; +`; + +const CurrentColorsList = styled.div` + display: flex; + flex-direction: column; + gap: ${spacing("verySmall")}; +`; + +const CurrentColorsHeader = styled.div` + display: flex; + flex-direction: row; + align-items: flex-end; + justify-content: space-between; +`; + +const ColorEntryContainer = styled.div` + display: flex; + flex-direction: row; + align-items: center; + justify-content: space-between; + + padding-left: ${spacing("listIndentation")}; + padding-top: ${spacing("verySmall")}; + padding-bottom: ${spacing("verySmall")}; +`; + +const ColorInfo = styled.div` + display: flex; + flex-direction: row; + align-items: center; + gap: ${spacing("small")}; +`; + +const ColorPreview = styled.div<{ color: string }>` + width: 24px; + height: 24px; + border-radius: 4px; + border: 1px solid black; + + background-color: ${({ color }) => color}; +`; + +interface ColorEntryProps { + label: string; + color: number; +} + +const toHexColor = (color: number) => `#${color.toString(16).padStart(6, "0")}`; + +const ColorEntry = ({ label, color }: ColorEntryProps) => { + return ( + + + + + + + + ); +}; + +const entries = [ + { label: "Valid intra-crosslinks", key: CrosslinkerType.ValidIntra }, + { label: "Invalid intra-crosslinks", key: CrosslinkerType.InvalidIntra }, + { label: "Valid inter-crosslinks", key: CrosslinkerType.ValidInter }, + { label: "Invalid inter-crosslinks", key: CrosslinkerType.InvalidInter }, +]; + +export const CrosslinkColors = () => { + const notify = useNotification(); + const [colors, setColors] = useState(CROSSLINK_DEFAULT_COLORS); + const [isDeleteModalOpen, openDeleteModal, closeDeleteModal] = useToggleableState(false); + const [formKey, setFormKey] = useState(0); + + useEffect(() => { + const loadColors = async () => { + const result = await callApi("get_cl_colors"); + + if (result && Object.keys(result).length > 0) { + setColors(result); + } + }; + + void loadColors(); + }, []); + + const parseColor = (value: unknown): number => { + const str = String(value).trim(); + + if (/^-?\d+$/.test(str)) return Number(str); + + if (str.startsWith("0x")) { + const parsed = parseInt(str, 16); + if (!Number.isNaN(parsed)) return parsed; + } + + if (str.startsWith("#")) { + const parsed = parseInt(str.slice(1), 16); + if (!Number.isNaN(parsed)) return parsed; + } + + if (/^[0-9a-fA-F]{6}$/.test(str)) { + return parseInt(str, 16); + } + + throw new Error("Invalid colour format"); + }; + + const updateColors = (prev: typeof CROSSLINK_DEFAULT_COLORS, data: Record) => { + const update = (key: CrosslinkerType) => { + const input = data[key]; + + // if the field was left empty, we keep the old color + if (input == null || (typeof input === "string" && input.trim() === "")) { + return prev[key]; + } + + // non-empty fields are validated + try { + return parseColor(input); + } catch { + notify({ + title: "Invalid colour input", + message: + `Invalid value for ${key}.` + + "Please enter a valid colour-code" + + "(e.g. #FF00AA or 0xFF00AA or 6-digit hex code).", + type: "error", + isClosingAutomatically: true, + }); + + throw new Error("Abort update"); + } + }; + + return { + [CrosslinkerType.ValidIntra]: update(CrosslinkerType.ValidIntra), + [CrosslinkerType.InvalidIntra]: update(CrosslinkerType.InvalidIntra), + [CrosslinkerType.ValidInter]: update(CrosslinkerType.ValidInter), + [CrosslinkerType.InvalidInter]: update(CrosslinkerType.InvalidInter), + }; + }; + + const handleChangeColors = async (data: Record) => { + try { + const updated = updateColors(colors, data); + setColors(updated); + + const response = await callApiWithParameters("update_cl_colors", updated); + if (response?.success) { + notify({ + title: "Crosslink colour update", + message: response.message as string, + type: "success", + isClosingAutomatically: true, + }); + } else { + notify({ + title: "Crosslink colour update failed", + message: response.message ?? "Unknown error", + type: "error", + isClosingAutomatically: true, + }); + } + setFormKey((prev) => prev + 1); + } catch { + return; + } + }; + + const handleResetToDefaults = async () => { + const updated = CROSSLINK_DEFAULT_COLORS; + setColors(updated); + + const response = await callApiWithParameters("update_cl_colors", updated); + if (response?.success) { + notify({ + title: "Crosslink colour reset", + message: response.message as string, + type: "success", + isClosingAutomatically: true, + }); + } else { + notify({ + title: "Crosslink colour reset failed", + message: response.message ?? "Unknown error", + type: "error", + isClosingAutomatically: true, + }); + } + closeDeleteModal(); + }; + + const handleDelete = () => { + openDeleteModal(); + }; + + return ( +
+ + + + { + handleChangeColors(data).catch(console.error); + }} + /> + + + + + + + + + {entries.map((entry) => ( + + ))} + + + { + void handleResetToDefaults; + }} + title={ + `All crosslink colours will be resetted to the developer defaults.` + + `Your currently selected colours will be permanently deleted. Would you like to proceed?` + } + /> +
+ ); +}; diff --git a/frontend/src/components/app/settings/other-settings/index.ts b/frontend/src/components/app/settings/other-settings/index.ts index c55241e68..1cb9d4e23 100644 --- a/frontend/src/components/app/settings/other-settings/index.ts +++ b/frontend/src/components/app/settings/other-settings/index.ts @@ -5,3 +5,4 @@ export * from "./ptm-vis-settings"; export * from "./monomer-structure-upload"; export * from "./multimer-structure-upload"; export * from "./cl-default-settings"; +export * from "./cl-colors-settings"; diff --git a/frontend/src/components/app/settings/settings.tsx b/frontend/src/components/app/settings/settings.tsx index adde4c13f..a0b1c54c2 100644 --- a/frontend/src/components/app/settings/settings.tsx +++ b/frontend/src/components/app/settings/settings.tsx @@ -4,6 +4,7 @@ import { useState } from "react"; import { styled } from "styled-components"; import { + CrosslinkColors, CrosslinkDefaultUpload, DatabaseSettings, GitHub, @@ -144,6 +145,15 @@ export const Settings: React.FC = ({ handleSwitchSection("crosslink-defaults"); }} /> + { + handleSwitchSection("crosslink-colors"); + }} + /> = ({ {selectedSetting === "monomer-structure-upload" && } {selectedSetting === "multimer-structure-upload" && } {selectedSetting === "crosslink-defaults" && } + {selectedSetting === "crosslink-colors" && } {selectedSetting === "github" && } = { +export type CrosslinkColors = Record; + +export const CROSSLINK_DEFAULT_COLORS: CrosslinkColors = { [CrosslinkerType.ValidIntra]: 0xe03e00, // bright orange-red [CrosslinkerType.InvalidIntra]: 0xfca311, // pale yellow-orange [CrosslinkerType.ValidInter]: 0x8a2be2, // bright purple diff --git a/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts index 93b450c89..7cf16f9a7 100644 --- a/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts +++ b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts @@ -1,4 +1,5 @@ import { useNotification } from "@protzilla/app"; +import { callApi } from "@protzilla/utils"; import { PluginUIContext } from "molstar/lib/mol-plugin-ui/context"; import { MolScriptBuilder as MS } from "molstar/lib/mol-script/language/builder"; @@ -7,12 +8,13 @@ import { CrosslinkerType, generateCrosslinkCIF, } from "./crosslinker-processing"; -import { CROSSLINKER_COLORS } from "./molstar-viewer.config"; +import { CROSSLINK_DEFAULT_COLORS, CrosslinkColors } from "./molstar-viewer.config"; export async function addCrosslinks( plugin: PluginUIContext, cifText: string, crosslinks: CrosslinkerInformation[], + crosslinkColors: CrosslinkColors, ) { const { crosslinkerCifText: crosslinkerCifText, crosslinkerGroups: crosslinkerGroups } = generateCrosslinkCIF(cifText, crosslinks); @@ -42,12 +44,29 @@ export async function addCrosslinks( await plugin.builders.structure.representation.addRepresentation(component, { type: "line", color: "uniform", - colorParams: { value: CROSSLINKER_COLORS[type] }, + colorParams: { value: crosslinkColors[type] }, }); } } } +export const initCrosslinkColors = async (): Promise => { + try { + const userColors = await callApi("get_cl_colors"); + + if (userColors && Object.keys(userColors).length > 0) { + return { + ...CROSSLINK_DEFAULT_COLORS, + ...userColors, + }; + } + + return CROSSLINK_DEFAULT_COLORS; + } catch { + return CROSSLINK_DEFAULT_COLORS; + } +}; + export function handleError( error: unknown, errorTitle: string, diff --git a/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.tsx b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.tsx index 34ee45713..56059ea7e 100644 --- a/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.tsx +++ b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.tsx @@ -6,7 +6,7 @@ import { renderReact18 } from "molstar/lib/mol-plugin-ui/react18"; import React, { useEffect, useRef, useState } from "react"; import { MolstarViewerProps } from "./molstar-viewer.props"; -import { addCrosslinks, handleError } from "./molstar-viewer.service"; +import { addCrosslinks, handleError, initCrosslinkColors } from "./molstar-viewer.service"; import { CanvasWrapper, Container } from "./styles"; import "molstar/lib/mol-plugin-ui/skin/base/base.scss"; @@ -45,7 +45,8 @@ const MolstarViewer: React.FC = ({ cifText, crosslinks }) => // add crosslinks to structure, if available if (crosslinks !== undefined) { - await addCrosslinks(plugin, cifText, crosslinks); + const crosslinkColors = await initCrosslinkColors(); + await addCrosslinks(plugin, cifText, crosslinks, crosslinkColors); } setIsLoading(false); From f484e1b48061c362c6eefb655ba263d0d5b1c5df Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Mon, 11 May 2026 22:59:00 +0200 Subject: [PATCH 225/240] fix: fix reset to default colors --- .../app/settings/other-settings/cl-colors-settings.tsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontend/src/components/app/settings/other-settings/cl-colors-settings.tsx b/frontend/src/components/app/settings/other-settings/cl-colors-settings.tsx index 3399a101b..c73ce238c 100644 --- a/frontend/src/components/app/settings/other-settings/cl-colors-settings.tsx +++ b/frontend/src/components/app/settings/other-settings/cl-colors-settings.tsx @@ -282,10 +282,10 @@ export const CrosslinkColors = () => { isOpen={isDeleteModalOpen} onClose={closeDeleteModal} onConfirm={() => { - void handleResetToDefaults; + void handleResetToDefaults(); }} title={ - `All crosslink colours will be resetted to the developer defaults.` + + `All crosslink colours will be reset to the developer defaults.` + `Your currently selected colours will be permanently deleted. Would you like to proceed?` } /> From 53499009d4e31256c61d3b74cedc0b0b71f00ada Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Tue, 12 May 2026 00:23:49 +0200 Subject: [PATCH 226/240] feat: add icon to new settings tab --- .../src/components/app/settings/settings.tsx | 2 +- .../core/shared/icon/icons/color_brush.svg | 23 +++++++++++++++++++ .../core/shared/icon/icons/index.ts | 1 + 3 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 frontend/src/components/core/shared/icon/icons/color_brush.svg diff --git a/frontend/src/components/app/settings/settings.tsx b/frontend/src/components/app/settings/settings.tsx index a0b1c54c2..972dd096b 100644 --- a/frontend/src/components/app/settings/settings.tsx +++ b/frontend/src/components/app/settings/settings.tsx @@ -148,7 +148,7 @@ export const Settings: React.FC = ({ { handleSwitchSection("crosslink-colors"); diff --git a/frontend/src/components/core/shared/icon/icons/color_brush.svg b/frontend/src/components/core/shared/icon/icons/color_brush.svg new file mode 100644 index 000000000..e1cdc4386 --- /dev/null +++ b/frontend/src/components/core/shared/icon/icons/color_brush.svg @@ -0,0 +1,23 @@ + + + + + + + + + + + diff --git a/frontend/src/components/core/shared/icon/icons/index.ts b/frontend/src/components/core/shared/icon/icons/index.ts index b69cb5b6f..abd4a95b8 100644 --- a/frontend/src/components/core/shared/icon/icons/index.ts +++ b/frontend/src/components/core/shared/icon/icons/index.ts @@ -13,6 +13,7 @@ export { default as chevronUp } from "./chevron-up.svg?react"; export { default as clipboard } from "./clipboard.svg?react"; export { default as close } from "./close.svg?react"; export { default as complete } from "./complete.svg?react"; +export { default as colorBrush } from "./color_brush.svg?react"; export { default as data_analysis } from "./data_analysis.svg?react"; export { default as data_integration } from "./data_integration.svg?react"; export { default as data_preprocessing } from "./data_preprocessing.svg?react"; From 91e72446d39dd9a0201335e482c089fb9817bc27 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Wed, 13 May 2026 09:45:21 +0200 Subject: [PATCH 227/240] fix bugs pointed out in review --- backend/main/views_settings.py | 113 +++++++++++++----- .../alphafold_protein_structure_load.py | 12 +- .../multimer-structure-upload.tsx | 2 +- 3 files changed, 93 insertions(+), 34 deletions(-) diff --git a/backend/main/views_settings.py b/backend/main/views_settings.py index 0874b6c03..f9127adf6 100644 --- a/backend/main/views_settings.py +++ b/backend/main/views_settings.py @@ -248,6 +248,7 @@ def check_and_copy_files_to_directory(file_names: list, target_dir: str): source_file = settings.FILE_UPLOAD_TEMP_DIR / file_name success, message = copy_file_to_directory(source_file, target_dir) if not success: + shutil.rmtree(target_dir, ignore_errors=True) return False, message return True, "All files successfully uploaded" @@ -325,13 +326,6 @@ def extend_metadata_csv( metadata_df: pd.DataFrame, ) -> None: try: - mask = ( - existing_metadata_df["entry_id"].astype(str).str.upper() == entry_id.upper() - ) - if mask.any(): - msg = f'Entry ID "{entry_id}" not unique. Entry IDs are compared case insensitively, so "ABC" and "abc" are treated as the same ID.' - return False, msg - combined = pd.concat([existing_metadata_df, metadata_df], ignore_index=True) combined.to_csv(metadata_csv, index=False) return True, f'"{metadata_csv}" updated successfully.' @@ -382,7 +376,24 @@ def upload_monomer_structure(request): pae = data.get("pae") fasta_file = data.get("fasta_file") - # add row to metadata csv + if not entry_id: + return JsonResponse( + data={ + "success": False, + "message": "The entry Id cannot be empty or None.", + }, + status=500, + ) + + if not uniprot_id: + return JsonResponse( + data={ + "success": False, + "message": "Uniprot Id cannot be empty or None.", + }, + status=500, + ) + ALPHAFOLD_MONOMER_PATH.mkdir(parents=True, exist_ok=True) metadata_csv = AF_MONOMER_METADATA_CSV_PATH @@ -404,22 +415,18 @@ def upload_monomer_structure(request): "entry_id": entry_id, "uniprot_accession": uniprot_id, "model_created_date": timestamp, - "gene": gene, - "model_used": model_used, + "gene": "" if gene is None else gene, + "model_used": "" if model_used is None else model_used, } metadata_df = pd.DataFrame([new_row]) - success, message = extend_metadata_csv( - entry_id=entry_id, - metadata_csv=metadata_csv, - existing_metadata_df=existing_metadata_df, - metadata_df=metadata_df, + + mask = ( + existing_metadata_df["entry_id"].astype(str).str.upper() == entry_id.upper() ) - if not success: - return JsonResponse( - {"success": False, "message": message}, - status=500, - ) + if mask.any(): + msg = f'Entry ID "{entry_id}" not unique. Entry IDs are compared case insensitively, so "ABC" and "abc" are treated as the same ID.' + return False, msg # Copy files to source directory out of temp directory @@ -428,12 +435,27 @@ def upload_monomer_structure(request): success, message = check_and_copy_files_to_directory( file_names=file_names, target_dir=target_dir ) + if not success: return JsonResponse( {"success": False, "message": message}, status=500, ) + # add row to metadata csv + success, message = extend_metadata_csv( + entry_id=entry_id, + metadata_csv=metadata_csv, + existing_metadata_df=existing_metadata_df, + metadata_df=metadata_df, + ) + if not success: + shutil.rmtree(target_dir, ignore_errors=True) + return JsonResponse( + {"success": False, "message": message}, + status=500, + ) + return JsonResponse( { "success": True, @@ -499,7 +521,24 @@ def upload_multimer_structure(request): ALPHAFOLD_MULTIMER_PATH.mkdir(parents=True, exist_ok=True) - # add row to metadata csv + if not entry_id: + return JsonResponse( + data={ + "success": False, + "message": "The entry Id cannot be empty or None.", + }, + status=500, + ) + + if not uniprot_ids: + return JsonResponse( + data={ + "success": False, + "message": "Uniprot Ids cannot be empty or None.", + }, + status=500, + ) + metadata_csv = AF_MULTIMER_METADATA_CSV_PATH expected_columns = [ "entry_id", @@ -520,21 +559,17 @@ def upload_multimer_structure(request): "entry_id": entry_id, "uniprot_ids": uniprot_ids_as_list, "model_created_date": timestamp, - "model_used": model_used, + "model_used": "" if model_used is None else model_used, } metadata_df = pd.DataFrame([new_row]) - success, message = extend_metadata_csv( - entry_id=entry_id, - metadata_csv=metadata_csv, - existing_metadata_df=existing_metadata_df, - metadata_df=metadata_df, + + mask = ( + existing_metadata_df["entry_id"].astype(str).str.upper() == entry_id.upper() ) - if not success: - return JsonResponse( - {"success": False, "message": message}, - status=500, - ) + if mask.any(): + msg = f'Entry ID "{entry_id}" not unique. Entry IDs are compared case insensitively, so "ABC" and "abc" are treated as the same ID.' + return False, msg # Copy files to source directory out of temp directory @@ -550,6 +585,20 @@ def upload_multimer_structure(request): file_names=file_names, target_dir=target_dir ) if not success: + return JsonResponse( + data={"success": False, "message": message}, + status=500, + ) + + # add row to metadata csv + success, message = extend_metadata_csv( + entry_id=entry_id, + metadata_csv=metadata_csv, + existing_metadata_df=existing_metadata_df, + metadata_df=metadata_df, + ) + if not success: + shutil.rmtree(target_dir, ignore_errors=True) return JsonResponse( {"success": False, "message": message}, status=500, diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 7519c1d9b..4fd343fac 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -862,6 +862,16 @@ def upload_multimer_prediction( will propagate after cleanup of any temporary directory. """ + if not entry_id: + msg = "The entry Id cannot be empty or None." + logger.error(msg) + raise ValueError(msg) + + if not uniprot_ids: + msg = "Uniprot Ids cannot be empty or None." + logger.error(msg) + raise ValueError(msg) + messages = [] temp_dir, work_dir = get_correct_af_directories( @@ -878,7 +888,7 @@ def upload_multimer_prediction( "entry_id": entry_id, "uniprot_ids": uniprot_ids_as_list, "model_created_date": timestamp, - "model_used": model_used, + "model_used": "" if model_used is None else model_used, } try: diff --git a/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx b/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx index 98627e491..57b4a485f 100644 --- a/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx +++ b/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx @@ -189,7 +189,7 @@ export const MultimerStructureUpload = () => { { type: "text", name: "uniprot_ids", - label: "Protein IDs of all proteins used in the sequence (required):", + label: "Protein IDs of all proteins used in the sequence:", isVisible: true, }, { From 841846681b21852302f2568f75a67f3788bd6079 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Wed, 13 May 2026 09:47:26 +0200 Subject: [PATCH 228/240] add required to uniprot ids field --- backend/protzilla/methods/importing.py | 2 +- .../app/settings/other-settings/multimer-structure-upload.tsx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index 690eb6151..9c8223ec7 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -555,7 +555,7 @@ def create_form(self): ), TextField( name="uniprot_ids", - label="Protein IDs of all proteins used in the sequence.", + label="Protein IDs of all proteins used in the sequence. (required)", ), InfoField( label="Please provide a list of Protein IDs separated by a comma \n e.g.: P68871, P69905, Q5VSL9." diff --git a/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx b/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx index 57b4a485f..c86b4272f 100644 --- a/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx +++ b/frontend/src/components/app/settings/other-settings/multimer-structure-upload.tsx @@ -189,7 +189,7 @@ export const MultimerStructureUpload = () => { { type: "text", name: "uniprot_ids", - label: "Protein IDs of all proteins used in the sequence:", + label: "Protein IDs of all proteins used in the sequence: (required)", isVisible: true, }, { From 73c1438cf19a2a68daf070dafae0239ea345a216 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Thu, 14 May 2026 10:38:19 +0200 Subject: [PATCH 229/240] fix: fix frontend crash after alphafold query generation --- backend/main/views.py | 2 +- backend/protzilla/importing/query_generation.py | 4 +++- frontend/src/components/app/run-screen/run-screen.tsx | 2 +- frontend/src/utils/protzilla-types.ts | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/backend/main/views.py b/backend/main/views.py index ec5ac6925..a45e537d4 100644 --- a/backend/main/views.py +++ b/backend/main/views.py @@ -712,7 +712,7 @@ def get_downloads_from_step(request: HttpRequest): { "success": True, "message": "Got the available download(s) for the step", - "data": downloads, + "data": {"json_downloads": downloads}, } ) diff --git a/backend/protzilla/importing/query_generation.py b/backend/protzilla/importing/query_generation.py index 937788a16..32e43c755 100644 --- a/backend/protzilla/importing/query_generation.py +++ b/backend/protzilla/importing/query_generation.py @@ -101,5 +101,7 @@ def generate_alphafold_query_json( ) return dict( messages=messages, - downloads=OutputItem(output_type=OutputType.DOWNLOAD, value={name: [query]}), + downloads=OutputItem( + output_type=OutputType.DOWNLOAD, value={f"{name}.json": [query]} + ), ) diff --git a/frontend/src/components/app/run-screen/run-screen.tsx b/frontend/src/components/app/run-screen/run-screen.tsx index cee1c0d28..ac7c9b54e 100644 --- a/frontend/src/components/app/run-screen/run-screen.tsx +++ b/frontend/src/components/app/run-screen/run-screen.tsx @@ -182,7 +182,7 @@ export const RunScreen: React.FC = () => { const transformDownload = useCallback( (output: StepOutputInfo, response: ApiResponse) => ({ title: output.label, - data: response.data.data, + data: response.data.json_downloads, }), [], ); diff --git a/frontend/src/utils/protzilla-types.ts b/frontend/src/utils/protzilla-types.ts index 4317559bd..e4c02e6c7 100644 --- a/frontend/src/utils/protzilla-types.ts +++ b/frontend/src/utils/protzilla-types.ts @@ -32,7 +32,7 @@ export interface Image { } export interface Download { - data: Record; + json_downloads: Record; } export interface Visualization { From efdd788e974c1cbe76bb470c0d182cb42e68a711 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Thu, 14 May 2026 11:56:37 +0200 Subject: [PATCH 230/240] fix: fix broken test due to file name change of downloadable jsons --- backend/tests/protzilla/importing/test_query_generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/tests/protzilla/importing/test_query_generation.py b/backend/tests/protzilla/importing/test_query_generation.py index 4a768664c..5da4c7a3d 100644 --- a/backend/tests/protzilla/importing/test_query_generation.py +++ b/backend/tests/protzilla/importing/test_query_generation.py @@ -30,7 +30,7 @@ def test_generate_alphafold_multimer_json_query_for_multiple_proteins(mock_get): assert len(downloads) == 1 key = list(downloads.keys())[0] - assert key == "name" + assert key == "name.json" parsed_json = downloads[key][0] From aead6b48b967ca5f45ee68f0d3e310ec5504fda6 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Mon, 18 May 2026 12:33:50 +0200 Subject: [PATCH 231/240] fix: minor changes because of linter warnings --- .../shared/molstar-viewer/molstar-viewer.service.ts | 10 ++++++---- .../components/core/shared/molstar-viewer/styles.ts | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts index 4b6001f9d..d03e89657 100644 --- a/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts +++ b/frontend/src/components/core/shared/molstar-viewer/molstar-viewer.service.ts @@ -1,5 +1,7 @@ import { useNotification } from "@protzilla/app"; import { OrderedSet } from "molstar/lib/mol-data/int"; +import { Loci } from "molstar/lib/mol-model/loci"; +import { StructureElement } from "molstar/lib/mol-model/structure"; import { PluginUIContext } from "molstar/lib/mol-plugin-ui/context"; import { MolScriptBuilder as MS } from "molstar/lib/mol-script/language/builder"; @@ -15,7 +17,7 @@ type PluginWithCrosslinks = PluginUIContext & { }; interface LabelProvider { - label: (loci: any) => string | undefined; + label: (loci: Loci) => string | undefined; } export async function addCrosslinks( @@ -68,13 +70,13 @@ export function overrideLabels(plugin: PluginUIContext) { const defaultProviders = [...labelManager.providers]; labelManager.providers = []; - const getDefaultLabel = (loci: any) => + const getDefaultLabel = (loci: Loci) => defaultProviders .map((p) => p.label(loci)) .filter(Boolean) .join(" | "); - const getAtomIdsFromLoci = (loci: any): string[] => { + const getAtomIdsFromLoci = (loci: StructureElement.Loci): string[] => { const ids: string[] = []; for (const element of loci.elements) { @@ -83,7 +85,7 @@ export function overrideLabels(plugin: PluginUIContext) { for (let i = 0; i < OrderedSet.size(indices); i++) { const idx = OrderedSet.getAt(indices, i); - ids.push(String(atoms.value(idx))); + ids.push(atoms.value(idx)); } } diff --git a/frontend/src/components/core/shared/molstar-viewer/styles.ts b/frontend/src/components/core/shared/molstar-viewer/styles.ts index a28b823ee..5f147e511 100644 --- a/frontend/src/components/core/shared/molstar-viewer/styles.ts +++ b/frontend/src/components/core/shared/molstar-viewer/styles.ts @@ -62,7 +62,7 @@ const lightSurfaces = ` .msp-plugin .msp-left-panel-controls-buttons, .msp-plugin .msp-layout-right, .msp-plugin .msp-layout-left, - .msp-plugin .msp-highlight-info + .msp-plugin .msp-highlight-info, `; const layoutBlocks = ` @@ -76,6 +76,7 @@ const layoutBlocks = ` .msp-animation-viewport-controls .msp-animation-viewport-controls-select, .msp-plugin .msp-viewport-controls-panel, + .msp-plugin .msp-no-webgl `; const elementsWithDarkText = ` From f01dac16ddcad16dd7d806d2c09dd7fc3c732381 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Mon, 18 May 2026 13:05:24 +0200 Subject: [PATCH 232/240] fix: user-facing strings --- .../app/settings/other-settings/cl-colors-settings.tsx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/frontend/src/components/app/settings/other-settings/cl-colors-settings.tsx b/frontend/src/components/app/settings/other-settings/cl-colors-settings.tsx index c73ce238c..a1d542662 100644 --- a/frontend/src/components/app/settings/other-settings/cl-colors-settings.tsx +++ b/frontend/src/components/app/settings/other-settings/cl-colors-settings.tsx @@ -138,8 +138,8 @@ export const CrosslinkColors = () => { notify({ title: "Invalid colour input", message: - `Invalid value for ${key}.` + - "Please enter a valid colour-code" + + `Invalid value for ${key}. ` + + "Please enter a valid colour-code " + "(e.g. #FF00AA or 0xFF00AA or 6-digit hex code).", type: "error", isClosingAutomatically: true, @@ -215,13 +215,13 @@ export const CrosslinkColors = () => {
Date: Mon, 18 May 2026 15:49:02 +0200 Subject: [PATCH 233/240] fix: edit svg to take current color --- .../core/shared/icon/icons/handle_crosslinking.svg | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/frontend/src/components/core/shared/icon/icons/handle_crosslinking.svg b/frontend/src/components/core/shared/icon/icons/handle_crosslinking.svg index c933facf8..dd3080bf7 100644 --- a/frontend/src/components/core/shared/icon/icons/handle_crosslinking.svg +++ b/frontend/src/components/core/shared/icon/icons/handle_crosslinking.svg @@ -1,16 +1,8 @@ - - - - + - - + + \ No newline at end of file From 3e8802be522a4c50a0e06bb87a29eb8baf82e93a Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Tue, 19 May 2026 11:30:59 +0200 Subject: [PATCH 234/240] chore: fix typo in query generation form --- backend/protzilla/methods/importing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index c5b887d00..d5bd49217 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -669,7 +669,7 @@ def create_form(self): ), InfoField( label="For each entered ID a number should be entered.\n" - "Numbers should be should be space- or comma-separated." + "Numbers should be space- or comma-separated." ), NumberField( name="model_seed", From 4e45cedf7f75a062438d21318eb7f051ce443b82 Mon Sep 17 00:00:00 2001 From: Nele Riediger <75492653+NeleRiediger@users.noreply.github.com> Date: Tue, 19 May 2026 16:57:29 +0200 Subject: [PATCH 235/240] fix: fix failing test (hopefully) --- backend/tests/protzilla/importing/test_crosslinking_import.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/tests/protzilla/importing/test_crosslinking_import.py b/backend/tests/protzilla/importing/test_crosslinking_import.py index abf5f6bd8..d6e26ade0 100644 --- a/backend/tests/protzilla/importing/test_crosslinking_import.py +++ b/backend/tests/protzilla/importing/test_crosslinking_import.py @@ -113,7 +113,7 @@ def mock_execute(*args, **kwargs): return mock monkeypatch.setattr( - "protzilla.importing.crosslinking_import.execute_uniprot_request", + "backend.protzilla.importing.crosslinking_import.execute_uniprot_request", mock_execute, ) From fab59c96e30e0709409b194622a5fec233d3e6e0 Mon Sep 17 00:00:00 2001 From: Anna Polensky Date: Wed, 27 May 2026 09:42:29 +0200 Subject: [PATCH 236/240] fix: matplotlib images are loaded again in the frontend --- backend/main/views.py | 4 +++- frontend/src/components/app/run-screen/run-screen.tsx | 2 +- frontend/src/utils/protzilla-types.ts | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/backend/main/views.py b/backend/main/views.py index a45e537d4..425260d03 100644 --- a/backend/main/views.py +++ b/backend/main/views.py @@ -820,7 +820,9 @@ def get_png_from_step(request: HttpRequest): ) content = output.decode("utf-8") - return JsonResponse({"success": True, "message": "OK", "data": content}) + return JsonResponse( + {"success": True, "message": "OK", "data": {"base64image": content}} + ) def get_current_step_table_data(request): diff --git a/frontend/src/components/app/run-screen/run-screen.tsx b/frontend/src/components/app/run-screen/run-screen.tsx index ac7c9b54e..deec91f79 100644 --- a/frontend/src/components/app/run-screen/run-screen.tsx +++ b/frontend/src/components/app/run-screen/run-screen.tsx @@ -204,7 +204,7 @@ export const RunScreen: React.FC = () => { (output: StepOutputInfo, response: ApiResponse) => ({ title: output.label, alt: output.label, - data: "data:image/png;base64," + response.data.data, + data: "data:image/png;base64," + response.data.base64image, }), [], ); diff --git a/frontend/src/utils/protzilla-types.ts b/frontend/src/utils/protzilla-types.ts index e4c02e6c7..07b123564 100644 --- a/frontend/src/utils/protzilla-types.ts +++ b/frontend/src/utils/protzilla-types.ts @@ -28,7 +28,7 @@ export interface ApiResponse { export interface Image { title: string; alt: string; - data: string; + base64image: string; } export interface Download { From f6b2be06cbb80a97de316e4dec9e81eeb688f264 Mon Sep 17 00:00:00 2001 From: Joris <148121026+jorisfu@users.noreply.github.com> Date: Thu, 28 May 2026 08:36:58 +0000 Subject: [PATCH 237/240] 404 Extend CL validation and imports (#423) * feat: add pLDDT to CL results table * feat: add trivial PAE based validation * feat: trivial plDDT based validation * fix: broken formula * fix monomer validation test * refactor: expose PAE as matrix for monomers * feat: PAE for multimers * feat: pLDDT for multimers * feat: add PAE/plDDT consistently to multimer imports * feat: proper PAE validation for multimers * fix: adjust existing cl validation tests * fix: some alphafold import tests * tempfix: bridge monomer plots so method doesn't fail * tempfix: bridge multimer plots so method doesn't fail * chore: remove obsolete todos * chore: adjust some tests * chore: adjust some tests * feat: introduce parsing of _chem_comp table in cif-files * chore: fix existing tests * chore: test for no pLDDT data within cif * chore: tests for PAE based CL validation * chore: tests for pLDDT based CL validation * feat: AF3 to AF2 PAE matrix translation * (AI) tests: PAE matrix reduction * chore: remove unused imports * feat: only make bounds fields visible if manual bounds is selected mode * chore: black * fix tests * chore: black * fix more tests --------- Co-authored-by: Tarek Massini --- backend/protzilla/constants/cif_columns.py | 63 +++ backend/protzilla/constants/data_types.py | 2 +- backend/protzilla/constants/option_types.py | 7 + .../data_analysis/crosslinking_validation.py | 301 ++++++++++--- .../alphafold_protein_structure_load.py | 293 ++++++++++++- backend/protzilla/methods/data_analysis.py | 31 +- backend/protzilla/methods/importing.py | 8 +- .../test_crosslinking_validation.py | 328 +++++++++++++- .../test_alphafold_protein_structure_load.py | 399 +++++++++++++----- .../importing/test_pae_matrix_reduction.py | 162 +++++++ .../app/run-screen/node-editor/StepNode.tsx | 2 +- 11 files changed, 1403 insertions(+), 193 deletions(-) create mode 100644 backend/protzilla/constants/cif_columns.py create mode 100644 backend/tests/protzilla/importing/test_pae_matrix_reduction.py diff --git a/backend/protzilla/constants/cif_columns.py b/backend/protzilla/constants/cif_columns.py new file mode 100644 index 000000000..5715f2f32 --- /dev/null +++ b/backend/protzilla/constants/cif_columns.py @@ -0,0 +1,63 @@ +from enum import StrEnum + + +ATOM_SITE_PREFIX = "_atom_site." + + +class ATOM_SITE_COLUMNS(StrEnum): + """ + Enum containing all column names that should be present in + the _atom_site. table for mmCIF files from PDB or AFDB + """ + + ID = f"{ATOM_SITE_PREFIX}id" + TYPE_SYMBOL = f"{ATOM_SITE_PREFIX}type_symbol" + LABEL_ATOM_ID = f"{ATOM_SITE_PREFIX}label_atom_id" + LABEL_ALT_ID = f"{ATOM_SITE_PREFIX}label_alt_id" + LABEL_COMP_ID = f"{ATOM_SITE_PREFIX}label_comp_id" + LABEL_ASYM_ID = f"{ATOM_SITE_PREFIX}label_asym_id" + LABEL_ENTITY_ID = f"{ATOM_SITE_PREFIX}label_entity_id" + LABEL_SEQ_ID = f"{ATOM_SITE_PREFIX}label_seq_id" + PDBX_PDB_INS_CODE = f"{ATOM_SITE_PREFIX}pdbx_PDB_ins_code" + CARTN_X = f"{ATOM_SITE_PREFIX}Cartn_x" + CARTN_Y = f"{ATOM_SITE_PREFIX}Cartn_y" + CARTN_Z = f"{ATOM_SITE_PREFIX}Cartn_z" + OCCUPANCY = f"{ATOM_SITE_PREFIX}occupancy" + B_ISO_OR_EQUIV = f"{ATOM_SITE_PREFIX}B_iso_or_equiv" + PDBX_FORMAL_CHARGE = f"{ATOM_SITE_PREFIX}pdbx_formal_charge" + AUTH_SEQ_ID = f"{ATOM_SITE_PREFIX}auth_seq_id" + AUTH_COMP_ID = f"{ATOM_SITE_PREFIX}auth_comp_id" + AUTH_ASYM_ID = f"{ATOM_SITE_PREFIX}auth_asym_id" + AUTH_ATOM_ID = f"{ATOM_SITE_PREFIX}auth_atom_id" + PDBX_PDB_MODEL_NUM = f"{ATOM_SITE_PREFIX}pdbx_PDB_model_num" + + +ATOM_SITE_LABEL_COMP_ID = ATOM_SITE_COLUMNS.LABEL_COMP_ID + +ATOM_SITE_COLUMNS_NUMERIC = [ + ATOM_SITE_COLUMNS.ID, + ATOM_SITE_COLUMNS.LABEL_SEQ_ID, + ATOM_SITE_COLUMNS.CARTN_X, + ATOM_SITE_COLUMNS.CARTN_Y, + ATOM_SITE_COLUMNS.CARTN_Z, + ATOM_SITE_COLUMNS.OCCUPANCY, + ATOM_SITE_COLUMNS.B_ISO_OR_EQUIV, + ATOM_SITE_COLUMNS.AUTH_SEQ_ID, +] + +CHEM_COMP_PREFIX = "_chem_comp." + + +class CHEM_COMP_COLUMNS(StrEnum): + """ + Enum containing all column names that should be present in + the _chem_comp. table for mmCIF files from PDB or AFDB + """ + + ID = f"{CHEM_COMP_PREFIX}id" + TYPE = f"{CHEM_COMP_PREFIX}type" + MON_NSTD_FLAG = f"{CHEM_COMP_PREFIX}mon_nstd_flag" + NAME = f"{CHEM_COMP_PREFIX}name" + PDBX_SYNONYMS = f"{CHEM_COMP_PREFIX}pdbx_synonyms" + FORMULA = f"{CHEM_COMP_PREFIX}formula" + FORMULA_WEIGHT = f"{CHEM_COMP_PREFIX}formula_weight" diff --git a/backend/protzilla/constants/data_types.py b/backend/protzilla/constants/data_types.py index 2665cfe3e..94f4db695 100644 --- a/backend/protzilla/constants/data_types.py +++ b/backend/protzilla/constants/data_types.py @@ -23,7 +23,7 @@ class DataKey(StrEnum): GENE_MAPPING_DF = "gene_mapping_df" CIF_DF = "cif_df" AMINO_ACID_SEQUENCES_DF = "amino_acid_sequences_df" - PAE_DF = "pae_df" # pae = predicted aligned error + PAE_MATRIX = "pae_matrix" # pae = predicted aligned error PLDDT_DF = "plddt_df" # plddt = predicted local distance difference test CROSSLINKING_DF = "crosslinking_df" CONFIDENCE_DF = "confidence_df" diff --git a/backend/protzilla/constants/option_types.py b/backend/protzilla/constants/option_types.py index 678efe9cb..30b2a61bc 100644 --- a/backend/protzilla/constants/option_types.py +++ b/backend/protzilla/constants/option_types.py @@ -60,6 +60,13 @@ class PValueColumnName(StrEnum): ptm = "PTM" +class CrosslinkingValidationCriterion(Enum): + manual_bounds = "Manual Bounds (set below)" + max_pae = "CL length +/- maximum PAE between sites" + min_pae = "CL length +/- minimum PAE between sites" + plddt_adjusted = "plDDT adjusted" + + FC_SIGNIFICANCE_COLUMNS = ["Protein ID", "fc_z_score", "fc_significance"] CORRECTED_P_VALUES_COLUMNS = [ "Protein ID", diff --git a/backend/protzilla/data_analysis/crosslinking_validation.py b/backend/protzilla/data_analysis/crosslinking_validation.py index 4f02ee3bb..8d8b765df 100644 --- a/backend/protzilla/data_analysis/crosslinking_validation.py +++ b/backend/protzilla/data_analysis/crosslinking_validation.py @@ -1,14 +1,15 @@ import itertools import ast import math -from pipes import stepkinds +from typing import Callable + +from backend.protzilla.constants.option_types import CrosslinkingValidationCriterion import pandas as pd import numpy as np import re import logging -from pandas.io.stata import stata_epoch import plotly.graph_objects as go from plotly.graph_objects import Figure @@ -371,6 +372,9 @@ def monomer_validation( crosslinker_information: dict[str, list[float]], cif_df: pd.DataFrame, amino_acid_sequences_df: pd.DataFrame, + pae_matrix: np.ndarray[tuple[int, int]], + plddt_df: pd.DataFrame, + validation_criterion: CrosslinkingValidationCriterion, ) -> dict: """ Validates crosslinking data for a monomeric protein structure by checking @@ -382,6 +386,8 @@ def monomer_validation( allowed distance boundaries (e.g., [min_dist, max_dist]). :param cif_df: DataFrame containing mmCIF information. :param amino_acid_sequences_df: DataFrame containing known amino acid sequences. + :param pae_matrix: NumPy 2D array containing AlphaFold PAE data. + :param plddt_df: DataFrame containing AlphaFold pLDDT data. :return: A dictionary containing the validation results and distance metrics. """ protein_id = structure_metadata_df["uniprot_accession"].iloc[0] @@ -392,9 +398,12 @@ def monomer_validation( structure_metadata_df=structure_metadata_df, cif_df=cif_df, amino_acid_sequences_df=amino_acid_sequences_df, + pae_matrix=pae_matrix, + plddt_df=plddt_df, valid_ids=valid_ids, id_column_name="_atom_site.pdbx_sifts_xref_db_acc", structures_to_validate=[protein_id], + validation_criterion=validation_criterion, ) @@ -453,6 +462,47 @@ def get_valid_ids_per_protein_id_from_job_request( return valid_ids +def get_global_residue_index( + position_within_protein: int, # 1-based index + chain_id: str, + cif_df: pd.DataFrame, +): + """ + For multimer PAE lookup: For a position within a given protein in a chain, + get the global 0-based residue index used to find that position in the PAE matrix. + + Note: This assumes that the order of AAs in the _atom_site table corresponds + to the order of residues in the pae matrix and thus the other residue-based tables in + the cif. + + :param position_within_protein: index of the amino acid within the protein (1-based) + :param chain_id: the chain ID of the protein within the complex + :param cif_df: DataFrame containing the _atom_site table of the complex structure + """ + + # Get table with only unique chain and sequence IDs and infer global index + index_lookup_df = ( + cif_df[["_atom_site.label_asym_id", "_atom_site.label_seq_id"]] + .drop_duplicates() + .reset_index(drop=True) + ) + index_lookup_df.reset_index(inplace=True) + + index_lookup_df = index_lookup_df[ + index_lookup_df["_atom_site.label_asym_id"] == chain_id + ] + index_lookup_df = index_lookup_df[ + index_lookup_df["_atom_site.label_seq_id"] == position_within_protein + ] + + if len(index_lookup_df) != 1: + raise ValueError( + "Invalid input: CIF contains multiple atoms mapped to same chain/sequence ID pair!" + ) + + return index_lookup_df["index"].iloc[0] + + def multimer_validation( crosslinking_df: pd.DataFrame, structure_metadata_df: pd.DataFrame, @@ -460,6 +510,9 @@ def multimer_validation( cif_df: pd.DataFrame, amino_acid_sequences_df: pd.DataFrame, job_request_df: pd.DataFrame, + plddt_df: pd.DataFrame, + pae_matrix: np.ndarray[tuple[int, int]], + validation_criterion: CrosslinkingValidationCriterion, ) -> dict: """ Validates crosslinking data for a multimeric protein complex by checking @@ -477,6 +530,8 @@ def multimer_validation( :param cif_df: DataFrame containing mmCIF information. :param amino_acid_sequences_df: DataFrame containing known amino acid sequences. :param job_request_df: DataFrame containing the loaded AlphaFold job request JSON. + :param plddt_df: DataFrame containing per-residue pLDDT values. + :param pae_matrix: NumPy 2D array containing the PAE values for each residue pair. :return: A dictionary containing the validation results and distance metrics. """ valid_ids = get_valid_ids_per_protein_id_from_job_request( @@ -493,6 +548,9 @@ def multimer_validation( valid_ids=valid_ids, id_column_name="_atom_site.label_entity_id", structures_to_validate=structures_to_validate, + pae_matrix=pae_matrix, + plddt_df=plddt_df, + validation_criterion=validation_criterion, ) @@ -502,9 +560,12 @@ def validate_with_angstrom_deviation( structure_metadata_df: pd.DataFrame, cif_df: pd.DataFrame, amino_acid_sequences_df: pd.DataFrame, - valid_ids: dict, + valid_ids: dict[str, list[int]], id_column_name: str, structures_to_validate: list, + validation_criterion: CrosslinkingValidationCriterion, + plddt_df: pd.DataFrame | None = None, + pae_matrix: np.ndarray[tuple[int, int]] | None = None, ) -> dict: """ Validates crosslinks by comparing the crosslinker lengths with the distances between the linked @@ -516,6 +577,8 @@ def validate_with_angstrom_deviation( :param crosslinker_information: Dictionary mapping crosslinker names to a list of three floats: [crosslinker_length, upper_accepted_deviation, lower_accepted_deviation]. :param cif_df: DataFrame containing CIF information (predicted coordinates of all the protein's atoms). + :param plddt_df: DataFrame containing the local AlphaFold pLDDT values for each residue. + :param pae_matrix: NumPy 2D array containing the PAE values for each residue pair. :param amino_acid_sequences_df: Dataframe that contains all known amino acid sequences. :param valid_ids: Dictionary mapping protein IDs to their valid chain/entity identifiers in the CIF data. :param id_column_name: The column name in the cif_df to use for matching against valid_ids. @@ -575,6 +638,47 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: amino_acid_sequences_df=amino_acid_sequences_df, protein_id=protein_id2 ) + def get_site_plddts(crosslink: pd.Series): + if plddt_df is None: + return np.nan, np.nan + + plddt_at_position1 = float( + plddt_df.query( + "residueNumber == @crosslink.crosslinker_position1 and " + + "chainID == @crosslink.Chain_id1" + ).iloc[0]["confidenceScore"] + ) + plddt_at_position2 = float( + plddt_df.query( + "residueNumber == @crosslink.crosslinker_position2 and " + + "chainID == @crosslink.Chain_id2" + ).iloc[0]["confidenceScore"] + ) + + return plddt_at_position1, plddt_at_position2 + + def get_paes(): + if pae_matrix is None: + return np.nan, np.nan + + pae_index_pos1 = get_global_residue_index( + crosslink.crosslinker_position1, crosslink.Chain_id1, cif_df + ) + pae_index_pos2 = get_global_residue_index( + crosslink.crosslinker_position2, crosslink.Chain_id2, cif_df + ) + pae_x_position1 = pae_matrix[ + pae_index_pos1, pae_index_pos2 + ] # Using position1 as scored residue + pae_x_position2 = pae_matrix[ + pae_index_pos2, pae_index_pos1 + ] # Using position2 as scored residue + + return pae_x_position1, pae_x_position2 + + plddt_at_position1, plddt_at_position2 = get_site_plddts(crosslink) + pae_x_position1, pae_x_position2 = get_paes() + predicted_distance = get_distance_between_two_amino_acids_in_angstrom( amino_acid_position1=crosslink.crosslinker_position1, amino_acid_position2=crosslink.crosslinker_position2, @@ -595,13 +699,68 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: f"Missing required information regarding crosslinker length " f"and/or accepted deviation for crosslinker '{crosslink.Crosslinker}'." ) - # Fallback to default deviation bounds when not explicitly provided - accepted_distance_lower_bound = crosslinker_length - ( - accepted_deviation_lower_bound or crosslinker_length - ) - accepted_distance_upper_bound = ( - accepted_deviation_upper_bound or float("inf") - ) + crosslinker_length + + accepted_distance_lower_bound: float = 0.0 + accepted_distance_upper_bound: float = 0.0 + + match validation_criterion: + case CrosslinkingValidationCriterion.manual_bounds.value: + # Fallback to default deviation bounds when not explicitly provided + accepted_distance_lower_bound = crosslinker_length - ( + accepted_deviation_lower_bound or crosslinker_length + ) + accepted_distance_upper_bound = ( + accepted_deviation_upper_bound or float("inf") + ) + crosslinker_length + + case CrosslinkingValidationCriterion.max_pae.value: + if np.isnan(pae_x_position1) or np.isnan(pae_x_position2): + raise ValueError("No PAE data given.") + + pae_tolerance = max(pae_x_position1, pae_x_position2) + accepted_distance_lower_bound = float( + max(crosslinker_length - pae_tolerance, 0.0) + ) + accepted_distance_upper_bound = float( + crosslinker_length + pae_tolerance + ) + + case CrosslinkingValidationCriterion.min_pae.value: + if np.isnan(pae_x_position1) or np.isnan(pae_x_position2): + raise ValueError("No PAE data given.") + pae_x_position1, pae_x_position2 = get_paes() + pae_tolerance = min(pae_x_position1, pae_x_position2) + accepted_distance_lower_bound = float( + max(crosslinker_length - pae_tolerance, 0.0) + ) + accepted_distance_upper_bound = float( + crosslinker_length + pae_tolerance + ) + + case CrosslinkingValidationCriterion.plddt_adjusted.value: + if np.isnan(plddt_at_position1) or np.isnan(plddt_at_position2): + raise ValueError("No pLDDT data given.") + + get_plddt_factor: Callable[[float], float] = lambda plddt: 1 - ( + plddt / 100 + ) + + plddt_factor_pos1 = get_plddt_factor(plddt_at_position1) + plddt_factor_pos2 = get_plddt_factor(plddt_at_position2) + + max_half_tolerance = crosslinker_length # Note: This is quite lenient + tolerance_pos1 = plddt_factor_pos1 * max_half_tolerance + tolerance_pos2 = plddt_factor_pos2 * max_half_tolerance + + accepted_distance_lower_bound = max( + crosslinker_length - tolerance_pos1 - tolerance_pos2, 0 + ) + accepted_distance_upper_bound = ( + crosslinker_length + tolerance_pos1 + tolerance_pos2 + ) + + case _: + raise ValueError("Invalid validation strategy") valid = ( accepted_distance_lower_bound @@ -615,6 +774,10 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: "valid_crosslink": valid, "crosslinker_position1": crosslink.crosslinker_position1, "crosslinker_position2": crosslink.crosslinker_position2, + "plddt_at_position1": plddt_at_position1, + "plddt_at_position2": plddt_at_position2, + "pae_x_position1": pae_x_position1, + "pae_x_position2": pae_x_position2, } ) @@ -624,6 +787,10 @@ def check_crosslink(crosslink: pd.Series) -> pd.Series: "valid_crosslink", "crosslinker_position1", "crosslinker_position2", + "plddt_at_position1", + "plddt_at_position2", + "pae_x_position1", + "pae_x_position2", ] relevant_crosslinks_df["crosslinker_position1"] = relevant_crosslinks_df[ @@ -907,69 +1074,76 @@ def diagrams_of_crosslinking_validation_data( def monomer_diagrams( - crosslinking_df: pd.DataFrame, + output_crosslinking_result_df: pd.DataFrame, structure_metadata_df: pd.DataFrame, crosslinker_information: dict[str, list[float]], - cif_df: pd.DataFrame, - amino_acid_sequences_df: pd.DataFrame, + validation_criterion: CrosslinkingValidationCriterion, ) -> list[Figure]: """ Generates visual diagrams to evaluate crosslinking validation results for a monomeric protein structure. - This function acts as a wrapper that first runs the crosslink validation - step via `monomer_validation`. It then extracts the resulting dataframe - of validated crosslinks and passes it to the diagram generator to create - the final plots. - - :param crosslinking_df: DataFrame containing the full set of crosslinks. + :param output_crosslinking_result_df: DataFrame containing the CL validation results. :param structure_metadata_df: DataFrame containing structural metadata; the first row's 'uniprot_accession' is used as the target. :param crosslinker_information: Dictionary mapping crosslinker names to a list of three floats: [length, upper_bound, lower_bound]. - :param cif_df: DataFrame containing parsed mmCIF structural coordinate data. - :param amino_acid_sequences_df: DataFrame containing known amino acid sequences. + :param validation_criterion: The validation criterion used for validation. :return: A list of Figure objects visualizing the crosslinking validation data. """ structures_to_validate = [structure_metadata_df["uniprot_accession"].iloc[0]] - validated_df = monomer_validation( - crosslinking_df, - structure_metadata_df, - crosslinker_information, - cif_df, - amino_acid_sequences_df, - )["crosslinking_result_df"] - return diagrams_of_crosslinking_validation_data( - validated_df=validated_df, - structures_to_validate=structures_to_validate, - crosslinker_information=crosslinker_information, - ) + + match validation_criterion: + case CrosslinkingValidationCriterion.manual_bounds.value: + return diagrams_of_crosslinking_validation_data( + validated_df=output_crosslinking_result_df, + structures_to_validate=structures_to_validate, + crosslinker_information=crosslinker_information, + ) + + # TODO: Separate Issue #429 + case ( + CrosslinkingValidationCriterion.max_pae.value + | CrosslinkingValidationCriterion.min_pae.value + ): + return diagrams_of_crosslinking_validation_data( + validated_df=output_crosslinking_result_df, + structures_to_validate=structures_to_validate, + crosslinker_information=crosslinker_information, + ) + + # TODO: Separate Issue #429 + case CrosslinkingValidationCriterion.plddt_adjusted.value: + return diagrams_of_crosslinking_validation_data( + validated_df=output_crosslinking_result_df, + structures_to_validate=structures_to_validate, + crosslinker_information=crosslinker_information, + ) + + case _: + return [] def multimer_diagrams( - crosslinking_df: pd.DataFrame, - structure_metadata_df: pd.DataFrame, + output_crosslinking_result_df: pd.DataFrame, crosslinker_information: dict[str, list[float]], - cif_df: pd.DataFrame, amino_acid_sequences_df: pd.DataFrame, job_request_df: pd.DataFrame, + validation_criterion: CrosslinkingValidationCriterion, ) -> list[Figure]: """ Generates visual diagrams to evaluate crosslinking validation results for a multimeric protein complex. This function parses an AlphaFold job request to determine the valid chain - compositions. It then runs `multimer_validation` to filter and validate - the relevant crosslinks, extracting the result to generate structural - distance and validation plots. + compositions and uses the passed result from the validation. - :param crosslinking_df: DataFrame containing the full set of crosslinks. - :param structure_metadata_df: DataFrame containing structural metadata. + :param output_crosslinking_result_df: DataFrame containing the CL validation results. :param crosslinker_information: Dictionary mapping crosslinker names to a list of three floats: [length, upper_bound, lower_bound]. - :param cif_df: DataFrame containing parsed mmCIF structural coordinate data. :param amino_acid_sequences_df: DataFrame containing known amino acid sequences. :param job_request_df: DataFrame containing the loaded AlphaFold job request JSON. + :param validation_criterion: The validation criterion used for validation. :return: A list of Figure objects visualizing the crosslinking validation data. """ valid_ids = get_valid_ids_per_protein_id_from_job_request( @@ -977,20 +1151,35 @@ def multimer_diagrams( ) structures_to_validate = list(valid_ids.keys()) - validated_df = multimer_validation( - crosslinking_df, - structure_metadata_df, - crosslinker_information, - cif_df, - amino_acid_sequences_df, - job_request_df, - )["crosslinking_result_df"] - - return diagrams_of_crosslinking_validation_data( - validated_df=validated_df, - structures_to_validate=structures_to_validate, - crosslinker_information=crosslinker_information, - ) + match validation_criterion: + case CrosslinkingValidationCriterion.manual_bounds.value: + return diagrams_of_crosslinking_validation_data( + validated_df=output_crosslinking_result_df, + structures_to_validate=structures_to_validate, + crosslinker_information=crosslinker_information, + ) + + # TODO: Separate Issue #429 + case ( + CrosslinkingValidationCriterion.max_pae.value + | CrosslinkingValidationCriterion.min_pae.value + ): + return diagrams_of_crosslinking_validation_data( + validated_df=output_crosslinking_result_df, + structures_to_validate=structures_to_validate, + crosslinker_information=crosslinker_information, + ) + + # TODO: Separate Issue #429 + case CrosslinkingValidationCriterion.plddt_adjusted.value: + return diagrams_of_crosslinking_validation_data( + validated_df=output_crosslinking_result_df, + structures_to_validate=structures_to_validate, + crosslinker_information=crosslinker_information, + ) + + case _: + return [] # Warning: Mostly AI generated diff --git a/backend/protzilla/importing/alphafold_protein_structure_load.py b/backend/protzilla/importing/alphafold_protein_structure_load.py index 4fd343fac..38f4d1307 100644 --- a/backend/protzilla/importing/alphafold_protein_structure_load.py +++ b/backend/protzilla/importing/alphafold_protein_structure_load.py @@ -11,15 +11,24 @@ from datetime import datetime, timezone import gemmi import pandas as pd +import numpy as np +import ast import requests import re from backend.protzilla.constants import paths from backend.protzilla.constants.protzilla_logging import logger +from backend.protzilla.constants.cif_columns import ( + ATOM_SITE_PREFIX, + ATOM_SITE_COLUMNS, + ATOM_SITE_COLUMNS_NUMERIC, + CHEM_COMP_PREFIX, + CHEM_COMP_COLUMNS, +) from backend.protzilla.importing.fasta_import import fasta_import from backend.protzilla.networking import download_file_from_url from backend.protzilla.utilities.utilities import copy_file_to_directory -from backend.protzilla.steps import OutputItem, OutputType +from backend.protzilla.steps import Output, OutputItem, OutputType def get_monomer_metadata_df() -> pd.DataFrame: @@ -108,26 +117,54 @@ def read_alphafold_mmcif(path: Path) -> pd.DataFrame: block = doc.sole_block() - cat_name = "_atom_site." - if cat_name not in block.get_mmcif_category_names(): + if ATOM_SITE_PREFIX not in block.get_mmcif_category_names(): return pd.DataFrame() - table = block.find_mmcif_category(cat_name) - - columns = list(table.tags) - nrows = len(table) - data = {} - for j, col in enumerate(columns): - col_values = [] - for i in range(nrows): - row = table[i] - if j < len(row): - col_values.append(row[j]) - else: - col_values.append(None) - data[col] = col_values + atom_site_table = block.find_mmcif_category(ATOM_SITE_PREFIX) + + atom_site_df = pd.DataFrame( + list(atom_site_table), + columns=list(atom_site_table.tags), + dtype=pd.StringDtype(), + ) + + # convert to numeric dtype for numeric columns present in the dataframe + present_numeric_columns = [ + column for column in ATOM_SITE_COLUMNS_NUMERIC if column in atom_site_table.tags + ] + atom_site_df[present_numeric_columns] = atom_site_df[present_numeric_columns].apply( + pd.to_numeric, errors="coerce" + ) + + atom_site_df = atom_site_df.convert_dtypes() + + if CHEM_COMP_PREFIX not in block.get_mmcif_category_names(): + raise ValueError( + f"Required table with prefix {CHEM_COMP_PREFIX} not found in {path}" + ) + + chem_comp_table = block.find_mmcif_category(CHEM_COMP_PREFIX) + + chem_comp_df = pd.DataFrame( + list(chem_comp_table), + columns=list(chem_comp_table.tags), + dtype=pd.StringDtype(), + )[[CHEM_COMP_COLUMNS.ID, CHEM_COMP_COLUMNS.MON_NSTD_FLAG]] - return pd.DataFrame(data) + # convert flags to native booleans + bool_map = {"y": True, "n": False, ".": pd.NA} + + chem_comp_df[CHEM_COMP_COLUMNS.MON_NSTD_FLAG] = ( + chem_comp_df[CHEM_COMP_COLUMNS.MON_NSTD_FLAG].map(bool_map).astype("boolean") + ) + + # merge on the comp_id and drop the duplicate column + return atom_site_df.merge( + chem_comp_df, + how="left", + left_on=ATOM_SITE_COLUMNS.LABEL_COMP_ID, + right_on=CHEM_COMP_COLUMNS.ID, + ).drop(CHEM_COMP_COLUMNS.ID, axis=1) def get_correct_af_directories( @@ -328,6 +365,9 @@ def handle_alphafold_files( if temp_dir is not None: shutil.rmtree(temp_dir, ignore_errors=True) + # For consistency with multimer pLDDT + plddt_df["chainID"] = "A" + return { "cif_df": cif_df, "pae_df": pae_df, @@ -426,8 +466,13 @@ def fetch_alphafold_protein_structure( messages.append(dict(level=logging.WARNING, msg=message)) data_for_visualization = None + pae_string = str(df_dict["pae_df"]["predicted_aligned_error"].iloc[0]) + pae_matrix = np.array(ast.literal_eval(pae_string)) + del df_dict["pae_df"] + return dict( **df_dict, + pae_matrix=OutputItem(output_type=OutputType.JOBLIB_ARTIFACT, value=pae_matrix), messages=messages, visualization=OutputItem( output_type=OutputType.VISUALIZATION, value=data_for_visualization @@ -435,6 +480,88 @@ def fetch_alphafold_protein_structure( ) +def reduce_pae_to_per_amino_acid( + pae_matrix: np.ndarray, + token_res_ids: list[int], + cif_df: pd.DataFrame, +): + """ + Reduces AlphaFold3 PAE matrices (per-token) to AlphaFold2 PAE matrices (per-amino acid). + If the number of tokens mapping to one AA equals the number of atoms (common for predicted PTMs), + the CA token gets used. Otherwise, the first token gets used. + Required for predictions with PTMs! + + :param pae_matrix: the per-token PAE matrix + :param token_res_ids: the token_res_ids table from the AF3 full_data json + :param cif_df: the atom_site table as a dataframe + + :return: the per-AA/per-residue PAE matrix + """ + + indices_to_delete = [] + + current_idx = 0 + runs = [] + + current_chain_idx = 0 + # Get all runs (start_token_idx, len, chain_idx, res_id) of same res ids into one list + while current_idx < len(token_res_ids): + start_token_idx = current_idx + res_id = token_res_ids[start_token_idx] + length = 1 + + if res_id == 1: + current_chain_idx += 1 + + while True: + current_idx += 1 + if ( + current_idx < len(token_res_ids) + and token_res_ids[current_idx] == res_id + ): + length += 1 + else: + break + + runs.append((start_token_idx, length, current_chain_idx, res_id)) + + for start_token_idx, length, chain_idx, res_id in runs: + if length == 1: + continue + + # Get corresponding entries of _atom_site table for the token + relevant_cif_df = cif_df[cif_df["_atom_site.label_entity_id"] == str(chain_idx)] + relevant_cif_df = relevant_cif_df[ + relevant_cif_df["_atom_site.label_seq_id"] == res_id + ] + + keep_offset = 0 # Relative index to keep within duplicate tokens for one amino acid. Default: first token + + # If we have one token per atom, we try to take the CA atom + if len(relevant_cif_df) == length: + # Reset index twice to get 0..length enumeration for atoms in index + relevant_cif_df.reset_index(drop=True, inplace=True) + relevant_cif_df.reset_index(inplace=True) + + relevant_cif_df = relevant_cif_df[ + relevant_cif_df["_atom_site.label_atom_id"] == "CA" + ] + # 0 or 2+ CA atoms -> default + if len(relevant_cif_df) == 1: + keep_offset = int(relevant_cif_df.iloc[0]["index"]) + + for duplicate_idx in range(0, length): + if duplicate_idx != keep_offset: + indices_to_delete.append(start_token_idx + duplicate_idx) + + # Apply deletion + mask = np.ones(len(pae_matrix), dtype=bool) + mask[indices_to_delete] = False + pae_matrix = pae_matrix[np.ix_(mask, mask)] + + return pae_matrix + + def get_all_available_entry_ids_of_monomer_metadata() -> list[str]: """ " Get the entry ids of all the protein structure predictions that can be found on disk. @@ -694,6 +821,9 @@ def get_monomer_structure_dfs(entry_id: str) -> dict[str, Any]: logger.exception(msg) raise RuntimeError(msg) from e + # For consistency with multimer pLDDT + plddt_df["chainID"] = "A" + df_dict = { "structure_metadata_df": monomer_metadata_df, "cif_df": cif_df, @@ -702,12 +832,19 @@ def get_monomer_structure_dfs(entry_id: str) -> dict[str, Any]: "amino_acid_sequences_df": amino_acid_sequences_df, } check_success_of_get_df(entry_id=entry_id, df_dict=df_dict, messages=messages) + data_for_visualization = { "structure_entry_id": entry_id, "cif_df": cif_df, } + + pae_string = str(df_dict["pae_df"]["predicted_aligned_error"].iloc[0]) + pae_matrix = np.array(ast.literal_eval(pae_string)) + del df_dict["pae_df"] + return dict( **df_dict, + pae_matrix=OutputItem(output_type=OutputType.JOBLIB_ARTIFACT, value=pae_matrix), messages=messages, visualization=OutputItem( output_type=OutputType.VISUALIZATION, value=data_for_visualization @@ -715,6 +852,73 @@ def get_monomer_structure_dfs(entry_id: str) -> dict[str, Any]: ) +def unwrap_full_data_df(full_data_df: pd.DataFrame) -> dict[str, Any]: + """ + Extracts certain data from a full_data_df, deletes the extracted columns + and returns the "remaining" full_data_df as well as the extracted data. + + :param full_data_df: The AlphaFold3 full_data_df + :return dict: + - "full_data_df": The updated reduced full_data_df + - "pae_matrix": Numpy matrix with the PAE values for each residue pair + - "token_res_ids": List with the token -> AA mappings + """ + + try: + pae_matrix = np.array(full_data_df["pae"].iloc[0]) + full_data_df = full_data_df.drop(columns=["pae"]) + except KeyError: + pae_matrix = None + + try: + token_res_ids = np.array(full_data_df["token_res_ids"].iloc[0]) + full_data_df = full_data_df.drop(columns=["token_res_ids"]) + except KeyError as e: + raise KeyError( + "Prediction data does not contain required prediction token to amino acid mapping." + ) from e + + return dict( + full_data_df=full_data_df, + pae_matrix=pae_matrix, + token_res_ids=token_res_ids, + ) + + +def get_plddt_from_cif(cif_df: pd.DataFrame) -> pd.DataFrame | None: + """ + For use with multimers predicted using Alphafold3. + Returns per-residue pLDDT values for the predicted structure. + Note that sine AlphaFold3 uses per-atom pLDDT, we use the pLDDT for the CA atom. + See also https://github.com/google-deepmind/alphafold3/issues/330 + + :param cif_df: the cif_df holding the _atom_site table. + :return: DataFrame containing columns + "chainID", "residueNumber", "confidenceScore", "confidenceCategory" + """ + + try: + filtered_cif_df = cif_df[cif_df["_atom_site.label_atom_id"] == "CA"] + filtered_cif_df = filtered_cif_df[ + [ + "_atom_site.auth_asym_id", + "_atom_site.label_seq_id", + "_atom_site.B_iso_or_equiv", + ] + ] + filtered_cif_df = filtered_cif_df.rename( + columns={ + "_atom_site.auth_asym_id": "chainID", + "_atom_site.label_seq_id": "residueNumber", + "_atom_site.B_iso_or_equiv": "confidenceScore", + } + ) + return filtered_cif_df + + except KeyError: + return None + + def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: """ Writes multimer structure data from disk of a specific entry ID into dataframes. @@ -798,9 +1002,31 @@ def get_multimer_structure_dfs(entry_id: str) -> dict[str, Any]: "structure_entry_id": entry_id, "cif_df": cif_df, } + + unwrapped_full_data = unwrap_full_data_df(df_dict["full_data_df"]) + df_dict["full_data_df"] = unwrapped_full_data["full_data_df"] + + pae_matrix = unwrapped_full_data["pae_matrix"] + token_res_ids = unwrapped_full_data["token_res_ids"] + plddt_df = get_plddt_from_cif(df_dict["cif_df"]) + + pae_matrix = reduce_pae_to_per_amino_acid( + pae_matrix, token_res_ids, df_dict["cif_df"] + ) + + if plddt_df is None: + messages.append( + dict( + level=logging.WARNING, + msg=f"Could not parse pLDDT values from CIF file. File is likely malformed!", + ) + ) + return dict( **df_dict, messages=messages, + plddt_df=plddt_df, + pae_matrix=OutputItem(output_type=OutputType.JOBLIB_ARTIFACT, value=pae_matrix), visualization=OutputItem( output_type=OutputType.VISUALIZATION, value=data_for_visualization ), @@ -946,13 +1172,38 @@ def upload_multimer_prediction( } if not any(df.empty for df in df_dict.values()): - success_msg = f"Successfully loaded AlphaFold data for entry '{entry_id}'" - logger.info(success_msg) - messages.append(dict(level=logging.INFO, msg=success_msg)) + + unwrapped_full_data = unwrap_full_data_df(df_dict["full_data_df"]) + df_dict["full_data_df"] = unwrapped_full_data["full_data_df"] + + pae_matrix = reduce_pae_to_per_amino_acid( + unwrapped_full_data["pae_matrix"], + unwrapped_full_data["token_res_ids"], + df_dict["cif_df"], + ) + + pae_matrix = OutputItem( + output_type=OutputType.JOBLIB_ARTIFACT, + value=pae_matrix, + ) + df_dict["pae_matrix"] = pae_matrix + df_dict["plddt_df"] = get_plddt_from_cif(df_dict["cif_df"]) + + if df_dict["plddt_df"] is None: + messages.append( + dict( + level=logging.WARNING, + msg=f"Could not parse pLDDT values from CIF file. File is likely malformed!", + ) + ) data_for_visualization = { "structure_entry_id": entry_id, "cif_df": cif_df, } + + success_msg = f"Successfully loaded AlphaFold data for entry '{entry_id}'" + logger.info(success_msg) + messages.append(dict(level=logging.INFO, msg=success_msg)) else: message = f"Could not load AlphaFold data for entry '{entry_id}'" logger.warning(message) diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 5a758f5bd..b7f47667c 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -3,6 +3,7 @@ import ast from backend.protzilla.constants.option_types import ( + CrosslinkingValidationCriterion, LogBaseWithNoneType, SimpleImputerStrategyType, ) @@ -70,6 +71,7 @@ MultiSelectField, NumberField, TextField, + FormDivider, ) from backend.protzilla.steps import Step, Section from backend.protzilla.step_manager import StepManager @@ -2415,6 +2417,13 @@ def create_crosslink_input_fields(self, form: Form, run: Run): form.add_field(upper_bound_length_deviation_field) form.add_field(lower_bound_length_deviation_field) + bounds_visible = ( + form["validation_criterion"].value + == CrosslinkingValidationCriterion.manual_bounds.value + ) + form[f"{crosslinker}_upper_accepted_deviation"].isVisible = bounds_visible + form[f"{crosslinker}_lower_accepted_deviation"].isVisible = bounds_visible + def collect_crosslinking_information(self, steps: StepManager, inputs) -> dict: # although crosslinker_information is not a dataframe we need to insert the user information regarding the crosslinks as a dictionary into the inputs crosslinker_to_length_and_deviation = {} @@ -2450,9 +2459,18 @@ def create_form(self): return Form( label="Ångström Deviation - Monomer", input_fields=[ + DropdownField( + name="validation_criterion", + label="Validation criterion", + options=CrosslinkingValidationCriterion, + value=CrosslinkingValidationCriterion.manual_bounds, + ), + FormDivider( + label="Crosslinker lengths and bounds", + ), InfoField( label="Set default cross-link lengths and their upper/lower deviations in settings under 'Cross-Links Defaults'.", - ) + ), ], ) @@ -2470,8 +2488,17 @@ def create_form(self): return Form( label="Ångström Deviation - Multimer", input_fields=[ + DropdownField( + name="validation_criterion", + label="Validation criterion", + options=CrosslinkingValidationCriterion, + value=CrosslinkingValidationCriterion.manual_bounds, + ), + FormDivider( + label="Crosslinker lengths and bounds", + ), InfoField( label="Set default cross-link lengths and their upper/lower deviations in settings under 'Cross-Links Defaults'.", - ) + ), ], ) diff --git a/backend/protzilla/methods/importing.py b/backend/protzilla/methods/importing.py index f4fdd50ad..4d07d53f9 100644 --- a/backend/protzilla/methods/importing.py +++ b/backend/protzilla/methods/importing.py @@ -438,9 +438,9 @@ class AlphaFoldPredictionLoad(ImportingStep): output_keys = [ DataKey.STRUCTURE_METADATA_DF, DataKey.CIF_DF, - DataKey.PAE_DF, DataKey.PLDDT_DF, DataKey.AMINO_ACID_SEQUENCES_DF, + DataKey.PAE_MATRIX, ] plot_method = None @@ -503,7 +503,7 @@ class ImportMonomerStructurePredictionFromDisk(ImportingStep): output_keys = [ DataKey.STRUCTURE_METADATA_DF, DataKey.CIF_DF, - DataKey.PAE_DF, + DataKey.PAE_MATRIX, DataKey.PLDDT_DF, DataKey.AMINO_ACID_SEQUENCES_DF, ] @@ -540,6 +540,8 @@ class UploadMultimerPredictions(ImportingStep): DataKey.FULL_DATA_DF, DataKey.JOB_REQUEST_DF, DataKey.AMINO_ACID_SEQUENCES_DF, + DataKey.PAE_MATRIX, + DataKey.PLDDT_DF, ] def create_form(self): @@ -617,6 +619,8 @@ class ImportMultimerStructurePredictionFromDisk(ImportingStep): DataKey.FULL_DATA_DF, DataKey.JOB_REQUEST_DF, DataKey.AMINO_ACID_SEQUENCES_DF, + DataKey.PAE_MATRIX, + DataKey.PLDDT_DF, ] def create_form(self): diff --git a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py index 251b0cf6f..0e815a17d 100644 --- a/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py +++ b/backend/tests/protzilla/data_analysis/test_crosslinking_validation.py @@ -1,10 +1,12 @@ import pandas as pd +from backend.protzilla.constants.option_types import CrosslinkingValidationCriterion import pytest import logging from unittest.mock import patch, MagicMock import plotly.graph_objects as go from plotly.graph_objects import Figure import pandas.testing as pdt +import numpy as np from backend.protzilla.data_analysis.crosslinking_validation import ( @@ -35,11 +37,14 @@ (6.01, False), # outside bounds ], ) -def test_validate_with_angstrom_deviation(distance, expected): +def test_monomer_validation_baseline_manual_bounds(distance, expected): + crosslinker_information = {"DSS": [5.0, 1.0, 1.0]} # Length 5 Å ± 1 Å + # Fake AlphaFold Data with chain IDs cif_df = pd.DataFrame( { "_atom_site.label_atom_id": ["CA", "CA"], + "_atom_site.label_asym_id": ["A", "A"], "_atom_site.label_seq_id": [1, 2], "_atom_site.Cartn_x": [0, distance], "_atom_site.Cartn_y": [0, 0], @@ -70,7 +75,6 @@ def test_validate_with_angstrom_deviation(distance, expected): {"entry_id": ["test"], "uniprot_accession": ["P12345"]} ) - crosslinker_information = {"DSS": [5.0, 1.0, 1.0]} # Länge 5 Å ± 1 Å valid_ids = {"P12345": ["P12345"]} structures_to_validate = ["P12345"] @@ -83,9 +87,10 @@ def test_validate_with_angstrom_deviation(distance, expected): valid_ids=valid_ids, id_column_name="_atom_site.pdbx_sifts_xref_db_acc", structures_to_validate=structures_to_validate, + validation_criterion=CrosslinkingValidationCriterion.manual_bounds.value, ) - df = result["crosslinking_result_df"] + df: pd.DataFrame = result["crosslinking_result_df"] assert "alphafold_distance" in df.columns assert "valid_crosslink" in df.columns @@ -97,6 +102,318 @@ def test_validate_with_angstrom_deviation(distance, expected): assert df.loc[0, "link_type"] == "intra" +@pytest.mark.parametrize( + "distance, expected", + [ + (4.99, False), + (5.0, True), + (5.1, False), + ], +) +def test_cl_validation_pae_noerrror(distance, expected): + crosslinker_information = {"DSS": [5.0, 1.0, 1.0]} # Length 5 Å (± 1 Å) + pae_matrix = np.array([[np.nan, 0], [0, np.nan]]) + + # Fake AlphaFold Data with chain IDs + cif_df = pd.DataFrame( + { + "_atom_site.label_atom_id": ["CA", "CA"], + "_atom_site.label_asym_id": ["A", "A"], + "_atom_site.label_seq_id": [1, 2], + "_atom_site.Cartn_x": [0, distance], + "_atom_site.Cartn_y": [0, 0], + "_atom_site.Cartn_z": [0, 0], + "_atom_site.auth_asym_id": ["A", "A"], + "_atom_site.pdbx_sifts_xref_db_acc": ["P12345", "P12345"], + } + ) + + amino_acid_sequences_df = pd.DataFrame( + {"Protein ID": ["P12345-1"], "Protein Sequence": ["AB"]} + ) + + # Fake Crosslink Data + crosslinking_df = pd.DataFrame( + { + "Protein_id1": ["P12345"], + "Protein_id2": ["P12345"], + "Peptide1": ["A"], + "Peptide2": ["B"], + "CL_position_within_peptide1": [0], + "CL_position_within_peptide2": [0], + "Crosslinker": ["DSS"], + } + ) + + structure_metadata_df = pd.DataFrame( + {"entry_id": ["test"], "uniprot_accession": ["P12345"]} + ) + + valid_ids = {"P12345": ["P12345"]} + structures_to_validate = ["P12345"] + + result = validate_with_angstrom_deviation( + crosslinking_df=crosslinking_df, + structure_metadata_df=structure_metadata_df, + crosslinker_information=crosslinker_information, + cif_df=cif_df, + amino_acid_sequences_df=amino_acid_sequences_df, + valid_ids=valid_ids, + id_column_name="_atom_site.pdbx_sifts_xref_db_acc", + structures_to_validate=structures_to_validate, + validation_criterion=CrosslinkingValidationCriterion.min_pae.value, + pae_matrix=pae_matrix, + ) + + df: pd.DataFrame = result["crosslinking_result_df"] + assert df.loc[0, "valid_crosslink"] == expected + + +@pytest.mark.parametrize( + "distance, expected_min, expected_max", + [ + (2.0, False, False), + (3.0, False, True), + (4.0, True, True), + (5.0, True, True), + (6.0, True, True), + (7.0, False, True), + (8.0, False, False), + ], +) +def test_cl_validation_pae_haserror(distance, expected_min, expected_max): + crosslinker_information = {"DSS": [5.0, 1.0, 1.0]} # Length 5 Å (± 1 Å) + pae_matrix = np.array([[np.nan, 1], [2, np.nan]]) + + # Fake AlphaFold Data with chain IDs + cif_df = pd.DataFrame( + { + "_atom_site.label_atom_id": ["CA", "CA"], + "_atom_site.label_asym_id": ["A", "A"], + "_atom_site.label_seq_id": [1, 2], + "_atom_site.Cartn_x": [0, distance], + "_atom_site.Cartn_y": [0, 0], + "_atom_site.Cartn_z": [0, 0], + "_atom_site.auth_asym_id": ["A", "A"], + "_atom_site.pdbx_sifts_xref_db_acc": ["P12345", "P12345"], + } + ) + + amino_acid_sequences_df = pd.DataFrame( + {"Protein ID": ["P12345-1"], "Protein Sequence": ["AB"]} + ) + + # Fake Crosslink Data + crosslinking_df = pd.DataFrame( + { + "Protein_id1": ["P12345"], + "Protein_id2": ["P12345"], + "Peptide1": ["A"], + "Peptide2": ["B"], + "CL_position_within_peptide1": [0], + "CL_position_within_peptide2": [0], + "Crosslinker": ["DSS"], + } + ) + + structure_metadata_df = pd.DataFrame( + {"entry_id": ["test"], "uniprot_accession": ["P12345"]} + ) + + valid_ids = {"P12345": ["P12345"]} + structures_to_validate = ["P12345"] + + result_min = validate_with_angstrom_deviation( + crosslinking_df=crosslinking_df, + structure_metadata_df=structure_metadata_df, + crosslinker_information=crosslinker_information, + cif_df=cif_df, + amino_acid_sequences_df=amino_acid_sequences_df, + valid_ids=valid_ids, + id_column_name="_atom_site.pdbx_sifts_xref_db_acc", + structures_to_validate=structures_to_validate, + validation_criterion=CrosslinkingValidationCriterion.min_pae.value, + pae_matrix=pae_matrix, + ) + + df: pd.DataFrame = result_min["crosslinking_result_df"] + assert df.loc[0, "valid_crosslink"] == expected_min + + result_max = validate_with_angstrom_deviation( + crosslinking_df=crosslinking_df, + structure_metadata_df=structure_metadata_df, + crosslinker_information=crosslinker_information, + cif_df=cif_df, + amino_acid_sequences_df=amino_acid_sequences_df, + valid_ids=valid_ids, + id_column_name="_atom_site.pdbx_sifts_xref_db_acc", + structures_to_validate=structures_to_validate, + validation_criterion=CrosslinkingValidationCriterion.max_pae.value, + pae_matrix=pae_matrix, + ) + + df: pd.DataFrame = result_max["crosslinking_result_df"] + assert df.loc[0, "valid_crosslink"] == expected_max + + +@pytest.mark.parametrize( + "distance, expected", + [ + (4.99, False), + (5.0, True), + (5.1, False), + ], +) +def test_cl_validation_plddt_noerrror(distance, expected): + crosslinker_information = {"DSS": [5.0, 1.0, 1.0]} # Length 5 Å (± 1 Å) + + plddt_df_noerror = pd.DataFrame( + { + "chainID": ["A", "A"], + "residueNumber": [1, 2], + "confidenceScore": [100, 100], + # confidenceCategory is not required + } + ) + + # Fake AlphaFold Data with chain IDs + cif_df = pd.DataFrame( + { + "_atom_site.label_atom_id": ["CA", "CA"], + "_atom_site.label_asym_id": ["A", "A"], + "_atom_site.label_seq_id": [1, 2], + "_atom_site.Cartn_x": [0, distance], + "_atom_site.Cartn_y": [0, 0], + "_atom_site.Cartn_z": [0, 0], + "_atom_site.auth_asym_id": ["A", "A"], + "_atom_site.pdbx_sifts_xref_db_acc": ["P12345", "P12345"], + } + ) + + amino_acid_sequences_df = pd.DataFrame( + {"Protein ID": ["P12345-1"], "Protein Sequence": ["AB"]} + ) + + # Fake Crosslink Data + crosslinking_df = pd.DataFrame( + { + "Protein_id1": ["P12345"], + "Protein_id2": ["P12345"], + "Peptide1": ["A"], + "Peptide2": ["B"], + "CL_position_within_peptide1": [0], + "CL_position_within_peptide2": [0], + "Crosslinker": ["DSS"], + } + ) + + structure_metadata_df = pd.DataFrame( + {"entry_id": ["test"], "uniprot_accession": ["P12345"]} + ) + + valid_ids = {"P12345": ["P12345"]} + structures_to_validate = ["P12345"] + + result = validate_with_angstrom_deviation( + crosslinking_df=crosslinking_df, + structure_metadata_df=structure_metadata_df, + crosslinker_information=crosslinker_information, + cif_df=cif_df, + amino_acid_sequences_df=amino_acid_sequences_df, + valid_ids=valid_ids, + id_column_name="_atom_site.pdbx_sifts_xref_db_acc", + structures_to_validate=structures_to_validate, + validation_criterion=CrosslinkingValidationCriterion.plddt_adjusted.value, + plddt_df=plddt_df_noerror, + ) + + df: pd.DataFrame = result["crosslinking_result_df"] + assert df.loc[0, "valid_crosslink"] == expected + + +# l_cl = 5, t_x = 1.25, t_y = 3.5. +# So range is 0.25 <= d <= 9.75 +@pytest.mark.parametrize( + "distance, expected", + [ + (0.0, False), + (0.24, False), + (0.25, True), + (5.0, True), + (9.0, True), + (9.74, True), + (9.75, True), + (9.76, False), + (10.0, False), + ], +) +def test_cl_validation_plddt_witherror(distance, expected): + crosslinker_information = {"DSS": [5.0, 1.0, 1.0]} # Length 5 Å (± 1 Å) + + plddt_df_noerror = pd.DataFrame( + { + "chainID": ["A", "A"], + "residueNumber": [1, 2], + "confidenceScore": [75, 30], + # confidenceCategory is not required + } + ) + + # Fake AlphaFold Data with chain IDs + cif_df = pd.DataFrame( + { + "_atom_site.label_atom_id": ["CA", "CA"], + "_atom_site.label_asym_id": ["A", "A"], + "_atom_site.label_seq_id": [1, 2], + "_atom_site.Cartn_x": [0, distance], + "_atom_site.Cartn_y": [0, 0], + "_atom_site.Cartn_z": [0, 0], + "_atom_site.auth_asym_id": ["A", "A"], + "_atom_site.pdbx_sifts_xref_db_acc": ["P12345", "P12345"], + } + ) + + amino_acid_sequences_df = pd.DataFrame( + {"Protein ID": ["P12345-1"], "Protein Sequence": ["AB"]} + ) + + # Fake Crosslink Data + crosslinking_df = pd.DataFrame( + { + "Protein_id1": ["P12345"], + "Protein_id2": ["P12345"], + "Peptide1": ["A"], + "Peptide2": ["B"], + "CL_position_within_peptide1": [0], + "CL_position_within_peptide2": [0], + "Crosslinker": ["DSS"], + } + ) + + structure_metadata_df = pd.DataFrame( + {"entry_id": ["test"], "uniprot_accession": ["P12345"]} + ) + + valid_ids = {"P12345": ["P12345"]} + structures_to_validate = ["P12345"] + + result = validate_with_angstrom_deviation( + crosslinking_df=crosslinking_df, + structure_metadata_df=structure_metadata_df, + crosslinker_information=crosslinker_information, + cif_df=cif_df, + amino_acid_sequences_df=amino_acid_sequences_df, + valid_ids=valid_ids, + id_column_name="_atom_site.pdbx_sifts_xref_db_acc", + structures_to_validate=structures_to_validate, + validation_criterion=CrosslinkingValidationCriterion.plddt_adjusted.value, + plddt_df=plddt_df_noerror, + ) + + df: pd.DataFrame = result["crosslinking_result_df"] + assert df.loc[0, "valid_crosslink"] == expected + + def test_modify_form_creates_crosslinker_fields(): crosslinking_df = pd.DataFrame({"Crosslinker": ["DSS", "BS3", "DSS"]}) @@ -353,6 +670,7 @@ def test_validate_multimer_filters_only_pairs_within_structures_to_validate(): valid_ids=valid_ids, id_column_name="_atom_site.label_entity_id", structures_to_validate=structures_to_validate, + validation_criterion=CrosslinkingValidationCriterion.manual_bounds.value, ) result_df = out["crosslinking_result_df"] @@ -428,6 +746,7 @@ def test_validate_multimer_no_links_between_structures_returns_empty_and_warning valid_ids=valid_ids, id_column_name="_atom_site.label_entity_id", structures_to_validate=structures_to_validate, + validation_criterion=CrosslinkingValidationCriterion.manual_bounds.value, ) result_df = out["crosslinking_result_df"] @@ -499,6 +818,7 @@ def test_validate_multimer_duplicates_rows_for_multiple_peptide_matches_and_vali valid_ids=valid_ids, id_column_name="_atom_site.label_entity_id", structures_to_validate=structures_to_validate, + validation_criterion=CrosslinkingValidationCriterion.manual_bounds.value, ) result_df = out["crosslinking_result_df"] @@ -827,6 +1147,7 @@ def test_validate_multimer_with_invalid_crosslinks(): valid_ids=valid_ids, id_column_name="_atom_site.label_entity_id", structures_to_validate=structures_to_validate, + validation_criterion=CrosslinkingValidationCriterion.manual_bounds.value, ) result_df = out["crosslinking_result_df"] @@ -1003,6 +1324,7 @@ def test_validate_multimer_same_protein_different_chains_intra_vs_inter(): valid_ids=valid_ids, id_column_name="_atom_site.label_entity_id", structures_to_validate=structures_to_validate, + validation_criterion=CrosslinkingValidationCriterion.manual_bounds.value, ) result_df = out["crosslinking_result_df"] diff --git a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py index 940bbcea2..334cdbccf 100644 --- a/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py +++ b/backend/tests/protzilla/importing/test_alphafold_protein_structure_load.py @@ -1,9 +1,11 @@ +from backend.protzilla.steps import OutputItem import pandas as pd import pytest import json import logging import shutil from pathlib import Path +import numpy as np from backend.protzilla.importing.alphafold_protein_structure_load import ( @@ -29,6 +31,12 @@ check_success_of_get_df, ) from backend.protzilla.constants import paths +from backend.protzilla.constants.cif_columns import ( + ATOM_SITE_PREFIX, + ATOM_SITE_COLUMNS, + CHEM_COMP_COLUMNS, +) +from backend.protzilla.constants.data_types import DataKey def test_to_fasta_default_header_and_newline(): @@ -90,11 +98,18 @@ def test_read_alphafold_mmcif_valid_atom_site(tmp_path): """ data_test loop_ +_chem_comp.id +_chem_comp.mon_nstd_flag +SER y +# +loop_ _atom_site.id _atom_site.type_symbol +_atom_site.label_atom_id +_atom_site.label_comp_id _atom_site.Cartn_x -N N 1.0 -CA C 2.0 +1 N N SER 1.0 +2 C CA SER 2.0 """ ) @@ -102,14 +117,19 @@ def test_read_alphafold_mmcif_valid_atom_site(tmp_path): assert isinstance(df, pd.DataFrame) assert list(df.columns) == [ - "_atom_site.id", - "_atom_site.type_symbol", - "_atom_site.Cartn_x", + ATOM_SITE_COLUMNS.ID, + ATOM_SITE_COLUMNS.TYPE_SYMBOL, + ATOM_SITE_COLUMNS.LABEL_ATOM_ID, + ATOM_SITE_COLUMNS.LABEL_COMP_ID, + ATOM_SITE_COLUMNS.CARTN_X, + CHEM_COMP_COLUMNS.MON_NSTD_FLAG, ] assert len(df) == 2 - assert df["_atom_site.id"].tolist() == ["N", "CA"] - assert df["_atom_site.type_symbol"].tolist() == ["N", "C"] - assert df["_atom_site.Cartn_x"].tolist() == ["1.0", "2.0"] + assert df[ATOM_SITE_COLUMNS.ID].tolist() == [1, 2] + assert df[ATOM_SITE_COLUMNS.TYPE_SYMBOL].tolist() == ["N", "C"] + assert df[ATOM_SITE_COLUMNS.LABEL_ATOM_ID].tolist() == ["N", "CA"] + assert df[ATOM_SITE_COLUMNS.CARTN_X].tolist() == [1.0, 2.0] + assert df[CHEM_COMP_COLUMNS.MON_NSTD_FLAG].tolist() == [True, True] def test_fetch_alphafold_protein_structure_wrong_uniprot_id(): @@ -127,11 +147,11 @@ def test_fetch_alphafold_returned_keys(tmp_path, monkeypatch): out = fetch_alphafold_protein_structure("Q8WP00", persist_upload=True) assert out.keys() == { - "structure_metadata_df", - "cif_df", - "pae_df", - "plddt_df", - "amino_acid_sequences_df", + DataKey.STRUCTURE_METADATA_DF, + DataKey.CIF_DF, + DataKey.PAE_MATRIX, + DataKey.PLDDT_DF, + DataKey.AMINO_ACID_SEQUENCES_DF, "messages", "visualization", } @@ -146,16 +166,16 @@ def test_fetch_alphafold_monomer_metadata(tmp_path, monkeypatch): ) out = fetch_alphafold_protein_structure("Q8WP00", persist_upload=True) - assert isinstance(out["structure_metadata_df"], pd.DataFrame) - assert not out["structure_metadata_df"].empty - assert out["structure_metadata_df"].iloc[0]["uniprot_accession"] == "Q8WP00" + assert isinstance(out[DataKey.STRUCTURE_METADATA_DF], pd.DataFrame) + assert not out[DataKey.STRUCTURE_METADATA_DF].empty + assert out[DataKey.STRUCTURE_METADATA_DF].iloc[0]["uniprot_accession"] == "Q8WP00" assert ( - out["structure_metadata_df"].iloc[0]["model_created_date"] + out[DataKey.STRUCTURE_METADATA_DF].iloc[0]["model_created_date"] == "2025-08-01T00:00:00Z" ) - assert out["structure_metadata_df"].iloc[0]["gene"] == "PRM1" + assert out[DataKey.STRUCTURE_METADATA_DF].iloc[0]["gene"] == "PRM1" assert ( - out["structure_metadata_df"].iloc[0]["model_used"] + out[DataKey.STRUCTURE_METADATA_DF].iloc[0]["model_used"] == "AlphaFold Monomer v2.0 pipeline" ) @@ -197,20 +217,20 @@ def test_fetch_alphafold_dfs_exist(tmp_path, monkeypatch): out = fetch_alphafold_protein_structure("Q8WP00", persist_upload=True) - cif_df = out["cif_df"] + cif_df = out[DataKey.CIF_DF] assert isinstance(cif_df, pd.DataFrame) assert not cif_df.empty - assert any(col.startswith("_atom_site.") for col in cif_df.columns) + assert any(col.startswith(ATOM_SITE_PREFIX) for col in cif_df.columns) - pae_df = out["pae_df"] - assert isinstance(pae_df, pd.DataFrame) - assert not pae_df.empty + pae_matrix = out[DataKey.PAE_MATRIX] + assert isinstance(pae_matrix, OutputItem) + assert len(pae_matrix.value) != 0 - plddt_df = out["plddt_df"] + plddt_df = out[DataKey.PLDDT_DF] assert isinstance(plddt_df, pd.DataFrame) assert not plddt_df.empty - seq_df = out["amino_acid_sequences_df"] + seq_df = out[DataKey.AMINO_ACID_SEQUENCES_DF] assert isinstance(seq_df, pd.DataFrame) assert not seq_df.empty @@ -280,11 +300,18 @@ def test_get_prot_structure_dfs_success(tmp_path, monkeypatch): """ data_test loop_ +_chem_comp.id +_chem_comp.mon_nstd_flag +SER y +# +loop_ _atom_site.id _atom_site.type_symbol +_atom_site.label_atom_id +_atom_site.label_comp_id _atom_site.Cartn_x -N N 1.0 -CA C 2.0 +1 N N SER 1.0 +2 C CA SER 2.0 """ ) @@ -303,34 +330,43 @@ def test_get_prot_structure_dfs_success(tmp_path, monkeypatch): out = get_monomer_structure_dfs("Q8WP00") - assert isinstance(out["structure_metadata_df"], pd.DataFrame) - assert not out["structure_metadata_df"].empty - assert out["structure_metadata_df"].iloc[0]["entry_id"] == "Q8WP00" + assert isinstance(out[DataKey.STRUCTURE_METADATA_DF], pd.DataFrame) + assert not out[DataKey.STRUCTURE_METADATA_DF].empty + assert out[DataKey.STRUCTURE_METADATA_DF].iloc[0]["entry_id"] == "Q8WP00" - assert isinstance(out["cif_df"], pd.DataFrame) - assert not out["cif_df"].empty - assert list(out["cif_df"].columns) == [ - "_atom_site.id", - "_atom_site.type_symbol", - "_atom_site.Cartn_x", + cif_df = out[DataKey.CIF_DF] + assert isinstance(cif_df, pd.DataFrame) + assert not cif_df.empty + assert list(cif_df.columns) == [ + ATOM_SITE_COLUMNS.ID, + ATOM_SITE_COLUMNS.TYPE_SYMBOL, + ATOM_SITE_COLUMNS.LABEL_ATOM_ID, + ATOM_SITE_COLUMNS.LABEL_COMP_ID, + ATOM_SITE_COLUMNS.CARTN_X, + CHEM_COMP_COLUMNS.MON_NSTD_FLAG, ] - assert out["cif_df"]["_atom_site.id"].tolist() == ["N", "CA"] - assert out["cif_df"]["_atom_site.type_symbol"].tolist() == ["N", "C"] - assert out["cif_df"]["_atom_site.Cartn_x"].tolist() == ["1.0", "2.0"] - - assert isinstance(out["pae_df"], pd.DataFrame) - assert not out["pae_df"].empty - assert out["pae_df"]["predicted_aligned_error"].tolist() == [0.1] + assert len(cif_df) == 2 + assert cif_df[ATOM_SITE_COLUMNS.ID].tolist() == [1, 2] + assert cif_df[ATOM_SITE_COLUMNS.TYPE_SYMBOL].tolist() == ["N", "C"] + assert cif_df[ATOM_SITE_COLUMNS.LABEL_ATOM_ID].tolist() == ["N", "CA"] + assert cif_df[ATOM_SITE_COLUMNS.CARTN_X].tolist() == [1.0, 2.0] + assert cif_df[CHEM_COMP_COLUMNS.MON_NSTD_FLAG].tolist() == [True, True] + + assert isinstance(out[DataKey.PAE_MATRIX], OutputItem) + assert isinstance(out[DataKey.PAE_MATRIX].value, np.ndarray) + assert ( + out[DataKey.PAE_MATRIX].value == 0.1 + ) # 0D array (only one value) TODO: Change this to something more reasonable? idk - assert isinstance(out["plddt_df"], pd.DataFrame) - assert not out["plddt_df"].empty - assert out["plddt_df"]["residueNumber"].tolist() == [1] - assert out["plddt_df"]["confidenceScore"].tolist() == [90] + assert isinstance(out[DataKey.PLDDT_DF], pd.DataFrame) + assert not out[DataKey.PLDDT_DF].empty + assert out[DataKey.PLDDT_DF]["residueNumber"].tolist() == [1] + assert out[DataKey.PLDDT_DF]["confidenceScore"].tolist() == [90] - assert isinstance(out["amino_acid_sequences_df"], pd.DataFrame) - assert not out["amino_acid_sequences_df"].empty - assert out["amino_acid_sequences_df"]["Protein ID"].tolist() == ["Q8WP00-1"] - assert out["amino_acid_sequences_df"]["Protein Sequence"].tolist() == ["AAAA"] + assert isinstance(out[DataKey.AMINO_ACID_SEQUENCES_DF], pd.DataFrame) + assert not out[DataKey.AMINO_ACID_SEQUENCES_DF].empty + assert out[DataKey.AMINO_ACID_SEQUENCES_DF]["Protein ID"].tolist() == ["Q8WP00-1"] + assert out[DataKey.AMINO_ACID_SEQUENCES_DF]["Protein Sequence"].tolist() == ["AAAA"] assert any(d.get("level") == logging.INFO for d in out["messages"]) or any( "Successfully loaded" in d.get("msg", "") for d in out["messages"] @@ -421,11 +457,13 @@ def test_get_amino_acid_sequences_df_and_handle_files(tmp_path, monkeypatch): out = handle_alphafold_files( {}, "P", "TESTSEQ", metadata_df, "P", persist_upload=False ) - assert "amino_acid_sequences_df" in out - assert isinstance(out["cif_df"], pd.DataFrame) and out["cif_df"].empty + assert DataKey.AMINO_ACID_SEQUENCES_DF in out + assert isinstance(out[DataKey.CIF_DF], pd.DataFrame) and out[DataKey.CIF_DF].empty assert isinstance(out["pae_df"], pd.DataFrame) and out["pae_df"].empty - assert isinstance(out["plddt_df"], pd.DataFrame) and out["plddt_df"].empty - assert isinstance(out["amino_acid_sequences_df"], pd.DataFrame) + assert ( + isinstance(out[DataKey.PLDDT_DF], pd.DataFrame) and out[DataKey.PLDDT_DF].empty + ) + assert isinstance(out[DataKey.AMINO_ACID_SEQUENCES_DF], pd.DataFrame) def test_upload_multimer_prediction_basic(tmp_path, monkeypatch): @@ -435,19 +473,41 @@ def test_upload_multimer_prediction_basic(tmp_path, monkeypatch): fasta = tmp_path / "seqs.fasta" fasta.write_text(">alpha|X\nAAAA\n") cif = tmp_path / "m.cif" + # Note that we only write the absolutely necessary columns here + # Also this is not biologically plausible cif.write_text( """ -data_test -loop_ -_atom_site.id -_atom_site.type_symbol -N N -""" + data_test + loop_ + _chem_comp.id + _chem_comp.mon_nstd_flag + SER y + GLY y + # + loop_ + _atom_site.id + _atom_site.label_atom_id + _atom_site.label_comp_id + _atom_site.auth_asym_id + _atom_site.label_seq_id + _atom_site.B_iso_or_equiv + 1 N SER A 1 99.99 + 2 CA SER A 1 67.76 + 3 CA SER A 2 33.65 + 4 O SER A 2 5.52 + 5 N GLY B 1 0 + 6 CA GLY B 1 13.37 + # + """ ) conf = tmp_path / "conf.json" - conf.write_text('[{"residueNumber":1, "confidenceScore":99}]') + conf.write_text( + '{"chain_iptm": [0.42, 0.89]}' + ) # Note that we do not use these metrics anywhere full = tmp_path / "full.json" - full.write_text('{"a": [1,2]}') + full.write_text( + '{"random_column": [1,2], "pae": [[1, 2], [3, 4]], "token_res_ids": [1, 2, 1]}' + ) job_request = tmp_path / "job_request.json" job_request.write_text( json.dumps( @@ -458,11 +518,18 @@ def test_upload_multimer_prediction_basic(tmp_path, monkeypatch): "sequences": [ { "proteinChain": { - "sequence": "AAAA", + "sequence": "PE", "count": 1, "useStructureTemplate": True, } - } + }, + { + "proteinChain": { + "sequence": "T", + "count": 1, + "useStructureTemplate": True, + } + }, ], "dialect": "alphafoldserver", "version": 3, @@ -484,7 +551,7 @@ def _copy(src, dest_dir): out = upload_multimer_prediction( entry_id="M1", - uniprot_ids="X", + uniprot_ids="X, Y", model_used="m", amino_acid_sequences=fasta, cif_file=cif, @@ -494,53 +561,135 @@ def _copy(src, dest_dir): persist_upload=True, ) - assert isinstance(out["structure_metadata_df"], pd.DataFrame) + assert isinstance(out[DataKey.STRUCTURE_METADATA_DF], pd.DataFrame) # check metadata contents - mdf = out["structure_metadata_df"] + mdf = out[DataKey.STRUCTURE_METADATA_DF] assert mdf.iloc[0]["entry_id"] == "M1" - assert mdf.iloc[0]["uniprot_ids"] == ["X"] + assert mdf.iloc[0]["uniprot_ids"] == ["X", "Y"] assert mdf.iloc[0]["model_used"] == "m" # cif contents - cif_df = out["cif_df"] + cif_df = out[DataKey.CIF_DF] assert isinstance(cif_df, pd.DataFrame) - assert list(cif_df.columns) == ["_atom_site.id", "_atom_site.type_symbol"] - assert cif_df["_atom_site.id"].tolist() == ["N"] - assert cif_df["_atom_site.type_symbol"].tolist() == ["N"] + assert list(cif_df.columns) == [ + ATOM_SITE_COLUMNS.ID, + ATOM_SITE_COLUMNS.LABEL_ATOM_ID, + ATOM_SITE_COLUMNS.LABEL_COMP_ID, + ATOM_SITE_COLUMNS.AUTH_ASYM_ID, + ATOM_SITE_COLUMNS.LABEL_SEQ_ID, + ATOM_SITE_COLUMNS.B_ISO_OR_EQUIV, + CHEM_COMP_COLUMNS.MON_NSTD_FLAG, + ] + assert cif_df[ATOM_SITE_COLUMNS.ID].tolist() == list(range(1, 7)) + assert cif_df[ATOM_SITE_COLUMNS.LABEL_ATOM_ID].tolist() == [ + "N", + "CA", + "CA", + "O", + "N", + "CA", + ] + assert cif_df[ATOM_SITE_COLUMNS.AUTH_ASYM_ID].tolist() == ["A"] * 4 + ["B"] * 2 + assert cif_df[ATOM_SITE_COLUMNS.B_ISO_OR_EQUIV].tolist() == [ + 99.99, + 67.76, + 33.65, + 5.52, + 0, + 13.37, + ] + assert cif_df[ATOM_SITE_COLUMNS.LABEL_COMP_ID].tolist() == ["SER"] * 4 + ["GLY"] * 2 + assert cif_df[CHEM_COMP_COLUMNS.MON_NSTD_FLAG].tolist() == [True] * 6 # confidence JSON - conf_df = out["confidence_df"] + conf_df = out[DataKey.CONFIDENCE_DF] assert isinstance(conf_df, pd.DataFrame) - assert conf_df["residueNumber"].tolist() == [1] - assert conf_df["confidenceScore"].tolist() == [99] + assert conf_df["chain_iptm"].tolist() == [0.42, 0.89] # full data normalization - full_df = out["full_data_df"] + full_df = out[DataKey.FULL_DATA_DF] assert isinstance(full_df, pd.DataFrame) - assert full_df.iloc[0]["a"] == [1, 2] + assert list(full_df.columns) == ["random_column"] + assert full_df.iloc[0]["random_column"] == [1, 2] # job request JSON - job_df = out["job_request_df"] + job_df = out[DataKey.JOB_REQUEST_DF] assert isinstance(job_df, pd.DataFrame) assert job_df.iloc[0]["name"] == "test_job" assert job_df.iloc[0]["dialect"] == "alphafoldserver" # sequences - seqs = out["amino_acid_sequences_df"] + seqs = out[DataKey.AMINO_ACID_SEQUENCES_DF] assert isinstance(seqs, pd.DataFrame) assert seqs["Protein Sequence"].tolist() == ["AAAA"] assert any(str(v).startswith("X") for v in seqs["Protein ID"].tolist()) + # pLDDT values + plddt_df = out[DataKey.PLDDT_DF] + assert isinstance(plddt_df, pd.DataFrame) + assert list(plddt_df.columns) == ["chainID", "residueNumber", "confidenceScore"] + assert plddt_df["confidenceScore"].tolist() == [ + 67.76, + 33.65, + 13.37, + ] # Keep only CA atoms + + # PAE values + pae_matrix = out[DataKey.PAE_MATRIX].value + assert isinstance(pae_matrix, np.ndarray) + assert pae_matrix[0, 0] == 1 + assert pae_matrix[0, 1] == 2 + assert pae_matrix[1, 0] == 3 + assert pae_matrix[1, 1] == 4 + upload_dir = tmp_path / "M1" assert upload_dir.exists() assert any(upload_dir.glob("*.fasta")) or any(upload_dir.glob("*.fa")) assert any(upload_dir.glob("*.json")) assert any(upload_dir.glob("*.cif")) + # Test no plDDT Data -> plddt_df should be None + cif.write_text( + """ + data_test + loop_ + _chem_comp.id + _chem_comp.mon_nstd_flag + SER y + GLY y + # + loop_ + _atom_site.id + _atom_site.label_atom_id + _atom_site.label_comp_id + _atom_site.auth_asym_id + _atom_site.label_seq_id + 1 N SER A 1 + 2 CA SER A 1 + 3 CA SER A 2 + 4 O SER A 2 + 5 N GLY B 1 + 6 CA GLY B 1 + # + """ + ) -# Additional comprehensive tests for error cases and edge cases + out = upload_multimer_prediction( + entry_id="M1", + uniprot_ids="X, Y", + model_used="m", + amino_acid_sequences=fasta, + cif_file=cif, + confidence_file=conf, + full_data_file=full, + job_request_file=job_request, + persist_upload=True, + ) + + assert out[DataKey.PLDDT_DF] is None +# Additional comprehensive tests for error cases and edge cases def test_get_monomer_metadata_df_existing_csv(tmp_path, monkeypatch): """Test reading existing monomer metadata CSV""" csv_path = tmp_path / "alphafold_monomer_metadata.csv" @@ -605,18 +754,23 @@ def test_to_fasta_lowercase_conversion(): def test_upload_multimer_prediction_no_persist(tmp_path, monkeypatch): - """Test upload_multimer_prediction with persist_upload=False""" + """ + Test upload_multimer_prediction with persist_upload=False. + Also tests full_data without PAE values + """ monkeypatch.setattr(paths, "ALPHAFOLD_MONOMER_PATH", tmp_path) monkeypatch.setattr(paths, "ALPHAFOLD_MULTIMER_PATH", tmp_path) fasta = tmp_path / "seqs.fasta" fasta.write_text(">alpha|X\nAAAA\n") cif = tmp_path / "m.cif" - cif.write_text("data_test\nloop_\n_atom_site.id\nN\n") + cif.write_text( + "data_test\nloop_\n_chem_comp.id\n_chem_comp.mon_nstd_flag\nSER y\nloop_\n#\n_atom_site.id\n_atom_site.label_comp_id\nN SER\n" + ) conf = tmp_path / "conf.json" conf.write_text('[{"residueNumber":1, "confidenceScore":99}]') full = tmp_path / "full.json" - full.write_text('{"a": [1,2]}') + full.write_text('{"a": [1,2], "token_res_ids": [1], "pae": [[2]]}') job_request = tmp_path / "job_request.json" job_request.write_text( json.dumps( @@ -653,10 +807,11 @@ def test_upload_multimer_prediction_no_persist(tmp_path, monkeypatch): ) # verify dataframes are returned - assert isinstance(out["structure_metadata_df"], pd.DataFrame) - assert isinstance(out["cif_df"], pd.DataFrame) - assert isinstance(out["job_request_df"], pd.DataFrame) - assert out["job_request_df"].iloc[0]["name"] == "test_job_2" + assert isinstance(out[DataKey.STRUCTURE_METADATA_DF], pd.DataFrame) + assert isinstance(out[DataKey.CIF_DF], pd.DataFrame) + assert isinstance(out[DataKey.JOB_REQUEST_DF], pd.DataFrame) + assert out[DataKey.PAE_MATRIX].value is not None + assert out[DataKey.JOB_REQUEST_DF].iloc[0]["name"] == "test_job_2" # directory should still exist (created for the entry) upload_dir = tmp_path / "M2" assert not upload_dir.exists() @@ -692,7 +847,9 @@ def test_get_prot_structure_dfs_missing_fasta(tmp_path, monkeypatch): # create CIF but no FASTA cif = prot_dir / "test.cif" - cif.write_text("data_test\nloop_\n_atom_site.id\nN\n") + cif.write_text( + "data_test\nloop_\n_chem_comp.id\n_chem_comp.mon_nstd_flag\nSER y\nloop_\n#\n_atom_site.id\n_atom_site.label_comp_id\nN SER\n" + ) with pytest.raises(FileNotFoundError, match="No FASTA file found"): get_monomer_structure_dfs("NOFASTA") @@ -712,7 +869,9 @@ def test_get_prot_structure_dfs_missing_json(tmp_path, monkeypatch): # create CIF and FASTA but no JSON cif = prot_dir / "test.cif" - cif.write_text("data_test\nloop_\n_atom_site.id\nN\n") + cif.write_text( + "data_test\nloop_\n_chem_comp.id\n_chem_comp.mon_nstd_flag\nSER y\nloop_\n#\n_atom_site.id\n_atom_site.label_comp_id\nN SER\n" + ) fasta = prot_dir / "test.fasta" # valid header for parse_fasta_id (expects at least one "|" in the id) @@ -820,18 +979,30 @@ def test_get_cif_df_from_disk_multiple_cif_warns(tmp_path): """ data_test loop_ +_chem_comp.id +_chem_comp.mon_nstd_flag +SER y +# +loop_ _atom_site.id _atom_site.type_symbol -N N +_atom_site.label_comp_id +N N SER """ ) cif2.write_text( """ data_test loop_ +_chem_comp.id +_chem_comp.mon_nstd_flag +SER y +# +loop_ _atom_site.id _atom_site.type_symbol -CA C +_atom_site.label_comp_id +CA C SER """ ) @@ -872,9 +1043,15 @@ def test_get_multimer_structure_dfs_success(tmp_path, monkeypatch): """ data_test loop_ +_chem_comp.id +_chem_comp.mon_nstd_flag +SER y +# +loop_ _atom_site.id _atom_site.type_symbol -N N +_atom_site.label_comp_id +N N SER """ ) @@ -885,7 +1062,9 @@ def test_get_multimer_structure_dfs_success(tmp_path, monkeypatch): full_data = prot_dir / "full.json" job_request = prot_dir / "job_request.json" confidence.write_text(json.dumps({"chain_iptm": [0.75]})) - full_data.write_text(json.dumps({"pae": [[0.1, 0.2], [0.3, 0.4]]})) + full_data.write_text( + json.dumps({"pae": [[0.1, 0.2], [0.3, 0.4]], "token_res_ids": [1]}) + ) job_request.write_text( json.dumps( [ @@ -909,17 +1088,17 @@ def test_get_multimer_structure_dfs_success(tmp_path, monkeypatch): ) out = get_multimer_structure_dfs("M1") - assert isinstance(out["structure_metadata_df"], pd.DataFrame) - assert isinstance(out["cif_df"], pd.DataFrame) - assert isinstance(out["amino_acid_sequences_df"], pd.DataFrame) - assert isinstance(out["confidence_df"], pd.DataFrame) - assert isinstance(out["full_data_df"], pd.DataFrame) - assert isinstance(out["job_request_df"], pd.DataFrame) - - assert "chain_iptm" in out["confidence_df"].columns - assert "pae" in out["full_data_df"].columns - assert out["job_request_df"].iloc[0]["name"] == "multimer_job" - assert out["job_request_df"].iloc[0]["version"] == 3 + assert isinstance(out[DataKey.STRUCTURE_METADATA_DF], pd.DataFrame) + assert isinstance(out[DataKey.CIF_DF], pd.DataFrame) + assert isinstance(out[DataKey.AMINO_ACID_SEQUENCES_DF], pd.DataFrame) + assert isinstance(out[DataKey.CONFIDENCE_DF], pd.DataFrame) + assert isinstance(out[DataKey.FULL_DATA_DF], pd.DataFrame) + assert isinstance(out[DataKey.JOB_REQUEST_DF], pd.DataFrame) + + assert "chain_iptm" in out[DataKey.CONFIDENCE_DF].columns + assert "pae" not in out[DataKey.FULL_DATA_DF].columns + assert out[DataKey.JOB_REQUEST_DF].iloc[0]["name"] == "multimer_job" + assert out[DataKey.JOB_REQUEST_DF].iloc[0]["version"] == 3 assert any(m.get("level") == logging.INFO for m in out["messages"]) or any( "Successfully loaded" in str(m.get("msg", "")) for m in out["messages"] @@ -956,9 +1135,15 @@ def test_get_multimer_structure_dfs_json_fallback_warns(tmp_path, monkeypatch): """ data_test loop_ +_chem_comp.id +_chem_comp.mon_nstd_flag +SER y +# +loop_ _atom_site.id _atom_site.type_symbol -N N +_atom_site.label_comp_id +N N SER """ ) diff --git a/backend/tests/protzilla/importing/test_pae_matrix_reduction.py b/backend/tests/protzilla/importing/test_pae_matrix_reduction.py new file mode 100644 index 000000000..4daf5449e --- /dev/null +++ b/backend/tests/protzilla/importing/test_pae_matrix_reduction.py @@ -0,0 +1,162 @@ +import numpy as np +import pandas as pd +from backend.protzilla.importing.alphafold_protein_structure_load import ( + reduce_pae_to_per_amino_acid, +) +import pytest + +# These test cases were generated by AI (Gemini) but have been manually checked + + +@pytest.fixture +def empty_cif_df(): + """Returns an empty atom_site DataFrame with required columns.""" + return pd.DataFrame( + columns=[ + "_atom_site.label_entity_id", + "_atom_site.label_seq_id", + "_atom_site.label_atom_id", + ] + ) + + +def test_no_duplicates(empty_cif_df): + """Edge Case 1: Every residue has exactly one token. + + The PAE matrix should remain entirely untouched. + """ + pae_matrix = np.array([[1.0, 2.0], [3.0, 4.0]]) + token_res_ids = [1, 2] + + result = reduce_pae_to_per_amino_acid(pae_matrix, token_res_ids, empty_cif_df) + + assert np.array_equal(result, pae_matrix) + + +def test_duplicates_fallback_to_first_token(empty_cif_df): + """Edge Case 2: Multi-token residue, but length mismatch with CIF. + + Should fall back to keeping the first token (offset 0) and deleting the rest. + """ + # 3 tokens: Residue 1 has 2 tokens, Residue 2 has 1 token. + pae_matrix = np.array([[10, 11, 12], [20, 21, 22], [30, 31, 32]]) + token_res_ids = [1, 1, 2] + + # Keeping token 0 (first of res 1) and token 2 (res 2). Token 1 should be deleted. + expected_indices = [0, 2] + expected_matrix = pae_matrix[np.ix_(expected_indices, expected_indices)] + + result = reduce_pae_to_per_amino_acid(pae_matrix, token_res_ids, empty_cif_df) + + assert np.array_equal(result, expected_matrix) + + +def test_duplicates_keep_ca_atom(): + """Edge Case 3: Run length matches CIF length, and exactly one CA atom is found. + + Should keep the token corresponding exactly to the 'CA' atom position. + """ + # 4 tokens: Residue 1 has 3 tokens, Residue 2 has 1 token. + pae_matrix = np.diag([1.0, 2.0, 3.0, 4.0]) + token_res_ids = [1, 1, 1, 2] + + # CIF setup: 3 atoms for chain 1, residue 1. 'CA' sits at relative index 1. + cif_df = pd.DataFrame( + { + "_atom_site.label_entity_id": ["1", "1", "1"], + "_atom_site.label_seq_id": [1, 1, 1], + "_atom_site.label_atom_id": ["N", "CA", "C"], + } + ) + + # Expected: Keep global index 1 (the CA atom) and global index 3 (residue 2). + # Global indices 0 and 2 should be wiped out. + expected_indices = [1, 3] + expected_matrix = pae_matrix[np.ix_(expected_indices, expected_indices)] + + result = reduce_pae_to_per_amino_acid(pae_matrix, token_res_ids, cif_df) + + assert np.array_equal(result, expected_matrix) + + +def test_duplicates_cif_match_but_no_ca_fallback(): + """Edge Case 4a: Run length matches CIF length, but no CA atom exists. + + Should fall back to keeping the first token (offset 0). + """ + pae_matrix = np.diag([10, 20, 30]) + token_res_ids = [1, 1, 2] + + # CIF matches length (2 atoms), but neither is 'CA' + cif_df = pd.DataFrame( + { + "_atom_site.label_entity_id": ["1", "1"], + "_atom_site.label_seq_id": [1, 1], + "_atom_site.label_atom_id": ["N", "O"], + } + ) + + # Expected to keep global index 0 (fallback) and global index 2. + expected_indices = [0, 2] + expected_matrix = pae_matrix[np.ix_(expected_indices, expected_indices)] + + result = reduce_pae_to_per_amino_acid(pae_matrix, token_res_ids, cif_df) + + assert np.array_equal(result, expected_matrix) + + +def test_duplicates_cif_match_multiple_ca_fallback(): + """Edge Case 4b: Run length matches CIF length, but multiple CA atoms exist. + + Should fall back to keeping the first token (offset 0). + """ + pae_matrix = np.diag([10, 20, 30]) + token_res_ids = [1, 1, 2] + + # CIF matches length (2 atoms), but both claim to be 'CA' + cif_df = pd.DataFrame( + { + "_atom_site.label_entity_id": ["1", "1"], + "_atom_site.label_seq_id": [1, 1], + "_atom_site.label_atom_id": ["CA", "CA"], + } + ) + + # Expected to keep global index 0 (fallback) and global index 2. + expected_indices = [0, 2] + expected_matrix = pae_matrix[np.ix_(expected_indices, expected_indices)] + + result = reduce_pae_to_per_amino_acid(pae_matrix, token_res_ids, cif_df) + + assert np.array_equal(result, expected_matrix) + + +def test_multiple_chains_tracking(): + """Edge Case 5: The system has multiple chains. + + Verifies that `current_chain_idx` increments whenever `res_id == 1` starts a run, + and queries the correct stringified `_atom_site.label_entity_id`. + """ + # Chain 1: res 1 (len 1), res 2 (len 1) + # Chain 2: res 1 (len 2) -> Triggered by encountering 1 again + token_res_ids = [1, 2, 1, 1] + pae_matrix = np.diag([100, 200, 300, 400]) + + cif_df = pd.DataFrame( + { + "_atom_site.label_entity_id": ["1", "1", "2", "2"], + "_atom_site.label_seq_id": [1, 2, 1, 1], + "_atom_site.label_atom_id": ["CA", "CA", "N", "CA"], + } + ) + + # Chain 1, Res 1 (idx 0): len 1 -> Keep + # Chain 1, Res 2 (idx 1): len 1 -> Keep + # Chain 2, Res 1 (idx 2, 3): len 2 -> Matches CIF length for chain '2'. + # CA is at relative index 1 (global idx 3). Global idx 2 is dropped. + expected_indices = [0, 1, 3] + expected_matrix = pae_matrix[np.ix_(expected_indices, expected_indices)] + + result = reduce_pae_to_per_amino_acid(pae_matrix, token_res_ids, cif_df) + + assert np.array_equal(result, expected_matrix) diff --git a/frontend/src/components/app/run-screen/node-editor/StepNode.tsx b/frontend/src/components/app/run-screen/node-editor/StepNode.tsx index a83c617fd..7adb84d93 100644 --- a/frontend/src/components/app/run-screen/node-editor/StepNode.tsx +++ b/frontend/src/components/app/run-screen/node-editor/StepNode.tsx @@ -53,7 +53,7 @@ const DATA_TYPE_ICON_MAP: Partial> = { full_data_df: handleFullDataIcon, gene_mapping_df: handleDnaIcon, metadata_df: handleMetadataIcon, - pae_df: handlePaeIcon, + pae_matrix: handlePaeIcon, peptide_df: handlePeptidesIcon, plddt_df: handlePlddtIcon, protein_df: handleProteinIcon, From 5f8afd46b6ec402dc1f38143692a98fd09fa7a04 Mon Sep 17 00:00:00 2001 From: Elena-kal Date: Thu, 28 May 2026 11:20:55 +0200 Subject: [PATCH 238/240] fix spelling of cross-link --- backend/protzilla/methods/data_analysis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index b7f47667c..03318a97a 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -2469,7 +2469,7 @@ def create_form(self): label="Crosslinker lengths and bounds", ), InfoField( - label="Set default cross-link lengths and their upper/lower deviations in settings under 'Cross-Links Defaults'.", + label="Set default crosslink lengths and their upper/lower deviations in settings under 'Crosslinks Defaults'.", ), ], ) @@ -2498,7 +2498,7 @@ def create_form(self): label="Crosslinker lengths and bounds", ), InfoField( - label="Set default cross-link lengths and their upper/lower deviations in settings under 'Cross-Links Defaults'.", + label="Set default crosslink lengths and their upper/lower deviations in settings under 'Crosslinks Defaults'.", ), ], ) From 1eb2c9b0a5d3a33909eb3b74fe43c185b693c46d Mon Sep 17 00:00:00 2001 From: jorisfu Date: Fri, 29 May 2026 11:48:28 +0200 Subject: [PATCH 239/240] (incomplete) begin work on fix --- .../components/core/data-table/data-table.tsx | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/frontend/src/components/core/data-table/data-table.tsx b/frontend/src/components/core/data-table/data-table.tsx index 619188862..941307614 100644 --- a/frontend/src/components/core/data-table/data-table.tsx +++ b/frontend/src/components/core/data-table/data-table.tsx @@ -107,7 +107,7 @@ export const DataTable: React.FC = ({ filters: JSON.stringify(filterModel.items), }); - if (response.rows.length > 0 && !columnsInitializedRef.current) { + if (response.rows.length > 0 && !columnsInitializedRef.current && Object.keys(response.rows[0]).length <= MAX_COLUMNS) { const generatedColumns = Object.keys(response.rows[0]).map((key) => { const isNumeric = response.rows.every( (row: TableRecord) => typeof row[key] === "number" || row[key] === null, @@ -127,16 +127,30 @@ export const DataTable: React.FC = ({ }); setColumns(generatedColumns); + setCurrentRows(response.rows); columnsInitializedRef.current = true; } - if (response.rows.length > 0 && Object.keys(response.rows[0]).length > MAX_COLUMNS) { + else if (response.rows.length > 0 && Object.keys(response.rows[0]).length > MAX_COLUMNS) { + const generatedColumns = Object.keys(FALLBACK_TOO_MANY_COLUMNS[0]).map((key) => { + return { + field: key, + headerName: key, + flex: 1, + type: "string", + align: "left", + headerAlign: "left", + filterable: true, + filterOperators: stringOperators, + valueFormatter: (value: unknown) => value ?? "NaN", + } as GridColDef; + }); + + setColumns(generatedColumns); setCurrentRows(FALLBACK_TOO_MANY_COLUMNS); setTotalRowCount(FALLBACK_TOO_MANY_COLUMNS.length); - } else { - setCurrentRows(response.rows); - setTotalRowCount(response.total_row_count); - } + } + } catch (error) { console.error("Failed to fetch table data:", error); } finally { From 841e64afc03b06f86aad2659588fc42ba1aa0923 Mon Sep 17 00:00:00 2001 From: jorisfu Date: Fri, 29 May 2026 14:55:15 +0200 Subject: [PATCH 240/240] (AI) fix: filtering/sorting not working and proper fallback logic --- .../components/core/data-table/data-table.tsx | 64 ++++++++++++------- 1 file changed, 42 insertions(+), 22 deletions(-) diff --git a/frontend/src/components/core/data-table/data-table.tsx b/frontend/src/components/core/data-table/data-table.tsx index 941307614..d769e77eb 100644 --- a/frontend/src/components/core/data-table/data-table.tsx +++ b/frontend/src/components/core/data-table/data-table.tsx @@ -78,19 +78,25 @@ export const DataTable: React.FC = ({ items: [], }); const [columns, setColumns] = useState([]); + const columnsInitializedRef = useRef(false); + const isFallbackRef = useRef(false); - // necessary for updating which columns exist when switching between tables + // Reset state when switching between tables useEffect(() => { setColumns([]); + setCurrentRows([]); setFilterModel({ items: [] }); setSortModel([]); columnsInitializedRef.current = false; + isFallbackRef.current = false; }, [tableLabel]); - // Fetch data when pagination changes + // Fetch data when pagination, sorts, or filters change useEffect(() => { const fetchData = async () => { + if (isFallbackRef.current) return; + setLoading(true); const startIndex = paginationModel.page * paginationModel.pageSize; @@ -107,7 +113,35 @@ export const DataTable: React.FC = ({ filters: JSON.stringify(filterModel.items), }); - if (response.rows.length > 0 && !columnsInitializedRef.current && Object.keys(response.rows[0]).length <= MAX_COLUMNS) { + // 1. Column Generation & Fallback Logic (Only runs once per table) + if (!columnsInitializedRef.current && response.rows.length > 0) { + const numCols = Object.keys(response.rows[0]).length; + + // Handle Too Many Columns (Fallback) + if (numCols > MAX_COLUMNS) { + const generatedColumns = Object.keys(FALLBACK_TOO_MANY_COLUMNS[0]).map((key) => { + return { + field: key, + headerName: key, + flex: 1, + type: "string", + align: "left", + headerAlign: "left", + sortable: false, // Prevent users from sorting fallback rows + filterable: false, // Prevent users from filtering fallback rows + } as GridColDef; + }); + + setColumns(generatedColumns); + setCurrentRows(FALLBACK_TOO_MANY_COLUMNS); + setTotalRowCount(FALLBACK_TOO_MANY_COLUMNS.length); + + columnsInitializedRef.current = true; + isFallbackRef.current = true; + return; + } + + // Handle Normal Columns const generatedColumns = Object.keys(response.rows[0]).map((key) => { const isNumeric = response.rows.every( (row: TableRecord) => typeof row[key] === "number" || row[key] === null, @@ -127,29 +161,15 @@ export const DataTable: React.FC = ({ }); setColumns(generatedColumns); - setCurrentRows(response.rows); columnsInitializedRef.current = true; + isFallbackRef.current = false; } - else if (response.rows.length > 0 && Object.keys(response.rows[0]).length > MAX_COLUMNS) { - const generatedColumns = Object.keys(FALLBACK_TOO_MANY_COLUMNS[0]).map((key) => { - return { - field: key, - headerName: key, - flex: 1, - type: "string", - align: "left", - headerAlign: "left", - filterable: true, - filterOperators: stringOperators, - valueFormatter: (value: unknown) => value ?? "NaN", - } as GridColDef; - }); + if(!isFallbackRef.current) { + setCurrentRows(response.rows); + setTotalRowCount(response.total_row_count); + } - setColumns(generatedColumns); - setCurrentRows(FALLBACK_TOO_MANY_COLUMNS); - setTotalRowCount(FALLBACK_TOO_MANY_COLUMNS.length); - } } catch (error) { console.error("Failed to fetch table data:", error);