diff --git a/massql/msql_engine.py b/massql/msql_engine.py index 9f92450..8f889e0 100644 --- a/massql/msql_engine.py +++ b/massql/msql_engine.py @@ -209,7 +209,8 @@ def _evalute_variable_query(parsed_dict, input_filename, presearch_parse["conditions"] = non_variable_conditions - ms1_df, ms2_df = _executeconditions_query(presearch_parse, input_filename, + ms1_df, ms2_df = _executeconditions_query(presearch_parse, input_filename, + ms1_input_df=ms1_df, ms2_input_df=ms2_df, cache=cache, cache_dir=cache_dir, cache_file=cache_file) variable_x_ms1_df = ms1_df diff --git a/tests/get_data.sh b/tests/get_data.sh index 1ae9ce0..db5bb62 100644 --- a/tests/get_data.sh +++ b/tests/get_data.sh @@ -9,7 +9,7 @@ wget --no-verbose --output-document=NS_1x_test.mzML "https://massiveproxy.gnps2. wget --no-verbose --output-document=JB_182_2_fe.mzML "https://massiveproxy.gnps2.org/massiveproxy/MSV000084289/ccms_peak/JB_182_2_fe.mzML" wget --no-verbose --output-document=S_N2_neutral_Zn.mzML "https://massiveproxy.gnps2.org/massiveproxy/MSV000083387/updates/2019-11-12_allegraaron_e893cb7e/peak/S_N2_neutral_Zn.mzML" wget --no-verbose --output-document=gnps-library.json "https://external.gnps2.org/gnpslibrary/GNPS-LIBRARY.json" -wget --no-verbose --output-document=specs_ms.mgf "http://massive.ucsd.edu/ProteoSAFe/DownloadResultFile?task=5ecfcf81cb3c471698995b194d8246a0&block=main&file=spectra/specs_ms.mgf" +wget --no-verbose --tries=3 --waitretry=5 --output-document=specs_ms.mgf "https://massive.ucsd.edu/ProteoSAFe/DownloadResultFile?task=5ecfcf81cb3c471698995b194d8246a0&block=main&file=spectra/specs_ms.mgf" wget --no-verbose --output-document=1810E-II.mzML "https://massiveproxy.gnps2.org/massiveproxy/MSV000084691/ccms_peak/1810E-II.mzML" wget --no-verbose --output-document=T04251505.mzXML "https://massiveproxy.gnps2.org/massiveproxy/MSV000082797/ccms_peak/raw/MTBLS368/T04251505.mzXML" wget --no-verbose --output-document=isa_9_fe.mzML "https://massiveproxy.gnps2.org/massiveproxy/MSV000084030/ccms_peak/isa_9_fe.mzML" diff --git a/tests/reference_parses/QUERY_scaninfo_MS2DATA__WHERE_MS2PROD_337.25TOLERA___0395c7beeeccee4aa47652257f60526d.json b/tests/reference_parses/QUERY_scaninfo_MS2DATA__WHERE_MS2PROD_337.25TOLERA___0395c7beeeccee4aa47652257f60526d.json new file mode 100644 index 0000000..e826a16 --- /dev/null +++ b/tests/reference_parses/QUERY_scaninfo_MS2DATA__WHERE_MS2PROD_337.25TOLERA___0395c7beeeccee4aa47652257f60526d.json @@ -0,0 +1,114 @@ +{ + "conditions": [ + { + "conditiontype": "where", + "qualifiers": { + "qualifiermztolerance": { + "comparator": "equal", + "name": "qualifiermztolerance", + "unit": "mz", + "value": 0.05 + }, + "type": "qualifier" + }, + "type": "ms2productcondition", + "value": [ + 337.25 + ] + }, + { + "conditiontype": "where", + "qualifiers": { + "qualifiermztolerance": { + "comparator": "equal", + "name": "qualifiermztolerance", + "unit": "mz", + "value": 0.05 + }, + "type": "qualifier" + }, + "type": "ms2productcondition", + "value": [ + 319.24 + ] + }, + { + "conditiontype": "where", + "type": "ms2precursorcondition", + "value": [ + "X" + ] + }, + { + "conditiontype": "where", + "qualifiers": { + "qualifiermztolerance": { + "comparator": "equal", + "name": "qualifiermztolerance", + "unit": "mz", + "value": 0.05 + }, + "type": "qualifier" + }, + "type": "ms2productcondition", + "value": [ + "X-390.277" + ] + }, + { + "conditiontype": "where", + "qualifiers": { + "qualifierintensitymatch": { + "comparator": "equal", + "name": "qualifierintensitymatch", + "value": "Y" + }, + "qualifierintensityreference": { + "name": "qualifierintensityreference" + }, + "qualifierppmtolerance": { + "comparator": "equal", + "name": "qualifierppmtolerance", + "unit": "ppm", + "value": 40.0 + }, + "type": "qualifier" + }, + "type": "ms2productcondition", + "value": [ + 319.24 + ] + }, + { + "conditiontype": "where", + "qualifiers": { + "qualifierintensitymatch": { + "comparator": "equal", + "name": "qualifierintensitymatch", + "value": "Y*300.0" + }, + "qualifierintensitytolpercent": { + "comparator": "equal", + "name": "qualifierintensitytolpercent", + "value": 99.0 + }, + "qualifierppmtolerance": { + "comparator": "equal", + "name": "qualifierppmtolerance", + "unit": "ppm", + "value": 40.0 + }, + "type": "qualifier" + }, + "type": "ms2productcondition", + "value": [ + 201.16 + ] + } + ], + "query": "QUERY scaninfo(MS2DATA) WHERE MS2PROD=337.25:TOLERANCEMZ=0.05 AND MS2PROD=319.24:TOLERANCEMZ=0.05 AND MS2PREC=X AND MS2PROD=X-390.277:TOLERANCEMZ=0.05 AND MS2PROD=319.24:TOLERANCEPPM=40:INTENSITYMATCH=Y:INTENSITYMATCHREFERENCE AND MS2PROD=201.16:TOLERANCEPPM=40:INTENSITYMATCH=Y*300:INTENSITYMATCHPERCENT=99", + "querytype": { + "datatype": "datams2data", + "function": "functionscaninfo" + } +} \ No newline at end of file diff --git a/tests/test_extraction.py b/tests/test_extraction.py index ec55ec1..331f874 100644 --- a/tests/test_extraction.py +++ b/tests/test_extraction.py @@ -51,8 +51,12 @@ def test_extract_mzXML(): assert(len(merged_summary_df) == 5) def test_extract_MGF(): + mgf_path = "tests/data/specs_ms.mgf" + assert os.path.exists(mgf_path), f"Test data file {mgf_path} not found - download may have failed" + assert os.path.getsize(mgf_path) > 1000, f"Test data file {mgf_path} appears corrupt (too small) - download may have failed" + query = "QUERY scaninfo(MS2DATA)" - results_df = msql_engine.process_query(query, "tests/data/specs_ms.mgf") + results_df = msql_engine.process_query(query, mgf_path) print(results_df) assert(len(results_df) > 1) diff --git a/tests/test_queries.txt b/tests/test_queries.txt index d4b992d..6df820d 100644 --- a/tests/test_queries.txt +++ b/tests/test_queries.txt @@ -45,4 +45,5 @@ QUERY scaninfo(MS2DATA) WHERE MS2PREC=X AND MOBILITY=range(min=X*0.0006775+0.405 QUERY scaninfo(MS2DATA) WHERE MS2PROD=(58.06513 OR 60.04439 OR 70.06513 OR 72.08078 OR 74.06004 OR 84.04439 OR 84.08078 OR 86.09643 OR 87.05529 OR 88.0393 OR 88.07569 OR 100.11208 OR 101.07094 OR 101.10732 OR 102.05495 OR 102.09134 OR 104.05285 OR 110.07127 OR 114.12773 OR 115.08659 OR 115.12297 OR 116.0706 OR 118.0685 OR 120.08078 OR 124.08692 OR 129.10224 OR 129.11347 OR 129.13862 OR 130.08625 OR 132.08415 OR 134.09643 OR 136.07569 OR 138.10257 OR 143.12912 OR 148.11208 OR 150.09134 OR 157.14477 OR 159.09167 OR 164.10699 OR 173.10732 OR 187.12297):CARDINALITY=range(min=2,max=5):TOLERANCEPPM=10:INTENSITYPERCENT=5 QUERY scaninfo(MS2DATA) WHERE MS2PROD=226.18:TOLERANCEPPM=5:EXCLUDED QUERY scaninfo(MS2DATA) WHERE MS2PROD=formula(C10) -QUERY scaninfo(MS2DATA) WHERE MS2PROD=341.28:TOLERANCEMZ=0.01:INTENSITYPERCENT=2 AND MS2PROD=323.27:TOLERANCEMZ=0.01:INTENSITYPERCENT=2 AND MS2PREC=X AND MS2PROD=X-358.2871:TOLERANCEMZ=0.01:INTENSITYPERCENT=2 \ No newline at end of file +QUERY scaninfo(MS2DATA) WHERE MS2PROD=341.28:TOLERANCEMZ=0.01:INTENSITYPERCENT=2 AND MS2PROD=323.27:TOLERANCEMZ=0.01:INTENSITYPERCENT=2 AND MS2PREC=X AND MS2PROD=X-358.2871:TOLERANCEMZ=0.01:INTENSITYPERCENT=2 +QUERY scaninfo(MS2DATA) WHERE MS2PROD=337.25:TOLERANCEMZ=0.05 AND MS2PROD=319.24:TOLERANCEMZ=0.05 AND MS2PREC=X AND MS2PROD=X-390.277:TOLERANCEMZ=0.05 AND MS2PROD=319.24:TOLERANCEPPM=40:INTENSITYMATCH=Y:INTENSITYMATCHREFERENCE AND MS2PROD=201.16:TOLERANCEPPM=40:INTENSITYMATCH=Y*300:INTENSITYMATCHPERCENT=99 \ No newline at end of file diff --git a/tests/test_query.py b/tests/test_query.py index 8745ef3..83ca887 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -917,6 +917,61 @@ def test_ms2_intensitypercent_gt_lt_eq_tripartite(): assert scans_gt.issubset(scans_eq), "INTENSITYPERCENT= (>=) must include all scans matched by INTENSITYPERCENT>" +def test_ms2_variable_with_intensitymatch(): + """Test a complex query combining MS2PREC=X variable, MS2PROD with INTENSITYMATCH and INTENSITYMATCHREFERENCE.""" + import pandas as pd + import numpy as np + + query = "QUERY scaninfo(MS2DATA) WHERE MS2PROD=337.25:TOLERANCEMZ=0.05 AND MS2PROD=319.24:TOLERANCEMZ=0.05 AND MS2PREC=X AND MS2PROD=X-390.277:TOLERANCEMZ=0.05 AND MS2PROD=319.24:TOLERANCEPPM=40:INTENSITYMATCH=Y:INTENSITYMATCHREFERENCE AND MS2PROD=201.16:TOLERANCEPPM=40:INTENSITYMATCH=Y*300:INTENSITYMATCHPERCENT=99" + + # Verify it parses correctly + parse_obj = msql_parser.parse_msql(query) + print(json.dumps(parse_obj, indent=4)) + + condition_types = [c["type"] for c in parse_obj["conditions"]] + assert condition_types.count("ms2productcondition") == 5 + assert condition_types.count("ms2precursorcondition") == 1 + + # Check INTENSITYMATCH qualifiers are present + ref_conditions = [c for c in parse_obj["conditions"] + if "qualifiers" in c and "qualifierintensityreference" in c.get("qualifiers", {})] + assert len(ref_conditions) == 1 + + match_conditions = [c for c in parse_obj["conditions"] + if "qualifiers" in c and "qualifierintensitytolpercent" in c.get("qualifiers", {})] + assert len(match_conditions) == 1 + assert match_conditions[0]["qualifiers"]["qualifierintensitymatch"]["value"] == "Y*300.0" + + # Test execution with synthetic data that has matching peaks + precmz = 710.0 + rows = [] + peaks = [ + (337.25, 1000.0), + (319.24, 500.0), # reference Y + (201.16, 150000.0), # Y*300 = 500*300 = 150000 + (319.723, 800.0), # X-390.277 = 710-390.277 = 319.723 + ] + for mz, intensity in peaks: + rows.append({ + 'scan': 1, 'ms1scan': 0, 'rt': 1.0, + 'mz': mz, 'i': intensity, + 'precmz': precmz, 'charge': 1, 'polarity': 1, + 'i_norm': intensity / max(p[1] for p in peaks), + 'i_tic_norm': intensity / sum(p[1] for p in peaks), + }) + ms2_df = pd.DataFrame(rows) + ms1_df = pd.DataFrame({ + 'scan': [0], 'rt': [1.0], 'mz': [precmz], 'i': [10000.0], + 'i_norm': [1.0], 'i_tic_norm': [1.0], 'polarity': [1] + }) + + results_df = msql_engine.process_query(query, "tests/data/GNPS00002_A3_p.mzML", + ms1_df=ms1_df, ms2_df=ms2_df) + print(results_df) + assert len(results_df) > 0, "Query should find the matching scan in synthetic data" + assert 1 in results_df["scan"].values + + def debug_query(): query = "QUERY scaninfo(MS2DATA) WHERE MS2PROD=341.28:TOLERANCEMZ=0.01:INTENSITYPERCENT=2 AND MS2PROD=323.27:TOLERANCEMZ=0.01:INTENSITYPERCENT=2 AND MS2PREC=X AND MS2PROD=X-358.2871:TOLERANCEMZ=0.01:INTENSITYPERCENT=2"