diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 94660c4..b07d2de 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,6 +4,7 @@ on: push: branches: - main + - dev pull_request: branches: - main @@ -17,78 +18,52 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.9', '3.12'] - + python-version: ['3.9', '3.14'] + steps: - - uses: actions/checkout@v4 - + - uses: actions/checkout@v6 + - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - + - name: Install system dependencies run: | sudo apt-get update sudo apt-get install -y poppler-utils - + - name: Upgrade pip, setuptools, and packaging run: | python -m pip install --upgrade pip setuptools packaging - + - name: Cache src directory - uses: actions/cache@v4 + uses: actions/cache@v5 with: - path: ./src/ + path: ${{ github.workspace }}/src/ key: ${{ runner.os }}-src-grch37 restore-keys: | ${{ runner.os }}-src- - + - name: Download GRCh37.tar.gz if not present run: | - if [ ! -f ./src/GRCh37.tar.gz ]; then - wget --connect-timeout=10 --tries=20 ftp://alexandrovlab-ftp.ucsd.edu/pub/tools/SigProfilerMatrixGenerator/GRCh37.tar.gz -P ./src/ + if [ ! -f ${{ github.workspace }}/src/GRCh37.tar.gz ]; then + wget --connect-timeout=10 --tries=20 ftp://alexandrovlab-ftp.ucsd.edu/pub/tools/SigProfilerMatrixGenerator/GRCh37.tar.gz -P ${{ github.workspace }}/src/ fi - + - name: Install package with tests run: | pip install .[tests] - + - name: Install genome run: | - python install_genome.py ${{ github.workspace }}/src/ - + SigProfilerMatrixGenerator install GRCh37 --local_genome ${{ github.workspace }}/src/ + - name: Run unit tests run: | - pytest tests - + pip install pytest + pytest -s -rw tests + - name: Run integration test run: | python3 test.py - - - name: Build and push Docker image - if: github.ref == 'refs/heads/main' && github.event_name == 'push' && matrix.python-version == '3.12' - run: | - echo "Starting Docker deployment to GHCR for sigprofilersuite..." - - VERSION_TAG=$(grep "VERSION = " setup.py | cut -d'"' -f2) - - # Get the repository name and convert it to lowercase - REPO_NAME=$(basename ${{ github.repository }} | tr '[:upper:]' '[:lower:]') - IMAGE_NAME="ghcr.io/sigprofilersuite/$REPO_NAME" - - echo "Building version: $VERSION_TAG for image: $IMAGE_NAME" - - echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io \ - --username "${{ github.actor }}" \ - --password-stdin - - docker build \ - --build-arg COMMIT_SHA=${{ github.sha }} \ - -t $IMAGE_NAME:$VERSION_TAG \ - -t $IMAGE_NAME:latest . - - docker push $IMAGE_NAME:$VERSION_TAG - docker push $IMAGE_NAME:latest - - echo "Docker deployment to GHCR successful" diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..5802fce --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,55 @@ +name: Docs + +on: + push: + branches: + - master + - dev + workflow_dispatch: + +permissions: + contents: read + pages: write + id-token: write + +concurrency: + group: github-pages + cancel-in-progress: true + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Configure Pages + uses: actions/configure-pages@v5 + + - name: Install docs dependencies + run: | + python -m pip install --upgrade pip + python -m pip install mkdocs mkdocs-material pymdown-extensions + + - name: Build site + run: | + mkdocs build --clean + + - name: Upload Pages artifact + uses: actions/upload-pages-artifact@v3 + with: + path: site + + deploy: + runs-on: ubuntu-latest + needs: build + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - id: deployment + uses: actions/deploy-pages@v4 + diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..8789486 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,112 @@ +name: Create Release from setup.py + +on: + pull_request: + types: + - closed + branches: + - main + +jobs: + release: + if: github.event.pull_request.merged == true + runs-on: ubuntu-latest + + permissions: + contents: write + + outputs: + version: ${{ steps.get_version.outputs.VERSION }} + + steps: + - name: Checkout repo + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.14" + + - name: Upgrade pip, setuptools, packaging and install git-cliff + run: | + python -m pip install --upgrade pip setuptools packaging + pip install git-cliff + + - name: Extract version from setup.py + id: get_version + run: | + VERSION=$(python setup.py --version) + echo "VERSION=$VERSION" >> $GITHUB_OUTPUT + + - name: Get latest release tag + id: get_last_release + run: | + latest=$(curl -s \ + -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + https://api.github.com/repos/${{ github.repository }}/releases/latest \ + | jq -r '.tag_name // "v0.0.0"' | sed 's/^v//') + echo "LATEST_RELEASE=$latest" >> $GITHUB_ENV + + - name: Check release version + run: | + echo "New version: ${{ steps.get_version.outputs.VERSION }}" + echo "Latest release: $LATEST_RELEASE" + python - < RELEASE_NOTES.md + git-cliff --unreleased --tag "v${{ steps.get_version.outputs.VERSION }}" --prepend CHANGELOG.md + + - name: Commit and push CHANGELOG.md + run: | + git config user.name "github-actions" + git config user.email "github-actions@github.com" + git add CHANGELOG.md + git commit -m "chore: update CHANGELOG.md for v${{ steps.get_version.outputs.VERSION }}" + git push origin HEAD:main + + - name: Create Git tag + run: | + git tag v${{ steps.get_version.outputs.VERSION }} + git push origin v${{ steps.get_version.outputs.VERSION }} + + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + tag_name: v${{ steps.get_version.outputs.VERSION }} + body_path: RELEASE_NOTES.md + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + sync-dev: + needs: release + runs-on: ubuntu-latest + + permissions: + contents: write + + steps: + - name: Checkout repo + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Merge main into dev and push + run: | + git config user.name "github-actions" + git config user.email "github-actions@github.com" + git fetch origin main dev + git checkout -B dev origin/dev + git merge origin/main --no-edit + git push origin dev diff --git a/README.md b/README.md index 9522c7f..bbcf34e 100644 --- a/README.md +++ b/README.md @@ -1,188 +1,52 @@ -[![Docs](https://img.shields.io/badge/docs-latest-blue.svg)](https://osf.io/mz79v/wiki/home/) +[![Docs](https://img.shields.io/badge/docs-latest-blue.svg)](https://sigprofilersuite.github.io/SigProfilerAssignment/) [![License](https://img.shields.io/badge/License-BSD\%202--Clause-orange.svg)](https://opensource.org/licenses/BSD-2-Clause) [![CI](https://github.com/SigProfilerSuite/SigProfilerAssignment/actions/workflows/ci.yml/badge.svg)](https://github.com/SigProfilerSuite/SigProfilerAssignment/actions/workflows/ci.yml) -drawing +SigProfilerAssignment # SigProfilerAssignment -SigProfilerAssignment enables assignment of previously known mutational signatures to individual samples and individual somatic mutations. The tool refits different types of reference mutational signatures, including [COSMIC signatures](https://cancer.sanger.ac.uk/signatures/), as well as custom signature databases. Refitting of known mutational signatures is a numerical optimization approach that not only identifies the set of operative mutational signatures in a particular sample, but also quantifies the number of mutations assigned to each signature found in that sample. SigProfilerAssignment makes use of [SigProfilerMatrixGenerator](https://github.com/AlexandrovLab/SigProfilerMatrixGenerator) and [SigProfilerPlotting](https://github.com/AlexandrovLab/SigProfilerPlotting), seamlessly integrating with other [SigProfiler tools](https://cancer.sanger.ac.uk/signatures/tools/). +SigProfilerAssignment enables assignment of previously known mutational signatures to individual samples and individual somatic mutations. The tool refits different types of reference mutational signatures, including [COSMIC signatures](https://cancer.sanger.ac.uk/signatures/), as well as custom signature databases. SigProfilerAssignment makes use of [SigProfilerMatrixGenerator](https://github.com/SigProfilerSuite/SigProfilerMatrixGenerator) and [SigProfilerPlotting](https://github.com/SigProfilerSuite/SigProfilerPlotting), seamlessly integrating with other in [SigProfilerSuite](https://github.com/SigProfilerSuite). -For users that prefer working in an R environment, a wrapper package is provided and can be found and installed from: https://github.com/AlexandrovLab/SigProfilerAssignmentR. Detailed documentation can be found at: https://osf.io/mz79v/wiki/home/. +## Documentation +Detailed documentation can be found at https://sigprofilersuite.github.io/SigProfilerAssignment. - -## Table of contents -- [Installation](#installation) -- [Running](#running) - - [Main Parameters](#parameters) - - [Signature Subgroups](#subgroups) -- [Examples](#examples) -- [_De novo_ extraction of mutational signatures downstream analysis](#denovo) -- [Citation](#citation) -- [Copyright](#copyright) -- [Contact Information](#contact) - -## Installation +## Quick Start Guide +### Installation Install the current stable PyPi version of SigProfilerAssignment: ``` $ pip install SigProfilerAssignment ``` -If mutation calling files (MAF, VCF, or simple text files) are used as input, please install your desired reference genome as follows (available reference genomes are: GRCh37, GRCh38, mm9, mm10, and rn6): +If mutation calling files (MAF, VCF, or simple text files) are used as input, please install your desired reference genome as follows (available reference genomes are: GRCh37, GRCh38, mm9, mm10, rn6, and rn7): ```python $ python from SigProfilerMatrixGenerator import install as genInstall genInstall.install('GRCh37') ``` -If you plan to use `sample_reconstruction_plots='png'` or `'both'`, the external `poppler` binary is required. You can install it using one of the following commands: - -- For Conda-based environments: - `conda install -c conda-forge poppler` - -## Running - -Assignment of known mutational signatures to individual samples is performed using the `cosmic_fit` function. Input samples are provided using the `samples` parameter in the form of mutation calling files (VCFs, MAFs, or simple text files), segmentation files or mutational matrices. COSMIC mutational signatures v3.5 are used as the default reference signatures, although previous COSMIC versions and custom signature databases are also supported using the `cosmic_version` and `signature_database` parameters. Results will be found in the folder specified in the `output` parameter. - -```python -from SigProfilerAssignment import Analyzer as Analyze -Analyze.cosmic_fit(samples, output, input_type="matrix", context_type="96", - collapse_to_SBS96=True, cosmic_version=3.5, exome=False, - genome_build="GRCh37", signature_database=None, - exclude_signature_subgroups=None, export_probabilities=False, - export_probabilities_per_mutation=False, make_plots=False, - sample_reconstruction_plots=False, verbose=False) -``` - - - -### Main Parameters - -| Parameter | Variable Type | Parameter Description | -| ------ | ----------- | ----------- | -| samples | String | Path to the input somatic mutations file (if using segmentation file/mutational matrix) or input folder (mutation calling file/s). | -| output | String | Path to the output folder. | -| input_type | String | Three accepted input types:The default value is "matrix". | -| context_type | String | Required context type if `input_type` is "vcf". `context_type` takes which context type of the input data is considered for assignment. Valid options include "96", "288", "1536", "DINUC", and "ID". The default value is "96". | -| cosmic_version | Float | Defines the version of the COSMIC reference signatures. Takes a positive float among 1, 2, 3, 3.1, 3.2, 3.3, 3.4, and 3.5. The default value is 3.5. | -| exome | Boolean | Defines if the exome renormalized COSMIC signatures will be used. The default value is False. | -| genome_build | String | The reference genome build, used for select the appropriate version of the COSMIC reference signatures, as well as processing the mutation calling file/s. Supported genomes include "GRCh37", "GRCh38", "mm9", "mm10" and "rn6". The default value is "GRCh37". If the selected genome is not in the supported list, the default genome will be used. | -| signature_database | String | Path to the input set of known mutational signatures (only in case that COSMIC reference signatures are not used), a tab delimited file that contains the signature matrix where the rows are mutation types and columns are signature IDs. | -| exclude_signature_subgroups | List | Removes the signatures corresponding to specific subtypes to improve refitting (only available when using default COSMIC reference signatures). The usage is explained below. The default value is None, which corresponds to use all COSMIC signatures. | -| export_probabilities | Boolean | Defines if the probability matrix per mutational context for all samples is created. The default value is True. | -| export_probabilities_per_mutation | Boolean | Defines if the probability matrices per mutation for all samples are created. Only available when `input_type` is "vcf". The default value is False. | -| make_plots | Boolean | Toggle on and off for making and saving plots. The default value is True. | -| sample_reconstruction_plots | String | Select the output format for sample reconstruction plots. Valid inputs are {'pdf', 'png', 'both', 'none'}. The default value is 'none'. If set to 'png' or 'both', the external binary `poppler` must be installed. Install via `conda install -c conda-forge poppler` or `brew install poppler` on macOS. | -| verbose | Boolean | Prints detailed statements. The default value is False. | -| cpu | Integer | Number of processor cores to use during assignment. The default value is -1, which uses all available cores. | -| volume | String | Path to SigProfilerAssignment volumes. Used for Docker/Singularity. Environmental variable "SIGPROFILERASSIGNMENT_VOLUME" takes precedence. Default value is None. | - - - - -### Signature Subgroups - -When using COSMIC reference signatures, some subgroups of signatures can be removed to improve the refitting analysis. To use this feature, the `exclude_signature_subgroups` parameter should be added, following the sintax below: - -```python -exclude_signature_subgroups = ['MMR_deficiency_signatures', - 'POL_deficiency_signatures', - 'HR_deficiency_signatures' , - 'BER_deficiency_signatures', - 'Chemotherapy_signatures', - 'Immunosuppressants_signatures', - 'Treatment_signatures', - 'APOBEC_signatures', - 'Tobacco_signatures', - 'UV_signatures', - 'AA_signatures', - 'Colibactin_signatures', - 'Artifact_signatures', - 'Lymphoid_signatures'] -``` - -The full list of signature subgroups is included in the following table: - -|Signature subgroup | SBS signatures excluded | DBS signatures excluded | ID signatures excluded | -| ----------- | ----------- | ----------- | ----------- | -|MMR_deficiency_signatures| 6, 14, 15, 20, 21, 26, 44| 7, 10| 7| -|POL_deficiency_signatures| 10a, 10b, 10c, 10d, 28| 3| -| -|HR_deficiency_signatures| 3| 13| 6| -|BER_deficiency_signatures| 30, 36| -| -| -|Chemotherapy_signatures| 11, 25, 31, 35, 86, 87, 90, 99| 5| -| -|Immunosuppressants_signatures| 32| -| -| -|Treatment_signatures| 11, 25, 31, 32, 35, 86, 87, 90, 99| 5| -| -|APOBEC_signatures| 2, 13| -| -| -|Tobacco_signatures | 4, 29, 92, 100, 109| 2| 3| -|UV_signatures| 7a, 7b, 7c, 7d, 38| 1| 13| -|AA_signatures| 22a, 22b| 20| 23| -|Colibactin_signatures| 88| -| 18| -|Artifact_signatures| 27, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 95|14|-| -|Lymphoid_signatures| 9, 84, 85| -| -| - - - -## Examples - -### Using mutation calling files (VCFs) as input - -```python -import SigProfilerAssignment as spa -from SigProfilerAssignment import Analyzer as Analyze - -Analyze.cosmic_fit(samples=spa.__path__[0]+"/data/tests/vcf_input", - output="example_vcf", - input_type="vcf", - context_type="96", - genome_build="GRCh37", - cosmic_version=3.5) -``` +### Running -### Using a multi-sample segmentation file as input +Assignment of known mutational signatures to individual samples is performed using the `cosmic_fit` function. Input samples are provided using the `samples` parameter in the form of mutation calling files (VCFs, MAFs, or simple text files), segmentation files, or mutational matrices. COSMIC mutational signatures v3.5 are used as the default reference signatures, although previous COSMIC versions and custom signature databases are also supported using the `cosmic_version` and `signature_database` parameters. Results will be found in the folder specified in the `output` parameter. ```python -import SigProfilerAssignment as spa from SigProfilerAssignment import Analyzer as Analyze - -Analyze.cosmic_fit(samples=spa.__path__[0]+"/data/tests/cnv_input/all.breast.ascat.summary.sample.tsv", - output="example_sf", - input_type="seg:ASCAT_NGS", - cosmic_version=3.5, - collapse_to_SBS96=False) +Analyze.cosmic_fit(samples, output, input_type="matrix", context_type="96") ``` -### Using a mutational matrix as input +You can also run SigProfilerAssignment `cosmic_fit` function from command line: -```python -import SigProfilerAssignment as spa -from SigProfilerAssignment import Analyzer as Analyze +``` bash +$ SigProfilerAssignment cosmic_fit samples output --input_type "matrix" --context_type "96" -Analyze.cosmic_fit(samples=spa.__path__[0]+"/data/tests/txt_input/sample_matrix_SBS.txt", - output="example_mm", - input_type="matrix", - genome_build="GRCh37", - cosmic_version=3.5) ``` -## _De novo_ extraction of mutational signatures downstream analysis -Additional functionalities for downstream analysis of _de novo_ extraction of mutational signatures are also available as part of SigProfilerAssignment, including assignment of _de novo_ extracted mutational signatures and decomposition of _de novo_ signatures using a known set of signatures. More information can be found on the wiki page at https://osf.io/mz79v/wiki/5.%20Advanced%20mode/. - -## Unit Tests -Unit tests can be run with the following commands: - -```bash -python setup.py sdist -pip install .[tests] -pytest tests -``` +## Reference -## Citation +Díaz-Gay M, Vangara R, Barnes M, *et al.*, Alexandrov LB. Assigning mutational signatures to individual samples and individual somatic mutations with SigProfilerAssignment. *Bioinformatics*. 2023;39(12):btad756. [https://doi.org/10.1093/bioinformatics/btad756](https://doi.org/10.1093/bioinformatics/btad756) -Díaz-Gay, M., Vangara, R., Barnes, M., ... & Alexandrov, L. B. (2023). Assigning mutational signatures to individual samples and individual somatic mutations with SigProfilerAssignment, Bioinformatics, 2023-07. doi: [https://doi.org/10.1093/bioinformatics/btad756](https://doi.org/10.1093/bioinformatics/btad756) +## Contact -## Copyright -This software and its documentation are copyright 2022 as a part of the SigProfiler project. The SigProfilerAssignment framework is free software and is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +For questions, support requests, or bug reports, please contact the SigProfilerSuite team via GitHub [issues](https://github.com/SigProfilerSuite/SigProfilerAssignment/issues) or by email at [contact@sigprofilersuite.org](mailto:contact@sigprofilersuite.org). -## Contact Information -Please address any queries or bug reports to Raviteja Vangara at rvangara@health.ucsd.edu or Marcos Díaz-Gay at mdiazgay@health.ucsd.edu. diff --git a/SigProfilerAssignment/DecompositionPlots/PlotDecomposition.py b/SigProfilerAssignment/DecompositionPlots/PlotDecomposition.py index f539683..5272c05 100644 --- a/SigProfilerAssignment/DecompositionPlots/PlotDecomposition.py +++ b/SigProfilerAssignment/DecompositionPlots/PlotDecomposition.py @@ -1046,6 +1046,7 @@ def run_PlotSSDecomposition( custom_text=None, exome=False, volume=None, + use_custom_basis=False, ): """ Generates a reconstruction of a sample given a set of signatures. @@ -1084,30 +1085,45 @@ def run_PlotSSDecomposition( exome: Boolean. True if using exome COSMIC signatures, and False if not. + use_custom_basis: Boolean. True if basis_mtx comes from a custom signature_database + (renders basis plots directly from basis_mtx instead of the cached COSMIC plots). + Returns: ------- None. """ - # Create the denovo plots - denovo_plots_dict = gen_sub_plots( - denovo_mtx, - None, - output_path, - project, - context_type, - ss_decomp=True, - volume=volume, - ) - denovo_plots_dict = denovo_plots_dict[0] - # Load in the COSMIC plots - basis_plots_dict = install_cosmic_plots( - context_type=context_type, - genome_build=genome_build, - cosmic_version=cosmic_version, - exome=exome, - volume=volume, - ) + if use_custom_basis: + # Custom signature database: render basis (signature) plots directly + # from the provided matrix instead of the cached COSMIC reference plots. + denovo_plots_dict, basis_plots_dict = gen_sub_plots( + denovo_mtx, + basis_mtx, + output_path, + project, + context_type, + ss_decomp=True, + volume=volume, + ) + else: + # Create the denovo plots + denovo_plots_dict, _ = gen_sub_plots( + denovo_mtx, + None, + output_path, + project, + context_type, + ss_decomp=True, + volume=volume, + ) + # Load in the COSMIC plots + basis_plots_dict = install_cosmic_plots( + context_type=context_type, + genome_build=genome_build, + cosmic_version=cosmic_version, + exome=exome, + volume=volume, + ) # Create reconstructed matrix and plot reconstructed_mtx, reconstruction_plot_dict = gen_reconstructed_png_numerical( diff --git a/SigProfilerAssignment/controllers/cli_controller.py b/SigProfilerAssignment/controllers/cli_controller.py index 181b9db..d77d5a8 100644 --- a/SigProfilerAssignment/controllers/cli_controller.py +++ b/SigProfilerAssignment/controllers/cli_controller.py @@ -13,6 +13,8 @@ def str2bool(v): else: raise argparse.ArgumentTypeError("Boolean value expected.") +def str2list(arg): + return arg.split(",") def parse_arguments_common(args: List[str], description: str) -> argparse.Namespace: parser = argparse.ArgumentParser(description=description) @@ -99,6 +101,7 @@ def parse_arguments_common(args: List[str], description: str) -> argparse.Namesp ) parser.add_argument( "--exclude_signature_subgroups", + type=str2list, default=None, help="Remove specific signature subgroups.", ) diff --git a/SigProfilerAssignment/decomposition.py b/SigProfilerAssignment/decomposition.py index 1748b34..29e7396 100644 --- a/SigProfilerAssignment/decomposition.py +++ b/SigProfilerAssignment/decomposition.py @@ -103,6 +103,10 @@ def generate_sample_reconstruction( # png = pdf generation + png conversion + pdf removal project = "test_run" mtype = "96" + is_custom_database = execution_parameters.get("signature_database") is not None + cosmic_version_label = ( + "Custom" if is_custom_database else str(execution_parameters["cosmic_version"]) + ) final_pdf = PdfWriter() samples = samples_input.copy(deep=True) @@ -140,9 +144,10 @@ def generate_sample_reconstruction( project, mtype, genome_build=execution_parameters["reference_genome"], - cosmic_version=str(execution_parameters["cosmic_version"]), + cosmic_version=cosmic_version_label, exome=execution_parameters["exome"], volume=get_storage_dir(execution_parameters["volume"]), + use_custom_basis=is_custom_database, ) result.seek(0) @@ -1015,9 +1020,6 @@ def spa_analyze( cosmic_version=cosmic_version, exome=exome, )[0] - # for sample reconstruction plots - cosmic_sig_ref = processAvg.copy(deep=True) - cosmic_sig_ref.reset_index(inplace=True) else: try: @@ -1055,6 +1057,12 @@ def spa_analyze( # # processAvg.drop(sig_exclusion_list, axis=1, inplace=True, errors="ignore") + # for sample reconstruction plots; built after collapsing/exclusion so it + # matches the signatures actually used, whether from COSMIC or a custom + # signature_database. + cosmic_sig_ref = processAvg.copy(deep=True) + cosmic_sig_ref.reset_index(inplace=True) + # processAvg= originalProcessAvg # index = genomes.index # colnames = genomes.columns @@ -1138,7 +1146,6 @@ def spa_analyze( isinstance(sample_reconstruction_plots, str) and sample_reconstruction_plots.lower() in recon_output_types and mutation_type == "96" - and signature_database is None ): ss_recon_odir = os.path.join( layer_directory3, "Activities", "SampleReconstruction" diff --git a/cliff.toml b/cliff.toml new file mode 100644 index 0000000..20b3f0d --- /dev/null +++ b/cliff.toml @@ -0,0 +1,36 @@ +[changelog] +header = """# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +""" +body = """ +## [{{ version | trim_start_matches(pat="v") }}] - {{ timestamp | date(format="%Y-%m-%d") }} +{% for group, commits in commits | group_by(attribute="group") %} +### {{ group | trim_end_matches(pat=":") }} +{% for commit in commits %} +- {{ commit.message | split(pat="\\n") | first | trim }} +{% endfor %} +{% endfor %} +""" +trim = true +footer = "" + +[git] +conventional_commits = false +filter_unconventional = false +split_commits = false +filter_commits = true + +commit_parsers = [ + { message = "(?i)^merge ", skip = true }, + { message = "(?i)^revert", skip = true }, + { message = "(?i)^(feat|feature|add|added)(\\(.*\\))?:?\\b", group = "Added" }, + { message = "(?i)^(fix|fixed|bugfix|resolve[sd]?)(\\(.*\\))?:?\\b", group = "Fixed" }, + { message = "(?i)^(chore|ci)(\\(.*\\))?:?\\s", skip = true }, + { message = ".*", group = "Changed" }, +] diff --git a/docs/1_installation.md b/docs/1_installation.md new file mode 100644 index 0000000..9c5a460 --- /dev/null +++ b/docs/1_installation.md @@ -0,0 +1,73 @@ +# Installation + + +---------- + + +This section will help you set up the necessary software and packages required to run SigProfilerAssignment. + +---------- + + +## Prerequisites ## + +- [Python][1] version >= 3.9 +- Downloaded reference genomes using [SigProfilerMatrixGenerator][2] (only if mutation calling files are used as input) +- Other dependencies and necessary packages are downloaded during the installation + +## Installation ## + +SigProfilerAssignment can be executed on any Windows/MacOS/Unix system. First follow the [SigProfilerMatrixGenerator][2] guide for installing `Python` and `pip`. Next, follow the download instructions for the latest stable release or the current GitHub version. + +### Installation with `pip` ### + +Install last `SigProfilerAssignment` PyPI version using `pip`: +``` +$ pip install SigProfilerAssignment +``` + +### Install specific GitHub Release ### + +First, download the [zip file][3] or clone the GitHub repository by: +``` +$ git clone https://github.com/SigProfilerSuite/SigProfilerAssignment.git +``` + +Next, enter the downloaded directory and install the package by unzipping the contents of SigProfilerAssignment-master or the zip file of a corresponding branch: +``` +$ cd SigProfilerAssignment +$ pip install . +``` + +## Download Reference Genome ## + +In case you want to use SigProfilerAssignment with mutation calling files as input, you first need to download the appropriate reference genome. Current reference genomes supported include GRCh37, GRCh38, mm9, mm10, rn6 and rn7. To install the reference genome/s, you need to use [SigProfilerMatrixGenerator][2]. + +The last PyPI [SigProfilerMatrixGenerator][2] version is installed with SigProfilerAssignment by default. You can also install a specific version following the instructions in [SigProfilerMatrixGenerator Wiki][2]. + +Once [SigProfilerMatrixGenerator][2], install your desired reference genome from the command line/terminal as follows. + +### Installation from command line ### + +``` +$ SigProfilerMatrixGenerator install GRCh37 +``` +### Installation from Python terminal ### + +``` python +$ python +>> from SigProfilerMatrixGenerator import install as genInstall +>> genInstall.install('GRCh37', rsync=False, bash=True) +``` + +In case you prefer to install a reference genome that you have saved locally, you can do the following: +``` python +$ python +>> from SigProfilerMatrixGenerator import install as genInstall +>> genInstall.install('GRCh37', offline_files_path='path/to/directory/containing/GRCh37.tar.gz') +``` + + [1]: https://www.python.org/downloads + [2]: https://sigprofilersuite.github.io/SigProfilerMatrixGenerator/Installation-Python.html + [3]: https://github.com/SigProfilerSuite/SigProfilerAssignment/releases + diff --git a/docs/2_quick_start_example.md b/docs/2_quick_start_example.md new file mode 100644 index 0000000..85059d7 --- /dev/null +++ b/docs/2_quick_start_example.md @@ -0,0 +1,125 @@ +

Quick Start Example

+ +---------- + +This section provides an example for users to quickly get started with using the SigProfilerAssignment tool. The following example will use somatic mutational data from breast cancer samples from [Nik-Zainal et al. 2012 Cell][1], and will showcase how to use SigProfilerAssignment with different types of files containing the input somatic mutations, including variant calling files (VCFs) and mutational matrices. + + +---------- + +## Prerequisites ## +This tutorial requires that you have completed all steps in the [installation guide][2], specifically: + + - Installed SigProfilerAssignment + - Downloaded **GRCh37** reference genome using SigProfilerMatrixGenerator + + +## Downloading input example data ## +This example uses somatic mutational data from a breast cancer genome. Download the example dataset `BRCA.zip` at the following location or use the command line: + + ftp://alexandrovlab-ftp.ucsd.edu/pub/tools/SigProfilerAssignment/Example_data/ + +If using the command line, then enter the following command in bash on MacOS X or Unix systems: + + $ wget ftp://alexandrovlab-ftp.ucsd.edu/pub/tools/SigProfilerAssignment/Example_data/BRCA.zip + +Once `BRCA.zip` has been downloaded, unzip the file. The unzipped `BRCA` folder contains `BRCA.txt` and another folder `BRCA_vcf`. The file `BRCA.txt` is a mutational matrix defined using SBS-96 classification (created by [SigProfilerMatrixGenerator][3]) and `BRCA_vcf` contains the corresponding VCF file associated to the sample. + + +## Running SigProfilerAssignment from VCF ## +You will be assigning reference mutational signatures from [COSMIC][4] v3.5 to the breast cancer sample in the subfolder `BRCA_vcf` used as input for this example. + + +First, start a Python interactive shell and import the SigProfilerAssignment library. + +``` python +$ python +>>> from SigProfilerAssignment import Analyzer as Analyze +``` + +Next, assign reference COSMIC signatures by running the following command. **Note**: Update `"path/to/BRCA_vcf"` with the actual path to the `BRCA_vcf` folder. + +``` python +Analyze.cosmic_fit(samples="path/to/BRCA_vcf", + output="output_vcf", + input_type="vcf", + context_type="96", + genome_build="GRCh37") +``` + +You can also run SigProfilerAssignment `cosmic_fit` function from command line: + +``` bash +$ SigProfilerAssignment cosmic_fit "path/to/BRCA_vcf" "output_vcf" --input_type "vcf" --context_type "96" --genome_build "GRCh37" + +``` + +After SigProfilerAssignment has finished running, an output directory name `output_vcf` will be created. This directory will contain the output files and is located in the directory where the Python instance was started. To learn more about the output produced by SigProfilerAssignment, please refer to the [Using the Tool - Output][5] section. + +## Running SigProfilerAssignment (Mutational matrix) ## +You will be assigning reference mutational signatures from [COSMIC][4] v3.5 to the mutational matrix defined using the SBS-96 classification named `BRCA.txt` input for this example. + +First, start a Python interactive shell and import the SigProfilerAssignment library. + +``` python +$ python +>>> from SigProfilerAssignment import Analyzer as Analyze +``` + +Next, assign reference COSMIC signatures by running the following command. **Note**: Update `"path/to/BRCA.txt"` with the actual path to the `BRCA.txt` file. + +``` python +Analyze.cosmic_fit(samples="path/to/BRCA.txt", + output="output_mm", + input_type="matrix") +``` + +You can also run SigProfilerAssignment `cosmic_fit` function from command line: + +``` bash +$ SigProfilerAssignment cosmic_fit "path/to/BRCA.txt" "output_mm" --input_type "matrix" + +``` +After SigProfilerAssignment has finished running, an output directory name `output_mm` will be created. This directory will contain the output files and is located in the directory where the Python instance was started. To learn more about the output produced by SigProfilerAssignment, please refer to the [Using the Tool - Output][5] section. + +## Running SigProfilerAssignment (Multi-sample segmentation) ## + +You will be assigning reference mutational signatures from [COSMIC][4] v3.5 to the multi-sample segmentation file obtained from one of the copy number calling tools named `all.breast.ascat.summary.sample.tsv` input for this example. + +First, start a Python interactive shell and import the SigProfilerAssignment library. + +``` python +$ python +>>> from SigProfilerAssignment import Analyzer as Analyze +``` + +Next, assign reference COSMIC signatures by running the following command. **Note**: Update `"path/to/all.breast.ascat.summary.sample.tsv"` with the actual path to the `all.breast.ascat.summary.sample.tsv` file. + +``` python +Analyze.cosmic_fit(samples="path/to/all.breast.ascat.summary.sample.tsv", + output="example_sf", + input_type="seg:ASCAT_NGS", + cosmic_version=3.5, + collapse_to_SBS96=False) +``` + +You can also run SigProfilerAssignment `cosmic_fit` function from command line: + +``` bash +$ SigProfilerAssignment cosmic_fit "path/to/all.breast.ascat.summary.sample.tsv" "example_sf" --input_type "seg:ASCAT_NGS" --cosmic_version "3.5" --collapse_to_SBS96 False + +``` + +After SigProfilerAssignment has finished running, an output directory name `example_sf` will be created. This directory will contain the output files and is located in the directory where the Python instance was started. To learn more about the output produced by SigProfilerAssignment, please refer to the [Using the Tool - Output][5] section. + +## Additional Information ## +In the above examples, the other non specified parameters are passed in with their default values. All of the function arguments and their types are explained in detail in the [Using the Tool - Input section][6]. To learn more about the files that were produced, you can refer to [Using the Tool - Output][5]. + + + + [1]: https://doi.org/10.1016/j.cell.2012.04.024 + [2]: https://sigprofilersuite.github.io/SigProfilerAssignment/1_installation.html + [3]: https://sigprofilersuite.github.io/SigProfilerMatrixGenerator/ + [4]: https://cancer.sanger.ac.uk/signatures/ + [5]: https://sigprofilersuite.github.io/SigProfilerAssignment/4_using_the_tool_output.html + [6]: https://sigprofilersuite.github.io/SigProfilerAssignment/3_using_the_tool_input.html diff --git a/docs/3_using_the_tool_input.md b/docs/3_using_the_tool_input.md new file mode 100644 index 0000000..02fd03f --- /dev/null +++ b/docs/3_using_the_tool_input.md @@ -0,0 +1,113 @@ +# Using SigProfilerAssignment # + + +---------- + + +This section describes SigProfilerAssignment's main function for mutational signatures assignment, as well as the different parameters accepted. + + +---------- + +## Function ## + +The main function available in SigProfilerAssignment to perform *refitting of known* mutational signatures is the `cosmic_fit` function. + +### Input files ### + +Two main input files are needed to use this function: + +* **Somatic mutations**: three different formats are allowed (the selected format should be specified in the `input_type` parameter: + * Mutation calling files (VCFs, MAFs, or simple text files, as described [here][1]). One file per sample is required. Example input [vcf files][6]. Use `input_file = "vcf"`. + * Segmentation files. Used for copy number analysis. Only one multi-sample file is allowed. An example [segmentation file][5]. Use `input_file = "seg:TYPE"` (Check the segmentation files that are supported by [SigProfilerMatrixGenerator][2]). + * Mutational matrices. From different mutational classifications and generated by [SigProfilerMatrixGenerator][2]. An example mutational [matrix file][4]. Use `input_file = "matrix"` (default option). + +* **Set of known mutational signatures**: [COSMIC][3] v3.5 mutational signatures are used by default as the input reference signatures. Custom signature databases can also be used, and should be provided to the `cosmic_fit` function using the `signature_database` parameter. + +### Required parameters ### +To run the `cosmic_fit` function, first import the package within your python script or from within an interactive python session: + +``` python +$ python +>>> from SigProfilerAssignment import Analyzer as Analyze +``` + +Now, you are able to assign known signatures to your sample/s. The required parameters for the `cosmic_fit` function are: + +``` python +Analyze.cosmic_fit(samples, output, input_type = input_type) +``` + +You can also run SigProfilerAssignment `cosmic_fit` function from command line: + +``` bash +$ SigProfilerAssignment cosmic_fit samples output --input_type input_type + +``` + +### Full list of parameters ### +The full list of parameters is included in the following table: + +| Parameter | Variable Type | Parameter Description | +| ------ | ----------- | ----------- | +| samples | String | Path to the input somatic mutations file (if using segmentation file/mutational matrix) or input folder (mutation calling file/s) | +| output | String | Path to the output folder | +| input_type | String | Three accepted input types: The default value is "matrix".| +| context_type | String | Required context type if `input_type` is "vcf". `context_type` takes which context type of the input data is considered for assignment. Valid options include "96", "288", "1536", "DINUC", and "ID". The default value is "96" | +| cosmic_version | Float | Defines the version of the COSMIC reference signatures. Takes a positive float among 1, 2, 3, 3.1, 3.2, 3.3, 3.4 and 3.5. The default value is 3.5| +| exome | Boolean | Defines if the exome renormalized COSMIC signatures will be used. The default value is False | +| genome_build | String | The reference genome build, used for select the appropriate version of the COSMIC reference signatures, as well as processing the mutation calling file/s. Supported genomes include "GRCh37", "GRCh38", "mm9", "mm10", "rn6" and "rn7". The default value is "GRCh37". If the selected genome is not in the supported list, the default genome will be used | +| signature_database | String | Path to the input set of known mutational signatures (only in case that COSMIC reference signatures are not used), a tab delimited file that contains the signature matrix where the rows are mutation types and columns are signature IDs | +| exclude_signature_subgroups | List | Removes the signatures corresponding to specific subtypes to improve refitting (only available when using default COSMIC reference signatures). The usage is explained below. The default value is None, which corresponds to use all COSMIC signatures | +| export_probabilities | Boolean | Defines if the probability matrix per mutational context for all samples is created. The default value is True | +| export_probabilities_per_mutation | Boolean | Defines if the probability matrices per mutation for all samples are created. Only available when `input_type` is "vcf". The default value is False | +| make_plots | Boolean | Toggle on and off for making and saving plots. The default value is True | +| sample_reconstruction_plots | String | Select the output format for sample reconstruction plots. Valid inputs are 'pdf', 'png', 'both'and None. The default value is None | +| verbose | Boolean | Prints detailed statements. The default value is False | + +### Signature subgroups ### +When using [COSMIC][3] reference signatures, some subgroups of signatures can be removed to improve the *refitting* analysis. To use this feature, the `exclude_signature_subgroups` parameter should be added, following the syntax below: + +``` python +exclude_signature_subgroups = ['MMR_deficiency_signatures', + 'POL_deficiency_signatures', + 'HR_deficiency_signatures' , + 'BER_deficiency_signatures', + 'Chemotherapy_signatures', + 'Immunosuppressants_signatures' + 'Treatment_signatures' + 'APOBEC_signatures', + 'Tobacco_signatures', + 'UV_signatures', + 'AA_signatures', + 'Colibactin_signatures', + 'Artifact_signatures', + 'Lymphoid_signatures'] +``` + +The full list of signature subgroups is included in the following table: + +|Signature subgroup | SBS signatures excluded | DBS signatures excluded | ID signatures excluded | +| ----------- | ----------- | ----------- | ----------- | +|MMR_deficiency_signatures| 6, 14, 15, 20, 21, 26, 44| 7, 10| 7| +|POL_deficiency_signatures| 10a, 10b, 10c, 10d, 28| 3| -| +|HR_deficiency_signatures| 3| -| 6| +|BER_deficiency_signatures| 30, 36| -| -| +|Chemotherapy_signatures| 11, 25, 31, 35, 86, 87, 90| 5| -| +|Immunosuppressants_signatures| 32| -| -| +|Treatment_signatures| 11, 25, 31, 32, 35, 86, 87, 90| 5| -| +|APOBEC_signatures| 2, 13| -| -| +|Tobacco_signatures | 4, 29, 92, 100, 109| 2| 3| +|UV_signatures| 7a, 7b, 7c, 7d, 38| 1| 13| +|AA_signatures| 22| -| -| +|Colibactin_signatures| 88| -| 18| +|Artifact_signatures| 27, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 95|-|-| +|Lymphoid_signatures| 9, 84, 85| -| -| + + + [1]: https://sigprofilersuite.github.io/SigProfilerMatrixGenerator/Using-the-Tool-SBS-ID-DBS-Input.html + [2]: https://sigprofilersuite.github.io/SigProfilerMatrixGenerator/ + [3]: https://cancer.sanger.ac.uk/signatures/ + [4]: https://github.com/SigProfilerSuite/SigProfilerAssignment/blob/main/SigProfilerAssignment/data/Samples.txt + [5]: https://github.com/SigProfilerSuite/SigProfilerAssignment/blob/main/SigProfilerAssignment/data/tests/cnv_input/all.breast.ascat.summary.sample.tsv + [6]: https://github.com/SigProfilerSuite/SigProfilerAssignment/tree/main/SigProfilerAssignment/data/tests/vcf_inputs diff --git a/docs/4_using_the_tool_output.md b/docs/4_using_the_tool_output.md new file mode 100644 index 0000000..12d40f4 --- /dev/null +++ b/docs/4_using_the_tool_output.md @@ -0,0 +1,84 @@ +# Using SigProfilerAssignment - Output # + + +---------- + + +This section describes the output files of SigProfilerAssignment. The Assignment_Solution directory contains files organized into the three subdirectories: `Activities`, `Signatures`, and `Solution_Stats`. + + +---------- + +## Output Overview ## +In the screenshot below, there are 3 subdirectories `Activities`, `Signatures`, and `Solution_Stats`. The files examined below are from the SigProfilerAssignment results using as input a mutational matrix derived from 21 breast cancer samples from [Nik-Zainal et al. 2012 Cell][1]. + +![SPA_output](assets/images/SPA_output.png) + +The files in their respective directories are listed below: + + - Activities + - Assignment_Solution_Activities.txt + - Assignment_Solution_Activity_Plots.pdf + - Assignment_Solution_TMB_plot.pdf + - Decomposed_Mutation_Probabilities.txt + - Signatures + - Assignment_Solution_Signatures.txt + - SBS_96_plots_Assignment_Solution.pdf + - Solution_Stats + - Assignment_Solution_Samples_Stats.txt + - Assignment_Solution_Signature_Assignment_log.txt + +## Activities Directory ## + +### Assignment_Solution_Activities.txt ### + +The `Assignment_Solution_Activities.txt` file contains the activity matrix for the selected signatures. The first column lists all of the samples and the second and the following columns list the calculated activity value for the respective signatures. The number of columns is the number of signatures identified. + +Below is a screenshot of the first few rows and columns of a sample file, `Assignment_Solution_Activities.txt`. +![Assignment_Solution_Activities](assets/images/Matrix_Activities/Assignment_Solution_Activities.png) + +### Assignment_Solution_Activity_Plots.pdf ### +The `Assignment_Solution_Activity_Plots.pdf` plot shows the number of mutations in each signature on the y-axis and the sample name on the x-axis. The colors indicate which signature had the mutations and which signatures were found in each sample. + +![Assignment_Solution_Activity_Plots](assets/images/Matrix_Activities/Assignment_Solution_Activity_Plots.png) + +### Assignment_Solution_TMB_plot.pdf ### +The `Assignment_Solution_TMB_plot.pdf` file contains a tumor mutational burden plot. The y-axis is the somatic mutations per megabase and the x-axis is the number of samples plotted over the number of samples included. The column names are the mutational signatures and the plot is ordered by the median somatic mutations per megabase. + +![Assignment_Solution_TMB_plot](assets/images/Matrix_Activities/Assignment_Solution_TMB_plot.png) + +### Decomposed_Mutation_Probabilities.txt ### +The `Decomposed_Mutation_Probabilities.txt` file includes the probabilities of each mutation type (in this particular example, a total of 96 mutation types) in each sample. The first column lists all the samples, the second column lists all the mutation types, and the following columns list the calculated probability value for the respective signatures. + +![Decomposed_Mutation_Probabilities](assets/images/Matrix_Activities/Decomposed_Mutation_Probabilities.png) + +## Signatures Directory ## +### Assignment_Solution_Signatures.txt ### +The `Assignment_Solution_Signatures.txt` file contains the distribution of mutation types in the input mutational signatures. The first column lists all of the mutation types. There are 96 possible mutations that are considered for the SBS-96 context. The following columns are the signatures. Only the first few rows and columns are shown in the image below; however, the sum of each column is 1, and each value in a column indicates the proportion of a mutational context in the signature. + +![Assignment_Solution_Signatures](assets/images/Matrix_Signatures/Assignment_Solution_Signatures.png) + +### SBS_96_plots_Assignment_Solution.pdf ### +The `SBS_96_plots_Assignment_Solution.pdf` has plots for each signature identified that depicts the proportion of the mutation types for that signature. + +In the example below the plot generated for the first two signatures (SBS1 and SBS2) identified in the input samples. The top right corner also lists the total number of mutations and the percentage of total mutations assigned to the mutational signature. + +![SBS_96_plots_Assignment_Solution](assets/images/Matrix_Signatures/SBS_96_plots_Assignment_Solution.png) + +## Solutions_Stats Directory ## +### Assignment_Solution_Samples_Stats.txt ### +The `Assignment_Solution_Samples_Stats.txt` file contains the statistics for each sample including the total number of mutations, cosine similarity, L1 norm (calculated as the sum of the absolute values of the vector), L1 norm percentage, L2 norm (calculated as the square root of the sum of the squared vector values), and L2 norm percentage, along with the KL divergence. + +Below is an example of a `Assignment_Solution_Samples_Stats.txt` file. + +![Assignment_Solution_Samples_Stats](assets/images/Solution_Stats/Assignment_Solution_Samples_Stats.png) + +### Assignment_Solution_Signature_Assignment_log.txt ### +The `Assignment_Solution_Signature_Assignment_log.txt` file records the events that occur when known signatures are assigned to an input sample. The information includes the L2 error and cosine similarity between the reconstructed and original sample within different composition steps. + +Below is an example of the start of the log file. + +![Assignment_Solution_Signature_Assignment_log](assets/images/Solution_Stats/Assignment_Solution_Signature_Assignment_log.png) + + + [1]: https://doi.org/10.1016/j.cell.2012.04.024 diff --git a/docs/5_advanced_mode.md b/docs/5_advanced_mode.md new file mode 100644 index 0000000..c1d355e --- /dev/null +++ b/docs/5_advanced_mode.md @@ -0,0 +1,74 @@ +Advanced Mode Examples +===================== + + +---------- + + +This section provides examples for advanced users to get started with using SigProfilerAssignment for *de novo* extraction of mutational signautres downstream analysis, including: + +1. Assignment of *de novo* extracted mutational signatures using `denovo_fit`. +2. Decomposition of *de novo* extracted mutational signatures using a known set of signatures (reference [COSMIC][4] signatures or customized signature databases) by `decompose_fit`. + +---------- + + + +## `denovo_fit` ## +Attributes the somatic mutations of a given sample/s to a set of input *de novo* signatures. + +Two input files are required. First, a file containing the input somatic mutations, in any of the formats specified in the [Using the Tool - Input][1] section. Also, a matrix containing the *de novo* extracted mutational signatures, commonly derived from [SigProfilerExtractor][2]. + +**Note**: A reference genome build should also be specified if a mutation calling file is used as input. + + +``` python +$ python +>> from SigProfilerAssignment import Analyzer as Analyze +>> Analyze.denovo_fit(samples="path/to/input/mutations/file", + output="path/to/output/folder", + input_type="desired/input/mutation/file/format", + signatures="path/to/input/denovo/signatures/file", + genome_build="GRCh37") +``` + + +You can also run SigProfilerAssignment `denovo_fit` function from command line: + +``` bash +$ SigProfilerAssignment denovo_fit "path/to/input/mutations/file" "path/to/output/folder" --input_type "desired/input/mutation/file/format" --signatures "path/to/input/denovo/signatures/file" --genome_build "GRCh37" + +``` + +## `decompose_fit` ## +Decomposes a set of *de novo* extracted mutational signatures into a known set of signatures (reference COSMIC signatures or customized signature databases) and assigns these known signatures into a given sample/s. + +Two input files are required. First, a file containing the input somatic mutations, in any of the formats specified in the [Using the Tool - Input][1] section. Also, a matrix containing the *de novo* extracted mutational signatures, commonly derived from [SigProfilerExtractor][2]. + +An optional third input file is needed in case a custom reference signature database is used. By default, reference [COSMIC][3] signatures v3.5 are used for decomposing the set of *de novo* extracted signatures. + +**Note**: A reference genome build should also be specified if a mutation calling file is used as input. + +``` python +$ python +>> from SigProfilerAssignment import Analyzer as Analyze +>> Analyze.decompose_fit(samples="path/to/input/mutations/file", + output="path/to/output/folder", + input_type="desired/input/mutation/file/format", + signatures="path/to/input/denovo/signatures/file", + signature_database="path/to/optional/reference/signatures/database/file", + genome_build="GRCh37") +``` + + +You can also run SigProfilerAssignment `decompose_fit` function from command line: + +``` bash +$ SigProfilerAssignment decompose_fit "path/to/input/mutations/file" "path/to/output/folder" --input_type "desired/input/mutation/file/format" --signatures "path/to/input/denovo/signatures/file" --signature_database "path/to/optional/reference/signatures/database/file" --genome_build "GRCh37" + +``` + + + [1]: https://sigprofilersuite.github.io/SigProfilerAssignment/3_using_the_tool_input.html + [2]: https://sigprofilersuite.github.io/SigProfilerExtractor + [3]: https://cancer.sanger.ac.uk/signatures/ diff --git a/docs/assets/images/Matrix_Activities/Assignment_Solution_Activities.png b/docs/assets/images/Matrix_Activities/Assignment_Solution_Activities.png new file mode 100755 index 0000000..97bb90f Binary files /dev/null and b/docs/assets/images/Matrix_Activities/Assignment_Solution_Activities.png differ diff --git a/docs/assets/images/Matrix_Activities/Assignment_Solution_Activity_Plots.png b/docs/assets/images/Matrix_Activities/Assignment_Solution_Activity_Plots.png new file mode 100755 index 0000000..9da67f1 Binary files /dev/null and b/docs/assets/images/Matrix_Activities/Assignment_Solution_Activity_Plots.png differ diff --git a/docs/assets/images/Matrix_Activities/Assignment_Solution_TMB_plot.png b/docs/assets/images/Matrix_Activities/Assignment_Solution_TMB_plot.png new file mode 100755 index 0000000..e5ab3fa Binary files /dev/null and b/docs/assets/images/Matrix_Activities/Assignment_Solution_TMB_plot.png differ diff --git a/docs/assets/images/Matrix_Activities/Decomposed_Mutation_Probabilities.png b/docs/assets/images/Matrix_Activities/Decomposed_Mutation_Probabilities.png new file mode 100755 index 0000000..78d510a Binary files /dev/null and b/docs/assets/images/Matrix_Activities/Decomposed_Mutation_Probabilities.png differ diff --git a/docs/assets/images/Matrix_Signatures/Assignment_Solution_Signatures.png b/docs/assets/images/Matrix_Signatures/Assignment_Solution_Signatures.png new file mode 100755 index 0000000..4806cb2 Binary files /dev/null and b/docs/assets/images/Matrix_Signatures/Assignment_Solution_Signatures.png differ diff --git a/docs/assets/images/Matrix_Signatures/SBS_96_plots_Assignment_Solution.png b/docs/assets/images/Matrix_Signatures/SBS_96_plots_Assignment_Solution.png new file mode 100755 index 0000000..0d4c0b1 Binary files /dev/null and b/docs/assets/images/Matrix_Signatures/SBS_96_plots_Assignment_Solution.png differ diff --git a/docs/assets/images/SPA_output.png b/docs/assets/images/SPA_output.png new file mode 100755 index 0000000..58a2bd5 Binary files /dev/null and b/docs/assets/images/SPA_output.png differ diff --git a/SigProfilerAssignment/figures/SigProfilerAssignment.png b/docs/assets/images/SigProfilerAssignment.png old mode 100644 new mode 100755 similarity index 100% rename from SigProfilerAssignment/figures/SigProfilerAssignment.png rename to docs/assets/images/SigProfilerAssignment.png diff --git a/docs/assets/images/Solution_Stats/Assignment_Solution_Samples_Stats.png b/docs/assets/images/Solution_Stats/Assignment_Solution_Samples_Stats.png new file mode 100755 index 0000000..fffbe71 Binary files /dev/null and b/docs/assets/images/Solution_Stats/Assignment_Solution_Samples_Stats.png differ diff --git a/docs/assets/images/Solution_Stats/Assignment_Solution_Signature_Assignment_log.png b/docs/assets/images/Solution_Stats/Assignment_Solution_Signature_Assignment_log.png new file mode 100755 index 0000000..a4068ff Binary files /dev/null and b/docs/assets/images/Solution_Stats/Assignment_Solution_Signature_Assignment_log.png differ diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css new file mode 100644 index 0000000..c2e11d9 --- /dev/null +++ b/docs/assets/stylesheets/extra.css @@ -0,0 +1,30 @@ +:root { + --md-text-font: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, + Helvetica, Arial, "Apple Color Emoji", "Segoe UI Emoji"; +} + +.md-main__inner { + max-width: 80%; +} + +.md-grid { + max-width: 80%; +} + + +/* Improve readability of long tables from OSF wiki. */ +.md-typeset table:not([class]) { + display: block; + overflow-x: auto; + white-space: nowrap; +} + +/* Make big OSF screenshots feel less cramped. */ +.md-typeset img { + border-radius: 8px; +} + +p { + text-align: justify +} + diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..4d7f8c0 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,40 @@ +# SigProfilerAssignment + +![Logo](assets/images/SigProfilerAssignment.png) + +---------- + +**SigProfilerAssignment** is a Python framework that assigns and decomposes mutational signatures in individual samples and individual somatic mutations. + +Refitting of mutational signatures is a numerical optimization approach that not only identifies the set of operative mutational signatures in a particular sample, but also quantifies the number of mutations attributed to each signature detected in that sample. In addition to refitting reference signatures, the tool have an advanced mode that can assign _de novo_ extracted signatures and decompose them into reference signatures, facilitating biological interpretation and comparison with established catalogs. + +It supports multiple mutation contexts and genome builds, provides confidence estimation and signature activity thresholds, and generates detailed visual and tabular outputs to aid downstream analyses. The tool refits different types of known reference mutational signatures, including COSMIC [SBS][1], [DBS][2], [ID][3], and [CN][4] signatures, as well as custom signature databases. + +**SigProfilerAssignment** makes use of [SigProfilerMatrixGenerator][5] and [SigProfilerPlotting][6], enabling seamless integration with other tools in the SigProfiler suite. + +The SigProfilerAssignment library is available on [GitHub](https://github.com/SigProfilerSuite/SigProfilerAssignment) and [PyPI](https://pypi.org/project/SigProfilerAssignment) + + +---------- + +### Citation + +Díaz-Gay M, Vangara R, Barnes M, Wang X, Islam SMA, Vermes I, Duke S, Narasimman NB, Yang T, Jiang Z, Moody S, Senkin S, Brennan P, Stratton MR, Alexandrov LB. Assigning mutational signatures to individual samples and individual somatic mutations with SigProfilerAssignment. *Bioinformatics*. 2023;39(12):btad756. [https://doi.org/10.1093/bioinformatics/btad756](https://doi.org/10.1093/bioinformatics/btad756) + +### License + +This software and its documentation are part of the SigProfiler project and are copyrighted © 2022. The SigProfilerAssignment framework is free software and is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +### Contact + +For questions, support requests, or bug reports, please contact the SigProfilerSuite team via GitHub [issues](https://github.com/SigProfilerSuite/SigProfilerAssignment/issues) or by email at [contact@sigprofilersuite.org](mailto:contact@sigprofilersuite.org). + + [1]: https://cancer.sanger.ac.uk/cosmic/signatures/SBS/ + [2]: https://cancer.sanger.ac.uk/signatures/dbs/ + [3]: https://cancer.sanger.ac.uk/signatures/id/ + [4]: https://cancer.sanger.ac.uk/signatures/cn/ + [5]: https://osf.io/s93d5/ + [6]: https://osf.io/2aj6t/ + [7]: https://cancer.sanger.ac.uk/signatures/tools/ + [8]: https://github.com/SigProfilerSuite/SigProfilerAssignment/ + [9]: https://github.com/SigProfilerSuite/SigProfilerAssignmentR/ diff --git a/install_genome.py b/install_genome.py deleted file mode 100644 index 1124b4a..0000000 --- a/install_genome.py +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env python3 -import sys -from SigProfilerMatrixGenerator import install as genInstall - - -def install_ref(ref_path): - genInstall.install("GRCh37", offline_files_path=ref_path) - - -if __name__ == "__main__": - ref_path = sys.argv[1] - install_ref(ref_path) diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..d8e1118 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,60 @@ +site_name: SigProfilerAssignment +repo_name: SigProfilerSuite/SigProfilerAssignment +repo_url: https://github.com/SigProfilerSuite/SigProfilerAssignment + +docs_dir: docs +site_url: https://sigprofilersuite.github.io/SigProfilerAssignment/ + +use_directory_urls: false + +theme: + name: material + features: + - navigation.instant + - navigation.tracking + - navigation.tabs + - navigation.sections + - navigation.top + - toc.follow + - content.code.copy + palette: + - scheme: default + primary: indigo + accent: indigo + - scheme: slate + primary: indigo + accent: indigo + toggle: + icon: material/weather-night + name: Switch to dark mode + +nav: + - Home: index.md + - Getting Started: + - '1. Installation': '1_installation.md' + - '2. Quick Start Example': '2_quick_start_example.md' + - Using The Tool: + - '3. Using the Tool - Input': '3_using_the_tool_input.md' + - '4. Using the Tool - Output': '4_using_the_tool_output.md' + - Advanced mode: + - '5. Advanced mode': '5_advanced_mode.md' + +plugins: + - search + +markdown_extensions: + - toc: + permalink: true + - attr_list + - md_in_html + - admonition + - pymdownx.superfences + - pymdownx.details + - pymdownx.tabbed: + alternate_style: true + +extra_css: + - assets/stylesheets/extra.css + +hooks: + - mkdocs_hooks.py diff --git a/mkdocs_hooks.py b/mkdocs_hooks.py new file mode 100644 index 0000000..cb3f3bc --- /dev/null +++ b/mkdocs_hooks.py @@ -0,0 +1,14 @@ +import re + + +_OSF_TOC_RE = re.compile(r"^\s*@\[toc\]\([^)]+\)\s*$", re.IGNORECASE) + + +def on_page_markdown(markdown: str, /, *, page, config, files): + # OSF wiki pages contain a non-standard marker like `@[toc](Sections)` which MkDocs + # otherwise renders as a broken relative link. We keep the source files unchanged + # (to match OSF) and strip the marker at build time. + lines = markdown.splitlines() + filtered = [line for line in lines if not _OSF_TOC_RE.match(line)] + return "\n".join(filtered) + ("\n" if markdown.endswith("\n") else "") + diff --git a/requirements.txt b/requirements.txt index 2d3e928..34ada41 100644 --- a/requirements.txt +++ b/requirements.txt @@ -51,7 +51,7 @@ patsy==1.0.1 # via statsmodels pdf2image==1.17.0 # via SigProfilerAssignment (setup.py) -pillow==11.3.0 +pillow==12.1.1 # via # matplotlib # pdf2image diff --git a/setup.py b/setup.py index 6c360e2..fe7e98b 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ if os.path.exists("dist"): shutil.rmtree("dist") -VERSION = "1.1.3" +VERSION = "1.1.4" def write_version_py(filename="SigProfilerAssignment/version.py"): diff --git a/tests/test_cli.py b/tests/test_cli.py index d70a57d..9432033 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -3,6 +3,7 @@ from SigProfilerAssignment.controllers.cli_controller import ( parse_arguments_common, str2bool, + str2list ) @@ -44,6 +45,8 @@ def test_argument_parsing(): "True", "--sample_reconstruction_plots", "png", + "--exclude_signature_subgroups", + "MMR_deficiency_signatures,POL_deficiency_signatures,HR_deficiency_signatures,BER_deficiency_signatures,Chemotherapy_signatures,Immunosuppressants_signatures,Treatment_signatures,APOBEC_signatures,Tobacco_signatures,UV_signatures,AA_signatures,Colibactin_signatures,Artifact_signatures,Lymphoid_signatures" ], "Test argument parsing", ) @@ -59,6 +62,7 @@ def test_argument_parsing(): assert args.export_probabilities_per_mutation == True assert args.exome == True assert args.sample_reconstruction_plots == "png" + assert args.exclude_signature_subgroups == ['MMR_deficiency_signatures', 'POL_deficiency_signatures', 'HR_deficiency_signatures' , 'BER_deficiency_signatures', 'Chemotherapy_signatures', 'Immunosuppressants_signatures', 'Treatment_signatures', 'APOBEC_signatures', 'Tobacco_signatures', 'UV_signatures', 'AA_signatures', 'Colibactin_signatures', 'Artifact_signatures', 'Lymphoid_signatures'] def test_boolean_conversion(): @@ -75,6 +79,10 @@ def test_boolean_conversion(): with pytest.raises(argparse.ArgumentTypeError): str2bool("maybe") +def test_str2list(): + assert str2list("arg1,arg2,arg3") == ["arg1", "arg2","arg3"] + assert str2list("arg_unique") == ["arg_unique"] + assert str2list("wrong.sepparator") == ["wrong.sepparator"] if __name__ == "__main__": pytest.main()