Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions extras/assign_gtdb_taxonomy_to_gtdb_genomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@
# Read GTDB metadata
logging.info("Parsing GTDB metadata ..")
gtdb = pl.concat([
pl.read_csv(args.gtdb_bacterial_metadata, separator="\t", infer_schema_length=10000000),
pl.read_csv(args.gtdb_archaeal_metadata, separator="\t", infer_schema_length=10000000)
pl.read_csv(args.gtdb_bacterial_metadata, separator="\t", columns=["accession", "gtdb_taxonomy"]),
pl.read_csv(args.gtdb_archaeal_metadata, separator="\t", columns=["accession", "gtdb_taxonomy"])
])
gtdb_id_to_taxonomy = {}
for row in gtdb.iter_rows(named=True):
Expand Down
45 changes: 40 additions & 5 deletions extras/update_metapackage/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,48 @@ Download latest Uniprot swissprot annotations
wget https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz

# Running sprot_to_fasta_and_taxonomy.py:
python sprot_to_fasta_and_taxonomy.py -s uniprot_sprot.dat.gz -f uniprot_sprot.fa -t uniprot_sprot_taxonomy.tsv
pixi run -e update-metapackage \
python sprot_to_fasta_and_taxonomy.py -s uniprot_sprot.dat.gz -f uniprot_sprot.fa -t uniprot_sprot_taxonomy.tsv

# Output
# Viruses 17451
# Eukaryota 198986
# Bacteria 336739
# Archaea 19794
# Viruses 17497
# Eukaryota 200289
# Bacteria 336999
# Archaea 19842
```

## Rename preliminary taxonomy files

```python
# When taxonomy files have genome names like: G001261915 instead of RS_GCF_015356595.1
import polars as pl
import os

genome_files = (
pl.DataFrame({"filename": [f for r, d, files in os.walk("protein_faa_reps") for f in files if f.endswith(".faa.gz")]})
.select(
genome_id = pl.col("filename").str.replace(r"_protein\.faa\.gz$", ""),
g_id = pl.col("filename").str.replace(r"\.\d+_protein\.faa\.gz$", "").str.replace(r"^[RG][SB]_GC[FA]_", "G"),
)
)

fixed_ar = (
pl.read_csv("gtdb_r232_ar53_draft_taxonomy.tsv", separator="\t", has_header=False, new_columns=["g_id", "taxonomy"])
.join(genome_files, on="g_id", how="left")
.select("genome_id", "taxonomy")
)

fixed_ar.filter(pl.col("genome_id").is_null())
fixed_ar.write_csv("ar53_taxonomy_r232.tsv", separator="\t", include_header=False)

fixed_bac = (
pl.read_csv("gtdb_r232_bac120_draft_taxonomy.tsv", separator="\t", has_header=False, new_columns=["g_id", "taxonomy"])
.join(genome_files, on="g_id", how="left")
.select("genome_id", "taxonomy")
)

fixed_bac.filter(pl.col("genome_id").is_null())
fixed_bac.write_csv("bac120_taxonomy_r232.tsv", separator="\t", include_header=False)
```

## SRA test run comparison
Expand Down
49 changes: 11 additions & 38 deletions extras/update_metapackage/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,14 @@
"""
Steps:

pixi run -e update-metapackage snakemake --profile aqua --configfile config-R226.yaml

NOTE! This file is being migrated to pixi, but we aren't there yet, some rules haven't been migrated yet.
pixi run -e update-metapackage snakemake --profile aqua --configfile config-R232.yaml
"""

import pandas as pd
import polars as pl
import os

configfile: "config.yaml"

hmms_and_names = pd.read_csv("hmms_and_names", sep="\t").set_index("name", drop=False)
hmms_and_names = pl.read_csv("hmms_and_names", separator="\t")
singlem_packages = hmms_and_names.get_column("name").to_list()
singlem_bin = "pixi run --manifest-path ../../pixi.toml ../../singlem/main.py"

output_dir = config["output_dir"]
Expand Down Expand Up @@ -88,10 +85,8 @@ rule zenodo_backpack:
output:
zenodo_backpack=zenodo_backpack,
done=touch(output_dir + "/zenodo_backpack.done")
conda:
"envs/zenodo_backpack.yml"
shell:
"zenodo_backpack create --input-directory {input.metapackage} --output-file {output.zenodo_backpack} --data-version {zenodo_backpack_version}"
"pixi run -e zenodo-backpack zenodo_backpack create --input-directory {input.metapackage} --output-file {output.zenodo_backpack} --data-version {zenodo_backpack_version}"

####################
### HMM searches ###
Expand Down Expand Up @@ -275,7 +270,7 @@ rule concatenate_seqs_and_taxonomies:
# Rule not part of the main workflow here.
rule all_concatenate_seqs_and_taxonomies:
input:
expand(output_dir + "/concatenate_seqs_and_taxonomies.{spkg}.done", spkg = hmms_and_names.index)
expand(output_dir + "/concatenate_seqs_and_taxonomies.{spkg}.done", spkg = singlem_packages)
output:
touch(output_dir + "/concatenate_seqs_and_taxonomies.done")

Expand All @@ -292,15 +287,13 @@ rule create_SingleM_packages:
spkg = config["metapackage"] + "/{spkg}.spkg",
spkg_seq = output_dir + "/hmmseq_concat/{spkg}.faa",
spkg_tax = output_dir + "/hmmseq_concat/{spkg}_taxonomy.tsv",
spkg_name = lambda wildcards: hmms_and_names.loc[wildcards.spkg, "name_without_number"]
spkg_name = lambda wildcards: hmms_and_names.filter(pl.col("name") == wildcards.spkg).get_column("name_without_number").item(),
threads: 1
resources:
mem_mb = get_mem_mb,
runtime = 48*60,
log:
logs_dir + "/packages/{spkg}.log"
conda:
"../../singlem.yml"
shell:
"{params.singlem} regenerate "
"--input-singlem-package {params.spkg} "
Expand All @@ -317,7 +310,7 @@ rule create_SingleM_packages:
############################
rule create_draft_SingleM_metapackage:
input:
packages = expand(output_dir + "/packages/{spkg}.spkg", spkg = hmms_and_names.index)
packages = expand(output_dir + "/packages/{spkg}.spkg", spkg = singlem_packages)
output:
directory(output_dir + "/draft_metapackage.smpkg")
params:
Expand All @@ -329,8 +322,6 @@ rule create_draft_SingleM_metapackage:
runtime = 48*60,
log:
logs_dir + "/draft_metapackage.log"
conda:
"../../singlem.yml"
shell:
"{params.singlem} metapackage "
"--singlem-packages {input.packages} "
Expand Down Expand Up @@ -371,8 +362,6 @@ rule SingleM_transcripts:
params:
singlem = singlem_bin,
input_files = output_dir + "/transcript_lists/{n}.txt"
conda:
"../../singlem.yml"
log:
logs_dir + "/transcripts/{n}.log"
shell:
Expand Down Expand Up @@ -402,12 +391,10 @@ rule assign_taxonomy:
bac_metadata = config["gtdb_bac_metadata"],
arc_metadata = config["gtdb_arc_metadata"],
dir = output_dir + "/transcripts",
conda:
"../../singlem.yml"
log:
logs_dir + "/taxonomy.log"
shell:
"../assign_gtdb_taxonomy_to_gtdb_genomes.py "
"pixi run -e singlem ../assign_gtdb_taxonomy_to_gtdb_genomes.py "
"--otu-table-list <( find {params.dir} -name '*.otu_table.tsv' ) "
"--gtdb-bac {params.bac_metadata} "
"--gtdb-arc {params.arc_metadata} "
Expand All @@ -428,8 +415,6 @@ rule make_sdb:
singlem = singlem_bin
log:
logs_dir + "/taxonomy/makedb.log"
conda:
"../../singlem.yml"
shell:
"{params.singlem} makedb "
"--otu-table {input} "
Expand All @@ -439,7 +424,7 @@ rule make_sdb:

rule create_SingleM_metapackage:
input:
packages = expand(output_dir + "/packages/{spkg}.spkg", spkg = hmms_and_names.index),
packages = expand(output_dir + "/packages/{spkg}.spkg", spkg = singlem_packages),
sdb = output_dir + "/taxonomy/transcripts.sdb",
genome_sizes = output_dir + "/gtdb_mean_genome_sizes.tsv"
output:
Expand All @@ -453,8 +438,6 @@ rule create_SingleM_metapackage:
runtime = 48*60,
log:
logs_dir + "/metapackage.log"
conda:
"../../singlem.yml"
shell:
"{params.singlem} metapackage "
"--singlem-packages {input.packages} "
Expand Down Expand Up @@ -498,11 +481,9 @@ rule calculate_mean_genome_size:
runtime = '1h',
log:
logs_dir + "/calculate_mean_genome_size.log"
conda:
"envs/calculate_mean_genome_size.yml"
shell:
"""
scripts/genome_size_from_gtdb.py {params.checkm2_grep_arg} --gtdb-bac-metadata {params.gtdb_bac_metadata} --gtdb-ar-metadata {params.gtdb_arc_metadata} > {output} 2> {log}
pixi run -e calculate-mean-genome-size scripts/genome_size_from_gtdb.py {params.checkm2_grep_arg} --gtdb-bac-metadata {params.gtdb_bac_metadata} --gtdb-ar-metadata {params.gtdb_arc_metadata} > {output} 2> {log}
"""

###########################
Expand All @@ -526,8 +507,6 @@ rule test_metapackage:
runtime = 48*60,
log:
logs_dir + "/sra/{sra}_new_test.log"
conda:
"../../singlem.yml"
shell:
"{params.singlem} pipe "
"--forward {input.sra_1} "
Expand Down Expand Up @@ -558,8 +537,6 @@ rule test_old_metapackage:
runtime = 48*60,
log:
logs_dir + "/sra/{sra}_old_test.log"
conda:
"../../singlem.yml"
shell:
"{params.singlem} pipe "
"--forward {input.sra_1} "
Expand All @@ -581,8 +558,6 @@ rule sra_fractions_assigned_to_each_level:
resources:
mem_mb = get_mem_mb,
runtime = 48*60,
conda:
"../../singlem.yml"
shell:
"{singlem_bin} summarise "
"--input-taxonomic-profile {input.profile} "
Expand All @@ -601,8 +576,6 @@ rule sra_prokaryotic_fraction:
runtime = 48*60,
params:
metapackge = lambda wildcards: config["metapackage"] if wildcards.version == "old" else output_dir + "/metapackage/" + config["new_metapackage"],
conda:
"../../singlem.yml"
log:
logs_dir + "/sra/{sra}_{version}_smf.log"
shell:
Expand Down
27 changes: 27 additions & 0 deletions extras/update_metapackage/config-R232.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
max_threads: 32
compressed_genome_data: true
gtdb_protein_fna_reps: /work/microbiome/msingle/mess/214_R232_renew/protein_fna_reps
output_dir: /work/microbiome/msingle/mess/214_R232_renew/update_metapackage_final_taxonomy
metapackage: /work/microbiome/db/singlem/S5.4.0.GTDB_r226.metapackage_20250331.smpkg.zb/payload_directory
new_metapackage: S6.5.0.GTDB_r232.metapackage_20260319.smpkg
new_version_number: 6.5.0
gtdb_protein_faa_reps: /work/microbiome/msingle/mess/214_R232_renew/protein_faa_reps
gtdb_bac_tax: /work/microbiome/msingle/mess/214_R232_renew/bac120_taxonomy_r232.tsv
gtdb_arc_tax: /work/microbiome/msingle/mess/214_R232_renew/ar53_taxonomy_r232.tsv
pfams: /work/microbiome/msingle/sam/HMMs/Pfam
tigrfams: /work/microbiome/msingle/sam/HMMs/TIGRfams.HMM
uniprot_seq: /work/microbiome/msingle/mess/214_R232_renew/uniprot_sprot.fa
uniprot_tax: /work/microbiome/msingle/mess/214_R232_renew/uniprot_sprot_taxonomy.tsv
sra_seqs_1:
DRR083195: /work/microbiome/msingle/sam/SRA/DRR083195_1.fastq
SRR1926147: /work/microbiome/msingle/sam/SRA/SRR1926147_1.fastq
SRR9224014: /work/microbiome/msingle/sam/SRA/SRR9224014_1.fastq
sra_seqs_2:
DRR083195: /work/microbiome/msingle/sam/SRA/DRR083195_2.fastq
SRR1926147: /work/microbiome/msingle/sam/SRA/SRR1926147_2.fastq
SRR9224014: /work/microbiome/msingle/sam/SRA/SRR9224014_2.fastq
gtdb_arc_metadata: /work/microbiome/msingle/mess/214_R232_renew/gtdb_r232_metadata.updated_reps-draft-final_taxonomy.tsv
gtdb_bac_metadata: /work/microbiome/msingle/mess/214_R232_renew/gtdb_r232_metadata.updated_reps-draft-final_taxonomy.tsv
# These 2 constants must jive with those in Metapackage
taxonomy_database_name: Genome Taxonomy Database (GTDB)
taxonomy_database_version: R232
Loading
Loading