msk-access · buehlere · Apr 14, 2025
diff --git a/biometrics_extract/0.2.16/biometrics_extract.cwl b/biometrics_extract/0.2.16/biometrics_extract.cwl
@@ -0,0 +1,165 @@
+class: CommandLineTool
+cwlVersion: v1.0
+$namespaces:
+  dct: 'http://purl.org/dc/terms/'
+  doap: 'http://usefulinc.com/ns/doap#'
+  foaf: 'http://xmlns.com/foaf/0.1/'
+  sbg: 'https://www.sevenbridges.com/'
+id: biometrics_extract_0_2_15
+baseCommand:
+  - biometrics
+  - extract
+inputs:
+  - id: sample_bam
+    type: File
+    inputBinding:
+      position: 0
+      prefix: '--sample-bam'
+    doc: BAM file.
+    secondaryFiles:
+      - ^.bai
+  - id: sample_sex
+    type: string?
+    inputBinding:
+      position: 0
+      prefix: '--sample-sex'
+    doc: Expected sample sex (i.e. M or F).
+  - id: sample_group
+    type: string?
+    inputBinding:
+      position: 0
+      prefix: '--sample-group'
+    doc: The sample group (e.g. the sample patient ID).
+  - id: sample_name
+    type: string
+    inputBinding:
+      position: 0
+      prefix: '--sample-name'
+    doc: >-
+      Sample name. If not specified, sample name is automatically figured out
+      from the BAM file.
+  - id: fafile
+    type: File
+    inputBinding:
+      position: 0
+      prefix: '--fafile'
+    doc: Path to reference fasta.
+    secondaryFiles:
+      - ^.fasta.fai
+  - id: vcf_file
+    type: File
+    inputBinding:
+      position: 0
+      prefix: '--vcf'
+    doc: VCF file containing the SNPs to be queried.
+  - id: bed_file
+    type: File?
+    inputBinding:
+      position: 0
+      prefix: '--bed'
+    doc: BED file containing the intervals to be queried.
+  - id: database
+    type: string?
+    inputBinding:
+      position: 0
+      prefix: '--database'
+    doc: >-
+      Directory to store the intermediate files after running the extraction
+      step.
+  - default: 1
+    id: min_mapping_quality
+    type: int?
+    inputBinding:
+      position: 0
+      prefix: '--min-mapping-quality'
+    doc: Minimum mapping quality of reads to be used for pileup.
+  - default: 1
+    id: min_base_quality
+    type: int?
+    inputBinding:
+      position: 0
+      prefix: '--min-base-quality'
+    doc: Minimum base quality of reads to be used for pileup.
+  - default: 10
+    id: min_coverage
+    type: int?
+    inputBinding:
+      position: 0
+      prefix: '--min-coverage'
+    doc: Minimum coverage to count a site.
+  - default: 0.1
+    id: min_homozygous_thresh
+    type: float?
+    inputBinding:
+      position: 0
+      prefix: '--min-homozygous-thresh'
+    doc: Minimum threshold to define homozygous.
+  - id: default_genotype
+    type: string?
+    inputBinding:
+      position: 0
+      prefix: '--default-genotype'
+    doc: Default genotype if coverage is too low (options are Het or Hom).
+  - id: file_type
+    type: string?
+    doc: >-
+      Specify the type of bam file you are generating the pickle for to be
+      incorporated in pickle file name (Myeloid_1_L001_duplex.pickle)
+outputs:
+  - id: ALL_FPsummary.txt
+    type: File
+    outputBinding:
+      glob: ALL_FPsummary.txt
+  - id: biometrics_extract_pickle
+    type: File
+    outputBinding:
+      glob: |-
+        ${
+          if (inputs.database) {
+            return inputs.database + '/' + inputs.sample_name + '.pickle';
+          }
+          else {
+            return inputs.sample_name + '.pickle';
+          }
+        }
+      outputEval: |-
+        ${
+           if (inputs.file_type) {
+             self[0].basename = inputs.sample_name + '_' + inputs.file_type + ".pickle";
+             return self;
+           }
+           else {
+             return self;
+           }
+        }
+requirements:
+  - class: ResourceRequirement
+    ramMin: 24000
+    coresMin: 4
+  - class: DockerRequirement
+    dockerPull: 'ghcr.io/msk-access/biometrics:0.2.15'
+  - class: InlineJavascriptRequirement
+'dct:contributor':
+  - class: 'foaf:Organization'
+    'foaf:member':
+      - class: 'foaf:Person'
+        'foaf:mbox': 'mailto:murphyc4@mskcc.org'
+        'foaf:name': Charlie Murphy
+      - class: 'foaf:Person'
+        'foaf:mbox': 'mailto:shahr2@mskcc.org'
+        'foaf:name': Ronak Shah
+      - class: 'foaf:Person'
+        'foaf:mbox': 'mailto:charlk@mskcc.org'
+        'foaf:name': Carmelina Charlambous
+    'foaf:name': Memorial Sloan Kettering Cancer Center
+'dct:creator':
+  - class: 'foaf:Organization'
+    'foaf:member':
+      - class: 'foaf:Person'
+        'foaf:mbox': 'mailto:shahr2@mskcc.org'
+        'foaf:name': Ronak Shah
+    'foaf:name': Memorial Sloan Kettering Cancer Center
+'doap:release':
+  - class: 'doap:Version'
+    'doap:name': biometrics
+    'doap:revision': 0.2.15
diff --git a/biometrics_extract/0.2.16/example_inputs.yaml b/biometrics_extract/0.2.16/example_inputs.yaml
@@ -0,0 +1,24 @@
+sample_type:
+  - "Normal"
+sample_sex:
+  - "M"
+sample_name:
+  - "test"
+sample_group:
+  - "test"
+fafile:
+  class: File
+  path: /path/to/fasta
+sample_bam:
+  - class: File
+    path: /path/to/bam
+bed_file: null
+vcf_file:
+  class: File
+  path: /path/to/vcf
+database: null
+min_mapping_quality: null
+min_base_quality: null
+min_coverage: null
+min_homozygous_thresh: null
+default_genotype: null
diff --git a/docs/biometrics/biometrics_extract_0.2.16.md b/docs/biometrics/biometrics_extract_0.2.16.md
@@ -0,0 +1,76 @@
+# CWL and Dockerfile for running biometrics
+
+## Version of tools in docker image (/container/Dockerfile)
+
+| Tool | Version | Location |
+|--- |--- |--- |
+| biometrics_extract   | 0.2.16  |  <https://github.com/msk-access/biometrics> |
+
+## CWL
+
+- CWL specification 1.0
+- Use example_inputs.json to see the inputs to the cwl
+- Example Command using [toil](https://toil.readthedocs.io):
+
+```bash
+    > toil-cwl-runner biometrics_extract.cwl example_inputs.json
+```
+
+```bash
+#Using CWLTOOL
+> cwltool --singularity --non-strict /path/to/biometrics_extract.cwl /path/to/example_inputs.json
+
+#Using toil-cwl-runner
+> mkdir tool_toil_log
+> toil-cwl-runner --singularity --logFile /path/to/tool_toil_log/cwltoil.log  --jobStore /path/to/tool_jobStore --batchSystem lsf --workDir /path/to/tool_toil_log --outdir . --writeLogs /path/to/tool_toil_log --logLevel DEBUG --stats --retryCount 2 --disableCaching --maxLogFileSize 20000000000 /path/to/biometrics_extract.cwl /path/to/example_inputs.json > tool_toil.stdout 2> tool_toil.stderr &
+```
+
+### Usage
+
+```bash
+> toil-cwl-runner biometrics_extract.cwl -h
+usage: biometrics_extract/0.2.16/biometrics_extract.cwl [-h] --sample_bam SAMPLE_BAM
+                                                        [--sample_sex SAMPLE_SEX]
+                                                        [--sample_group SAMPLE_GROUP] --sample_name
+                                                        SAMPLE_NAME --fafile FAFILE --vcf_file VCF_FILE
+                                                        [--bed_file BED_FILE] [--database DATABASE]
+                                                        [--min_mapping_quality MIN_MAPPING_QUALITY]
+                                                        [--min_base_quality MIN_BASE_QUALITY]
+                                                        [--min_coverage MIN_COVERAGE]
+                                                        [--min_homozygous_thresh MIN_HOMOZYGOUS_THRESH]
+                                                        [--default_genotype DEFAULT_GENOTYPE]
+                                                        [--file_type FILE_TYPE]
+                                                        [job_order]
+
+positional arguments:
+  job_order             Job input json file
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --sample_bam SAMPLE_BAM
+                        BAM file.
+  --sample_sex SAMPLE_SEX
+                        Expected sample sex (i.e. M or F).
+  --sample_group SAMPLE_GROUP
+                        The sample group (e.g. the sample patient ID).
+  --sample_name SAMPLE_NAME
+                        Sample name. If not specified, sample name is automatically figured out from the
+                        BAM file.
+  --fafile FAFILE       Path to reference fasta.
+  --vcf_file VCF_FILE   VCF file containing the SNPs to be queried.
+  --bed_file BED_FILE   BED file containing the intervals to be queried.
+  --database DATABASE   Directory to store the intermediate files after running the extraction step.
+  --min_mapping_quality MIN_MAPPING_QUALITY
+                        Minimum mapping quality of reads to be used for pileup.
+  --min_base_quality MIN_BASE_QUALITY
+                        Minimum base quality of reads to be used for pileup.
+  --min_coverage MIN_COVERAGE
+                        Minimum coverage to count a site.
+  --min_homozygous_thresh MIN_HOMOZYGOUS_THRESH
+                        Minimum threshold to define homozygous.
+  --default_genotype DEFAULT_GENOTYPE
+                        Default genotype if coverage is too low (options are Het or Hom).
+  --file_type FILE_TYPE
+                        Specify the type of bam file you are generating the pickle for to be incorporated
+                        in pickle file name (Myeloid_1_L001_duplex.pickle)
+```