Skip to content

Negative junction_length for reverse_complement sequence #3

@Zjianglin

Description

@Zjianglin

Hi developer, nice tool!

I have encountered a confused question about 'is_reverse_complement': True sequnce, whose junction_length attribute is negative. Is this a bug? (caculated from junction_end - junction_start). I think for later typically analysis procedures, suchsequene shoule be transformed to its 'reverse_complement' status to keep the 5'->3' direction. Howevere, the groudtruth of the coordinates for domains (such as v_region (v_sequence_start, v_sequence_end), 'junction_region'(junction_start, junction_end)) seems to be numbered according to raw sequence, how these coordinates shoule be properlly processed?

My procedure as below:

result = Experiment.on("human_tcrb").sequence( with_reverse_complement(0.3)).mutate(rate(0.02, 0.08))

for seq in result:
    ...:     if seq['productive'] and seq['is_reverse_complement']:
    ...:         print(seq)
    ...:         break
    ...: 
{'sequence': 'ccagcTctgagagccgggtcccggcgccgaagtactgaatgtttttggctAGcGttCcacagacatacacggccgagtccccctCctctgcaggctggatcttcagagtagagaatgatccatcaggcctctcGgctgagaatcActctCtgggcatacctgcGttatctataatagatcggctccggaagtaaGtcagAaattccaCtccctgcGcgGagCtatctctgtaccagaaaaggaaCttgtggccaaaaattggctcacatctcagagtcactgtttgtcccatttctgtcacctcatActtgggtgactggataatgccagcatc', 'germline_alignment': 'ggtcgtgactctcggcccagggccgcggcttcatgacttacaaaaaccgaNNgNaacgtgtctgtatgtgccggctcagggggacgagacgtccgacctagaagtctcatctcttactaggtagtccggagagtcgactcttagcgagacacccgtatggacgtaatagatattatctagccgaggccttcattgagtcgttaaggtcagggacgtgcttccatagagacatggtcttttcctttaacaccggtttttaaccgagtgtagagtctcagtgacaaacagggtaaagacagtggagtacgaacccactgacctattacggtcgtag', 'v_call': 'TRBV20/OR9-2*01', 'd_call': 'Short-D', 'j_call': 'TRBJ2-6*01,TRBJ2-7*01,TRBJ2-7*02', 'c_call': '', 'v_sequence_start': 54, 'v_sequence_end': 334, 'v_germline_start': 0, 'v_germline_end': 280, 'd_sequence_start': 52, 'd_sequence_end': 53, 'd_germline_start': 0, 'd_germline_end': 1, 'j_sequence_start': 0, 'j_sequence_end': 50, 'j_germline_start': 0, 'j_germline_end': 50, 'junction_start': 60, 'junction_end': 33, 'junction_nt': '', 'junction_aa': '', 'junction_length': -27, 'v_trim_5': 0, 'v_trim_3': 10, 'd_trim_5': 0, 'd_trim_3': 15, 'j_trim_5': 0, 'j_trim_3': 0, 'np1_region': 'G', 'np1_length': 1, 'np2_region': 'AG', 'np2_length': 2, 'mutation_rate': 0.045317, 'n_mutations': 15, 'mutations': '5:t>T,56:c>C,84:c>C,133:t>G,144:c>A,149:c>C,163:t>G,194:g>G,199:g>A,207:c>C,215:t>G,218:t>G,221:c>C,244:t>C,306:c>A', 'n_sequencing_errors': 0, 'sequencing_errors': '', 'n_pcr_errors': 0, 'pcr_errors': '', 'productive': True, 'stop_codon': False, 'vj_in_frame': True, 'note': '', 'is_reverse_complement': True, 'is_contaminant': False, 'sequence_length': 334}

Above sequence has a negative length: 'junction_start': 60, 'junction_end': 33, 'junction_nt': '', 'junction_aa': '', 'junction_length': -27. And this is a productive and vj_in_frame sequence, why its junction_aa and junction_nt is empty?

BTW, I want to use GenAIRR to simulate tens of human TCRB sequences. currently, I used following settings and procedures, is it correct?. Do you have any suggestion for my parameters settings and/or simulation procedures? Thanks.

LOCUS_TO_CONFIG = {
    'IGH': 'HUMAN_IGH_IMGT',
    'IGK': 'HUMAN_IGK_IMGT',
    'IGL': 'HUMAN_IGL_IMGT',
    'TRA': 'HUMAN_TCRA_IMGT',
    'TRB': 'HUMAN_TCRB_IMGT',
}
LOCUS_SIM_PARAMS = {
    'IGH': {'mutation': (0.01, 0.05), 'pcr_cycles': 30, 'five_prime_loss': 24, 'three_prime_loss': 18, 'indel_prob': 0.0025, 'ns_prob': 0.0025},
    'IGK': {'mutation': (0.002, 0.02), 'pcr_cycles': 25, 'five_prime_loss': 20, 'three_prime_loss': 14, 'indel_prob': 0.0015, 'ns_prob': 0.0020},
    'IGL': {'mutation': (0.002, 0.02), 'pcr_cycles': 25, 'five_prime_loss': 20, 'three_prime_loss': 14, 'indel_prob': 0.0015, 'ns_prob': 0.0020},
    'TRA': {'mutation': (0.0, 0.0005), 'pcr_cycles': 25, 'five_prime_loss': 18, 'three_prime_loss': 12, 'indel_prob': 0.0008, 'ns_prob': 0.0015},
    'TRB': {'mutation': (0.0, 0.0005), 'pcr_cycles': 25, 'five_prime_loss': 18, 'three_prime_loss': 12, 'indel_prob': 0.0008, 'ns_prob': 0.0015},
}
QUALITY_PROFILE = {'base': 0.001, 'peak': 0.02}

def _import_genairr():
    from GenAIRR import Experiment
    from GenAIRR.ops import rate, with_3prime_loss, with_5prime_loss, with_indels, with_ns, with_pcr, with_quality_profile, with_reverse_complement
    return {
        'Experiment': Experiment,
        'rate': rate,
        'with_3prime_loss': with_3prime_loss,
        'with_5prime_loss': with_5prime_loss,
        'with_indels': with_indels,
        'with_ns': with_ns,
        'with_pcr': with_pcr,
        'with_quality_profile': with_quality_profile,
        'with_reverse_complement': with_reverse_complement,
    }


def _experiment_for_locus(locus: str, productive: bool, seed: int):
    ga = _import_genairr()
    Experiment = ga['Experiment']
    cfg = LOCUS_TO_CONFIG[locus]
    exp = Experiment.on(cfg)
    params = LOCUS_SIM_PARAMS[locus]
    mutation_low, mutation_high = params['mutation']
    exp = exp.mutate(ga['rate'](mutation_low, mutation_high))
    exp = exp.prepare(ga['with_pcr'](error_rate=1e-4, cycles=params['pcr_cycles']))
    exp = exp.sequence(
        ga['with_5prime_loss'](min_remove=0, max_remove=params['five_prime_loss']),
        ga['with_3prime_loss'](min_remove=0, max_remove=params['three_prime_loss']),
        ga['with_quality_profile'](base=QUALITY_PROFILE['base'], peak=QUALITY_PROFILE['peak']),
        ga['with_reverse_complement'](prob=0.1),
    )
    exp = exp.observe(
        ga['with_indels'](prob=params['indel_prob']),
        ga['with_ns'](prob=params['ns_prob']),
    )
    sim = exp.compile(seed=seed, productive=productive)
    return sim

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions