Skip to content

Commit

Permalink
Merge pull request #754 from griffithlab/proximal_dnp
Browse files Browse the repository at this point in the history
Handle proximal DNP variants affecting multiple amino acids
  • Loading branch information
susannasiebert authored Jan 18, 2022
2 parents 1e86411 + d75b299 commit 02dd9d4
Show file tree
Hide file tree
Showing 8 changed files with 183 additions and 5 deletions.
32 changes: 27 additions & 5 deletions lib/fasta_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,20 +111,42 @@ def add_proximal_variants(self, somatic_variant_index, wildtype_subsequence, mut
codon_changes = [ item['codon_change'] for item in filtered_lines ]
proximal_variant_mutant_amino_acid = ProximalVariant.combine_conflicting_variants(codon_changes)

proximal_variant_position = int(protein_position) - 1 - mutation_offset
if proximal_variant_position <= 0 or proximal_variant_position >= len(wildtype_subsequence):
if '-' in protein_position:
proximal_variant_start_position = int(protein_position.split('-')[0]) - 1 - mutation_offset
else:
proximal_variant_start_position = int(protein_position) - 1 - mutation_offset
proximal_variant_end_position = proximal_variant_start_position + len(proximal_variant_mutant_amino_acid)

if proximal_variant_end_position <= 0 or proximal_variant_start_position >= len(wildtype_subsequence):
continue
if len(proximal_variant_wildtype_amino_acid) != len(proximal_variant_mutant_amino_acid):
print("Nearby variant is not a missense mutation. Skipping.")
continue
if wildtype_subsequence[proximal_variant_position] != proximal_variant_wildtype_amino_acid:

if proximal_variant_end_position > len(wildtype_subsequence):
#the DNP extends past the end of the wildtype_subsequence
missing_amino_acids_count = proximal_variant_end_position - len(wildtype_subsequence)
#remove proximal variant amino acids after wildtype subsquence end
proximal_variant_wildtype_amino_acid = proximal_variant_wildtype_amino_acid[:len(proximal_variant_wildtype_amino_acid) - missing_amino_acids_count]
proximal_variant_mutant_amino_acid = proximal_variant_mutant_amino_acid[:len(proximal_variant_mutant_amino_acid) - missing_amino_acids_count]

if proximal_variant_start_position < 0:
#DNP starts before the beginning of the wildtype subsequence
missing_amino_acids_count = abs(proximal_variant_start_position)
proximal_variant_start_position = 0
#remove proximal variant amino acids before wildtype subsequence start
proximal_variant_wildtype_amino_acid = proximal_variant_wildtype_amino_acid[missing_amino_acids_count:]
proximal_variant_mutant_amino_acid = proximal_variant_mutant_amino_acid[missing_amino_acids_count:]

if wildtype_subsequence[proximal_variant_start_position:proximal_variant_end_position] != proximal_variant_wildtype_amino_acid:
sys.exit(
"Error when processing proximal variant.\n" +
"The wildtype amino acid for variant %s with substring %s is different than expected.\n" % (somatic_variant_index, wildtype_subsequence) +
"Actual wildtype amino acid: %s\n" % wildtype_subsequence[proximal_variant_position] +
"Actual wildtype amino acid: %s\n" % wildtype_subsequence[proximal_variant_start_position:proximal_variant_end_position] +
"Wildtype amino acid of the proximal_variant: %s" % proximal_variant_wildtype_amino_acid
)
wildtype_subsequence_with_proximal_variants = wildtype_subsequence_with_proximal_variants[:proximal_variant_position] + proximal_variant_mutant_amino_acid + wildtype_subsequence_with_proximal_variants[proximal_variant_position+1:]

wildtype_subsequence_with_proximal_variants = wildtype_subsequence_with_proximal_variants[:proximal_variant_start_position] + proximal_variant_mutant_amino_acid + wildtype_subsequence_with_proximal_variants[proximal_variant_end_position:]
return wildtype_subsequence_with_proximal_variants

def execute(self):
Expand Down
11 changes: 11 additions & 0 deletions tests/test_data/fasta_generator/proximal_dnp/input.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
chromosome_name start stop reference variant gene_name transcript_name transcript_support_level amino_acid_change codon_change ensembl_gene_id hgvsc hgvsp wildtype_amino_acid_sequence frameshift_amino_acid_sequence fusion_amino_acid_sequence variant_type protein_position transcript_expression gene_expression normal_depth normal_vaf tdna_depth tdna_vaf trna_depth trna_vaf index protein_length_change
chr16 28593449 28593450 AG GA SULT1A2 ENST00000335715 1 A/V gCT/gTC ENSG00000197165 ENST00000335715.9:c.491_492delinsTC ENSP00000338742.4:p.Ala164Val MELIQDISRPPLEYVKGVPLIKYFAEALGPLQSFQARPDDLLISTYPKSGTTWVSQILDMIYQGGDLEKCHRAPIFMRVPFLEFKVPGIPSGMETLKNTPAPRLLKTHLPLALLPQTLLDQKVKVVYVARNAKDVAVSYYHFYHMAKVYPHPGTWESFLEKFMAGEVSYGSWYQHVQEWWELSRTHPVLYLFYEDMKENPKREIQKILEFVGRSLPEETVDLMVEHTSFKEMKKNPMTNYTTVRREFMDHSISPFMRKGMAGDWKTTFTVAQNERFDADYAKKMAGCSLSFRSEL missense 164 NA 0.0534405615378929 NA NA 185 0.136 NA NA 1.SULT1A2.ENST00000335715.missense.164A/V
chr16 28593449 28593450 AG GA SULT1A2 ENST00000395630 5 A/V gCT/gTC ENSG00000197165 ENST00000395630.5:c.491_492delinsTC ENSP00000378992.1:p.Ala164Val MELIQDISRPPLEYVKGVPLIKYFAEALGPLQSFQARPDDLLISTYPKSGTTWVSQILDMIYQGGDLEKCHRAPIFMRVPFLEFKVPGIPSGMETLKNTPAPRLLKTHLPLALLPQTLLDQKVKVVYVARNAKDVAVSYYHFYHMAKVYPHPGTWESFLEKFMAGEVSYGSWYQHVQEWWELSRTHPVLYLFYEDMKENPKREIQKILEFVGRSLPEETVDLMVEHTSFKEMKKNPMTNYTTVRREFMDHSISPFMRKGMAGDWKTTFTVAQNERFDADYAKKMAGCSLSFRSEL missense 164 NA 0.0534405615378929 NA NA 185 0.136 NA NA 2.SULT1A2.ENST00000395630.missense.164A/V
chr16 28593449 28593450 AG GA SULT1A2 ENST00000526384 3 A/V gCT/gTC ENSG00000197165 ENST00000526384.1:c.491_492delinsTC ENSP00000435358.1:p.Ala164Val MELIQDISRPPLEYVKGVPLIKYFAEALGPLQSFQARPDDLLISTYPKSGTTWVSQILDMIYQGGDLEKCHRAPIFMRVPFLEFKVPGIPSGMETLKNTPAPRLLKTHLPLALLPQTLLDQKVKVVYVARNAKDVAVSYYHFYHMAKVYPHPGTWESFLEKFMAGEVSYGSWYQHVQEWWELSRTHPVLYLF missense 164 NA 0.0534405615378929 NA NA 185 0.136 NA NA 3.SULT1A2.ENST00000526384.missense.164A/V
chr16 28593449 28593450 AG GA SULT1A2 ENST00000534108 2 GW/GR ggCTgg/ggTCgg ENSG00000197165 ENST00000534108.5:c.576_577delinsTC ENSP00000433075.1:p.Trp193Arg MELIQDISRPPLEYVKGVPLIKYFAEALGPLQSFQARPDDLLISTYPKSGTTWVSQILDMIYQGGDLEKCHRAPIFMRVPFLEFKVPGIPSGVCVLGARGVEEDRAGASAHQTFPDPLLRDGDSEKHTSPTTPEDTPAPGSAPPDSVGSEGQGGLCCPQRKGCGGFLLPLLPHGQSVPSPWDLGKLPGEVHGWRSVLWVLVPARARVVGAEPHPPCSLPLL missense 192-193 NA 0.0534405615378929 NA NA 185 0.136 NA NA 4.SULT1A2.ENST00000534108.missense.192-193GW/GR
chr16 28593449 28593450 AG GA AC020765.6 ENST00000677940 NA A/V gCT/gTC ENSG00000288656 ENST00000677940.1:c.257_258delinsTC ENSP00000503077.1:p.Ala86Val MLAKLLCDQVVGAPIAVSAFYAGMSILQGKDDIFLDLKQKFWNTYMVVYVARNAKDVAVSYYHFYHMAKVYPHPGTWESFLEKFMAGEVSYGSWYQHVQEWWELSRTHPVLYLFYEDMKENPKREIQKILEFVGRSLPEETVDLMVEHTSFKEMKKNPMTNYTTVRREFMDHSISPFMRKGMAGDWKTTFTVAQNERFDADYAKKMAGC missense 86 NA 0.0 NA NA 185 0.136 NA NA 5.AC020765.6.ENST00000677940.missense.86A/V
chr16 28593472 28593473 T G SULT1A2 ENST00000335715 1 E/D gaA/gaC ENSG00000197165 ENST00000335715.9:c.468A>C ENSP00000338742.4:p.Glu156Asp MELIQDISRPPLEYVKGVPLIKYFAEALGPLQSFQARPDDLLISTYPKSGTTWVSQILDMIYQGGDLEKCHRAPIFMRVPFLEFKVPGIPSGMETLKNTPAPRLLKTHLPLALLPQTLLDQKVKVVYVARNAKDVAVSYYHFYHMAKVYPHPGTWESFLEKFMAGEVSYGSWYQHVQEWWELSRTHPVLYLFYEDMKENPKREIQKILEFVGRSLPEETVDLMVEHTSFKEMKKNPMTNYTTVRREFMDHSISPFMRKGMAGDWKTTFTVAQNERFDADYAKKMAGCSLSFRSEL missense 156 NA 0.0534405615378929 NA NA 195 0.121 0 NA 6.SULT1A2.ENST00000335715.missense.156E/D
chr16 28593472 28593473 T G SULT1A2 ENST00000395630 5 E/D gaA/gaC ENSG00000197165 ENST00000395630.5:c.468A>C ENSP00000378992.1:p.Glu156Asp MELIQDISRPPLEYVKGVPLIKYFAEALGPLQSFQARPDDLLISTYPKSGTTWVSQILDMIYQGGDLEKCHRAPIFMRVPFLEFKVPGIPSGMETLKNTPAPRLLKTHLPLALLPQTLLDQKVKVVYVARNAKDVAVSYYHFYHMAKVYPHPGTWESFLEKFMAGEVSYGSWYQHVQEWWELSRTHPVLYLFYEDMKENPKREIQKILEFVGRSLPEETVDLMVEHTSFKEMKKNPMTNYTTVRREFMDHSISPFMRKGMAGDWKTTFTVAQNERFDADYAKKMAGCSLSFRSEL missense 156 NA 0.0534405615378929 NA NA 195 0.121 0 NA 7.SULT1A2.ENST00000395630.missense.156E/D
chr16 28593472 28593473 T G SULT1A2 ENST00000526384 3 E/D gaA/gaC ENSG00000197165 ENST00000526384.1:c.468A>C ENSP00000435358.1:p.Glu156Asp MELIQDISRPPLEYVKGVPLIKYFAEALGPLQSFQARPDDLLISTYPKSGTTWVSQILDMIYQGGDLEKCHRAPIFMRVPFLEFKVPGIPSGMETLKNTPAPRLLKTHLPLALLPQTLLDQKVKVVYVARNAKDVAVSYYHFYHMAKVYPHPGTWESFLEKFMAGEVSYGSWYQHVQEWWELSRTHPVLYLF missense 156 NA 0.0534405615378929 NA NA 195 0.121 0 NA 8.SULT1A2.ENST00000526384.missense.156E/D
chr16 28593472 28593473 T G SULT1A2 ENST00000534108 2 K/Q Aag/Cag ENSG00000197165 ENST00000534108.5:c.553A>C ENSP00000433075.1:p.Lys185Gln MELIQDISRPPLEYVKGVPLIKYFAEALGPLQSFQARPDDLLISTYPKSGTTWVSQILDMIYQGGDLEKCHRAPIFMRVPFLEFKVPGIPSGVCVLGARGVEEDRAGASAHQTFPDPLLRDGDSEKHTSPTTPEDTPAPGSAPPDSVGSEGQGGLCCPQRKGCGGFLLPLLPHGQSVPSPWDLGKLPGEVHGWRSVLWVLVPARARVVGAEPHPPCSLPLL missense 185 NA 0.0534405615378929 NA NA 195 0.121 0 NA 9.SULT1A2.ENST00000534108.missense.185K/Q
chr16 28593472 28593473 T G AC020765.6 ENST00000677940 NA E/D gaA/gaC ENSG00000288656 ENST00000677940.1:c.234A>C ENSP00000503077.1:p.Glu78Asp MLAKLLCDQVVGAPIAVSAFYAGMSILQGKDDIFLDLKQKFWNTYMVVYVARNAKDVAVSYYHFYHMAKVYPHPGTWESFLEKFMAGEVSYGSWYQHVQEWWELSRTHPVLYLFYEDMKENPKREIQKILEFVGRSLPEETVDLMVEHTSFKEMKKNPMTNYTTVRREFMDHSISPFMRKGMAGDWKTTFTVAQNERFDADYAKKMAGC missense 78 NA 0.0 NA NA 195 0.121 0 NA 10.AC020765.6.ENST00000677940.missense.78E/D
16 changes: 16 additions & 0 deletions tests/test_data/fasta_generator/proximal_dnp/output.cut_off.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
>1
SFLEKFMAGEVSYGS
>2
SFLEKFMVGEVSYGS
>3
KLPGEVHGWRSVLWVL
>4
QLPGEVHGRRSVLWVL
>5
YPHPGTWESFLEKFM
>6
YPHPGTWDSFLEKFM
>7
PSPWDLGKLPGEVHG
>8
PSPWDLGQLPGEVHG
28 changes: 28 additions & 0 deletions tests/test_data/fasta_generator/proximal_dnp/output.cut_off.key
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
1:
- WT.1.SULT1A2.ENST00000335715.missense.164A/V
- WT.2.SULT1A2.ENST00000395630.missense.164A/V
- WT.3.SULT1A2.ENST00000526384.missense.164A/V
- WT.5.AC020765.6.ENST00000677940.missense.86A/V
2:
- MT.1.SULT1A2.ENST00000335715.missense.164A/V
- MT.2.SULT1A2.ENST00000395630.missense.164A/V
- MT.3.SULT1A2.ENST00000526384.missense.164A/V
- MT.5.AC020765.6.ENST00000677940.missense.86A/V
3:
- WT.4.SULT1A2.ENST00000534108.missense.192-193GW/GR
4:
- MT.4.SULT1A2.ENST00000534108.missense.192-193GW/GR
5:
- WT.6.SULT1A2.ENST00000335715.missense.156E/D
- WT.7.SULT1A2.ENST00000395630.missense.156E/D
- WT.8.SULT1A2.ENST00000526384.missense.156E/D
- WT.10.AC020765.6.ENST00000677940.missense.78E/D
6:
- MT.6.SULT1A2.ENST00000335715.missense.156E/D
- MT.7.SULT1A2.ENST00000395630.missense.156E/D
- MT.8.SULT1A2.ENST00000526384.missense.156E/D
- MT.10.AC020765.6.ENST00000677940.missense.78E/D
7:
- WT.9.SULT1A2.ENST00000534108.missense.185K/Q
8:
- MT.9.SULT1A2.ENST00000534108.missense.185K/Q
16 changes: 16 additions & 0 deletions tests/test_data/fasta_generator/proximal_dnp/output.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
>1
WESFLEKFMAGEVSYGSWY
>2
WDSFLEKFMVGEVSYGSWY
>3
LGKLPGEVHGWRSVLWVLVP
>4
LGQLPGEVHGRRSVLWVLVP
>5
KVYPHPGTWESFLEKFMAG
>6
KVYPHPGTWDSFLEKFMVG
>7
SVPSPWDLGKLPGEVHGWR
>8
SVPSPWDLGQLPGEVHGRR
28 changes: 28 additions & 0 deletions tests/test_data/fasta_generator/proximal_dnp/output.key
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
1:
- WT.1.SULT1A2.ENST00000335715.missense.164A/V
- WT.2.SULT1A2.ENST00000395630.missense.164A/V
- WT.3.SULT1A2.ENST00000526384.missense.164A/V
- WT.5.AC020765.6.ENST00000677940.missense.86A/V
2:
- MT.1.SULT1A2.ENST00000335715.missense.164A/V
- MT.2.SULT1A2.ENST00000395630.missense.164A/V
- MT.3.SULT1A2.ENST00000526384.missense.164A/V
- MT.5.AC020765.6.ENST00000677940.missense.86A/V
3:
- WT.4.SULT1A2.ENST00000534108.missense.192-193GW/GR
4:
- MT.4.SULT1A2.ENST00000534108.missense.192-193GW/GR
5:
- WT.6.SULT1A2.ENST00000335715.missense.156E/D
- WT.7.SULT1A2.ENST00000395630.missense.156E/D
- WT.8.SULT1A2.ENST00000526384.missense.156E/D
- WT.10.AC020765.6.ENST00000677940.missense.78E/D
6:
- MT.6.SULT1A2.ENST00000335715.missense.156E/D
- MT.7.SULT1A2.ENST00000395630.missense.156E/D
- MT.8.SULT1A2.ENST00000526384.missense.156E/D
- MT.10.AC020765.6.ENST00000677940.missense.78E/D
7:
- WT.9.SULT1A2.ENST00000534108.missense.185K/Q
8:
- MT.9.SULT1A2.ENST00000534108.missense.185K/Q
11 changes: 11 additions & 0 deletions tests/test_data/fasta_generator/proximal_dnp/proximal_variants.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
chromosome_name start stop reference variant amino_acid_change codon_change protein_position type main_somatic_variant
chr16 28593472 28593473 T G E/D gaA/gaC 156 somatic 1.SULT1A2.ENST00000335715.missense.164A/V
chr16 28593472 28593473 T G E/D gaA/gaC 156 somatic 2.SULT1A2.ENST00000395630.missense.164A/V
chr16 28593472 28593473 T G E/D gaA/gaC 156 somatic 3.SULT1A2.ENST00000526384.missense.164A/V
chr16 28593472 28593473 T G K/Q Aag/Cag 185 somatic 4.SULT1A2.ENST00000534108.missense.192-193GW/GR
chr16 28593472 28593473 T G E/D gaA/gaC 78 somatic 5.AC020765.6.ENST00000677940.missense.86A/V
chr16 28593449 28593450 AG GA A/V gCT/gTC 164 somatic 6.SULT1A2.ENST00000335715.missense.156E/D
chr16 28593449 28593450 AG GA A/V gCT/gTC 164 somatic 7.SULT1A2.ENST00000395630.missense.156E/D
chr16 28593449 28593450 AG GA A/V gCT/gTC 164 somatic 8.SULT1A2.ENST00000526384.missense.156E/D
chr16 28593449 28593450 AG GA GW/GR ggCTgg/ggTCgg 192-193 somatic 9.SULT1A2.ENST00000534108.missense.185K/Q
chr16 28593449 28593450 AG GA A/V gCT/gTC 86 somatic 10.AC020765.6.ENST00000677940.missense.78E/D
46 changes: 46 additions & 0 deletions tests/test_fasta_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,52 @@ def test_multiple_proximal_variants_on_same_codon_results_in_stop_codon(self):
self.assertEqual(os.path.getsize(generate_fasta_output_file.name), 0)
self.assertEqual(os.path.getsize(generate_fasta_key_output_file.name), 0)

def test_proximal_dnp(self):
test_data_dir = os.path.join(self.test_data_dir, 'proximal_dnp')
generate_fasta_input_file = os.path.join(test_data_dir, 'input.tsv')
generate_fasta_proximal_variants_file = os.path.join(test_data_dir, 'proximal_variants.tsv')
generate_fasta_output_file = tempfile.NamedTemporaryFile()
generate_fasta_key_output_file = tempfile.NamedTemporaryFile()

generate_fasta_params = {
'input_file' : generate_fasta_input_file,
'epitope_length' : 10,
'flanking_sequence_length' : 9,
'output_file' : generate_fasta_output_file.name,
'output_key_file' : generate_fasta_key_output_file.name,
'proximal_variants_file' : generate_fasta_proximal_variants_file,
}
generator = FastaGenerator(**generate_fasta_params)

self.assertFalse(generator.execute())
expected_output_file = os.path.join(test_data_dir, 'output.fasta')
self.assertTrue(cmp(generate_fasta_output_file.name, expected_output_file))
expected_key_output_file = os.path.join(test_data_dir, 'output.key')
self.assertTrue(cmp(generate_fasta_key_output_file.name, expected_key_output_file))

def test_proximal_dnp_cut_off(self):
test_data_dir = os.path.join(self.test_data_dir, 'proximal_dnp')
generate_fasta_input_file = os.path.join(test_data_dir, 'input.tsv')
generate_fasta_proximal_variants_file = os.path.join(test_data_dir, 'proximal_variants.tsv')
generate_fasta_output_file = tempfile.NamedTemporaryFile()
generate_fasta_key_output_file = tempfile.NamedTemporaryFile()

generate_fasta_params = {
'input_file' : generate_fasta_input_file,
'epitope_length' : 8,
'flanking_sequence_length' : 7,
'output_file' : generate_fasta_output_file.name,
'output_key_file' : generate_fasta_key_output_file.name,
'proximal_variants_file' : generate_fasta_proximal_variants_file,
}
generator = FastaGenerator(**generate_fasta_params)

self.assertFalse(generator.execute())
expected_output_file = os.path.join(test_data_dir, 'output.cut_off.fasta')
self.assertTrue(cmp(generate_fasta_output_file.name, expected_output_file))
expected_key_output_file = os.path.join(test_data_dir, 'output.cut_off.key')
self.assertTrue(cmp(generate_fasta_key_output_file.name, expected_key_output_file))

def test_distance_from_start_works_as_expected(self):
generate_fasta_input_file = tempfile.NamedTemporaryFile()
generate_fasta_output_file = tempfile.NamedTemporaryFile()
Expand Down

0 comments on commit 02dd9d4

Please sign in to comment.