griffithlab · susannasiebert · Aug 20, 2024 · Aug 14, 2024
diff --git a/pvactools/lib/fasta_generator.py b/pvactools/lib/fasta_generator.py
@@ -31,15 +31,28 @@ def __init__(self, **kwargs):
         self.output_key_file            = kwargs['output_key_file']
         self.downstream_sequence_length = kwargs.pop('downstream_sequence_length', None)
         self.proximal_variants_file     = kwargs.pop('proximal_variants_file', None)
+        self.trim_invalid_characters    = kwargs.pop('trim_invalid_characters', False)
         self.proximal_variants          = self.parse_proximal_variants_file()
 
+    def invalid_characters(self):
+        return ['*', 'X', '?']
 
     def contains_invalid_characters(self, sequence):
-        for character in ['*', 'X', '?']:
+        for character in self.invalid_characters():
             if character in sequence:
                 return True
         return False
 
+    def trim_sequence(self, sequence):
+        for character in self.invalid_characters():
+            while character in sequence:
+                invalid_character_pos = sequence.index(character)
+                if invalid_character_pos < (len(sequence) - invalid_character_pos):
+                    sequence = sequence[invalid_character_pos+1:]
+                else:
+                    sequence = sequence[0:invalid_character_pos]
+        return sequence
+
     def position_out_of_bounds(self, position, sequence):
         return position > len(sequence)-1
 
@@ -313,7 +326,10 @@ def execute(self):
                 subsequence = subsequence[:-1]
 
             if self.contains_invalid_characters(subsequence):
-                continue
+                if self.trim_invalid_characters:
+                    subsequence = self.trim_sequence(subsequence)
+                else:
+                    continue
 
             if len(subsequence) < self.epitope_length:
                 continue

diff --git a/pvactools/lib/input_file_converter.py b/pvactools/lib/input_file_converter.py
@@ -678,5 +678,7 @@ def execute(self):
             output_rows = self.parse_arriba_file(starfusion_entries)
         elif os.path.isdir(self.input_file):
             output_rows = self.parse_agfusion_files(starfusion_entries)
+        if not os.path.exists(self.input_file):
+            raise Exception("Input file {} doesn't exist. Aborting.".format(self.input_file))
         tsv_writer.writerows(output_rows)
         writer.close()
diff --git a/pvactools/tools/pvacfuse/generate_protein_fasta.py b/pvactools/tools/pvacfuse/generate_protein_fasta.py
@@ -69,7 +69,8 @@ def generate_fasta(args, downstream_sequence_length, temp_dir, save_tsv_file):
         'epitope_length'            : 0,
         'output_file'               : fasta_file,
         'output_key_file'           : fasta_key_file,
-        'downstream_sequence_length': downstream_sequence_length
+        'downstream_sequence_length': downstream_sequence_length,
+        'trim_invalid_characters'   : True,
     }
     fasta_generator = FusionFastaGenerator(**generate_fasta_params)
     fasta_generator.execute()

diff --git a/pvactools/tools/pvacfuse/run.py b/pvactools/tools/pvacfuse/run.py
@@ -227,17 +227,17 @@ def main(args_input = sys.argv[1:]):
                     output_file = os.path.join(per_epitope_output_dir, "{}.all_epitopes.final.tsv".format(args.sample_name))
                     append_columns(intermediate_output_file, "{}.tsv".format(input_file), output_file)
                     output_files.append(output_file)
-                if epitope_length == max(epitope_lengths):
-                    # copy fasta to output dir
-                    fasta_file = os.path.join(output_dir, "{}.fasta".format(args.sample_name))
-                    shutil.copy(input_file, fasta_file)
-                    run_arguments['fasta'] = fasta_file
-                    # generate and copy net_chop fasta to output dir if specified
-                    if args.net_chop_method:
-                        epitope_flank_length = 9
-                        (net_chop_fasta, _) = generate_fasta(args, output_dir, epitope_length, epitope_flank_length, net_chop_fasta=True)
-                        run_arguments['net_chop_fasta'] = net_chop_fasta
             if len(output_files) > 0:
+                # copy fasta to output dir
+                (input_file, per_epitope_output_dir) = generate_fasta(args, output_dir, max(epitope_lengths))
+                fasta_file = os.path.join(output_dir, "{}.fasta".format(args.sample_name))
+                shutil.copy(input_file, fasta_file)
+                run_arguments['fasta'] = fasta_file
+                # generate and copy net_chop fasta to output dir if specified
+                if args.net_chop_method:
+                    epitope_flank_length = 9
+                    (net_chop_fasta, _) = generate_fasta(args, output_dir, max(epitope_lengths), epitope_flank_length, net_chop_fasta=True)
+                    run_arguments['net_chop_fasta'] = net_chop_fasta
                 all_epitopes_file = os.path.join(output_dir, "{}.all_epitopes.tsv".format(args.sample_name))
                 filtered_file = os.path.join(output_dir, "{}.filtered.tsv".format(args.sample_name))
                 #!!! make below call to create_net_class_report
@@ -246,9 +246,9 @@ def main(args_input = sys.argv[1:]):
             else:
                 print("\nNo processable fusions found. Aborting.\n")
         elif len(prediction_algorithms) == 0:
-            print("No MHC class {} prediction algorithms chosen. Skipping MHC class I predictions.".format(mhc_class))
+            print("No MHC class {} prediction algorithms chosen. Skipping MHC class {} predictions.".format(mhc_class, mhc_class))
         elif len(alleles) == 0:
-            print("No MHC class {} alleles chosen. Skipping MHC class II predictions.".format(mhc_class))
+            print("No MHC class {} alleles chosen. Skipping MHC class {} predictions.".format(mhc_class, mhc_class))
 
     if len(class_i_prediction_algorithms) > 0 and len(class_i_alleles) > 0 and len(class_ii_prediction_algorithms) > 0 and len(class_ii_alleles) > 0:
         print("Creating combined reports")

diff --git a/tests/test_data/pvacfuse_generate_protein_fasta/output_with_invalid_characters.fasta b/tests/test_data/pvacfuse_generate_protein_fasta/output_with_invalid_characters.fasta
@@ -0,0 +1,2 @@
+>1.PTEN-ENSG00000200891(21548),MED6P1(31892).ENST00000371953-..frameshift_fusion.42
+PPCRPHPTPLCSSSCSSSRHHPHHPSPSLHPSSSISPLHGPCTCSPCSFPP
diff --git a/tests/test_fasta_generator.py b/tests/test_fasta_generator.py
@@ -20,6 +20,10 @@ def setUpClass(cls):
     def test_source_compiles(self):
         self.assertTrue(py_compile.compile(self.executable))
 
+    def test_trim_invalid_characters(self):
+        self.assertEqual(FastaGenerator.__new__(FastaGenerator).trim_sequence('K?TTASVKERREILSELGKCVAGKEFRVERTPLPSAPVLPELTAIPLT'), 'TTASVKERREILSELGKCVAGKEFRVERTPLPSAPVLPELTAIPLT')
+        self.assertEqual(FastaGenerator.__new__(FastaGenerator).trim_sequence('TTASVKERREILSELGKCVAGKEFRVERTPLPSAPVLPELTAI?PLT'), 'TTASVKERREILSELGKCVAGKEFRVERTPLPSAPVLPELTAI')
+
     def test_input_file_with_peptide_sequence_length_17_generates_expected_file(self):
         generate_fasta_input_file      = os.path.join(self.test_data_dir, 'input.tsv')
         generate_fasta_output_file     = tempfile.NamedTemporaryFile()

diff --git a/tests/test_pvacfuse_generate_protein_fasta.py b/tests/test_pvacfuse_generate_protein_fasta.py
@@ -67,5 +67,6 @@ def test_arriba_tsv_with_invalid_character(self):
             generate_protein_fasta_output_file.name,
             '-d', 'full'
         ], shell=False))
+        expected_output_file = os.path.join(self.test_data_dir, 'output_with_invalid_characters.fasta')
+        self.assertTrue(cmp(generate_protein_fasta_output_file.name, expected_output_file))
         os.unlink("{}.manufacturability.tsv".format(generate_protein_fasta_output_file.name))
-        self.assertEqual(os.path.getsize(generate_protein_fasta_output_file.name), 0)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		>1.PTEN-ENSG00000200891(21548),MED6P1(31892).ENST00000371953-..frameshift_fusion.42
		PPCRPHPTPLCSSSCSSSRHHPHHPSPSLHPSSSISPLHGPCTCSPCSFPP