Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle invalid pVACfuse characters by trimming sequence #1130

Merged
merged 1 commit into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions pvactools/lib/fasta_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,28 @@ def __init__(self, **kwargs):
self.output_key_file = kwargs['output_key_file']
self.downstream_sequence_length = kwargs.pop('downstream_sequence_length', None)
self.proximal_variants_file = kwargs.pop('proximal_variants_file', None)
self.trim_invalid_characters = kwargs.pop('trim_invalid_characters', False)
self.proximal_variants = self.parse_proximal_variants_file()

def invalid_characters(self):
return ['*', 'X', '?']

def contains_invalid_characters(self, sequence):
for character in ['*', 'X', '?']:
for character in self.invalid_characters():
if character in sequence:
return True
return False

def trim_sequence(self, sequence):
for character in self.invalid_characters():
while character in sequence:
invalid_character_pos = sequence.index(character)
if invalid_character_pos < (len(sequence) - invalid_character_pos):
sequence = sequence[invalid_character_pos+1:]
else:
sequence = sequence[0:invalid_character_pos]
return sequence

def position_out_of_bounds(self, position, sequence):
return position > len(sequence)-1

Expand Down Expand Up @@ -313,7 +326,10 @@ def execute(self):
subsequence = subsequence[:-1]

if self.contains_invalid_characters(subsequence):
continue
if self.trim_invalid_characters:
subsequence = self.trim_sequence(subsequence)
else:
continue

if len(subsequence) < self.epitope_length:
continue
Expand Down
2 changes: 2 additions & 0 deletions pvactools/lib/input_file_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,5 +678,7 @@ def execute(self):
output_rows = self.parse_arriba_file(starfusion_entries)
elif os.path.isdir(self.input_file):
output_rows = self.parse_agfusion_files(starfusion_entries)
if not os.path.exists(self.input_file):
raise Exception("Input file {} doesn't exist. Aborting.".format(self.input_file))
tsv_writer.writerows(output_rows)
writer.close()
3 changes: 2 additions & 1 deletion pvactools/tools/pvacfuse/generate_protein_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ def generate_fasta(args, downstream_sequence_length, temp_dir, save_tsv_file):
'epitope_length' : 0,
'output_file' : fasta_file,
'output_key_file' : fasta_key_file,
'downstream_sequence_length': downstream_sequence_length
'downstream_sequence_length': downstream_sequence_length,
'trim_invalid_characters' : True,
}
fasta_generator = FusionFastaGenerator(**generate_fasta_params)
fasta_generator.execute()
Expand Down
24 changes: 12 additions & 12 deletions pvactools/tools/pvacfuse/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,17 +227,17 @@ def main(args_input = sys.argv[1:]):
output_file = os.path.join(per_epitope_output_dir, "{}.all_epitopes.final.tsv".format(args.sample_name))
append_columns(intermediate_output_file, "{}.tsv".format(input_file), output_file)
output_files.append(output_file)
if epitope_length == max(epitope_lengths):
# copy fasta to output dir
fasta_file = os.path.join(output_dir, "{}.fasta".format(args.sample_name))
shutil.copy(input_file, fasta_file)
run_arguments['fasta'] = fasta_file
# generate and copy net_chop fasta to output dir if specified
if args.net_chop_method:
epitope_flank_length = 9
(net_chop_fasta, _) = generate_fasta(args, output_dir, epitope_length, epitope_flank_length, net_chop_fasta=True)
run_arguments['net_chop_fasta'] = net_chop_fasta
if len(output_files) > 0:
# copy fasta to output dir
(input_file, per_epitope_output_dir) = generate_fasta(args, output_dir, max(epitope_lengths))
fasta_file = os.path.join(output_dir, "{}.fasta".format(args.sample_name))
shutil.copy(input_file, fasta_file)
run_arguments['fasta'] = fasta_file
# generate and copy net_chop fasta to output dir if specified
if args.net_chop_method:
epitope_flank_length = 9
(net_chop_fasta, _) = generate_fasta(args, output_dir, max(epitope_lengths), epitope_flank_length, net_chop_fasta=True)
run_arguments['net_chop_fasta'] = net_chop_fasta
all_epitopes_file = os.path.join(output_dir, "{}.all_epitopes.tsv".format(args.sample_name))
filtered_file = os.path.join(output_dir, "{}.filtered.tsv".format(args.sample_name))
#!!! make below call to create_net_class_report
Expand All @@ -246,9 +246,9 @@ def main(args_input = sys.argv[1:]):
else:
print("\nNo processable fusions found. Aborting.\n")
elif len(prediction_algorithms) == 0:
print("No MHC class {} prediction algorithms chosen. Skipping MHC class I predictions.".format(mhc_class))
print("No MHC class {} prediction algorithms chosen. Skipping MHC class {} predictions.".format(mhc_class, mhc_class))
elif len(alleles) == 0:
print("No MHC class {} alleles chosen. Skipping MHC class II predictions.".format(mhc_class))
print("No MHC class {} alleles chosen. Skipping MHC class {} predictions.".format(mhc_class, mhc_class))

if len(class_i_prediction_algorithms) > 0 and len(class_i_alleles) > 0 and len(class_ii_prediction_algorithms) > 0 and len(class_ii_alleles) > 0:
print("Creating combined reports")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>1.PTEN-ENSG00000200891(21548),MED6P1(31892).ENST00000371953-..frameshift_fusion.42
PPCRPHPTPLCSSSCSSSRHHPHHPSPSLHPSSSISPLHGPCTCSPCSFPP
4 changes: 4 additions & 0 deletions tests/test_fasta_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ def setUpClass(cls):
def test_source_compiles(self):
self.assertTrue(py_compile.compile(self.executable))

def test_trim_invalid_characters(self):
self.assertEqual(FastaGenerator.__new__(FastaGenerator).trim_sequence('K?TTASVKERREILSELGKCVAGKEFRVERTPLPSAPVLPELTAIPLT'), 'TTASVKERREILSELGKCVAGKEFRVERTPLPSAPVLPELTAIPLT')
self.assertEqual(FastaGenerator.__new__(FastaGenerator).trim_sequence('TTASVKERREILSELGKCVAGKEFRVERTPLPSAPVLPELTAI?PLT'), 'TTASVKERREILSELGKCVAGKEFRVERTPLPSAPVLPELTAI')

def test_input_file_with_peptide_sequence_length_17_generates_expected_file(self):
generate_fasta_input_file = os.path.join(self.test_data_dir, 'input.tsv')
generate_fasta_output_file = tempfile.NamedTemporaryFile()
Expand Down
3 changes: 2 additions & 1 deletion tests/test_pvacfuse_generate_protein_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,5 +67,6 @@ def test_arriba_tsv_with_invalid_character(self):
generate_protein_fasta_output_file.name,
'-d', 'full'
], shell=False))
expected_output_file = os.path.join(self.test_data_dir, 'output_with_invalid_characters.fasta')
self.assertTrue(cmp(generate_protein_fasta_output_file.name, expected_output_file))
os.unlink("{}.manufacturability.tsv".format(generate_protein_fasta_output_file.name))
self.assertEqual(os.path.getsize(generate_protein_fasta_output_file.name), 0)
Loading