-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathvcf2introgress.py
executable file
·183 lines (161 loc) · 7.58 KB
/
vcf2introgress.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/env python
"""
Converts `.vcf` file to INTROGRESS input files (4 files). Splits data into
three categories: parental1, parental2 and admixed based on cluster assignment
(provided in separate file; e.g. STRUCTURE output) and given threshold, and
outputs data for loc that exceed a certain frequency difference between the
two 'parental' categories.
Note: not yet properly tested. also see similar `vcf_ancestry_matrix.py`
script. I use the formatted CLUMPP output (`clumpp_K2.out.csv`) from the
`structure_mp` wrapper as assignment file (max. of 2 clusters).
"""
import sys
import argparse
import vcf
__author__ = 'Pim Bongaerts'
__copyright__ = 'Copyright (C) 2016 Pim Bongaerts'
__license__ = 'GPL'
ADMIX_LABEL = "ADMIX"
COL_INDIV = 0
COL_POP = 1
COL_ASSIGN = 2
ERROR_FLAG = "ERROR"
def read_assignment_file(
assignment_filename, assign_cut_off, include_parentals):
""" Assess assignment of each individual and store in dict/list """
assignment_file = open(assignment_filename, 'r')
par1_indivs = []
par2_indivs = []
admixed_indivs = []
indiv_pops = {}
for line in assignment_file:
cols = line.split(',')
indiv_pops[cols[COL_INDIV]] = cols[COL_POP]
if float(cols[COL_ASSIGN]) > float(assign_cut_off):
# Parental individuals 1
par1_indivs.append(cols[COL_INDIV])
elif float(cols[COL_ASSIGN]) < (1 - float(assign_cut_off)):
# Parental individuals 2
par2_indivs.append(cols[COL_INDIV])
else:
# Admixed individuals
admixed_indivs.append(cols[COL_INDIV])
# Include parentals to admixed individuals if include flag is set
if include_parentals:
final_admixed_indivs = par1_indivs + admixed_indivs + par2_indivs
else:
final_admixed_indivs = admixed_indivs
return par1_indivs, par2_indivs, final_admixed_indivs, indiv_pops
def output_individuals(record, list_of_indivs, output_file):
""" Outputs genotypes of all indivs for current SNP to output_file """
genotypes_for_snp = []
for indiv in list_of_indivs:
genotype = record.genotype(indiv).gt_bases
if genotype:
introgress_genotype = '{0}/{1}'.format(genotype[0], genotype[2])
else:
introgress_genotype = 'NA/NA'
genotypes_for_snp.append(introgress_genotype)
concat_genotypes_for_snp = ','.join(genotypes_for_snp)
output_file.write('{}\n'.format(concat_genotypes_for_snp))
def calculate_allele_frequency(record, indivs):
""" Calculate allele frequency """
# Create list of all alleles in population
alleles = []
for indiv in indivs:
genotype = record.genotype(indiv)['GT']
if genotype:
alleles.append(genotype[0])
alleles.append(genotype[2])
concat_alleles = ''.join(alleles)
# Assess whether there are more than two alleles
unique_alleles = ''.join(set(concat_alleles))
if len(concat_alleles) == 0:
error_message = 'Error - only missing data for'
print('{0}: {1}_{2}'.format(error_message, record.CHROM, record.POS))
allele_freq = ERROR_FLAG
elif len(unique_alleles) > 2:
error_message = 'Error - locus has more than 2 alleles'
print('{0}: {1}_{2}'.format(error_message, record.CHROM, record.POS))
allele_freq = ERROR_FLAG
else:
allele_freq = float(concat_alleles.count('1') / len(concat_alleles))
return allele_freq
def allele_frequency_difference(record, par1_indivs,
par2_indivs, freq_diff_cut_off):
""" Calculates absolute difference in allele frequencies """
p1_allele_freq = calculate_allele_frequency(record, par1_indivs)
p2_allele_freq = calculate_allele_frequency(record, par2_indivs)
if p1_allele_freq == ERROR_FLAG or p2_allele_freq == ERROR_FLAG:
return ERROR_FLAG
else:
return abs(p1_allele_freq - p2_allele_freq)
def main(vcf_filename, assignment_filename, assign_cut_off, freq_diff_cut_off,
output_prefix, include_parentals):
# Assess assignment of individuals in dataset and store in lists
(par1_indivs, par2_indivs, admixed_indivs,
indiv_pops) = read_assignment_file(assignment_filename, assign_cut_off,
include_parentals)
# Open output files
parental1_file = open('{}_par1'.format(output_prefix), 'w')
parental2_file = open('{}_par2'.format(output_prefix), 'w')
admixed_file = open('{}_admix'.format(output_prefix), 'w')
loci_file = open('{}_loci'.format(output_prefix), 'w')
# Write output file headers
loci_file.write('locus,type,lg,marker pos.\n')
admixed_file.write(
'{}\n'.format(
','.join(
['ADMIX'] *
len(admixed_indivs))))
admixed_file.write('{}\n'.format(','.join(admixed_indivs)))
# Iterate over loci in VCF file and output to files
output_count = 0
vcf_reader = vcf.Reader(open(vcf_filename, 'r'))
for record in vcf_reader:
# Calculate absolute difference in allele frequencies
freq_diff = allele_frequency_difference(record, par1_indivs,
par2_indivs,
freq_diff_cut_off)
# Only include locus if above threshold frequency_diff_cut_off
if freq_diff is not ERROR_FLAG and freq_diff > float(
freq_diff_cut_off):
# Output locus information (CHROM as lg and POS as marker pos.)
marker = '{0}_{1}'.format(record.CHROM, record.POS)
loci_file.write('{0},C,{1},{1}.{2}\n'.format(marker, record.CHROM,
record.POS))
# Output genotypes of individuals to different files
output_individuals(record, par1_indivs, parental1_file)
output_individuals(record, par2_indivs, parental2_file)
output_individuals(record, admixed_indivs, admixed_file)
output_count += 1
# Close output files
parental1_file.close()
parental2_file.close()
admixed_file.close()
loci_file.close()
# Summary message
print('Outputted {0} loci'.format(output_count))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('vcf_filename', metavar='vcf_file',
help='input file with SNP data (`.vcf`)')
parser.add_argument('assignment_filename', metavar='assignment_file',
help='text file (tsv or csv) with assignment \
values for each individual (max. 2 clusters); e.g. a \
reformatted STRUCTURE output file')
parser.add_argument('assign_cut_off', metavar='assign_cut_off', type=float,
help='min. assignment value for an individual to be \
included in the allele frequency calculation (i.e. \
putative purebred)')
parser.add_argument('freq_cut_off', metavar='freq_cut_off', type=float,
help='min. allele frequency difference between the 2 \
clusters for a locus to be included in the output')
parser.add_argument('output_prefix', metavar='output_prefix',
help='prefix for output files')
parser.add_argument('--include', '-i', action='store_true',
help='set this flag if parental pops need to be \
included in output')
args = parser.parse_args()
main(args.vcf_filename, args.assignment_filename, args.assign_cut_off,
args.freq_cut_off, args.output_prefix, args.include)