-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathvcf2hapmatrix.py
executable file
·108 lines (91 loc) · 3.91 KB
/
vcf2hapmatrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python
"""
Converts `.vcf` file to Tag Haplotype Matrix (with Chrom), with order of
individuals as indicated in optional file.
Note: not yet properly tested. SNPs of same CHROM (first column) in `.vcf`
should be grouped together/sequentially, and all individuals need to be listed
in order_file.
"""
import sys
import vcf
import argparse
__author__ = 'Pim Bongaerts'
__copyright__ = 'Copyright (C) 2016 Pim Bongaerts'
__license__ = 'GPL'
VCF_HEADER = '#CHROM'
def list_from_indivorder_file(indivorder_filename):
""" Initialize list of indvs from textfile or popfile """
indv_order = []
indivorder_file = open(indivorder_filename, 'r')
for line in indivorder_file:
indv_order.append(line.rstrip().split()[0]) # ignore second col
return indv_order
def list_from_vcf_file(vcf_filename):
""" Initialize list of indvs from VCF file """
vcf_file = open(vcf_filename, 'r')
for line in vcf_file:
# Find VCF header and extract names and order of individuals
if line[0:len(VCF_HEADER)] == VCF_HEADER:
cols = line.split()
indv_order = cols[9:]
break
vcf_file.close()
return indv_order
def get_haplotypes_output(indv_order, haplotypes):
""" Generate output line with haplotypes (order as in indv_order) """
haplotypes_list = []
for indiv in indv_order:
haplotype_a = ''.join(haplotypes[indiv][0])
haplotype_b = ''.join(haplotypes[indiv][1])
# Only include genotype when no missing data present
if '.' not in haplotype_a:
haplotypes_list.append('{0}/{1}'.format(haplotype_a, haplotype_b))
else:
haplotypes_list.append('')
# Join all genotypes into single tab-separated string
haplotypes_output = '{0}\n'.format('\t'.join(haplotypes_list))
return haplotypes_output
def main(vcf_filename, indivorder_filename):
# Initialize list with order of individuals
if not indivorder_filename:
# Use order in VCF file if separate file not given
indv_order = list_from_vcf_file(vcf_filename)
else:
# Use order as indicated in separate file
indv_order = list_from_indivorder_file(indivorder_filename)
# Output header
print('Chr\t{0}'.format('\t'.join(indv_order)))
# Iterate over loci/snps for each indv in VCF file
previous_chrom = ""
haplotypes = {}
vcf_reader = vcf.Reader(open(vcf_filename, 'r'))
for record in vcf_reader:
# Output haplotype when reaching new CHROM
if previous_chrom != '' and record.CHROM != previous_chrom:
haplotypes_output = get_haplotypes_output(indv_order, haplotypes)
print('{0}\t{1}'.format(previous_chrom, haplotypes_output), end='')
haplotypes.clear()
# Store all SNP genotypes in dict (appending when same CHROM)
for indiv in indv_order:
if record.genotype(indiv).gt_bases:
snp_genotypes = record.genotype(indiv).gt_bases \
.replace('|', '/').split("/")
else:
snp_genotypes = ['.', '.']
if indiv not in haplotypes:
# Create dict item (indiv = key) with a list for each hap
haplotypes[indiv] = {}
haplotypes[indiv][0] = []
haplotypes[indiv][1] = []
haplotypes[indiv][0].append(snp_genotypes[0])
haplotypes[indiv][1].append(snp_genotypes[1])
previous_chrom = record.CHROM
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('vcf_filename', metavar='vcf_file',
help='input file with SNP data (`.vcf`)')
parser.add_argument('-o', '--order_file', metavar='order_file',
help='text file with preferred output order of \
individuals')
args = parser.parse_args()
main(args.vcf_filename, args.order_file)