-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathHapMap_to_PLINK.py
executable file
·77 lines (66 loc) · 2.18 KB
/
HapMap_to_PLINK.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python
"""Convert from the HMP format from T3 into PLINK PED files. Will order markers
according to a supplied PLINK Map file. Takes two agruments:
1) T3 hmp file
2) PLINK MAP
"""
import sys
def store_alleles(vcf):
"""Reads a VCF and stores a dictionary of the reference and alternate
alleles for a SNP. SNP IDs are keys, and tuples of (ref, alt) are values."""
states = {}
with open(vcf, 'r') as f:
for line in f:
if line.startswith('#'):
continue
else:
tmp = line.strip().split()
states[tmp[2]] = (tmp[3], tmp[4])
return states
def parse_hapmap(hmp):
"""Parse the HapMap file and store it as a nested dictionary. The first key
is the sample name and the second key is the SNP name."""
hap_dat = {}
with open(hmp, 'r') as f:
for index, line in enumerate(f):
if index == 0:
samples = line.strip().split('\t')[11:]
for s in samples:
hap_dat[s] = {}
else:
tmp = line.strip().split('\t')
snpid = tmp[0]
for samp, geno in zip(samples, tmp[11:]):
if geno == 'NN':
call = '00'
else:
call = geno
hap_dat[samp].update({snpid: (call[0], call[1])})
return hap_dat
def generate_ped(ped_data, maporder):
"""Generate the PED in the proper order."""
ped = []
for sample in sorted(ped_data):
fid = '0'
pid = '0'
mid = '0'
sex = '0'
pheno = '-9'
geno = []
for snp in maporder:
geno += list(ped_data[sample][snp])
ped.append([fid, sample, pid, mid, sex, pheno] + geno)
return ped
def main(hmp, plinkmap):
"""Main function."""
hap = parse_hapmap(hmp)
# Get the map order
order = []
with open(plinkmap, 'r') as f:
for line in f:
order.append(line.strip().split()[1])
pedfile = generate_ped(hap, order)
for row in pedfile:
sys.stdout.write('\t'.join(row) + '\n')
return
main(sys.argv[1], sys.argv[2])