-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbeagle2tg
executable file
·151 lines (120 loc) · 2.88 KB
/
beagle2tg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/perl
use warnings;
use strict;
use fralib;
use File::Basename;
use Getopt::Long;
use Pod::Usage;
=head1 NAME
beagle2tg
=head1 SYNOPSIS
beagle2tg [options] <beagle-file>
-h help
<beagle-file>
a single-spaced delimited text file with first column as 'I', an identifier header.
2 columns represent each sample, with each column representing an allele (ACGT).
The first row is the sample-id, second row is the affection status 'A' for each allele. 'M' stands for marker rows.
Example:
beagle2tg geno.phased
=head1 DESCRIPTION
=cut
#option variables
my $help;
#initialize options
Getopt::Long::Configure ('bundling');
if(!GetOptions ('h'=>\$help) || scalar(@ARGV)!=1)
{
if ($help)
{
pod2usage(-verbose => 2);
}
else
{
pod2usage(1);
}
}
## input file
my $ifile = $ARGV[0];
## check if input is TG; exit if it is
if(isTg($ifile))
{
die "Input file is a TG file. Please use a BEAGLE output file: $!";
}
open (INPUT, $ifile) || die "Cannot open $ifile: $!";
## output file
my($name, $path, $ext) = fileparse($ifile, '\..*');
my $ofile = "$name.tg";
open (OUTPUT, ">$ofile") || die "Cannot open $ofile: $!";
# variables
my $headerProcessed = 0;
my %samplelist; # checks for dups
my @samples; # stores samplelists in the same order as input file
my $allele1 = '';
# read Mk file
while (<INPUT>)
{
s/\r?\n?$//;
my @fields = split(/ /, $_);
## need the 1st row unique; cos 2 cols for 1 sample
## do not need 2nd row (affection)
if($headerProcessed == 0)
{
my $numsamples = (scalar(@fields) - 2)/2; ## this is for checking later
for my $sample (@fields)
{
#print "|$sample|"; #debug
if($sample =~ m/(id|I)\b/)
{
next;
}
else
{
if(!exists($samplelist{$sample}))
{
$samplelist{$sample} = -1;
push(@samples,$sample);
} ## if sample does not exists in samplelist, push into list
else
{
# do nothing
}
} ## if the split field doesnt match headers 'id' or 'I'
} ## for loop through split @fields
if($numsamples != ($#samples + 1))
{
die "Number of samples: ".($#samples+1)." not equal to number of sample-column-pairs: $numsamples in BEAGLE file.: $!";
}
$headerProcessed++;
} ## if first row header
elsif($headerProcessed == 1)
{
$headerProcessed++;
## print headers in OUTPUT
print OUTPUT "snp-id";
for my $i (@samples)
{
print OUTPUT "\t$i";
}
print OUTPUT "\n";
} ## if second row header - affection status dun need
else
{
## do not need first I column; NOTE $i begins with 2
## prints marker column
print OUTPUT "$fields[1]";
for(my $i=2; $i<@fields; $i++)
{
if($i % 2 == 0)
{
$allele1 = $fields[$i];
}
else
{
print OUTPUT "\t$allele1$fields[$i]";
}
} # for looping from second column onwards
print OUTPUT "\n";
} ## else non-header row
}
close(INPUT);
close(OUTPUT);