-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgff_filter_longest.pl
106 lines (89 loc) · 1.85 KB
/
gff_filter_longest.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env perl
use strict;
my $a = scalar(@ARGV);
print "$a---";
die "perl $0 <ncbi.gff3> <out.longestID> <out.ncbi_longest.gff3>\n" unless (@ARGV == 3);
my $gff = shift;
my $otab = shift;
my $ogff = shift;
## read GFF and get ID info
my %mRNA2cds;
my %gene2mRNA;
my %cdslen;
open GFF, $gff or die $!;
while(<GFF>){
chomp;
next if /^#/;
my @t = split /\t/;
next unless ($t[8] =~ /ID=/ && $t[8] =~ /Parent=/);
$t[8] =~ /ID=([^;]+)/;
my $id1 = $1;
$t[8] =~ /Parent=([^;]+)/;
my $id2 = $1;
if($t[2] eq "mRNA"){
$gene2mRNA{$id2}{$id1} = 1;
}
if($t[2] eq "CDS"){
$mRNA2cds{$id2} = $id1;
## get cds length;
$cdslen{$id1} += $t[4] - $t[3] + 1;
}
}
close GFF;
## get longest cds and mRNA ID;
open O1, ">$otab" or die $!;
my %lmRNA;
foreach my $geneID (keys %gene2mRNA) {
my @mRNA = keys %{$gene2mRNA{$geneID}};
my $len = 0;
my $lmRNA;
foreach my $mRNA (@mRNA){
my $cds = $mRNA2cds{$mRNA};
next if (!exists $cdslen{$cds});
my $lcds = $cdslen{$cds};
if($lcds > $len){
$len = $lcds;
$lmRNA = $mRNA;
}
}
next if $len == 0;
print O1 "$geneID\t$lmRNA\t$mRNA2cds{$lmRNA}\n";
$lmRNA{$lmRNA} = $mRNA2cds{$lmRNA};
}
close O1;
## filter gff to keep longest cds and mRNA;
open O2, ">$ogff" or die $!;
open GFF, $gff or die $!;
while(my $line = <GFF>){
chomp $line ;
next if $line =~ /^#/;
my @t = split /\t/, $line;
next unless ( @t == 9);
my $id1 = "NA";
my $id2 = "NA";
if($t[8] =~ /ID=([^;]+)/){
$id1 = $1;
}
if($t[8] =~ /Parent=([^;]+)/){
$id2 = $1;
}
my $out = $line ;
my $flag = 0;
if($t[2] eq "gene"){
next unless exists $gene2mRNA{$id1};
$flag = 1;
}
if($t[2] eq "mRNA"){
next unless exists $lmRNA{$id1};
$flag = 1;
}
if($t[2] =~ /CDS/i || $t[2] =~ /exon/i || $t[2] =~ /UTR/i ){
next unless exists $lmRNA{$id2};
$flag = 1;
}
if($flag == 1){
print O2 "$out\n";
}
}
close GFF;
close O2;