好吧,该到最后的时刻了,识别novel可变剪接的类型,首先要在原理上判断类型的情况,把类型画出来比较好一些:
以上即为大致的分类,不过有两点要说明的:intron retained 1用tophat的结果无法实现,这个看覆盖度即可;另外还有两个分类:intergenic和other,顾名思义基因间区域内的和无法识别的,当然也可以按照其他分类,就不多说了,就以此为例吧:
#!/usr/bin/env perl use warnings; use strict; my %hash; open GTF,$ARGV[0] or die $!; while(<GTF>) { chomp; my @tmp = split; my ($a,$b) = (split /-/,$tmp[2])[0,1]; $hash{$tmp[0]}{$tmp[1]}{'pn'} = $tmp[3]; $hash{$tmp[0]}{$tmp[1]}{'rs'} = $a; $hash{$tmp[0]}{$tmp[1]}{'re'} = $b; $hash{$tmp[0]}{$tmp[1]}{'type'} = $tmp[4]; my @a = split /,/,$tmp[5]; my @b = split /,$tmp[6]; @{$hash{$tmp[0]}{$tmp[1]}{'start'}} = @a; @{$hash{$tmp[0]}{$tmp[1]}{'end'}} = @b; } close GTF; open NOVEL,$ARGV[1] or die $!; print "#chr\tsplice_start\tsplice_end\tsplice_strand\tsplice_reads\tsplice_type\tgene_id\tgene_strand\tgene_type\n"; while(<NOVEL>) { chomp; my @tmp = split; my $js = $tmp[1] + $tmp[5]; my $je = $tmp[2] - $tmp[6] + 1; my ($chr,$read,$jpn) = @tmp[0,3,4]; my $flag = 0; foreach my $t(keys %{$hash{$chr}}) { if($je < $hash{$chr}{$t}{'rs'} or $js > $hash{$chr}{$t}{'re'}) { next; }else{ $flag = 1; if($js < $hash{$chr}{$t}{'rs'} and $je >= $hash{$chr}{$t}{'rs'}) { if($hash{$chr}{$t}{'pn'} eq '+') { if($hash{$chr}{$t}{'type'} eq 0) { print "$chr\t$js\t$je\t$jpn\t$read\tOther\t$t\t+\tNonCoding\n"; }else{ print "$chr\t$js\t$je\t$jpn\t$read\t5UTR\t$t\t+\tCoding\n"; } }else{ if($hash{$chr}{$t}{'type'} eq 0) { print "$chr\t$js\t$je\t$jpn\t$read\tOther\t$t\t-\tNonCoding\n"; }else{ print "$chr\t$js\t$je\t$jpn\t$read\t3UTR\t$t\t-\tCoding\n"; } } last; }elsif($js <= $hash{$chr}{$t}{'re'} and $je > $hash{$chr}{$t}{'re'}){ if($hash{$chr}{$t}{'pn'} eq '+') { if($hash{$chr}{$t}{'type'} eq 0) { print "$chr\t$js\t$je\t$jpn\t$read\tOther\t$t\t+\tNonCoding\n"; }else{ print "$chr\t$js\t$je\t$jpn\t$read\t3UTR\t$t\t+\tCoding\n"; } }else{ if($hash{$chr}{$t}{'type'} eq 0) { print "$chr\t$js\t$je\t$jpn\t$read\tOther\t$t\t-\tNonCoding\n"; }else{ print "$chr\t$js\t$je\t$jpn\t$read\t5UTR\t$t\t-\tCoding\n"; } } last; }else{ my ($js_stat,$je_stat) = (0,0); foreach my $i(@{$hash{$chr}{$t}{'end'}}) { $js_stat = 1 if($js eq $i); } foreach my $j(@{$hash{$chr}{$t}{'start'}}) { $je_stat = 1 if($je eq $j); } if($js_stat eq 1 and $je_stat eq 1) { if($hash{$chr}{$t}{'type'} eq 0) { print "$chr\t$js\t$je\t$jpn\t$read\tES\t$t\t$hash{$chr}{$t}{'pn'}\tNonCoding\n"; }else{ print "$chr\t$js\t$je\t$jpn\t$read\tES\t$t\t$hash{$chr}{$t}{'pn'}\tCoding\n"; } }elsif($js_stat eq 1){ if($hash{$chr}{$t}{'pn'} eq '+') { if($hash{$chr}{$t}{'type'} eq 0) { print "$chr\t$js\t$je\t$jpn\t$read\t3S\t$t\t+\tNonCoding\n"; }else{ print "$chr\t$js\t$je\t$jpn\t$read\t3S\t$t\t+\tCoding\n"; } }else{ if($hash{$chr}{$t}{'type'} eq 0) { print "$chr\t$js\t$je\t$jpn\t$read\t5S\t$t\t-\tNonCoding\n"; }else{ print "$chr\t$js\t$je\t$jpn\t$read\t5S\t$t\t-\tCoding\n"; } } }elsif($je_stat eq 1){ if($hash{$chr}{$t}{'pn'} eq '+') { if($hash{$chr}{$t}{'type'} eq 0) { print "$chr\t$js\t$je\t$jpn\t$read\t5S\t$t\t+\tNonCoding\n"; }else{ print "$chr\t$js\t$je\t$jpn\t$read\t5S\t$t\t+\tCoding\n"; } }else{ if($hash{$chr}{$t}{'type'} eq 0) { print "$chr\t$js\t$je\t$jpn\t$read\t3S\t$t\t-\tNonCoding\n"; }else{ print "$chr\t$js\t$je\t$jpn\t$read\t3S\t$t\t-\tCoding\n"; } } }else{ my $f2 = 0; for(my $i = 0; $i < @{$hash{$chr}{$t}{'end'}}; $i ++) { if($js >= $hash{$chr}{$t}{'start'}->[$i] and $je <= $hash{$chr}{$t}{'end'}->[$i]) { $f2 = 1; if($hash{$chr}{$t}{'type'} eq 0) { print "$chr\t$js\t$je\t$jpn\t$read\tIR\t$t\t$hash{$chr}{$t}{'pn'}\tNonCoding\n"; }else{ print "$chr\t$js\t$je\t$jpn\t$read\tIR\t$t\t$hash{$chr}{$t}{'pn'}\tCoding\n"; } last; } } if($f2 eq 0) { if($hash{$chr}{$t}{'type'} eq 0) { print "$chr\t$js\t$je\t$jpn\t$read\tOther\t$t\t$hash{$chr}{$t}{'pn'}\tNonCoding\n"; }else{ print "$chr\t$js\t$je\t$jpn\t$read\tOther\t$t\t$hash{$chr}{$t}{'pn'}\tCoding\n"; } } } last; } } } if($flag eq 0) { print "$chr\t$js\t$je\t$jpn\t$read\tIntergenic\t.\t.\t.\n"; } } close NOVEL;