从NCBI基因组数据中获得cds，pep和geneID对应表

在做基因组相关分析时，我们常常需要从基因组中提取cds，并翻译成相应的pep序列。此脚本，以NCBI数据库中标准的基因组序列文件和对应的gff文件为输入文件，快速获得cds序列，pep序列，RNA,Protein和gene的对应关系表等相关文件。
A perl script which deals with ncbi raw data,and from which get cds,pep and gene,mRNA and protein ID list.
使用方法如下：
perl $0 @H_403_14@gff_file@H_403_14@genomes_fa_file @H_403_14@file_prefix
#! /usr/bin/perl -w

=head1 #######################################
	
=head1 name
	grep_cds_pep_from_ncbi_genomes_datas.pl

=head1 description
	deal with ncbi raw data,mRNA and protein ID list.

=head1 example
	perl  $0 ref_ConCri1.0_top_level.gff3.gz ccr_ref_ConCri1.0_chrUn.fa.gz  mole
	perl  $0 gff_file genomes_fa_file  file_prefix

=head1 author
	original from Xiangfeng Li,xflee0608@163.com
		##2014-4-19/21##

=head1 #######################################
=cut

use strict;
die `pod2text $0` unless @ARGV==3;
my ($gff,$fa,$prefix)=@ARGV;

##deal gff file##
$gff=~/gz$/ ? (open GFF,"gzip -cd $gff|"||die):(open GFF,$gff||die);
my (%mrna,%cds,%pep);
while(<GFF>){
	chomp;
	next if(/^#/);
	my @p=split/\t/,$_;
	my @q=split/;/,$p[8];
	my ($rna,$pep,$nt,$gene);
	my $chr=$p[0];
	if($p[2] eq "mRNA"){
		($rna=$q[0])=~s/ID=//;
		($nt=$q[1])=~s/Name=//;
		($gene=$q[-3])=~s/gene=//;
		$mrna{$chr}{$rna}{strand}=$p[6];
		$cds{$rna}=[$gene,$nt];
	}
        if($p[2] eq "CDS"){
                ($pep=$q[1])=~s/Name=//;
                ($rna=$q[2])=~s/Parent=//;
                push @{$mrna{$chr}{$rna}{nt}},[$p[3],$p[4]];
                $pep{$rna}=$pep;
        }
}
close GFF;

##get id list##
my %anno;
my $id_gene_cds_pep=$prefix."_id_gene_cds_pep.lst";
open ID,">",$id_gene_cds_pep||die;
foreach my $i(sort keys %pep){
	if($cds{$i}){
		my $out=join "\t",$i,$cds{$i}[0],$cds{$i}[1],$pep{$i};
		print ID $out,"\n";
		($anno{$i}=$out)=~s/\t/\|/g;
	}
}
warn "create file:$id_gene_cds_pep\n";
close ID;

##deal fa file##
my %max;
$fa=~/gz$/ ? (open FA,"gzip -cd $fa|"||die):(open FA,$fa||die);
my $raw_cds=$prefix."_raw_cds";
open CDS1,">$raw_cds"||die;
my ($start,$end);
$/=">";<FA>;$/="\n";
while(<FA>){
        my $name=$1 if(/(\S+)/);
	my $info=(split/\|/,$name)[-1];
        $/=">";
        my $seq=<FA>;
        $/="\n";
        $seq=~s/>|\n+//g;
	my $scaf=$mrna{$info};
	foreach my $k(sort keys %$scaf){
		next if(! exists $scaf->{$k}{nt});
		my @p=@{$scaf->{$k}{nt}};
		my $strand=$$scaf{$k}{strand};
		my $get;
		@p=sort{$a->[0]<=>$b->[0]}@p;
		my $loc1=$p[0][0];
		my $loc2=$p[-1][1];
		my ($get_len,$add,$out,$gene);
		if(exists $anno{$k}){
			$add=$anno{$k}; 
			$gene=(split/\|/,$add)[1];
		}else{next;}
		foreach(@p){
			($start,$end)=@$_[0,1];
			$get.=uc(substr($seq,$start-1,$end-$start+1));
		}
		if($strand eq "+"){
			$get_len=length$get;
			$get=~s/([A-Z]{50})/$1\n/g;
			chop($get) unless($get_len%50);
			$out=">$add LOC=$info :$loc1:$loc2:+ length=$get_len\n$get\n";
			push @{$max{$gene}},[$get_len,$out];
			print CDS1 $out;
		}
		if($strand eq "-"){
			$get=&reverse_complement($get);
			$get_len=length$get;
			$get=~s/([A-Z]{50})/$1\n/g;
			chop($get) unless($get_len%50);
			$out=">$add LOC=$info :$loc1:$loc2:- length=$get_len\n$get\n";
			push @{$max{$gene}},$out];
			print CDS1 $out;
		}
	}
}
warn "create file:$raw_cds\n";
close FA;
close CDS1;

##get max transcript##
my $filter_cds=$prefix."_filter_cds";
open CDS2,">$filter_cds"||die;
my @a;
foreach my $j(keys %max){
	my @trans=@{$max{$j}};
	@trans=sort{$a->[0] cmp $b->[0]}@trans;
	push @a,$trans[-1][1];
}
my @a_new;
foreach(@a){
	my $r=$1 if(/^>rna(\d+)/);
	push @a_new,[$r,$_];
}
my @cds_sort=map{$_->[1]}
		sort{$a->[0] <=> $b->[0]}@a_new;
print CDS2 $_ for@cds_sort;
close CDS2;
warn "create file:$filter_cds\n";

##get raw pep sequences##
my $raw_pep=$prefix."_raw_pep";
open PEP1,$raw_pep||die;
my @raw_pep=&cds2pep($raw_cds);
print PEP1 $_ for@raw_pep;
close PEP1;
warn "create file:$raw_pep\n";

##get filter pep sequences##
my $filter_pep=$prefix."_filter_pep";
open PEP2,">$filter_pep"||die;
my @filter_pep=&cds2pep($filter_cds);
print PEP2 $_ for@filter_pep;
close PEP2;
warn "create file:$filter_pep\n";

##add label for cds and pep of filter##
my $label=uc($prefix);
open IN1,$filter_cds||die;
my $filter_cds_label=$prefix."_filter_cds_label";
open OUT1,$filter_cds_label||die;
while(<IN1>){
	chomp;
	if(/^>/){
		my @a=split/\|/,$_,2;
		my $name=$a[0]."_$label";
		print OUT1"$name |$a[1]\n";
	}else{print OUT1 "$_\n";}
}
close IN1;
close OUT1;
warn "create file:$filter_cds_label\n";

open IN2,$filter_pep||die;
my $filter_pep_label=$prefix."_filter_pep_label";
open OUT2,$filter_pep_label||die;
while(<IN2>){
        chomp;
        if(/^>/){
                my @a=split/\|/,2;
                my $name=$a[0]."_$label";
                print OUT2 "$name |$a[1]\n";
        }else{print OUT2 "$_\n";}
}
close IN2;
close OUT2;
warn "create file:$filter_pep_label\n";

##timing##
my $time=times;
my $time_out=sprintf "%.2f",$time/60;
print "##########\nElapsed Time :$time_out mins\n##########\n";

##subroutine##
sub reverse_complement{
	my ($seq)=shift;
	$seq=reverse$seq;
	$seq=~tr/AaGgCcTt/TtCcGgAa/;
	return $seq;
}

##subroutine##
sub cds2pep{
	my $file=shift;
	my %code = (
                        "standard" =>
                                {
                                'GCA' => 'A','GCC' => 'A','GCG' => 'A','GCT' => 'A',# Alanine
                                'TGC' => 'C','TGT' => 'C',# Cysteine
                                'GAC' => 'D','GAT' => 'D',# Aspartic Aci
                                'GAA' => 'E','GAG' => 'E',# Glutamic Aci
                                'TTC' => 'F','TTT' => 'F',# Phenylalanin
                                'GGA' => 'G','GGC' => 'G','GGG' => 'G','GGT' => 'G',# Glycine
                                'CAC' => 'H','CAT' => 'H',# Histidine
                                'ATA' => 'I','ATC' => 'I','ATT' => 'I',# Isoleucine
                                'AAA' => 'K','AAG' => 'K',# Lysine
                                'CTA' => 'L','CTC' => 'L','CTG' => 'L','CTT' => 'L','TTA' => 'L','TTG' => 'L',# Leucine
                                'ATG' => 'M',# Methionine
                                'AAC' => 'N','AAT' => 'N',# Asparagine
                                'CCA' => 'P','CCC' => 'P','CCG' => 'P','CCT' => 'P',# Proline
                                'CAA' => 'Q','CAG' => 'Q',# Glutamine
                                'CGA' => 'R','CGC' => 'R','CGG' => 'R','CGT' => 'R','AGA' => 'R','AGG' => 'R',# Arginine
                                'TCA' => 'S','TCC' => 'S','TCG' => 'S','TCT' => 'S','AGC' => 'S','AGT' => 'S',# Serine
                                'ACA' => 'T','ACC' => 'T','ACG' => 'T','ACT' => 'T',# Threonine
                                'GTA' => 'V','GTC' => 'V','GTG' => 'V','GTT' => 'V',# Valine
                                'TGG' => 'W',# Tryptophan
                                'TAC' => 'Y','TAT' => 'Y',# Tyrosine
                                'TAA' => 'U','TAG' => 'U','TGA' => 'U'                                              # Stop
                                }
                        ## more translate table could be added here in future
                        ## more translate table could be added here in future
                        ## more translate table could be added here in future
        );
	open IN,$file||die;
	$/=">";<IN>;$/="\n";
	my @results_set;
	while(<IN>){
		my $info=$_;
        	my @a=split/\s+/,$info;
        	$/=">";my $seq=<IN>;$/="\n";
        	$seq=~s/\n|>//g;
        	my $len=length$seq;
        	my $info_out=join " ",@a[0..($#a-1)];
        	my ($pep_out,$triplet);
        	for(my $i=0;$i<$len;$i+=3){
                	$triplet=substr($seq,3);
                	next if(length$triplet!=3);
                	if(exists $code{standard}{$triplet}){
                        	$pep_out.=$code{standard}{$triplet};
                	}else{$pep_out.="X"}
        	}
        	$pep_out=~s/U$// if($pep_out=~/U$/);
        	my $pep_len=length$pep_out;
        	$pep_out=~s/([A-Z]{50})/$1\n/g;
        	chop($pep_out) unless($pep_len%50);
        	my $results= ">$info_out length=$pep_len\n$pep_out\n";
		push @results_set,$results;
	}
	return @results_set;
}

__END__
从NCBI基因组数据中获得cds，pep和geneID对应表

猜你在找的Perl相关文章