1.下载数据:
#wget -c https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
#wget -c https://ftp.ncbi.nlm.nih.gov/genbank/livelists/gi2acc_mapping/gi2acc_lmdb.db.gz
#wget -c https://ftp.ncbi.nlm.nih.gov/genbank/livelists/gi2acc_mapping/gi2accession.py
#wget -c https://ftp.ncbi.nih.gov/blast/db/FASTA/nr.gz
#wget -c https://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz
#快速下载方法
/share/work/biosoft/aspera/latest/cli/bin/ascp -v -k 1 -T -l 400m -i ~/asperaweb_id_dsa.openssh anonftp@ftp.ncbi.nlm.nih.gov:/genbank/livelists/gi2acc_mapping/gi2acc_lmdb.db.gz ./
/share/work/biosoft/aspera/latest/cli/bin/ascp -v -k 1 -T -l 400m -i ~/asperaweb_id_dsa.openssh anonftp@ftp.ncbi.nlm.nih.gov:/pub/taxonomy/accession2taxid/prot.accession2taxid.gz ./
/share/work/biosoft/aspera/latest/cli/bin/ascp -v -k 1 -T -l 400m -i ~/asperaweb_id_dsa.openssh anonftp@ftp.ncbi.nlm.nih.gov:/blast/db/FASTA/nr.gz ./
2.写脚本根据 prot.accession2taxid.gz 文件中的蛋白ID,分类信息对所有的蛋白序列nr库进行分类
0 | BCT | Bacteria | |
1 | INV | Invertebrates | |
2 | MAM | Mammals | |
3 | PHG | Phages | |
4 | PLN | Plants and Fungi | |
5 | PRI | Primates | |
6 | ROD | Rodents | |
7 | SYN | Synthetic and Chimeric | |
8 | UNA | Unassigned | No species nodes should inherit this division assignment |
9 | VRL | Viruses | |
10 | VRT | Vertebrates | |
11 | ENV | Environmental samples | Anonymous sequences cloned directly from the environment |
代码如下:
perl /share/work/huangls/piplines/01.script/split_taxid_ncbiv2.pl division.dmp nodes.dmp prot.accession2taxid.gz nr.gz ./
#ls *fa|while read a;do b=${a%%.fa};echo mv "$a ${b}_nr.fa";done
/share/work/biosoft/blast/ncbi-blast-2.6.0+/bin/makeblastdb -in INV_nr.fa -dbtype prot -title INV_nr -parse_seqids
/share/work/biosoft/blast/ncbi-blast-2.6.0+/bin/makeblastdb -in PLN_nr.fa -dbtype prot -title PLN_nr -parse_seqids
/share/work/biosoft/blast/ncbi-blast-2.6.0+/bin/makeblastdb -in MAM_nr.fa -dbtype prot -title MAM_nr -parse_seqids
/share/work/biosoft/blast/ncbi-blast-2.6.0+/bin/makeblastdb -in PHG_nr.fa -dbtype prot -title PHG_nr -parse_seqids
/share/work/biosoft/blast/ncbi-blast-2.6.0+/bin/makeblastdb -in PRI_nr.fa -dbtype prot -title PRI_nr -parse_seqids
/share/work/biosoft/blast/ncbi-blast-2.6.0+/bin/makeblastdb -in ROD_nr.fa -dbtype prot -title ROD_nr -parse_seqids
/share/work/biosoft/blast/ncbi-blast-2.6.0+/bin/makeblastdb -in SYN_nr.fa -dbtype prot -title SYN_nr -parse_seqids
/share/work/biosoft/blast/ncbi-blast-2.6.0+/bin/makeblastdb -in UNA_nr.fa -dbtype prot -title UNA_nr -parse_seqids
/share/work/biosoft/blast/ncbi-blast-2.6.0+/bin/makeblastdb -in VRL_nr.fa -dbtype prot -title VRL_nr -parse_seqids
/share/work/biosoft/blast/ncbi-blast-2.6.0+/bin/makeblastdb -in VRT_nr.fa -dbtype prot -title VRT_nr -parse_seqids
/share/work/biosoft/blast/ncbi-blast-2.6.0+/bin/makeblastdb -in ENV_nr.fa -dbtype prot -title ENV_nr -parse_seqids
/share/work/biosoft/blast/ncbi-blast-2.6.0+/bin/makeblastdb -in BCT_nr.fa -dbtype prot -title BCT_nr -parse_seqids
perl代码, 此代码需要320G以上的内存,运行时间约3天
split_taxid_ncbiv2.pl
#The gi_taxid_nucl.dmp is about 160 MB and contains two columns: the nucleotide's gi and taxid.
#The gi_taxid_prot.dmp is about 17 MB and contains two columns: the protein's gi and taxid.
#Divisions file (division.dmp):
# division id -- taxonomy database division id
# division cde -- GenBank division code (three characters)
# division name -- e.g. BCT, PLN, VRT, MAM, PRI...
# comments
#0 | BCT | Bacteria | |
#1 | INV | Invertebrates | |
#2 | MAM | Mammals | |
#3 | PHG | Phages | |
#4 | PLN | Plants and Fungi | |
#5 | PRI | Primates | |
#6 | ROD | Rodents | |
#7 | SYN | Synthetic and Chimeric | |
#8 | UNA | Unassigned | No species nodes should inherit this division assignment |
#9 | VRL | Viruses | |
#10 | VRT | Vertebrates | |
#11 | ENV | Environmental samples | Anonymous sequences cloned directly from the environment |
#nodes.dmp file consists of taxonomy nodes. The description for each node includes the following
#fields:
# tax_id -- node id in GenBank taxonomy database
# parent tax_id -- parent node id in GenBank taxonomy database
# rank -- rank of this node (superkingdom, kingdom, ...)
# embl code -- locus-name prefix; not unique
# division id -- see division.dmp file
# inherited div flag (1 or 0) -- 1 if node inherits division from parent
# genetic code id -- see gencode.dmp file
# inherited GC flag (1 or 0) -- 1 if node inherits genetic code from parent
# mitochondrial genetic code id -- see gencode.dmp file
# inherited MGC flag (1 or 0) -- 1 if node inherits mitochondrial gencode from parent
# GenBank hidden flag (1 or 0) -- 1 if name is suppressed in GenBank entry lineage
# hidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet
# comments -- free-text comments and citations
die "perl $0 <division.dmp> <nodes.dmp> <gi_taxid_n/p.dmp> <nr.fa/nt.fa> <od>" unless(@ARGV==5);
use Math::BigFloat;
use Bio::SeqIO;
use Bio::Seq;
use Data::Dumper;
use PerlIO::gzip;
use FileHandle;
use Cwd qw(abs_path getcwd);
if ($ARGV[3]=~/gz$/){
open $Fa, "<:gzip", "$ARGV[3]" or die "$!";
}else{
open $Fa, "$ARGV[3]" or die "$!";
}
my$od=$ARGV[4];
$od||=getcwd;
$od=abs_path($od);
unless(-d $od){ mkdir $od;}
my $in = Bio::SeqIO->new(-fh => $Fa ,
-format => 'Fasta');
my %division2name=();
if ($ARGV[0]=~/gz$/){
open IN, "<:gzip", "$ARGV[0]" or die "$!";
}else{
open IN,"$ARGV[0]" or die "$!";
}
while(<IN>){
chomp;
my($taxid,$name)=split(/\s+\|\s+/);
$division2name{$taxid}=$name;
}
close(IN);
print Dumper(\%division2name);
#die;
my%fout=();
for my$i (keys %division2name){
my$FO=FileHandle->new(">$od/$division2name{$i}.fa");
my $out = Bio::SeqIO->new(-fh => $FO , -format => 'Fasta');
$fout{$i}=$out;
}
print "$ARGV[0] readed\n";
#print Dumper(\%fout);
#print Dumper(\%division2name);
if ($ARGV[1]=~/gz$/){
open IN, "<:gzip", "$ARGV[1]" or die "$!";
}else{
open IN,"$ARGV[1]" or die "$!";
}
my%taxid2division=();
while(<IN>){
chomp;
my@tmp=split(/\|/);
#for($i=0;$i<@tmp;$i++){
# $tmp[$i]=~s/^\s+|\s+$//g;
#}
$tmp[0]=~s/^\s+|\s+$//g;
$tmp[4]=~s/^\s+|\s+$//g;
$taxid2division{$tmp[0]}=$tmp[4];
#last if $.>100;
}
close(IN);
print "$ARGV[1] readed\n";
my %gi2taxid=();
my %prot2gi=();
if ($ARGV[2]=~/gz$/){
open IN, "<:gzip", "$ARGV[2]" or die "$!";
}else{
open IN,"$ARGV[2]" or die "$!";
}
while(<IN>){
chomp;
my@tmp=split(/\s+/);
$gi2taxid{$tmp[3]}=$tmp[2];
$prot2gi{$tmp[1]}=$tmp[3]
#last if $.>100;
}
close(IN);
print "$ARGV[2] readed\n";
#print Dumper(\%gi2taxid);
$out="nr.gi.fa.gz";
open my $GZ ,"| gzip >$out" or die $!;
my$out_nr = Bio::SeqIO->new(-fh => $GZ , -format => 'fasta');
while( my $seq = $in->next_seq()){
my $id=$seq->id;
my $sequence=$seq->seq;
my $desc=$seq->desc;
#my ($gi)=($id=~/gi\|(\d+)\|ref\|/);
if( exists $prot2gi{$id}){
my $s=Bio::Seq->new(-seq=>$sequence,
-id=>"gi|$prot2gi{$id}|ref|$id|",
-desc=>$desc
);
$out_nr->write_seq($s);
my$gi=$prot2gi{$id};
if(exists($gi2taxid{$gi}) and exists($taxid2division{$gi2taxid{$gi}})){
$fout{$taxid2division{$gi2taxid{$gi}}}->write_seq($s);
}else{
print "unknown tax for gi: $gi\n";
}
}else{
print "unknown prot id :$id\n";
}
}
$out_nr->close();
$in->close();
for my $i (keys %fout){
$fout{$i}->close();
}
如果觉得我的文章对您有用,请随意打赏。你的支持将鼓励我继续创作!