有时候们会需要基因的启动子序列,这里有脚本可以提取基因组所有基因的启动子序列。
脚本运行命令:
perl gene_promoter.pl -fa Donkey_Hic_genome.20180408.fa -gff Donkey_Hic_genome.20180408.gff3 -out gene_promoter.fa -n 2000
其中 -fa 后跟基因组染色体序列;-gff 后跟基因组gff文件;-n后跟数字,表示要提取基因上游多少bp的序列。
脚本代码:
#!/usr/bin/perl -w
use strict;
use warnings;
use Getopt::Long;
use Data::Dumper;
use Config::General;
use Cwd qw(abs_path getcwd);
use FindBin qw($Bin $Script);
use File::Basename qw(basename dirname);
use Bio::SeqIO;
use Bio::Seq;
my $version = "1.3";
## prepare parameters #######################################################################
## -------------------------------------------------------------------------------------------
## GetOptions
my %opts;
GetOptions(\%opts, "gff=s","fa=s", "out=s", "n=s","h");
if(!defined($opts{out}) || !defined($opts{gff}) || || !defined($opts{fa}) ||defined($opts{h}))
{
print <<"Usage End.";
Description:
$version:lefse analysis
Usage
Forced parameter:
-gff gff file <infile> must be given
-out outdir <outfile> must be given -n num <int>
-fa genome fasta file <infile> must be given
Other parameter:
-h Help document
Usage End.
exit;
}
$opts{n} ||= 2000; my $n = $opts{n};
my $in = Bio::SeqIO->new(-file => "$opts{fa}" , -format => 'Fasta');
my %fasta;
while ( my $seq = $in->next_seq() ) {
my($id,$sequence)=($seq->id,$seq->seq);
$fasta{$id}=$sequence;
}
open(IN,"$opts{gff}") ||die "open file $opts{gff} faild.\n";
open(OUT,">$opts{out}") ||die "open file $opts{out} faild.\n";
while(<IN>){
next if(/^#/);
my @line = split ("\t",$_);
if($line[2] eq "gene"){
$line[8] =~ /ID=([^;]*);/;
my $name = $1;
if($line[6] eq "+"){
my $gene = substr( $fasta{$line[0]},$line[3]-$n-1, $n);
print OUT ">$name\n$gene\n";
}elsif($line[6] eq "-"){
my $gene = substr( $fasta{$line[0]},$line[4], $n);
$gene = &reverse_complement_IUPAC($gene);
print OUT ">$name\n$gene\n";
}
}
}
close(OUT);
close(IN);
sub reverse_complement_IUPAC {
my $dna = shift;
# reverse the DNA sequence
my $revcomp = reverse($dna);
# complement the reversed DNA sequence
$revcomp =~ tr/ABCDGHMNRSTUVWXYabcdghmnrstuvwxy/TVGHCDKNYSAABWXRtvghcdknysaabwxr/;
return $revcomp;
}
sub reverse_complement {
my $dna = shift;
# reverse the DNA sequence
my $revcomp = reverse($dna);
# complement the reversed DNA sequence
$revcomp =~ tr/ACGTacgt/TGCAtgca/;
return $revcomp;
}
如果觉得我的文章对您有用,请随意打赏。你的支持将鼓励我继续创作!