如果某fq_clean文件的其中一端出现了错误,我们手里还持有他的原始数据,那我们就可以用以下方法处理
1,首先提取clean文件另一端的id,我用了python脚本
import gzip
import argparse
def extract_gene_names(input_fastq_gz, output_file):
with gzip.open(input_fastq_gz, 'rt') as fastq_gz, open(output_file, 'w') as out_file:
line_count = 0
for line in fastq_gz:
line_count += 1
if line_count % 4 == 1:
gene_name = line.strip().split()[0]
out_file.write(gene_name + '\n')
def main():
parser = argparse.ArgumentParser(description='Extract gene names from a gzipped fastq file.')
parser.add_argument('input_fastq_gz', help='Path to the input fastq.gz file')
parser.add_argument('output_file', help='Path to the output file containing gene names')
args = parser.parse_args()
extract_gene_names(args.input_fastq_gz, args.output_file)
if __name__ == '__main__':
main()
用法如下
usage: get_id_fqgz.py [-h] input_fastq_gz output_file
2,使用perl脚本根据这个id文件和原始数据生成所需的cleandata
use Bio::SeqIO; use Bio::Seq; use Term::ProgressBar; if (@ARGV == 3) { my ($id_file, $fq1_file, $out1_file) = @ARGV; filter_fastq($id_file, $fq1_file, $out1_file); } elsif (@ARGV == 5) { my ($id_file, $fq1_file, $fq2_file, $out1_file, $out2_file) = @ARGV; filter_fastq($id_file, $fq1_file, $out1_file, $fq2_file, $out2_file); } else { die "Usage: perl $0 <id> <fq1> [<fq2>] <OUT1> [<OUT2>]\n"; } sub filter_fastq { my ($id_file, $fq1_file, $out1_file, $fq2_file, $out2_file) = @_; my %keep; open my $ID, $id_file or die "Cannot open ID file: $id_file\n"; while (<$ID>) { chomp; next if /^#/; my @tmp = split(/\s+/); my ($cleaned_id) = $tmp[0] =~ /([^@]+)/; # 移除 "@" 符号 $keep{$cleaned_id} = 1; } close($ID); process_fastq_file($fq1_file, $out1_file, \%keep, "Read 1"); if (defined $fq2_file && defined $out2_file) { process_fastq_file($fq2_file, $out2_file, \%keep, "Read 2"); } } sub process_fastq_file { my ($input_file, $output_file, $keep_ref, $read_label) = @_; open my $FQ, "zcat $input_file |" or die "Cannot open input file: $input_file\n"; open my $GZ, "| gzip >$output_file" or die "Cannot open output file: $output_file\n"; my $fq_in = Bio::SeqIO->new(-fh => $FQ, -format => 'fastq'); my $fq_out = Bio::SeqIO->new(-fh => $GZ, -format => 'fastq'); my $total_seqs = `zcat $input_file | grep -c "^@"`; # Count total sequences in the input file chomp($total_seqs); my $progress = Term::ProgressBar->new({name => $read_label, count => $total_seqs, ETA => 'linear'}); $progress->minor(0); # Disabling minor updates for better performance my $count = 0; while (my $seq_obj = $fq_in->next_seq()) { my $id = $seq_obj->id; my ($cleaned_id) = $id =~ /([^@]+)/; # 移除 "@" 符号 if (exists $keep_ref->{$cleaned_id}) { $fq_out->write_seq($seq_obj); } $count++; $progress->update($count); } $progress->update($total_seqs); # Ensure the progress bar reaches 100% close($FQ); close($GZ); }
没有开头那几个模块请自行百度安装,用法如下,【】中的是可选参数
Usage: perl raw_to_clean_byid.pl <id> <fq1> [<fq2>] <OUT1> [<OUT2>]
第二步的脚本优化自https://www.omicsclass.com/article/1267
如果觉得我的文章对您有用,请随意打赏。你的支持将鼓励我继续创作!