nr数据库在分类的时候有一个文件,即accession2taxid用来把accession number分类到taxid
但是由于和这个文件极大,直接比对速度相当慢,就写了以下两个脚本
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <pthread.h>
#include "uthash.h"
struct my_struct {
char accession[50];
int taxid;
UT_hash_handle hh;
};
struct thread_arg {
char input_filename[256];
char output_filename[256];
};
void add_accession(struct my_struct **s, char *accession, int taxid) {
struct my_struct *new_item;
HASH_FIND_STR(*s, accession, new_item); // Search to see if accession already exists
if (new_item == NULL) {
new_item = (struct my_struct*)malloc(sizeof(struct my_struct)); // Corrected malloc with casting
strcpy(new_item->accession, accession);
new_item->taxid = taxid;
HASH_ADD_STR(*s, accession, new_item); // Add to hash table
}
}
void *process_file(void *arg) {
struct thread_arg *targ = (struct thread_arg *)arg;
struct my_struct *hash_table = NULL;
printf("Processing file: %s\n", targ->input_filename);
FILE *file = fopen(targ->input_filename, "r");
if (file == NULL) {
perror("Failed to open file");
return NULL;
}
char accession[50];
int taxid;
while (fscanf(file, "%49s %d", accession, &taxid) == 2) {
add_accession(&hash_table, accession, taxid);
}
fclose(file);
FILE *fp = fopen(targ->output_filename, "wb");
if (!fp) {
perror("Failed to open file for writing");
return NULL;
}
struct my_struct *current_item, *tmp;
int count = 0;
HASH_ITER(hh, hash_table, current_item, tmp) {
fwrite(current_item->accession, sizeof(current_item->accession), 1, fp);
fwrite(¤t_item->taxid, sizeof(current_item->taxid), 1, fp);
count++;
}
printf("Written %d entries to %s\n", count, targ->output_filename);
fclose(fp);
HASH_ITER(hh, hash_table, current_item, tmp) {
HASH_DEL(hash_table, current_item);
free(current_item);
}
return NULL;
}
int main() {
pthread_t threads[26];
struct thread_arg args[26];
const char *basepath = "/share/backup01/database/blastdb/part_nr/accession2taxid_box/";
for (int i = 0; i < 26; i++) {
sprintf(args[i].input_filename, "%sfile_%c.txt", basepath, 'a' + i);
sprintf(args[i].output_filename, "%sdata_%c.bin", basepath, 'a' + i);
if (access(args[i].input_filename, F_OK) != -1) {
pthread_create(&threads[i], NULL, process_file, &args[i]);
} else {
printf("File does not exist: %s\n", args[i].input_filename);
threads[i] = 0;
}
}
for (int i = 0; i < 26; i++) {
if (threads[i] != 0) {
pthread_join(threads[i], NULL);
}
}
return 0;
}
这个脚本用于处理accession2taxid文件,拆分成多个
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <cctype>
#define MAX_THREADS 50
struct thread_arg {
char filename[256];
char **queries;
int num_queries;
};
void *thread_func(void *arg) {
struct thread_arg *targ = (struct thread_arg *)arg;
char command[1024];
for (int i = 0; i < targ->num_queries; i++) {
sprintf(command, "grep '%s' %s", targ->queries[i], targ->filename);
system(command);
}
return NULL;
}
int main(int argc, char *argv[]) {
pthread_t threads[MAX_THREADS];
struct thread_arg args[MAX_THREADS] = {0};
const char *basepath = "/share/backup01/database/blastdb/part_nr/accession2taxid_box/";
if (argc < 2) {
fprintf(stderr, "Usage: %s [query1] [query2] ...\n", argv[0]);
return 1;
}
int current_thread = 0;
for (int i = 1; i < argc && current_thread < MAX_THREADS; i++) {
char letter = tolower(argv[i][0]);
if (letter < 'a' || letter > 'z') continue;
int index = current_thread++;
sprintf(args[index].filename, "%sfile_%c.txt", basepath, letter);
args[index].queries = (char **)malloc(sizeof(char *));
args[index].num_queries = 1;
pthread_create(&threads[index], NULL, thread_func, &args[index]);
}
for (int i = 0; i < current_thread; i++) {
pthread_join(threads[i], NULL);
free(args[i].queries);
}
return 0;
}
这个脚本用于快速比对,用法类似
script seq1 seq2
如果觉得我的文章对您有用,请随意打赏。你的支持将鼓励我继续创作!