nr数据库在分类的时候有一个文件,即accession2taxid用来把accession number分类到taxid
但是由于和这个文件极大,直接比对速度相当慢,就写了以下两个脚本
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include <pthread.h> #include "uthash.h" struct my_struct { char accession[50]; int taxid; UT_hash_handle hh; }; struct thread_arg { char input_filename[256]; char output_filename[256]; }; void add_accession(struct my_struct **s, char *accession, int taxid) { struct my_struct *new_item; HASH_FIND_STR(*s, accession, new_item); // Search to see if accession already exists if (new_item == NULL) { new_item = (struct my_struct*)malloc(sizeof(struct my_struct)); // Corrected malloc with casting strcpy(new_item->accession, accession); new_item->taxid = taxid; HASH_ADD_STR(*s, accession, new_item); // Add to hash table } } void *process_file(void *arg) { struct thread_arg *targ = (struct thread_arg *)arg; struct my_struct *hash_table = NULL; printf("Processing file: %s\n", targ->input_filename); FILE *file = fopen(targ->input_filename, "r"); if (file == NULL) { perror("Failed to open file"); return NULL; } char accession[50]; int taxid; while (fscanf(file, "%49s %d", accession, &taxid) == 2) { add_accession(&hash_table, accession, taxid); } fclose(file); FILE *fp = fopen(targ->output_filename, "wb"); if (!fp) { perror("Failed to open file for writing"); return NULL; } struct my_struct *current_item, *tmp; int count = 0; HASH_ITER(hh, hash_table, current_item, tmp) { fwrite(current_item->accession, sizeof(current_item->accession), 1, fp); fwrite(¤t_item->taxid, sizeof(current_item->taxid), 1, fp); count++; } printf("Written %d entries to %s\n", count, targ->output_filename); fclose(fp); HASH_ITER(hh, hash_table, current_item, tmp) { HASH_DEL(hash_table, current_item); free(current_item); } return NULL; } int main() { pthread_t threads[26]; struct thread_arg args[26]; const char *basepath = "/share/backup01/database/blastdb/part_nr/accession2taxid_box/"; for (int i = 0; i < 26; i++) { sprintf(args[i].input_filename, "%sfile_%c.txt", basepath, 'a' + i); sprintf(args[i].output_filename, "%sdata_%c.bin", basepath, 'a' + i); if (access(args[i].input_filename, F_OK) != -1) { pthread_create(&threads[i], NULL, process_file, &args[i]); } else { printf("File does not exist: %s\n", args[i].input_filename); threads[i] = 0; } } for (int i = 0; i < 26; i++) { if (threads[i] != 0) { pthread_join(threads[i], NULL); } } return 0; }
这个脚本用于处理accession2taxid文件,拆分成多个
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <pthread.h> #include <cctype> #define MAX_THREADS 50 struct thread_arg { char filename[256]; char **queries; int num_queries; }; void *thread_func(void *arg) { struct thread_arg *targ = (struct thread_arg *)arg; char command[1024]; for (int i = 0; i < targ->num_queries; i++) { sprintf(command, "grep '%s' %s", targ->queries[i], targ->filename); system(command); } return NULL; } int main(int argc, char *argv[]) { pthread_t threads[MAX_THREADS]; struct thread_arg args[MAX_THREADS] = {0}; const char *basepath = "/share/backup01/database/blastdb/part_nr/accession2taxid_box/"; if (argc < 2) { fprintf(stderr, "Usage: %s [query1] [query2] ...\n", argv[0]); return 1; } int current_thread = 0; for (int i = 1; i < argc && current_thread < MAX_THREADS; i++) { char letter = tolower(argv[i][0]); if (letter < 'a' || letter > 'z') continue; int index = current_thread++; sprintf(args[index].filename, "%sfile_%c.txt", basepath, letter); args[index].queries = (char **)malloc(sizeof(char *)); args[index].num_queries = 1; pthread_create(&threads[index], NULL, thread_func, &args[index]); } for (int i = 0; i < current_thread; i++) { pthread_join(threads[i], NULL); free(args[i].queries); } return 0; }
这个脚本用于快速比对,用法类似
script seq1 seq2
如果觉得我的文章对您有用,请随意打赏。你的支持将鼓励我继续创作!