蛋白质比对结果快速分类taxid

蛋白质比对结果快速分类taxid的c脚本

nr数据库在分类的时候有一个文件,即accession2taxid用来把accession number分类到taxid

但是由于和这个文件极大,直接比对速度相当慢,就写了以下两个脚本

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <pthread.h>
#include "uthash.h"

struct my_struct {
    char accession[50];
    int taxid;
    UT_hash_handle hh;
};

struct thread_arg {
    char input_filename[256];
    char output_filename[256];
};

void add_accession(struct my_struct **s, char *accession, int taxid) {
    struct my_struct *new_item;
    HASH_FIND_STR(*s, accession, new_item);  // Search to see if accession already exists
    if (new_item == NULL) {
        new_item = (struct my_struct*)malloc(sizeof(struct my_struct));  // Corrected malloc with casting
        strcpy(new_item->accession, accession);
        new_item->taxid = taxid;
        HASH_ADD_STR(*s, accession, new_item);  // Add to hash table
    }
}

void *process_file(void *arg) {
    struct thread_arg *targ = (struct thread_arg *)arg;
    struct my_struct *hash_table = NULL;

    printf("Processing file: %s\n", targ->input_filename);
    FILE *file = fopen(targ->input_filename, "r");
    if (file == NULL) {
        perror("Failed to open file");
        return NULL;
    }

    char accession[50];
    int taxid;
    while (fscanf(file, "%49s %d", accession, &taxid) == 2) {
        add_accession(&hash_table, accession, taxid);
    }
    fclose(file);

    FILE *fp = fopen(targ->output_filename, "wb");
    if (!fp) {
        perror("Failed to open file for writing");
        return NULL;
    }

    struct my_struct *current_item, *tmp;
    int count = 0;
    HASH_ITER(hh, hash_table, current_item, tmp) {
        fwrite(current_item->accession, sizeof(current_item->accession), 1, fp);
        fwrite(&current_item->taxid, sizeof(current_item->taxid), 1, fp);
        count++;
    }
    printf("Written %d entries to %s\n", count, targ->output_filename);
    fclose(fp);

    HASH_ITER(hh, hash_table, current_item, tmp) {
        HASH_DEL(hash_table, current_item);
        free(current_item);
    }

    return NULL;
}

int main() {
    pthread_t threads[26];
    struct thread_arg args[26];
    const char *basepath = "/share/backup01/database/blastdb/part_nr/accession2taxid_box/";

    for (int i = 0; i < 26; i++) {
        sprintf(args[i].input_filename, "%sfile_%c.txt", basepath, 'a' + i);
        sprintf(args[i].output_filename, "%sdata_%c.bin", basepath, 'a' + i);

        if (access(args[i].input_filename, F_OK) != -1) {
            pthread_create(&threads[i], NULL, process_file, &args[i]);
        } else {
            printf("File does not exist: %s\n", args[i].input_filename);
            threads[i] = 0;
        }
    }

    for (int i = 0; i < 26; i++) {
        if (threads[i] != 0) {
            pthread_join(threads[i], NULL);
        }
    }

    return 0;
}

这个脚本用于处理accession2taxid文件,拆分成多个

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <cctype>

#define MAX_THREADS 50

struct thread_arg {
    char filename[256];
    char **queries;
    int num_queries;
};

void *thread_func(void *arg) {
    struct thread_arg *targ = (struct thread_arg *)arg;
    char command[1024];
    
    for (int i = 0; i < targ->num_queries; i++) {
        sprintf(command, "grep '%s' %s", targ->queries[i], targ->filename);
        system(command);
    }

    return NULL;
}

int main(int argc, char *argv[]) {
    pthread_t threads[MAX_THREADS];
    struct thread_arg args[MAX_THREADS] = {0};
    const char *basepath = "/share/backup01/database/blastdb/part_nr/accession2taxid_box/";

    if (argc < 2) {
        fprintf(stderr, "Usage: %s [query1] [query2] ...\n", argv[0]);
        return 1;
    }

    int current_thread = 0;

    for (int i = 1; i < argc && current_thread < MAX_THREADS; i++) {
        char letter = tolower(argv[i][0]);
        if (letter < 'a' || letter > 'z') continue;

        int index = current_thread++;
        sprintf(args[index].filename, "%sfile_%c.txt", basepath, letter);
        args[index].queries = (char **)malloc(sizeof(char *));  
        args[index].num_queries = 1;

        pthread_create(&threads[index], NULL, thread_func, &args[index]);
    }

    for (int i = 0; i < current_thread; i++) {
        pthread_join(threads[i], NULL);
        free(args[i].queries);
    }

    return 0;
}

这个脚本用于快速比对,用法类似

script seq1 seq2

发表于 2024-08-09 15:02
阅读 ( 1002 )
分类：软件工具

蛋白质比对结果快速分类taxid

你可能感兴趣的文章

相关问题

0 条评论

作家榜 »