蛋白质比对结果快速分类taxid

蛋白质比对结果快速分类taxid的c脚本

nr数据库在分类的时候有一个文件,即accession2taxid用来把accession number分类到taxid

attachments-2024-08-4U6lHlWI66b5bdae509b2.png但是由于和这个文件极大,直接比对速度相当慢,就写了以下两个脚本

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <pthread.h>
#include "uthash.h"

struct my_struct {
    char accession[50];
    int taxid;
    UT_hash_handle hh;
};

struct thread_arg {
    char input_filename[256];
    char output_filename[256];
};

void add_accession(struct my_struct **s, char *accession, int taxid) {
    struct my_struct *new_item;
    HASH_FIND_STR(*s, accession, new_item);  // Search to see if accession already exists
    if (new_item == NULL) {
        new_item = (struct my_struct*)malloc(sizeof(struct my_struct));  // Corrected malloc with casting
        strcpy(new_item->accession, accession);
        new_item->taxid = taxid;
        HASH_ADD_STR(*s, accession, new_item);  // Add to hash table
    }
}

void *process_file(void *arg) {
    struct thread_arg *targ = (struct thread_arg *)arg;
    struct my_struct *hash_table = NULL;

    printf("Processing file: %s\n", targ->input_filename);
    FILE *file = fopen(targ->input_filename, "r");
    if (file == NULL) {
        perror("Failed to open file");
        return NULL;
    }

    char accession[50];
    int taxid;
    while (fscanf(file, "%49s %d", accession, &taxid) == 2) {
        add_accession(&hash_table, accession, taxid);
    }
    fclose(file);

    FILE *fp = fopen(targ->output_filename, "wb");
    if (!fp) {
        perror("Failed to open file for writing");
        return NULL;
    }

    struct my_struct *current_item, *tmp;
    int count = 0;
    HASH_ITER(hh, hash_table, current_item, tmp) {
        fwrite(current_item->accession, sizeof(current_item->accession), 1, fp);
        fwrite(&current_item->taxid, sizeof(current_item->taxid), 1, fp);
        count++;
    }
    printf("Written %d entries to %s\n", count, targ->output_filename);
    fclose(fp);

    HASH_ITER(hh, hash_table, current_item, tmp) {
        HASH_DEL(hash_table, current_item);
        free(current_item);
    }

    return NULL;
}

int main() {
    pthread_t threads[26];
    struct thread_arg args[26];
    const char *basepath = "/share/backup01/database/blastdb/part_nr/accession2taxid_box/";

    for (int i = 0; i < 26; i++) {
        sprintf(args[i].input_filename, "%sfile_%c.txt", basepath, 'a' + i);
        sprintf(args[i].output_filename, "%sdata_%c.bin", basepath, 'a' + i);

        if (access(args[i].input_filename, F_OK) != -1) {
            pthread_create(&threads[i], NULL, process_file, &args[i]);
        } else {
            printf("File does not exist: %s\n", args[i].input_filename);
            threads[i] = 0;
        }
    }

    for (int i = 0; i < 26; i++) {
        if (threads[i] != 0) {
            pthread_join(threads[i], NULL);
        }
    }

    return 0;
}

这个脚本用于处理accession2taxid文件,拆分成多个


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <cctype>

#define MAX_THREADS 50

struct thread_arg {
    char filename[256];
    char **queries;
    int num_queries;
};

void *thread_func(void *arg) {
    struct thread_arg *targ = (struct thread_arg *)arg;
    char command[1024];
    
    for (int i = 0; i < targ->num_queries; i++) {
        sprintf(command, "grep '%s' %s", targ->queries[i], targ->filename);
        system(command);
    }

    return NULL;
}

int main(int argc, char *argv[]) {
    pthread_t threads[MAX_THREADS];
    struct thread_arg args[MAX_THREADS] = {0};
    const char *basepath = "/share/backup01/database/blastdb/part_nr/accession2taxid_box/";

    if (argc < 2) {
        fprintf(stderr, "Usage: %s [query1] [query2] ...\n", argv[0]);
        return 1;
    }

    int current_thread = 0;

    for (int i = 1; i < argc && current_thread < MAX_THREADS; i++) {
        char letter = tolower(argv[i][0]);
        if (letter < 'a' || letter > 'z') continue;

        int index = current_thread++;
        sprintf(args[index].filename, "%sfile_%c.txt", basepath, letter);
        args[index].queries = (char **)malloc(sizeof(char *));  
        args[index].num_queries = 1;

        pthread_create(&threads[index], NULL, thread_func, &args[index]);
    }

    for (int i = 0; i < current_thread; i++) {
        pthread_join(threads[i], NULL);
        free(args[i].queries);
    }

    return 0;
}


这个脚本用于快速比对,用法类似


script seq1 seq2


  • 发表于 2024-08-09 15:02
  • 阅读 ( 430 )
  • 分类:软件工具

你可能感兴趣的文章

相关问题

0 条评论

请先 登录 后评论
xun
xun

电路元件工程师

82 篇文章

作家榜 »

  1. omicsgene 701 文章
  2. 安生水 349 文章
  3. Daitoue 167 文章
  4. 生物女学霸 120 文章
  5. xun 82 文章
  6. 红橙子 78 文章
  7. rzx 75 文章
  8. CORNERSTONE 72 文章