这个其实很多办法都可以做到,基本就是靠zcat后接awk,sed,或者wc命令,不过速度差强人意,比如这个
awk应该是这几个里最快的,但一个3个G左右的文件仍然运行了接近2分钟,考虑到还有几个30个G的文件需要测
磨刀不误砍柴工,我就写了个多线程的脚本,为了速度,就用了c++
#include <iostream> #include <fstream> #include <sstream> #include <string> #include <thread> #include <vector> #include <atomic> #include <zlib.h> #include <algorithm> #include <sys/sysinfo.h> #include <chrono> std::atomic<unsigned long long> line_count(0); void count_lines(const std::string& chunk, int chunk_size) { unsigned long long local_count = 0; for (int i = 0; i < chunk_size; ++i) { if (chunk[i] == '\n') { ++local_count; } } line_count += local_count; } int main(int argc, char* argv[]) { if (argc != 2) { std::cerr << "Usage: " << argv[0] << " <input_file>" << std::endl; return 1; } int max_threads = std::thread::hardware_concurrency() * 0.75; struct sysinfo sys_info; sysinfo(&sys_info); int BUFFER_SIZE = sys_info.freeram * 0.25 / max_threads; BUFFER_SIZE = (BUFFER_SIZE / max_threads) * max_threads; gzFile input_file = gzopen(argv[1], "rb"); if (!input_file) { std::cerr << "Error opening file." << std::endl; return 1; } std::vector<std::thread> threads; std::string buffer(BUFFER_SIZE, 0); int bytes_read = 0; auto start_time = std::chrono::high_resolution_clock::now(); while ((bytes_read = gzread(input_file, &buffer[0], BUFFER_SIZE)) > 0) { threads.push_back(std::thread(count_lines, buffer, bytes_read)); if (threads.size() >= max_threads) { for (auto& t : threads) { t.join(); } threads.clear(); } } for (auto& t : threads) { t.join(); } auto end_time = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast<std::chrono::seconds>(end_time - start_time).count(); gzclose(input_file); std::cout << "Number of lines: " << line_count << std::endl; std::cout << "Time taken: " << duration << " seconds" << std::endl; return 0; }
g++ -std=c++11 -o gz_rownum gz_rownum.cpp -lz -pthread
如果觉得我的文章对您有用,请随意打赏。你的支持将鼓励我继续创作!