从wc -l说起---如何统计大文件的行数

问题引入

昨天工作上有一个任务根据nginx日志做一些数据统计。由于日志文件很大,而且不断增大中。如果我要统计一小时以内的日志,这时候就没必要对所有日志都扫一遍。我的初步思路是先用wc -l统计一下日志行数,然后根据当前时间估算出平均每分钟产生了多少条日志。这样就可以估算一小时以内的日志条数了。然后用tail -n就可以了。 但是发现wc -l其实也是有点慢的。从gnu上把bash wc实现代码(http://mirrors.ustc.edu.cn/gnu/coreutils/coreutils-8.9.tar.gz)wget 下来看了。统计单个文件的内部实现是调用read(int filedes, char *buf, unsigned nbytes) 先把内容读入buffer,然后按字节统计,在实现上做了一些细节优化,性能还是很好的。 但是不管怎么样还是要对所有字节都扫一遍。有没有更好的方式呢?

粗略统计文件行数

unix中struct state记录文件所有信息,但是没有文件行数,因此不能直接get到。

struct stat {
    dev_t     st_dev;         /* ID of device containing file */
    ino_t     st_ino;         /* inode number */
    mode_t    st_mode;        /* file type and mode */
    nlink_t   st_nlink;       /* number of hard links */
    uid_t     st_uid;         /* user ID of owner */
    gid_t     st_gid;         /* group ID of owner */
    dev_t     st_rdev;        /* device ID (if special file) */
    off_t     st_size;        /* total size, in bytes */
    blksize_t st_blksize;     /* blocksize for filesystem I/O */
    blkcnt_t  st_blocks;      /* number of 512B blocks allocated */

    /* Since Linux 2.6, the kernel supports nanosecond
       precision for the following timestamp fields.
       For the details before Linux 2.6, see NOTES. */

    struct timespec st_atim;  /* time of last access */
    struct timespec st_mtim;  /* time of last modification */
    struct timespec st_ctim;  /* time of last status change */

#define st_atime st_atim.tv_sec      /* Backward compatibility */
#define st_mtime st_mtim.tv_sec
#define st_ctime st_ctim.tv_sec
};

但是有文件大小(st_size),这样我们可以根据这个信息,然后再读取文件中的m行,酸楚其大小m_size, 这样平均一行大小为m_size/m,这样估算的行数就是st_size*m/m_size。 这样速度就会快很多。 我写了下面的程序:

#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <string.h>

#define SMALL_FILE_MAX_SIZE 4096
#define DEFAULT_LINE_NUM 100

void show_usage(char **argv) {
    fprintf(stdout, "Usage: %s FILE\n", argv[0]);
}

int small_file_line_counter(int fd, size_t* lines) {
    char buffer[SMALL_FILE_MAX_SIZE] = {0};
    char* buf = buffer;
    *lines = 0;
    int ret = 0;
    int i = 0;
    while ((ret = read(fd, (void *)buf, SMALL_FILE_MAX_SIZE)) > 0) {
        i = 0;
        while (i != ret) {
            if (*(buf + i) == '\n') {
                *lines = *lines + 1;
            }
            ++i;
        }
        buf = buffer;
    }

    return ret;
}

int large_file_line_counter(int fd, size_t total_size, size_t* lines) {
   int ret = 0;
   char buffer[SMALL_FILE_MAX_SIZE] = {0};
   char* buf = buffer;
   int i = 0;
   int cnt = 0;
   size_t size = 0;
   size_t line_size = 0;
   double size_of_each_line = 0.0;
   while (cnt < DEFAULT_LINE_NUM) {
       ret = read(fd, buf, SMALL_FILE_MAX_SIZE);
       if (ret == 0) {
           break;
       }
       if (ret == -1) {
           return -1;
       }
       i = 0;
       while (i != ret) {
           line_size += 1;
           if (*(buf + i) == '\n') {
               size += line_size;
               line_size = 0;
               cnt += 1;
               if (cnt == DEFAULT_LINE_NUM) {
                   break;
               }
           }
           ++i;
       }
       memset((void *)buffer, 0, SMALL_FILE_MAX_SIZE);
       buf = buffer;
   }
   size_of_each_line = (double)size / cnt;
   *lines = (int)(total_size / size_of_each_line);
   return 0;
}

int main (int argc, char **argv)
{
    int fd = -1;
    struct stat st;
    size_t line_count = 0;
    if (argc != 2) {
        show_usage(argv);
        exit(1);
    }
    if (stat(argv[1], &st) != 0) {
        fprintf(stderr, "Cannot get state of file: %s\n", argv[1]);
        exit(1);
    }

    fd = open(argv[1], O_RDONLY);
    if (fd == -1) {
        fprintf(stderr, "Cannot open file: %s\n", argv[1]);
        exit(1);
    }
    if (st.st_size < SMALL_FILE_MAX_SIZE) {
        if (small_file_line_counter(fd, &line_count) == -1) {
            fprintf(stderr, "Get line count of file: %s failed!\n", argv[1]);
            exit(1);
        }
    } else {
        if(large_file_line_counter(fd, st.st_size, &line_count) == -1) {
            fprintf(stderr, "Get line count of file: %s failed!\n", argv[1]);
            exit(1);
        }
    }
    close(fd);

    fprintf(stdout, "%s\t%d\n", argv[1], (int)line_count);

    return 0;
}

对于像nginx日志这种,准确率有90%+

[work@hkg02-pcf-csuours00:~/chenkang/codes/c]$./mylc /home/soft/resty/nginx/logs/access.js_mobojoy.conf_20170119.log
/home/soft/resty/nginx/logs/access.js_mobojoy.conf_20170119.log 35525363
[work@hkg02-pcf-csuours00:~/chenkang/codes/c]$wc -l /home/soft/resty/nginx/logs/access.js_mobojoy.conf_20170119.log
37404978 /home/soft/resty/nginx/logs/access.js_mobojoy.conf_20170119.log

目前的不足: 1. 估算每行的平均长度只是在前面读取100行,没有随机读取. 2. 参数不可配置。