从wc -l说起---如何统计大文件的行数
问题引入
昨天工作上有一个任务根据nginx日志做一些数据统计。由于日志文件很大,而且不断增大中。如果我要统计一小时以内的日志,这时候就没必要对所有日志都扫一遍。我的初步思路是先用wc -l统计一下日志行数,然后根据当前时间估算出平均每分钟产生了多少条日志。这样就可以估算一小时以内的日志条数了。然后用tail -n就可以了。 但是发现wc -l其实也是有点慢的。从gnu上把bash wc实现代码(http://mirrors.ustc.edu.cn/gnu/coreutils/coreutils-8.9.tar.gz)wget 下来看了。统计单个文件的内部实现是调用read(int filedes, char *buf, unsigned nbytes) 先把内容读入buffer,然后按字节统计,在实现上做了一些细节优化,性能还是很好的。 但是不管怎么样还是要对所有字节都扫一遍。有没有更好的方式呢?
粗略统计文件行数
unix中struct state记录文件所有信息,但是没有文件行数,因此不能直接get到。
struct stat {
dev_t st_dev; /* ID of device containing file */
ino_t st_ino; /* inode number */
mode_t st_mode; /* file type and mode */
nlink_t st_nlink; /* number of hard links */
uid_t st_uid; /* user ID of owner */
gid_t st_gid; /* group ID of owner */
dev_t st_rdev; /* device ID (if special file) */
off_t st_size; /* total size, in bytes */
blksize_t st_blksize; /* blocksize for filesystem I/O */
blkcnt_t st_blocks; /* number of 512B blocks allocated */
/* Since Linux 2.6, the kernel supports nanosecond
precision for the following timestamp fields.
For the details before Linux 2.6, see NOTES. */
struct timespec st_atim; /* time of last access */
struct timespec st_mtim; /* time of last modification */
struct timespec st_ctim; /* time of last status change */
#define st_atime st_atim.tv_sec /* Backward compatibility */
#define st_mtime st_mtim.tv_sec
#define st_ctime st_ctim.tv_sec
};
但是有文件大小(st_size),这样我们可以根据这个信息,然后再读取文件中的m行,酸楚其大小m_size, 这样平均一行大小为m_size/m,这样估算的行数就是st_size*m/m_size。 这样速度就会快很多。 我写了下面的程序:
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <string.h>
#define SMALL_FILE_MAX_SIZE 4096
#define DEFAULT_LINE_NUM 100
void show_usage(char **argv) {
fprintf(stdout, "Usage: %s FILE\n", argv[0]);
}
int small_file_line_counter(int fd, size_t* lines) {
char buffer[SMALL_FILE_MAX_SIZE] = {0};
char* buf = buffer;
*lines = 0;
int ret = 0;
int i = 0;
while ((ret = read(fd, (void *)buf, SMALL_FILE_MAX_SIZE)) > 0) {
i = 0;
while (i != ret) {
if (*(buf + i) == '\n') {
*lines = *lines + 1;
}
++i;
}
buf = buffer;
}
return ret;
}
int large_file_line_counter(int fd, size_t total_size, size_t* lines) {
int ret = 0;
char buffer[SMALL_FILE_MAX_SIZE] = {0};
char* buf = buffer;
int i = 0;
int cnt = 0;
size_t size = 0;
size_t line_size = 0;
double size_of_each_line = 0.0;
while (cnt < DEFAULT_LINE_NUM) {
ret = read(fd, buf, SMALL_FILE_MAX_SIZE);
if (ret == 0) {
break;
}
if (ret == -1) {
return -1;
}
i = 0;
while (i != ret) {
line_size += 1;
if (*(buf + i) == '\n') {
size += line_size;
line_size = 0;
cnt += 1;
if (cnt == DEFAULT_LINE_NUM) {
break;
}
}
++i;
}
memset((void *)buffer, 0, SMALL_FILE_MAX_SIZE);
buf = buffer;
}
size_of_each_line = (double)size / cnt;
*lines = (int)(total_size / size_of_each_line);
return 0;
}
int main (int argc, char **argv)
{
int fd = -1;
struct stat st;
size_t line_count = 0;
if (argc != 2) {
show_usage(argv);
exit(1);
}
if (stat(argv[1], &st) != 0) {
fprintf(stderr, "Cannot get state of file: %s\n", argv[1]);
exit(1);
}
fd = open(argv[1], O_RDONLY);
if (fd == -1) {
fprintf(stderr, "Cannot open file: %s\n", argv[1]);
exit(1);
}
if (st.st_size < SMALL_FILE_MAX_SIZE) {
if (small_file_line_counter(fd, &line_count) == -1) {
fprintf(stderr, "Get line count of file: %s failed!\n", argv[1]);
exit(1);
}
} else {
if(large_file_line_counter(fd, st.st_size, &line_count) == -1) {
fprintf(stderr, "Get line count of file: %s failed!\n", argv[1]);
exit(1);
}
}
close(fd);
fprintf(stdout, "%s\t%d\n", argv[1], (int)line_count);
return 0;
}
对于像nginx日志这种,准确率有90%+
[work@hkg02-pcf-csuours00:~/chenkang/codes/c]$./mylc /home/soft/resty/nginx/logs/access.js_mobojoy.conf_20170119.log
/home/soft/resty/nginx/logs/access.js_mobojoy.conf_20170119.log 35525363
[work@hkg02-pcf-csuours00:~/chenkang/codes/c]$wc -l /home/soft/resty/nginx/logs/access.js_mobojoy.conf_20170119.log
37404978 /home/soft/resty/nginx/logs/access.js_mobojoy.conf_20170119.log
目前的不足: 1. 估算每行的平均长度只是在前面读取100行,没有随机读取. 2. 参数不可配置。