Unix/Linux编程实践教程–tail在OS X的实现

环境：OS X 10.12.4

tail和head的作用刚好相反，读出文件的最后n行。这个看起来好像比较简单，但是还是有很多需要考量的。大致考虑了下，我得出了三个方案。

Plan A

从头开始读，记录下每一个换行符的位置（定义一个变量记录了目前移动了几次），遇到EOF时，比如目前的换行符的个数为x，那么从第x - ( n - 1)个换行符的下一个字符开始打印至EOF即可。

优点

可以使用缓存，减少系统调用。
实现简单。

缺点

需要读取整个文件的所有内容。
记录每个换符的位置其实是比较费内存的，以前有一次接触过一个2个多G的文本文件，包含3000多万行。如果用一个int型数组来存这些个位置，大约需要114M内存，但int往往是不够的，假设3000万行，每个80个字符，总的字符数大约是24亿，超过signed int了。具体就不再缀述了。

Plan B

从尾开始读，每次向前跳1个字符，找到足够数据的换行符（不用存储换行符的位置），然后从那开始打印至EOF即可。

优点

不用存储换行符的位置，空间代价小。
实现简单。

缺点

无法使用缓存字符，需要频繁的系统调用。

Plan C

从尾开始读，每次向前跳1024个字符，然后向下查找至上个End，过程中记录换行符的位置。

优点

可以使用缓存，减少系统调用。

缺点

需要记录换行符的位置。（此时最多记录1024个换行符的位置即可）
实现相对复杂。（相对于前两个方案）

后来我仔细考虑了下，觉得可以将Plan B和Plan C结合一下，使用Plan B的方案实现tail，然后使用Plan C向前跳1024个字符的思想封装一个缓存模块，Plan B调用这个缓存模块获取上一个字符。这样的话，就可以结合它们的优点，同时消除缺点。

好，开始写缓存模块，可能大致是这样的。（和书中的utmplib.c类似）

xc_file_r.c

#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>

char buf[BUFSIZ];
int cur = -1;
int num_sum = 0;
int fd = -1;

/* read only */
int xc_ropen(char *filename)
{
    /* open and then move to end */
    if((fd = open(filename,O_RDONLY)) == -1 || lseek(fd,0,SEEK_END) == -1){
        perror(filename);
        exit(1);
    }
    return fd;
}
int xc_rgetoffset(void)
{
    if(fd == -1)
        return -1;
    return lseek(fd,SEEK_CUR);
}
char xc_rreload(void)
{
    int bytes_read;
    static int pre_offset = -1;
    int offset;

    /* init */
    if(pre_offset == -1)
        pre_offset = xc_rgetoffset();
    /* the targer position is negative */
    if((offset = lseek(fd,-BUFSIZ,SEEK_CUR)) == -1)
        if(errno == EINVAL){ /* brace is needed to avoid `dangling else` */
            if((offset = lseek(fd,SEEK_SET)) == -1)
                perror("move"); 
        }else
            perror("move"); 
    num_sum = read(fd,buf,pre_offset - offset);
    cur = num_sum - 1;
    if(num_sum == 0)
        return EOF;
    if(num_sum == -1){
        perror("read");
        exit(1);
    }
    /* read option will move pointer to the next,so move back */
    lseek(fd,-num_sum,SEEK_CUR);
    pre_offset = offset;
    return buf[cur--];
}
char xc_rgetchar(void)
{
    if(fd == -1)
        return EOF;
    if(cur < 0)
        return xc_rreload();
    return buf[cur--];
}
void xc_rclose(void)
{
    if(fd != -1){
        if(close(fd) == -1){
            perror("close file");
            exit(1);
        }
        fd = -1;
    }
}

这时值得提的是，如果使用了lseek(fd,SEEK_END)之后，指针并不是指向最后一个字符，而是最后一个字符的下一个字符。还有就是lseek()的返回值是移动之后的位置相对于文件开始的位置的偏移（lseek() returns the resulting offset location as measured in bytes from the beginning of the file.），失败时返回-1。

还有一个问题就是如果现在的位置距离文件头还有不足BUFSIZ个字符，lseek()还向前移动BUFSIZ个字符时，lseek()会返回-1，并设置对应的errno为EINVAL。

以上说明均参照man 2 lseek得出。

然后主体程序的实现就是比较简单的了，从后向前读，计算换行符个数，数够了之后从那个点开始打印即可。

tail.c

#include <stdio.h>
#include <stdlib.h>

void xc_ropen(char *filename);
char xc_rgetchar(void);
void xc_rclose(void);
int xc_rgetoffset(void);

void xc_open(char *aFName);
int xc_getchar(void);
void xc_close(void);
void xc_moveto(int offset);


int main(int ac,char *av[])
{
    char ch;
    int lines = 10;
    int cnt = 0;
    int offset;
    int char_count = 0;

    while(--ac && (*++av)[0] == '-'){
        switch(*++av[0])
        {
            case 'n':
                lines = atoi(*++av);
                if(lines <= 0){
                    fprintf(stderr,"Usage: tail [-n number] file. \n");
                    exit(1);
                }
                break;
            case 'c':
                /* do not implement */
                break;
        }
    }
    xc_ropen(*av);
    while((ch = xc_rgetchar()) != EOF){
        char_count++;
        if(ch == '\n')
            cnt++;
        if(cnt == lines){
            /* print from here on */
            xc_open(*av);
            char_count--;
            xc_moveto(-char_count);
            while((ch = xc_getchar()) != EOF)
                putchar(ch);
            xc_close();
            break;
        }
    }
    xc_rclose();
    return 0;
}

因为最后需要顺序打印出需要打印的部分，所有使用了以前写的xc_file.c。

xc_file.c

#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdlib.h>
#include <string.h>

/* BUFSIZ define in stdio.h,commonly is 1024 */
static unsigned char chBuf[BUFSIZ];
static int fd = -1;
static char fName[BUFSIZ];
static int chCur;
static int chSum;

void xc_readFromStdin(void)
{
    /* define in unistd.h */
    fd = STDIN_FILENO;
}

void xc_open(char *aFName)
{
    if((fd = open(aFName,O_RDONLY)) == -1){
        perror(aFName);
        exit(1);
    }
    strcpy(fName,aFName); /* record which file is opened */
    chCur = chSum = 0;
}

int xc_reload(void)
{
    int bytes_read;
    if((bytes_read = read(fd,chBuf,BUFSIZ)) > 0){
        chCur = 0;
        chSum = bytes_read;
        return chBuf[chCur++];
    }else if(bytes_read == -1){
        perror(fName);
        exit(1);
    }else if (bytes_read == 0)
        return EOF;
}

int xc_getchar(void)
{
    if(fd == -1)
        return EOF;
    if(chSum == chCur)
        return xc_reload();
    return chBuf[chCur++];
}

void xc_close(void)
{
    if(fd != -1)
    {
        if(close(fd) == -1){
            perror(fName);
            exit(1);
        }
        fd = -1;
    }
}
void xc_moveto(int offset)
{
    if(fd == -1)
        return ;
    if(lseek(fd,offset,SEEK_END) == -1){
        perror("target position are illegal");
        exit(1);
    }
}

效果

Unix/Linux编程实践教程–tail在OS X的实现

Plan A

优点

缺点

Plan B

优点

缺点

Plan C

优点

缺点

猜你在找的Bash相关文章