我很快写了一个C程序,提取了一组gzip文件(包含约50万行)的第i行.这是我的C程序:
#include <stdio.h> #include <string.h> #include <stdlib.h> #include <errno.h> #include <zlib.h> /* compilation: gcc -o linesbyindex -Wall -O3 linesbyindex.c -lz */ #define MY_BUFFER_SIZE 10000000 static void extract(long int index,const char* filename) { char buffer[MY_BUFFER_SIZE]; long int curr=1; gzFile in=gzopen (filename,"rb"); if(in==NULL) { fprintf(stderr,"Cannot open \"%s\" %s.\n",filename,strerror(errno)); exit(EXIT_FAILURE); } while(gzread(in,buffer,MY_BUFFER_SIZE)!=-1 && curr<=index) { char* p=buffer; while(*p!=0) { if(curr==index) { fputc(*p,stdout); } if(*p=='\n') { ++curr; if(curr>index) break; } p++; } } gzclose(in); if(curr<index) { fprintf(stderr,"Not enough lines in %s (%ld)\n",curr); } } int main(int argc,char** argv) { int optind=2; char* p2; long int count=0; if(argc<3) { fprintf(stderr,"Usage: %s (count) files...\n",argv[0]); return EXIT_FAILURE; } count=strtol(argv[1],&p2,10); if(count<1 || *p2!=0) { fprintf(stderr,"bad number %s\n",argv[1]); return EXIT_SUCCESS; } while(optind< argc) { extract(count,argv[optind]); ++optind; } return EXIT_SUCCESS; }
作为测试,我在java中编写了以下等效代码:
import java.io.*; import java.util.zip.GZIPInputStream; public class GetLineByIndex{ private int index; public GetLineByIndex(int count){ this.index=count; } private String extract(File file) throws IOException { long curr=1; byte buffer[]=new byte[2048]; StringBuilder line=null; InputStream in=null; if(file.getName().toLowerCase().endsWith(".gz")){ in= (new GZIPInputStream(new FileInputStream(file))); }else{ in= (new FileInputStream(file)); } int nRead=0; while((nRead=in.read(buffer))!=-1) { int i=0; while(i<nRead) { if(buffer[i]=='\n') { ++curr; if(curr>this.index) break; } else if(curr==this.index) { if(line==null) line=new StringBuilder(500); line.append((char)buffer[i]); } i++; } if(curr>this.index) break; } in.close(); return (line==null?null:line.toString()); } public static void main(String args[]) throws Exception{ int optind=1; if(args.length<2){ System.err.println("Usage: program (count) files...\n"); return; } GetLineByIndex app=new GetLineByIndex(Integer.parseInt(args[0])); while(optind < args.length) { String line=app.extract(new File(args[optind])); if(line==null) { System.err.println("Not enough lines in "+args[optind]); } else { System.out.println(line); } ++optind; } return; } }
事情发生在同一台机器上,java程序要比C程序(〜2’15“)要快得多(〜1’45”)(我经历了几次测试).
怎么解释这个差异?
解决方法
Java版本比C版本更有可能的解释是C版本不正确.
修复C版本后,我获得了以下结果(与您的声明相比,Java比C更快):
Java 1.7 -client: 65 milliseconds (after JVM warmed up) Java 1.7 -server: 82 milliseconds (after JVM warmed up) gcc -O3: 37 milliseconds
任务是从文件word.gz打印第200000行.文件word.gz由gzipping /usr/share / dict / words生成.
... static char buffer[MY_BUFFER_SIZE]; ... ssize_t len; while((len=gzread(in,MY_BUFFER_SIZE)) > 0 && curr<=index) { char* p=buffer; char* endp=buffer+len; while(p < endp) { ...