近期有一个需求:获取多个文件 md5 校验和判断是否存在重复文件,因为文件数量较多,有的文件还比较大,需要处理的文件还没有到位,我就考虑了一下效率的问题。目前我已知的 Golang 中获取 md5 校验和的方法有两个,这里直接给出实现源码。
package@H_404_5@ main
import@H_404_5@ (
"crypto/md5"@H_404_5@
"flag"@H_404_5@
"fmt"@H_404_5@
"io"@H_404_5@
"io/IoUtil"@H_404_5@
"os"@H_404_5@
)
var@H_404_5@ which = flag.Bool("which"@H_404_5@,true@H_404_5@,""@H_404_5@)
var@H_404_5@ path = flag.String("path"@H_404_5@,""@H_404_5@,""@H_404_5@)
var@H_404_5@ cnt = flag.Int("cnt"@H_404_5@, 100@H_404_5@,""@H_404_5@)
func@H_404_5@ aaa() {
f,err := os.Open(*path)
if@H_404_5@ err != nil@H_404_5@ {
fmt.Println("Open"@H_404_5@,err)
return@H_404_5@
}
defer@H_404_5@ f.Close()
body,err := IoUtil.ReadAll(f)
if@H_404_5@ err != nil@H_404_5@ {
fmt.Println("ReadAll"@H_404_5@,err)
return@H_404_5@
}
md5.Sum(body)
//fmt.Printf("%x\n",md5.Sum(body))@H_404_5@
}
func@H_404_5@ bbb() {
f,err)
return@H_404_5@
}
defer@H_404_5@ f.Close()
md5hash := md5.New()
if@H_404_5@ _,err := io.Copy(md5hash,f); err != nil@H_404_5@ {
fmt.Println("Copy"@H_404_5@,err)
return@H_404_5@
}
md5hash.Sum(nil@H_404_5@)
//fmt.Printf("%x\n",md5hash.Sum(nil))@H_404_5@
}
func@H_404_5@ main() {
flag.Parse()
for@H_404_5@ i := 0@H_404_5@; i < *cnt; i++ {
if@H_404_5@ *which {
aaa()
} else@H_404_5@ {
bbb()
}
}
}
还有可供参考的获取 md5 校验和的 Shell 命令
md5 -- calculate a message-digest fingerprint (checksum) for@H_404_5@ a file
md5 [-pqrtx] [-s string] [file ...@H_404_5@]
banjakukutekiiMac:shell panshiqu$ ls -an@H_404_5@ |@H_404_5@ grep by@H_404_5@
-rw@H_404_5@-r@H_404_5@--@H_404_5@r--@H_404_5@ 1@H_404_5@ 501@H_404_5@ 20@H_404_5@ 7285957@H_404_5@ 11@H_404_5@ 17@H_404_5@ 16@H_404_5@:14@H_404_5@ by@H_404_5@.@H_404_5@out
banjakukutekiiMac:shell panshiqu$ cp by@H_404_5@.@H_404_5@out by2.@H_404_5@out
banjakukutekiiMac:shell panshiqu$ cat by@H_404_5@.@H_404_5@out >>@H_404_5@ by2.@H_404_5@out
banjakukutekiiMac:shell panshiqu$ ls -an@H_404_5@ |@H_404_5@ grep by@H_404_5@
-rw@H_404_5@-r@H_404_5@--@H_404_5@r--@H_404_5@ 1@H_404_5@ 501@H_404_5@ 20@H_404_5@ 7285957@H_404_5@ 11@H_404_5@ 17@H_404_5@ 16@H_404_5@:14@H_404_5@ by@H_404_5@.@H_404_5@out
-rw@H_404_5@-r@H_404_5@--@H_404_5@r--@H_404_5@ 1@H_404_5@ 501@H_404_5@ 20@H_404_5@ 14571914@H_404_5@ 11@H_404_5@ 17@H_404_5@ 17@H_404_5@:03@H_404_5@ by2.@H_404_5@out
下面效率展示
banjakukutekiiMac:shell@H_404_5@ panshiqu$ time@H_404_5@ ./gomd5 -cnt=1@H_404_5@ -which=true@H_404_5@ -path="by.out"@H_404_5@
real 0@H_404_5@m0.027@H_404_5@s
user 0@H_404_5@m0.017@H_404_5@s
sys 0@H_404_5@m0.012@H_404_5@s
banjakukutekiiMac:shell@H_404_5@ panshiqu$ time@H_404_5@ ./gomd5 -cnt=1@H_404_5@ -which=true@H_404_5@ -path="by2.out"@H_404_5@
real 0@H_404_5@m0.048@H_404_5@s
user 0@H_404_5@m0.033@H_404_5@s
sys 0@H_404_5@m0.018@H_404_5@s
banjakukutekiiMac:shell@H_404_5@ panshiqu$ time@H_404_5@ ./gomd5 -cnt=1@H_404_5@ -which=false@H_404_5@ -path="by.out"@H_404_5@
real 0@H_404_5@m0.018@H_404_5@s
user 0@H_404_5@m0.012@H_404_5@s
sys 0@H_404_5@m0.004@H_404_5@s
banjakukutekiiMac:shell@H_404_5@ panshiqu$ time@H_404_5@ ./gomd5 -cnt=1@H_404_5@ -which=false@H_404_5@ -path="by2.out"@H_404_5@
real 0@H_404_5@m0.031@H_404_5@s
user 0@H_404_5@m0.024@H_404_5@s
sys 0@H_404_5@m0.005@H_404_5@s
banjakukutekiiMac:shell@H_404_5@ panshiqu$ time@H_404_5@ md5 by@H_404_5@.out
MD5 (by@H_404_5@.out) = 9@H_404_5@d79e19a00cef1ae1bb6518ca4adf9de
real 0@H_404_5@m0.023@H_404_5@s
user 0@H_404_5@m0.019@H_404_5@s
sys 0@H_404_5@m0.006@H_404_5@s
banjakukutekiiMac:shell@H_404_5@ panshiqu$ time@H_404_5@ md5 by2.out
MD5 (by2.out) = 0@H_404_5@a029a460a20e8dcb00d032d6fab74c6
real 0@H_404_5@m0.042@H_404_5@s
user 0@H_404_5@m0.037@H_404_5@s
sys 0@H_404_5@m0.009@H_404_5@s
总结:
* 不管什么方法都会随着文件变大时间会变长,上面的例子大约都是2倍
* io.Copy
方法效率最高,建议大家这样使用