我正在开发一个程序,它将(希望)比较给定目录中的所有文件,识别重复项,将它们添加到列表中,然后将列表显示给用户,以便他们可以验证他们是否希望删除这些文件,然后再删除它们我严重陷入困境.到目前为止,我已经能够递归列出所有文件,并且我一直在乱搞比较它们以找到重复的文件.我很快意识到要完成我想要的东西,我需要比较多个文件属性.并非所有文件都是文本文件,并且比较文本主要是我发现的互联网上的示例代码,我正在尝试更多地了解二进制数据,因为比较字节数组和文件名是我能来的最好的起来.具体来说,我要问哪些属性最好进行比较,以便平衡查找重复项的准确性并能够处理合理大小的目录?如果你不介意我怎么能在我的代码中实现它?希望我的问题不是太可怕,我真的很感激我能得到的任何帮助.这就是我所拥有的,是的,我在这里找到的几个方法和第二个文件,以防你想知道.
附:如果我错过了任何无关的变量,我真的很抱歉,我试着在发布之前稍微清理一下代码
附:如果我错过了任何无关的变量,我真的很抱歉,我试着在发布之前稍微清理一下代码
ListFilesInDir.java
import java.io.*; import java.nio.file.Files; import java.nio.file.attribute.*; import java.security.*; import java.util.*; public final class ListFilesInDir { static File startingDir; static List<File> files; static List<File> dirs; static TreeMap<Integer,File> duplicates; static ArrayList<Integer> usedIndexes = new ArrayList<Integer>(); static ArrayList<File> duplicateList = new ArrayList<File>(); static File out = new File("ListDuplicateFiles.txt"); static PrintWriter output; static int key = 0; static String tabString; static TreeMap<Integer,File> tMap = new TreeMap<Integer,File>(); static int num1 = 0; static int num2 = 0; static File value1 = null; static File value2 = null; static String path1 = null; static String name1 = null; static String path2 = null; static String name2 = null; public static void main(String[] args) throws FileNotFoundException { new ListFilesInDir(args[0]); } public ListFilesInDir(String string) throws FileNotFoundException { startingDir = new File(string); dirs = new ArrayList<File>(); duplicates = new TreeMap<Integer,File>(); output = new PrintWriter(out); getFiles(startingDir); compareFiles(); writeDuplicateList(); } public void getFiles(File root) throws FileNotFoundException { System.out.println("Adding files to list..."); ListFilesInDir.files = getFileList(root); for (File file : files) { if (!file.isFile()) { System.out.println("Adding DIR: " + key + " name: " + file); dirs.add(file); } else { System.out.println("Adding FILE: " + key + " name: " + file); tMap.put(key,file); } key++; } System.out.println(dirs.size()); System.out.println("Complete"); } public static void compareFiles() throws FileNotFoundException { System.out.println("Preparing to compare files..."); for (num1 = 0; num1 < files.size(); num1++) { for (num2 = 0; num2 < files.size(); num2++) { if (num1 != num2) { value1 = files.get(num1); value2 = files.get(num2); path1 = value1.getAbsolutePath(); path2 = value2.getAbsolutePath(); name1 = path1.substring(path1.lastIndexOf(File.separator)); name2 = path2.substring(path2.lastIndexOf(File.separator)); HashMap<Integer,File> testMap = new HashMap<Integer,File>(); System.out.println(num1 + "|" + num2 + " : " + value1 + " - " + value2); if (CompareBinaries.fileContentsEquals( value1.getAbsolutePath(),value2.getAbsolutePath()) == true) { if (testMap.put(num1,value1) != null) { TreeSet<File> fileTreeSet; } addDuplicate(num1,value1); files.remove(num1); System.out.println("added(binary): " + num1 + ":" + value1); } else if (value1.getName().equalsIgnoreCase( value2.getName())) { addDuplicate(num1,value1); files.remove(num1); System.out.println("added(name): " + num1 + ":" + value1); } } } } System.out.println("Complete"); } public static void writeDuplicateList() { int printKey = 0; for (File file : duplicateList) { output.printf("%03d | %s\n",printKey,file); System.out.printf("%03d | %s\n",file); printKey++; } output.append(docsInfo()); output.close(); output.flush(); System.out.println("\n"+files.size()+" files in "+startingDir.getAbsolutePath() +","+duplicateList.size()+" duplicate files."); } static public String docsInfo() { String s = "\n\n" + files.size() + " files in " + startingDir.getAbsolutePath() + "," + duplicates.size() + " duplicate files."; return s; } static public List<File> getFileList(File file) throws FileNotFoundException { List<File> result = getUnsortedFileList(file); Collections.sort(result); return result; } static private List<File> getUnsortedFileList(File file) throws FileNotFoundException { List<File> result = new ArrayList<File>(); File[] filesAndDirs = file.listFiles(); List<File> filesDirs = Arrays.asList(filesAndDirs); int dirKey = 0; for (File fileList : filesDirs) { result.add(fileList); if (!fileList.isFile()) { List<File> deeperList = getUnsortedFileList(fileList); result.addAll(deeperList); } } return result; } static private void validateDir(File dir) throws FileNotFoundException { if (dir == null) throw new IllegalArgumentException("Directory is null!"); if (!dir.exists()) throw new FileNotFoundException("Directory doesn't exist: " + dir); if (!dir.isDirectory()) throw new IllegalArgumentException(dir + "is not a directory!"); if (!dir.canRead()) throw new IllegalArgumentException("Directory cannot be read: " + dir); } public static void addDuplicate(int i,File file)throws FileNotFoundException{ if (!duplicates.containsKey(i)) { duplicates.put(i,file); duplicateList.add(file); } } }
CompareBinaries.java
import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.util.Arrays; public class CompareBinaries { private final static int BUFFSIZE = 1024; private static byte buff1[] = new byte[BUFFSIZE]; private static byte buff2[] = new byte[BUFFSIZE]; public static boolean inputStreamEquals(InputStream is1,InputStream is2) { if(is1 == is2) return true; if(is1 == null && is2 == null) { System.out.println("both input streams are null"); return true; } if(is1 == null || is2 == null) return false; try { int read1 = -1; int read2 = -1; do { int offset1 = 0; while (offset1 < BUFFSIZE && (read1 = is1.read(buff1,offset1,BUFFSIZE-offset1)) >= 0) { offset1 += read1; } int offset2 = 0; while (offset2 < BUFFSIZE && (read2 = is2.read(buff2,offset2,BUFFSIZE-offset2)) >= 0) { offset2 += read2; } if(offset1 != offset2) return false; if(offset1 != BUFFSIZE) { Arrays.fill(buff1,BUFFSIZE,(byte)0); Arrays.fill(buff2,(byte)0); } if(!Arrays.equals(buff1,buff2)) return false; } while(read1 >= 0 && read2 >= 0); if(read1 < 0 && read2 < 0) return true; // both at EOF return false; } catch (Exception ei) { return false; } } public static boolean fileContentsEquals(File file1,File file2) { InputStream is1 = null; InputStream is2 = null; if(file1.length() != file2.length()) return false; try { is1 = new FileInputStream(file1); is2 = new FileInputStream(file2); return inputStreamEquals(is1,is2); } catch (Exception ei) { return false; } finally { try { if(is1 != null) is1.close(); if(is2 != null) is2.close(); } catch (Exception ei2) {} } } public static boolean fileContentsEquals(String fn1,String fn2) { return fileContentsEquals(new File(fn1),new File(fn2)); }
}
解决方法
您可以使用哈希函数来比较两个文件 – 两个文件(在不同的文件夹中)可以具有相同的名称和属性(例如长度)但内容不同.例如,您可以创建一个文本文件,然后将其复制到更改内容中一个字母的其他文件夹中.
哈希函数对文件内容进行了一些巧妙的数学计算,结果是一个数字,即使内容上的微小差异最终会有两个非常不同的数字.
以md5散列函数为例,这会产生任意长度的字节数组中的16字节数.虽然理论上可以创建具有相同md5但内容不同的两个文件,但概率非常低(而具有相同名称和大小但内容不同的两个文件是相对较高的概率事件)
关键是,您可以构建一个md5文件内容的表,这只需要计算一次并且可以快速比较 – 如果md5不同,则文件不同,100%置信度.只有在不太可能发生的情况下,md5才是相同的,你必须采用逐字节比较才能100%确定.