找到重复的文件并删除它们

我正在编写一个 Python 程序来查找和删除文件夹中的重复文件。

我有多个 mp3文件和其他一些文件的副本,我使用的是 sh1算法。

如何找到这些重复的文件并删除它们?

98293 次浏览

不久前我用 Python 写了一个——欢迎您使用它。

import sys
import os
import hashlib


check_path = (lambda filepath, hashes, p = sys.stdout.write:
(lambda hash = hashlib.sha1 (file (filepath).read ()).hexdigest ():
((hash in hashes) and (p ('DUPLICATE FILE\n'
'   %s\n'
'of %s\n' % (filepath, hashes[hash])))
or hashes.setdefault (hash, filepath)))())


scan = (lambda dirpath, hashes = {}:
map (lambda (root, dirs, files):
map (lambda filename: check_path (os.path.join (root, filename), hashes), files), os.walk (dirpath)))


((len (sys.argv) > 1) and scan (sys.argv[1]))
def remove_duplicates(dir):
unique = []
for filename in os.listdir(dir):
if os.path.isfile(filename):
filehash = md5.md5(file(filename).read()).hexdigest()
if filehash not in unique:
unique.append(filehash)
else:
os.remove(filename)

//编辑:

对于 MP3,你可能也对这个主题 检测重复的 MP3文件与不同的比特率和/或不同的 ID3标签?感兴趣

递归文件夹版本:

此版本使用文件大小和内容的散列来查找重复内容。 您可以传递给它多个路径,它将递归扫描所有路径,并报告所有重复发现。

import sys
import os
import hashlib


def chunk_reader(fobj, chunk_size=1024):
"""Generator that reads a file in chunks of bytes"""
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk


def check_for_duplicates(paths, hash=hashlib.sha1):
hashes = {}
for path in paths:
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
full_path = os.path.join(dirpath, filename)
hashobj = hash()
for chunk in chunk_reader(open(full_path, 'rb')):
hashobj.update(chunk)
file_id = (hashobj.digest(), os.path.getsize(full_path))
duplicate = hashes.get(file_id, None)
if duplicate:
print "Duplicate found: %s and %s" % (full_path, duplicate)
else:
hashes[file_id] = full_path


if sys.argv[1:]:
check_for_duplicates(sys.argv[1:])
else:
print "Please pass the paths to check as parameters to the script"

更快的算法

如果需要分析很多“大尺寸”的文件(图片、 mp3、 pdf 文档) ,使用下面的比较算法会更有趣/更快:

  1. 对文件的前 N 个字节(比如1KB)执行第一个快速哈希。这个散列会说明文件是否不同,但不会说明两个文件是否完全相同(散列的准确性,从磁盘读取的数据有限)

  2. 如果在第一阶段发生冲突,则执行第二个缓慢的散列,该散列更准确,并且对文件的整个内容执行

下面是这个算法的一个实现:

import hashlib
def Checksum(current_file_name, check_type = 'sha512', first_block = False):
"""Computes the hash for the given file. If first_block is True,
only the first block of size size_block is hashed."""
size_block = 1024 * 1024 # The first N bytes (1KB)


d = {'sha1' : hashlib.sha1, 'md5': hashlib.md5, 'sha512': hashlib.sha512}


if(not d.has_key(check_type)):
raise Exception("Unknown checksum method")


file_size = os.stat(current_file_name)[stat.ST_SIZE]
with file(current_file_name, 'rb') as f:
key = d[check_type].__call__()
while True:
s = f.read(size_block)
key.update(s)
file_size -= size_block
if(len(s) < size_block or first_block):
break
return key.hexdigest().upper()


def find_duplicates(files):
"""Find duplicates among a set of files.
The implementation uses two types of hashes:
- A small and fast one one the first block of the file (first 1KB),
- and in case of collision a complete hash on the file. The complete hash
is not computed twice.
It flushes the files that seems to have the same content
(according to the hash method) at the end.
"""


print 'Analyzing', len(files), 'files'


# this dictionary will receive small hashes
d = {}
# this dictionary will receive full hashes. It is filled
# only in case of collision on the small hash (contains at least two
# elements)
duplicates = {}


for f in files:


# small hash to be fast
check = Checksum(f, first_block = True, check_type = 'sha1')


if(not d.has_key(check)):
# d[check] is a list of files that have the same small hash
d[check] = [(f, None)]
else:
l = d[check]
l.append((f, None))


for index, (ff, checkfull) in enumerate(l):


if(checkfull is None):
# computes the full hash in case of collision
checkfull = Checksum(ff, first_block = False)
l[index] = (ff, checkfull)


# for each new full hash computed, check if their is
# a collision in the duplicate dictionary.
if(not duplicates.has_key(checkfull)):
duplicates[checkfull] = [ff]
else:
duplicates[checkfull].append(ff)


# prints the detected duplicates
if(len(duplicates) != 0):
print
print "The following files have the same sha512 hash"


for h, lf in duplicates.items():
if(len(lf)==1):
continue
print 'Hash value', h
for f in lf:
print '\t', f.encode('unicode_escape') if \
type(f) is types.UnicodeType else f
return duplicates

find_duplicates函数接受一个文件列表。这样,还可以比较两个目录(例如,更好地同步它们的内容)下面是一个函数创建具有指定扩展名的文件列表并避免在某些目录中输入的示例:

def getFiles(_path, extensions = ['.png'],
subdirs = False, avoid_directories = None):
"""Returns the list of files in the path :'_path',
of extension in 'extensions'. 'subdir' indicates if
the search should also be performed in the subdirectories.
If extensions = [] or None, all files are returned.
avoid_directories: if set, do not parse subdirectories that
match any element of avoid_directories."""


l = []
extensions = [p.lower() for p in extensions] if not extensions is None \
else None
for root, dirs, files in os.walk(_path, topdown=True):


for name in files:
if(extensions is None or len(extensions) == 0 or \
os.path.splitext(name)[1].lower() in extensions):
l.append(os.path.join(root, name))


if(not subdirs):
while(len(dirs) > 0):
dirs.pop()
elif(not avoid_directories is None):
for d in avoid_directories:
if(d in dirs): dirs.remove(d)


return l

这种方法对于不解析 .svn路径是很方便的,例如,它必然会触发 find_duplicates中的碰撞文件。

欢迎反馈。

    import hashlib
import os
import sys
from sets import Set


def read_chunk(fobj, chunk_size = 2048):
""" Files can be huge so read them in chunks of bytes. """
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk


def remove_duplicates(dir, hashfun = hashlib.sha512):
unique = Set()
for filename in os.listdir(dir):
filepath = os.path.join(dir, filename)
if os.path.isfile(filepath):
hashobj = hashfun()
for chunk in read_chunk(open(filepath,'rb')):
hashobj.update(chunk)
# the size of the hashobj is constant
# print "hashfun: ", hashfun.__sizeof__()
hashfile = hashobj.hexdigest()
if hashfile not in unique:
unique.add(hashfile)
else:
os.remove(filepath)


try:
hashfun = hashlib.sha256
remove_duplicates(sys.argv[1], hashfun)
except IndexError:
print """Please pass a path to a directory with
duplicate files as a parameter to the script."""

最快的算法-性能提高100倍相比,接受的答案(真的:)

其他解决方案中的方法非常酷,但是它们忽略了重复文件的一个重要属性——它们具有相同的文件大小。只在相同大小的文件上计算昂贵的哈希值将节省大量的 CPU 资源; 最后进行性能比较,下面是解释。

对@nosklo 给出的可靠答案进行迭代,并借用@Raffi 的思想,对每个文件的开头进行快速散列,然后只对快速散列中的冲突计算完整的散列,下面是步骤:

  1. 建立文件的哈希表,其中文件大小是关键。
  2. 对于大小相同的文件,创建一个哈希表,其哈希表的前1024字节; 非碰撞元素是唯一的
  3. 对于前1k 字节上具有相同散列的文件,计算完整内容上的散列-具有匹配内容的文件不是唯一的。

密码:

#!/usr/bin/env python3
from collections import defaultdict
import hashlib
import os
import sys




def chunk_reader(fobj, chunk_size=1024):
"""Generator that reads a file in chunks of bytes"""
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk




def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
hashobj = hash()
file_object = open(filename, 'rb')


if first_chunk_only:
hashobj.update(file_object.read(1024))
else:
for chunk in chunk_reader(file_object):
hashobj.update(chunk)
hashed = hashobj.digest()


file_object.close()
return hashed




def check_for_duplicates(paths, hash=hashlib.sha1):
hashes_by_size = defaultdict(list)  # dict of size_in_bytes: [full_path_to_file1, full_path_to_file2, ]
hashes_on_1k = defaultdict(list)  # dict of (hash1k, size_in_bytes): [full_path_to_file1, full_path_to_file2, ]
hashes_full = {}   # dict of full_file_hash: full_path_to_file_string


for path in paths:
for dirpath, dirnames, filenames in os.walk(path):
# get all files that have the same size - they are the collision candidates
for filename in filenames:
full_path = os.path.join(dirpath, filename)
try:
# if the target is a symlink (soft one), this will
# dereference it - change the value to the actual target file
full_path = os.path.realpath(full_path)
file_size = os.path.getsize(full_path)
hashes_by_size[file_size].append(full_path)
except (OSError,):
# not accessible (permissions, etc) - pass on
continue




# For all files with the same file size, get their hash on the 1st 1024 bytes only
for size_in_bytes, files in hashes_by_size.items():
if len(files) < 2:
continue    # this file size is unique, no need to spend CPU cycles on it


for filename in files:
try:
small_hash = get_hash(filename, first_chunk_only=True)
# the key is the hash on the first 1024 bytes plus the size - to
# avoid collisions on equal hashes in the first part of the file
# credits to @Futal for the optimization
hashes_on_1k[(small_hash, size_in_bytes)].append(filename)
except (OSError,):
# the file access might've changed till the exec point got here
continue


# For all files with the hash on the 1st 1024 bytes, get their hash on the full file - collisions will be duplicates
for __, files_list in hashes_on_1k.items():
if len(files_list) < 2:
continue    # this hash of fist 1k file bytes is unique, no need to spend cpy cycles on it


for filename in files_list:
try:
full_hash = get_hash(filename, first_chunk_only=False)
duplicate = hashes_full.get(full_hash)
if duplicate:
print("Duplicate found: {} and {}".format(filename, duplicate))
else:
hashes_full[full_hash] = filename
except (OSError,):
# the file access might've changed till the exec point got here
continue




if __name__ == "__main__":
if sys.argv[1:]:
check_for_duplicates(sys.argv[1:])
else:
print("Please pass the paths to check as parameters to the script")

这是有趣的部分性能比较。

基线 -

  • 一个包含1047个文件,32 mp4,1015-jpg,总大小为5445.998 MiB 的目录——也就是我手机的相机自动上传目录:)
  • 小型(但功能齐全)处理器-1600 BogoMIPS,1.2 GHz 32L1.256L2 Kbs 缓存,/proc/cpuinfo:

处理器: Feroceon 88FR131 rev 1(v5l) BogoMIPS: 1599.07

(例如,我的低端 NAS:) ,运行 Python 2.7.11。

因此,@nosklo 非常方便的解决方案的输出是:

root@NAS:InstantUpload# time ~/scripts/checkDuplicates.py
Duplicate found: ./IMG_20151231_143053 (2).jpg and ./IMG_20151231_143053.jpg
Duplicate found: ./IMG_20151125_233019 (2).jpg and ./IMG_20151125_233019.jpg
Duplicate found: ./IMG_20160204_150311.jpg and ./IMG_20160204_150311 (2).jpg
Duplicate found: ./IMG_20160216_074620 (2).jpg and ./IMG_20160216_074620.jpg


real    5m44.198s
user    4m44.550s
sys     0m33.530s

下面是一个版本,在大小检查上使用过滤器,然后是小散列,最后是在发现冲突时使用全散列:

root@NAS:InstantUpload# time ~/scripts/checkDuplicatesSmallHash.py . "/i-data/51608399/photo/Todor phone"
Duplicate found: ./IMG_20160216_074620 (2).jpg and ./IMG_20160216_074620.jpg
Duplicate found: ./IMG_20160204_150311.jpg and ./IMG_20160204_150311 (2).jpg
Duplicate found: ./IMG_20151231_143053 (2).jpg and ./IMG_20151231_143053.jpg
Duplicate found: ./IMG_20151125_233019 (2).jpg and ./IMG_20151125_233019.jpg


real    0m1.398s
user    0m1.200s
sys     0m0.080s

每个版本运行3次,以获得所需时间的平均值。

所以 v1是(user + sys) 284,另一个是-2; 差别很大,哈:) 有了这个增加,可以使用 SHA512,甚至更高级——所需的计算量更少,性能损失将会减轻。

负面影响:

  • 比其他版本更多的磁盘访问——每个文件只访问一次大小统计数据(这很便宜,但仍然是磁盘 IO) ,每个副本打开两次(对于小的前1k 字节哈希,和完整内容哈希)
  • 将由于存储哈希表运行时而消耗更多内存

@ IanLee1521有一个不错的解决方案 给你。它非常有效,因为它首先根据文件大小检查重复内容。

#! /usr/bin/env python


# Originally taken from:
# http://www.pythoncentral.io/finding-duplicate-files-with-python/
# Original Auther: Andres Torres


# Adapted to only compute the md5sum of files with the same size


import argparse
import os
import sys
import hashlib




def find_duplicates(folders):
"""
Takes in an iterable of folders and prints & returns the duplicate files
"""
dup_size = {}
for i in folders:
# Iterate the folders given
if os.path.exists(i):
# Find the duplicated files and append them to dup_size
join_dicts(dup_size, find_duplicate_size(i))
else:
print('%s is not a valid path, please verify' % i)
return {}


print('Comparing files with the same size...')
dups = {}
for dup_list in dup_size.values():
if len(dup_list) > 1:
join_dicts(dups, find_duplicate_hash(dup_list))
print_results(dups)
return dups




def find_duplicate_size(parent_dir):
# Dups in format {hash:[names]}
dups = {}
for dirName, subdirs, fileList in os.walk(parent_dir):
print('Scanning %s...' % dirName)
for filename in fileList:
# Get the path to the file
path = os.path.join(dirName, filename)
# Check to make sure the path is valid.
if not os.path.exists(path):
continue
# Calculate sizes
file_size = os.path.getsize(path)
# Add or append the file path
if file_size in dups:
dups[file_size].append(path)
else:
dups[file_size] = [path]
return dups




def find_duplicate_hash(file_list):
print('Comparing: ')
for filename in file_list:
print('    {}'.format(filename))
dups = {}
for path in file_list:
file_hash = hashfile(path)
if file_hash in dups:
dups[file_hash].append(path)
else:
dups[file_hash] = [path]
return dups




# Joins two dictionaries
def join_dicts(dict1, dict2):
for key in dict2.keys():
if key in dict1:
dict1[key] = dict1[key] + dict2[key]
else:
dict1[key] = dict2[key]




def hashfile(path, blocksize=65536):
afile = open(path, 'rb')
hasher = hashlib.md5()
buf = afile.read(blocksize)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(blocksize)
afile.close()
return hasher.hexdigest()




def print_results(dict1):
results = list(filter(lambda x: len(x) > 1, dict1.values()))
if len(results) > 0:
print('Duplicates Found:')
print(
'The following files are identical. The name could differ, but the'
' content is identical'
)
print('___________________')
for result in results:
for subresult in result:
print('\t\t%s' % subresult)
print('___________________')


else:
print('No duplicate files found.')




def main():
parser = argparse.ArgumentParser(description='Find duplicate files')
parser.add_argument(
'folders', metavar='dir', type=str, nargs='+',
help='A directory to parse for duplicates',
)
args = parser.parse_args()


find_duplicates(args.folders)




if __name__ == '__main__':
sys.exit(main())

为了安全起见(如果出现问题,自动移除它们可能是危险的!),这里是我使用的,基于@zalew 的答案。

请注意,md5 sum 代码与@zalew 的代码略有不同,因为 他的代码生成了太多错误的重复文件(这就是为什么我说自动删除它们是危险的!).

import hashlib, os
unique = dict()
for filename in os.listdir('.'):
if os.path.isfile(filename):
filehash = hashlib.md5(open(filename, 'rb').read()).hexdigest()


if filehash not in unique:
unique[filehash] = filename
else:
print filename + ' is a duplicate of ' + unique[filehash]

Python 有一个名为 filecmp的标准库来比较文件和目录。

它检查文件大小。它检查8k 块的内容。 它可以处理二进制文件。

这不是杂烩。

Filecmp 的 python 文档

我找到了一个100% 工作代码,用于递归地删除文件夹中的重复文件。只需将 clean 方法中的文件夹名称替换为您的文件夹名称即可。

import time
import os
import shutil
from hashlib import sha256




class Duplython:
def __init__(self):
self.home_dir = os.getcwd()
self.File_hashes = []
self.Cleaned_dirs = []
self.Total_bytes_saved = 0
self.block_size = 65536
self.count_cleaned = 0


def welcome(self) -> None:
print('******************************************************************')
print('****************        DUPLYTHON      ****************************')
print('********************************************************************\n\n')
print('----------------        WELCOME        ----------------------------')
time.sleep(3)
print('\nCleaning .................')
return None


def generate_hash(self, Filename: str) -> str:
Filehash = sha256()
try:
with open(Filename, 'rb') as File:
fileblock = File.read(self.block_size)
while len(fileblock) > 0:
Filehash.update(fileblock)
fileblock = File.read(self.block_size)
Filehash = Filehash.hexdigest()
return Filehash
except:
return False


def clean(self) -> None:
all_dirs = [path[0] for path in os.walk('E:\\songs')]
for path in all_dirs:
os.chdir(path)
All_Files = [file for file in os.listdir() if os.path.isfile(file)]
for file in All_Files:
filehash = self.generate_hash(file)
if not filehash in self.File_hashes:
if filehash:
self.File_hashes.append(filehash)
# print(file)
else:
byte_saved = os.path.getsize(file)
self.count_cleaned += 1
self.Total_bytes_saved += byte_saved
os.remove(file)
filename = file.split('/')[-1]
print(filename, '.. cleaned ')
os.chdir(self.home_dir)


def cleaning_summary(self) -> None:
mb_saved = self.Total_bytes_saved / 1048576
mb_saved = round(mb_saved, 2)
print('\n\n--------------FINISHED CLEANING ------------')
print('File cleaned  : ', self.count_cleaned)
print('Total Space saved : ', mb_saved, 'MB')
print('-----------------------------------------------')


def main(self) -> None:
self.welcome()
self.clean()
self.cleaning_summary()




#
# if __name__ == '__main__':
#     App = Duplython()
#     App.main()




def dedupe_bing_images():
App = Duplython()
App.main()
return True




dedupe_bing_images()