Deduplicate files

This commit is contained in:
Peter J. Holzer 2023-11-11 17:23:34 +01:00
commit 1d68ce12ed
1 changed files with 71 additions and 0 deletions

71
dedup Executable file
View File

@ -0,0 +1,71 @@
#!/usr/bin/python3
from hashlib import sha256
from os import walk, lstat, link, unlink
from os.path import join
from sys import argv, stderr
def get_files():
result = []
for dir, subdirs, files in walk(argv[1]):
for f in files:
ff = join(dir, f)
s = lstat(ff)
result.append(
{ "filename": ff,
"mtime": s.st_mtime,
"dev": s.st_dev,
"ino": s.st_ino,
"size": s.st_size,
})
result.sort(key=lambda x: x["mtime"])
return result
def get_hash(filename):
hash = sha256()
with open(filename, "rb") as fh:
for chunk in iter(lambda: fh.read(0x10000), b""):
hash.update(chunk)
return hash.hexdigest()
def dedup(files):
files_by_devino = {}
files_by_sizehash = {}
for file in files:
if (file["dev"], file["ino"]) in files_by_devino:
print("%s already seen (%d/%d), skipping" % (file["filename"], file["dev"], file["ino"]),
file=stderr)
continue
try:
size = file["size"]
hash = None
if size in files_by_sizehash:
if None in files_by_sizehash[size]:
hash = get_hash(files_by_sizehash[size][None]["filename"])
files_by_sizehash[size][hash] = files_by_sizehash[size][None]
del files_by_sizehash[size][None]
hash = get_hash(file["filename"])
if hash in files_by_sizehash[size] and file["dev"] == files_by_sizehash[size][hash]["dev"]:
print("%s already seen (%s), replacing" % (file["filename"], hash),
file=stderr)
unlink(file["filename"])
link(files_by_sizehash[size][hash]["filename"], file["filename"])
continue
else:
files_by_sizehash[size] = {}
print("%s is new (%d/%d, %s)" % (file["filename"], file["dev"], file["ino"], hash),
file=stderr)
files_by_devino[(file["dev"], file["ino"])] = file
files_by_sizehash[size][hash] = file
except EOFError:
print("%s is truncated, ignoring" % (file["filename"]),
file=stderr)
dedup(get_files())
# vim: tw=99 sw=4 expandtab