commit 1d68ce12ed30691362fa39a4cf35aff910149377 Author: Peter J. Holzer Date: Sat Nov 11 17:23:34 2023 +0100 Deduplicate files diff --git a/dedup b/dedup new file mode 100755 index 0000000..7946351 --- /dev/null +++ b/dedup @@ -0,0 +1,71 @@ +#!/usr/bin/python3 + +from hashlib import sha256 +from os import walk, lstat, link, unlink +from os.path import join +from sys import argv, stderr + +def get_files(): + result = [] + for dir, subdirs, files in walk(argv[1]): + for f in files: + ff = join(dir, f) + s = lstat(ff) + result.append( + { "filename": ff, + "mtime": s.st_mtime, + "dev": s.st_dev, + "ino": s.st_ino, + "size": s.st_size, + }) + result.sort(key=lambda x: x["mtime"]) + return result + +def get_hash(filename): + hash = sha256() + with open(filename, "rb") as fh: + for chunk in iter(lambda: fh.read(0x10000), b""): + hash.update(chunk) + return hash.hexdigest() + + +def dedup(files): + files_by_devino = {} + files_by_sizehash = {} + + for file in files: + if (file["dev"], file["ino"]) in files_by_devino: + print("%s already seen (%d/%d), skipping" % (file["filename"], file["dev"], file["ino"]), + file=stderr) + continue + try: + size = file["size"] + hash = None + if size in files_by_sizehash: + if None in files_by_sizehash[size]: + hash = get_hash(files_by_sizehash[size][None]["filename"]) + files_by_sizehash[size][hash] = files_by_sizehash[size][None] + del files_by_sizehash[size][None] + + hash = get_hash(file["filename"]) + if hash in files_by_sizehash[size] and file["dev"] == files_by_sizehash[size][hash]["dev"]: + print("%s already seen (%s), replacing" % (file["filename"], hash), + file=stderr) + unlink(file["filename"]) + link(files_by_sizehash[size][hash]["filename"], file["filename"]) + continue + else: + files_by_sizehash[size] = {} + print("%s is new (%d/%d, %s)" % (file["filename"], file["dev"], file["ino"], hash), + file=stderr) + files_by_devino[(file["dev"], file["ino"])] = file + files_by_sizehash[size][hash] = file + except EOFError: + print("%s is truncated, ignoring" % (file["filename"]), + file=stderr) + + +dedup(get_files()) + + +# vim: tw=99 sw=4 expandtab