Deduplicate files
This commit is contained in:
commit
1d68ce12ed
|
@ -0,0 +1,71 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
from hashlib import sha256
|
||||||
|
from os import walk, lstat, link, unlink
|
||||||
|
from os.path import join
|
||||||
|
from sys import argv, stderr
|
||||||
|
|
||||||
|
def get_files():
|
||||||
|
result = []
|
||||||
|
for dir, subdirs, files in walk(argv[1]):
|
||||||
|
for f in files:
|
||||||
|
ff = join(dir, f)
|
||||||
|
s = lstat(ff)
|
||||||
|
result.append(
|
||||||
|
{ "filename": ff,
|
||||||
|
"mtime": s.st_mtime,
|
||||||
|
"dev": s.st_dev,
|
||||||
|
"ino": s.st_ino,
|
||||||
|
"size": s.st_size,
|
||||||
|
})
|
||||||
|
result.sort(key=lambda x: x["mtime"])
|
||||||
|
return result
|
||||||
|
|
||||||
|
def get_hash(filename):
|
||||||
|
hash = sha256()
|
||||||
|
with open(filename, "rb") as fh:
|
||||||
|
for chunk in iter(lambda: fh.read(0x10000), b""):
|
||||||
|
hash.update(chunk)
|
||||||
|
return hash.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def dedup(files):
|
||||||
|
files_by_devino = {}
|
||||||
|
files_by_sizehash = {}
|
||||||
|
|
||||||
|
for file in files:
|
||||||
|
if (file["dev"], file["ino"]) in files_by_devino:
|
||||||
|
print("%s already seen (%d/%d), skipping" % (file["filename"], file["dev"], file["ino"]),
|
||||||
|
file=stderr)
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
size = file["size"]
|
||||||
|
hash = None
|
||||||
|
if size in files_by_sizehash:
|
||||||
|
if None in files_by_sizehash[size]:
|
||||||
|
hash = get_hash(files_by_sizehash[size][None]["filename"])
|
||||||
|
files_by_sizehash[size][hash] = files_by_sizehash[size][None]
|
||||||
|
del files_by_sizehash[size][None]
|
||||||
|
|
||||||
|
hash = get_hash(file["filename"])
|
||||||
|
if hash in files_by_sizehash[size] and file["dev"] == files_by_sizehash[size][hash]["dev"]:
|
||||||
|
print("%s already seen (%s), replacing" % (file["filename"], hash),
|
||||||
|
file=stderr)
|
||||||
|
unlink(file["filename"])
|
||||||
|
link(files_by_sizehash[size][hash]["filename"], file["filename"])
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
files_by_sizehash[size] = {}
|
||||||
|
print("%s is new (%d/%d, %s)" % (file["filename"], file["dev"], file["ino"], hash),
|
||||||
|
file=stderr)
|
||||||
|
files_by_devino[(file["dev"], file["ino"])] = file
|
||||||
|
files_by_sizehash[size][hash] = file
|
||||||
|
except EOFError:
|
||||||
|
print("%s is truncated, ignoring" % (file["filename"]),
|
||||||
|
file=stderr)
|
||||||
|
|
||||||
|
|
||||||
|
dedup(get_files())
|
||||||
|
|
||||||
|
|
||||||
|
# vim: tw=99 sw=4 expandtab
|
Loading…
Reference in New Issue