Deduplicate files
This commit is contained in:
commit
1d68ce12ed
|
@ -0,0 +1,71 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
from hashlib import sha256
|
||||
from os import walk, lstat, link, unlink
|
||||
from os.path import join
|
||||
from sys import argv, stderr
|
||||
|
||||
def get_files():
|
||||
result = []
|
||||
for dir, subdirs, files in walk(argv[1]):
|
||||
for f in files:
|
||||
ff = join(dir, f)
|
||||
s = lstat(ff)
|
||||
result.append(
|
||||
{ "filename": ff,
|
||||
"mtime": s.st_mtime,
|
||||
"dev": s.st_dev,
|
||||
"ino": s.st_ino,
|
||||
"size": s.st_size,
|
||||
})
|
||||
result.sort(key=lambda x: x["mtime"])
|
||||
return result
|
||||
|
||||
def get_hash(filename):
|
||||
hash = sha256()
|
||||
with open(filename, "rb") as fh:
|
||||
for chunk in iter(lambda: fh.read(0x10000), b""):
|
||||
hash.update(chunk)
|
||||
return hash.hexdigest()
|
||||
|
||||
|
||||
def dedup(files):
|
||||
files_by_devino = {}
|
||||
files_by_sizehash = {}
|
||||
|
||||
for file in files:
|
||||
if (file["dev"], file["ino"]) in files_by_devino:
|
||||
print("%s already seen (%d/%d), skipping" % (file["filename"], file["dev"], file["ino"]),
|
||||
file=stderr)
|
||||
continue
|
||||
try:
|
||||
size = file["size"]
|
||||
hash = None
|
||||
if size in files_by_sizehash:
|
||||
if None in files_by_sizehash[size]:
|
||||
hash = get_hash(files_by_sizehash[size][None]["filename"])
|
||||
files_by_sizehash[size][hash] = files_by_sizehash[size][None]
|
||||
del files_by_sizehash[size][None]
|
||||
|
||||
hash = get_hash(file["filename"])
|
||||
if hash in files_by_sizehash[size] and file["dev"] == files_by_sizehash[size][hash]["dev"]:
|
||||
print("%s already seen (%s), replacing" % (file["filename"], hash),
|
||||
file=stderr)
|
||||
unlink(file["filename"])
|
||||
link(files_by_sizehash[size][hash]["filename"], file["filename"])
|
||||
continue
|
||||
else:
|
||||
files_by_sizehash[size] = {}
|
||||
print("%s is new (%d/%d, %s)" % (file["filename"], file["dev"], file["ino"], hash),
|
||||
file=stderr)
|
||||
files_by_devino[(file["dev"], file["ino"])] = file
|
||||
files_by_sizehash[size][hash] = file
|
||||
except EOFError:
|
||||
print("%s is truncated, ignoring" % (file["filename"]),
|
||||
file=stderr)
|
||||
|
||||
|
||||
dedup(get_files())
|
||||
|
||||
|
||||
# vim: tw=99 sw=4 expandtab
|
Loading…
Reference in New Issue