diff --git a/tests/test_check.py b/tests/test_check.py index 0fec153..db8f72b 100644 --- a/tests/test_check.py +++ b/tests/test_check.py @@ -1,4 +1,5 @@ from basetest import * +from zfs_autobackup.BlockHasher import BlockHasher class TestZfsCheck(unittest2.TestCase): @@ -17,24 +18,25 @@ class TestZfsCheck(unittest2.TestCase): # 959e6b58078f0cfd2fb3d37e978fda51820473ff whole_whole2 # 309ffffba2e1977d12f3b7469971f30d28b94bd8 whole_whole2_partial + block_hasher=BlockHasher(count=1) self.assertEqual( - list(block_hash("tests/data/empty", count=1)), + list(block_hasher.generate("tests/data/empty")), [] ) self.assertEqual( - list(block_hash("tests/data/partial", count=1)), + list(block_hasher.generate("tests/data/partial")), [(0, "642027d63bb0afd7e0ba197f2c66ad03e3d70de1")] ) self.assertEqual( - list(block_hash("tests/data/whole", count=1)), + list(block_hasher.generate("tests/data/whole")), [(0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7")] ) self.assertEqual( - list(block_hash("tests/data/whole_whole2", count=1)), + list(block_hasher.generate("tests/data/whole_whole2")), [ (0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7"), (1, "2e863f1fcccd6642e4e28453eba10d2d3f74d798") @@ -42,7 +44,7 @@ class TestZfsCheck(unittest2.TestCase): ) self.assertEqual( - list(block_hash("tests/data/whole_whole2_partial", count=1)), + list(block_hasher.generate("tests/data/whole_whole2_partial")), [ (0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7"), #whole (1, "2e863f1fcccd6642e4e28453eba10d2d3f74d798"), #whole2 @@ -50,16 +52,18 @@ class TestZfsCheck(unittest2.TestCase): ] ) + block_hasher=BlockHasher(count=2) self.assertEqual( - list(block_hash("tests/data/whole_whole2_partial", count=2)), + list(block_hasher.generate("tests/data/whole_whole2_partial")), [ (0, "959e6b58078f0cfd2fb3d37e978fda51820473ff"), #whole_whole2 (1, "642027d63bb0afd7e0ba197f2c66ad03e3d70de1") #partial ] ) + block_hasher=BlockHasher(count=10) self.assertEqual( - list(block_hash("tests/data/whole_whole2_partial", count=10)), + list(block_hasher.generate("tests/data/whole_whole2_partial")), [ (0, "309ffffba2e1977d12f3b7469971f30d28b94bd8"), #whole_whole2_partial ]) diff --git a/zfs_autobackup/BlockHasher.py b/zfs_autobackup/BlockHasher.py new file mode 100644 index 0000000..c0db08f --- /dev/null +++ b/zfs_autobackup/BlockHasher.py @@ -0,0 +1,45 @@ +import hashlib + + +class BlockHasher(): + """This class was created to checksum huge files and blockdevices (TB's) + Instead of one sha1sum of the whole file, it generates sha1susms of chunks of the file. + + The chunksize is count*bs (bs is the read blocksize from disk) + + Its also possible to only read a certain percentage of blocks to just check a sample. + """ + def __init__(self, count=10000, bs=4096, hash_class=hashlib.sha1): + self.count=count + self.bs=bs + self.hash_class=hash_class + + + def generate(self, fname): + """Generates checksums + + yields(chunk_nr, hexdigest) + + yields nothing for empty files. + """ + with open(fname, "rb") as f: + hash = self.hash_class() + block_nr = 0 + chunk_nr = 0 + for block in iter(lambda: f.read(self.bs), b""): + hash.update(block) + block_nr = block_nr + 1 + if block_nr % self.count == 0: + yield (chunk_nr, hash.hexdigest()) + chunk_nr = chunk_nr + 1 + hash = self.hash_class() + + # yield last (incomplete) block + if block_nr % self.count != 0: + yield (chunk_nr, hash.hexdigest()) + + # def compare(fname, generator): + # """reads from generatos and compares blocks""" + # + # with open(fname, "rb") as f: + # for ( count, bs , chunk_nr, hexdigest) in input_generator: \ No newline at end of file diff --git a/zfs_autobackup/TreeHasher.py b/zfs_autobackup/TreeHasher.py new file mode 100644 index 0000000..1241255 --- /dev/null +++ b/zfs_autobackup/TreeHasher.py @@ -0,0 +1,33 @@ +import os + + +class TreeHasher(): + """uses BlockHasher recursively on a directory tree""" + + def __init__(self, block_hasher): + self.block_hasher=block_hasher + + def generate(self, start_path): + """Use BlockHasher on every file in a tree, yielding the results + + note that it only checks the contents of actual files. It ignores metadata like permissions and mtimes. + It also ignores empty directories, symlinks and special files. + """ + + cwd=os.getcwd() + os.chdir(start_path) + + def walkerror(e): + raise e + + try: + for (dirpath, dirnames, filenames) in os.walk(".", onerror=walkerror): + for f in filenames: + file_path=os.path.join(dirpath, f)[2:] + + if (not os.path.islink(file_path)) and os.path.isfile(file_path): + for (chunk_nr, hash) in self.block_hasher.generate(file_path): + yield ( file_path, chunk_nr, hash ) + finally: + os.chdir(cwd) + diff --git a/zfs_autobackup/ZfsCheck.py b/zfs_autobackup/ZfsCheck.py index 4cb9f7e..782be91 100644 --- a/zfs_autobackup/ZfsCheck.py +++ b/zfs_autobackup/ZfsCheck.py @@ -1,8 +1,10 @@ from __future__ import print_function +import time from signal import signal, SIGPIPE - +from .TreeHasher import TreeHasher +from .BlockHasher import BlockHasher from .ZfsNode import ZfsNode from .util import * from .CliBase import CliBase @@ -62,9 +64,11 @@ class ZfsCheck(CliBase): snapshot.mount(mnt) + tree_hasher=TreeHasher(BlockHasher(count=count, bs=bs)) + self.debug("Hashing tree: {}".format(mnt)) if not self.args.test: - for (file, block, hash) in block_hash_tree(mnt, count, bs): + for (file, block, hash) in tree_hasher.generate(mnt): print("{}\t{}\t{}".format(file, block, hash)) sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect @@ -113,14 +117,14 @@ class ZfsCheck(CliBase): def hash_volume(self, snapshot, count, bs): try: dev=self.activate_volume_snapshot(snapshot) + block_hasher=BlockHasher(count=count, bs=bs) self.debug("Hashing dev: {}".format(dev)) if not self.args.test: - for (block, hash) in block_hash(dev, count, bs): + for (block, hash) in block_hasher.generate(dev): print("{}\t{}".format(block, hash)) sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect - finally: self.deacitvate_volume_snapshot(snapshot) diff --git a/zfs_autobackup/util.py b/zfs_autobackup/util.py index e072379..5c86041 100644 --- a/zfs_autobackup/util.py +++ b/zfs_autobackup/util.py @@ -1,5 +1,3 @@ -import hashlib - # root@psyt14s:/home/psy/zfs_autobackup# ls -lh /home/psy/Downloads/carimage.zip # -rw-rw-r-- 1 psy psy 990M Nov 26 2020 /home/psy/Downloads/carimage.zip # root@psyt14s:/home/psy/zfs_autobackup# time sha1sum /home/psy/Downloads/carimage.zip @@ -18,60 +16,6 @@ import hashlib import os import platform import sys -import time - - - -def block_hash(fname, count=10000, bs=4096): - """This function was created to checksum huge files and blockdevices (TB's) - Instead of one sha1sum of the whole file, it generates sha1susms of chunks of the file. - - yields sha1 hash of fname, per count blocks. - yields(chunk_nr, hexdigest) - - yields nothing for empty files. - - """ - - with open(fname, "rb") as f: - hash = hashlib.sha1() - block_nr = 0 - chunk_nr = 0 - for block in iter(lambda: f.read(bs), b""): - hash.update(block) - block_nr = block_nr + 1 - if block_nr % count == 0: - yield (chunk_nr, hash.hexdigest()) - chunk_nr = chunk_nr + 1 - hash = hashlib.sha1() - - # yield last (incomplete) block - if block_nr % count != 0: - yield (chunk_nr, hash.hexdigest()) - -def block_hash_tree(start_path, count=10000, bs=4096): - """block_hash every file in a tree, yielding the results - - note that it only checks the contents of actual files. It ignores metadata like permissions and mtimes. - It also ignores empty directories, symlinks and special files. - """ - - cwd=os.getcwd() - os.chdir(start_path) - - def walkerror(e): - raise e - - try: - for (dirpath, dirnames, filenames) in os.walk(".", onerror=walkerror): - for f in filenames: - file_path=os.path.join(dirpath, f)[2:] - - if (not os.path.islink(file_path)) and os.path.isfile(file_path): - for (chunk_nr, hash) in block_hash(file_path, count, bs): - yield ( file_path, chunk_nr, hash ) - finally: - os.chdir(cwd) def tmp_name(suffix=""):