extract BlockHasher and TreeHasher classes

2025-07-07 16:18:43 +03:00 · 2022-02-21 13:49:05 +01:00 · 2022-02-21 13:49:05 +01:00 · a2f85690a3
commit a2f85690a3
parent a807ec320e
5 changed files with 97 additions and 67 deletions
--- a/tests/test_check.py
+++ b/tests/test_check.py
@ -1,4 +1,5 @@
 from basetest import *
 from zfs_autobackup.BlockHasher import BlockHasher
 class TestZfsCheck(unittest2.TestCase):
@ -17,24 +18,25 @@ class TestZfsCheck(unittest2.TestCase):
        # 959e6b58078f0cfd2fb3d37e978fda51820473ff  whole_whole2
        # 309ffffba2e1977d12f3b7469971f30d28b94bd8  whole_whole2_partial
        block_hasher=BlockHasher(count=1)
        self.assertEqual(
-            list(block_hash("tests/data/empty", count=1)),
+            list(block_hasher.generate("tests/data/empty")),
            []
        )
        self.assertEqual(
-            list(block_hash("tests/data/partial", count=1)),
+            list(block_hasher.generate("tests/data/partial")),
            [(0, "642027d63bb0afd7e0ba197f2c66ad03e3d70de1")]
        )
        self.assertEqual(
-            list(block_hash("tests/data/whole", count=1)),
+            list(block_hasher.generate("tests/data/whole")),
            [(0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7")]
        )
        self.assertEqual(
-            list(block_hash("tests/data/whole_whole2", count=1)),
+            list(block_hasher.generate("tests/data/whole_whole2")),
            [
                (0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7"),
                (1, "2e863f1fcccd6642e4e28453eba10d2d3f74d798")
@ -42,7 +44,7 @@ class TestZfsCheck(unittest2.TestCase):
        )
        self.assertEqual(
-            list(block_hash("tests/data/whole_whole2_partial", count=1)),
+            list(block_hasher.generate("tests/data/whole_whole2_partial")),
            [
                (0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7"), #whole
                (1, "2e863f1fcccd6642e4e28453eba10d2d3f74d798"), #whole2
@ -50,16 +52,18 @@ class TestZfsCheck(unittest2.TestCase):
            ]
        )
        block_hasher=BlockHasher(count=2)
        self.assertEqual(
-            list(block_hash("tests/data/whole_whole2_partial", count=2)),
+            list(block_hasher.generate("tests/data/whole_whole2_partial")),
            [
                (0, "959e6b58078f0cfd2fb3d37e978fda51820473ff"), #whole_whole2
                (1, "642027d63bb0afd7e0ba197f2c66ad03e3d70de1")  #partial
            ]
        )
        block_hasher=BlockHasher(count=10)
        self.assertEqual(
-            list(block_hash("tests/data/whole_whole2_partial", count=10)),
+            list(block_hasher.generate("tests/data/whole_whole2_partial")),
            [
                (0, "309ffffba2e1977d12f3b7469971f30d28b94bd8"), #whole_whole2_partial
            ])
--- a/zfs_autobackup/BlockHasher.py
+++ b/zfs_autobackup/BlockHasher.py
@ -0,0 +1,45 @@
 import hashlib
 class BlockHasher():
    """This class was created to checksum huge files and blockdevices (TB's)
    Instead of one sha1sum of the whole file, it generates sha1susms of chunks of the file.
    The chunksize is count*bs (bs is the read blocksize from disk)
    Its also possible to only read a certain percentage of blocks to just check a sample.
    """
    def __init__(self, count=10000, bs=4096, hash_class=hashlib.sha1):
        self.count=count
        self.bs=bs
        self.hash_class=hash_class
    def generate(self, fname):
        """Generates checksums
        yields(chunk_nr, hexdigest)
        yields nothing for empty files.
        """
        with open(fname, "rb") as f:
            hash = self.hash_class()
            block_nr = 0
            chunk_nr = 0
            for block in iter(lambda: f.read(self.bs), b""):
                hash.update(block)
                block_nr = block_nr + 1
                if block_nr % self.count == 0:
                    yield (chunk_nr, hash.hexdigest())
                    chunk_nr = chunk_nr + 1
                    hash = self.hash_class()
            # yield last (incomplete) block
            if block_nr % self.count != 0:
                yield (chunk_nr, hash.hexdigest())
        # def compare(fname, generator):
        #     """reads from generatos and compares blocks"""
        #
        #     with open(fname, "rb") as f:
        #         for ( count, bs , chunk_nr, hexdigest) in input_generator:
--- a/zfs_autobackup/TreeHasher.py
+++ b/zfs_autobackup/TreeHasher.py
@ -0,0 +1,33 @@
 import os
 class TreeHasher():
    """uses BlockHasher recursively on a directory tree"""
    def __init__(self, block_hasher):
        self.block_hasher=block_hasher
    def generate(self, start_path):
        """Use BlockHasher on every file in a tree, yielding the results
        note that it only checks the contents of actual files. It ignores metadata like permissions and mtimes.
        It also ignores empty directories, symlinks and special files.
        """
        cwd=os.getcwd()
        os.chdir(start_path)
        def walkerror(e):
            raise e
        try:
            for (dirpath, dirnames, filenames) in os.walk(".", onerror=walkerror):
                for f in filenames:
                    file_path=os.path.join(dirpath, f)[2:]
                    if (not os.path.islink(file_path)) and os.path.isfile(file_path):
                        for (chunk_nr, hash) in self.block_hasher.generate(file_path):
                            yield ( file_path, chunk_nr, hash )
        finally:
            os.chdir(cwd)
--- a/zfs_autobackup/ZfsCheck.py
+++ b/zfs_autobackup/ZfsCheck.py
@ -1,8 +1,10 @@
 from __future__ import print_function
 import time
 from signal import signal, SIGPIPE
-
+from .TreeHasher import TreeHasher
 from .BlockHasher import BlockHasher
 from .ZfsNode import ZfsNode
 from .util import *
 from .CliBase import CliBase
@ -62,9 +64,11 @@ class ZfsCheck(CliBase):
            snapshot.mount(mnt)
            tree_hasher=TreeHasher(BlockHasher(count=count, bs=bs))
            self.debug("Hashing tree: {}".format(mnt))
            if not self.args.test:
-                for (file, block, hash) in block_hash_tree(mnt, count, bs):
+                for (file, block, hash) in tree_hasher.generate(mnt):
                    print("{}\t{}\t{}".format(file, block, hash))
                    sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect
@ -113,14 +117,14 @@ class ZfsCheck(CliBase):
    def hash_volume(self, snapshot, count, bs):
        try:
            dev=self.activate_volume_snapshot(snapshot)
            block_hasher=BlockHasher(count=count, bs=bs)
            self.debug("Hashing dev: {}".format(dev))
            if not self.args.test:
-                for (block, hash) in block_hash(dev, count, bs):
+                for (block, hash) in block_hasher.generate(dev):
                    print("{}\t{}".format(block, hash))
                    sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect
        finally:
            self.deacitvate_volume_snapshot(snapshot)
--- a/zfs_autobackup/util.py
+++ b/zfs_autobackup/util.py
@ -1,5 +1,3 @@
 import hashlib
 # root@psyt14s:/home/psy/zfs_autobackup# ls -lh /home/psy/Downloads/carimage.zip
 # -rw-rw-r-- 1 psy psy 990M Nov 26  2020 /home/psy/Downloads/carimage.zip
 # root@psyt14s:/home/psy/zfs_autobackup# time sha1sum /home/psy/Downloads/carimage.zip
@ -18,60 +16,6 @@ import hashlib
 import os
 import platform
 import sys
 import time
 def block_hash(fname, count=10000, bs=4096):
    """This function was created to checksum huge files and blockdevices (TB's)
    Instead of one sha1sum of the whole file, it generates sha1susms of chunks of the file.
    yields sha1 hash of fname,  per count blocks.
    yields(chunk_nr, hexdigest)
    yields nothing for empty files.
    """
    with open(fname, "rb") as f:
        hash = hashlib.sha1()
        block_nr = 0
        chunk_nr = 0
        for block in iter(lambda: f.read(bs), b""):
            hash.update(block)
            block_nr = block_nr + 1
            if block_nr % count == 0:
                yield (chunk_nr, hash.hexdigest())
                chunk_nr = chunk_nr + 1
                hash = hashlib.sha1()
        # yield last (incomplete) block
        if block_nr % count != 0:
            yield (chunk_nr, hash.hexdigest())
 def block_hash_tree(start_path, count=10000, bs=4096):
    """block_hash every file in a tree, yielding the results
    note that it only checks the contents of actual files. It ignores metadata like permissions and mtimes.
    It also ignores empty directories, symlinks and special files.
    """
    cwd=os.getcwd()
    os.chdir(start_path)
    def walkerror(e):
        raise e
    try:
        for (dirpath, dirnames, filenames) in os.walk(".", onerror=walkerror):
            for f in filenames:
                file_path=os.path.join(dirpath, f)[2:]
                if (not os.path.islink(file_path)) and os.path.isfile(file_path):
                    for (chunk_nr, hash) in block_hash(file_path, count, bs):
                        yield ( file_path, chunk_nr, hash )
    finally:
        os.chdir(cwd)
 def tmp_name(suffix=""):