extract BlockHasher and TreeHasher classes

2025-06-03 01:30:57 +03:00 · 2022-02-21 13:49:05 +01:00 · 2022-02-21 13:49:05 +01:00 · a2f85690a3
commit a2f85690a3
parent a807ec320e
5 changed files with 97 additions and 67 deletions
--- a/tests/test_check.py
+++ b/tests/test_check.py
@ -1,4 +1,5 @@
 from basetest import *
+from zfs_autobackup.BlockHasher import BlockHasher


 class TestZfsCheck(unittest2.TestCase):
@ -17,24 +18,25 @@ class TestZfsCheck(unittest2.TestCase):
        # 959e6b58078f0cfd2fb3d37e978fda51820473ff  whole_whole2
        # 309ffffba2e1977d12f3b7469971f30d28b94bd8  whole_whole2_partial

+        block_hasher=BlockHasher(count=1)

        self.assertEqual(
-            list(block_hash("tests/data/empty", count=1)),
+            list(block_hasher.generate("tests/data/empty")),
            []
        )

        self.assertEqual(
-            list(block_hash("tests/data/partial", count=1)),
+            list(block_hasher.generate("tests/data/partial")),
            [(0, "642027d63bb0afd7e0ba197f2c66ad03e3d70de1")]
        )

        self.assertEqual(
-            list(block_hash("tests/data/whole", count=1)),
+            list(block_hasher.generate("tests/data/whole")),
            [(0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7")]
        )

        self.assertEqual(
-            list(block_hash("tests/data/whole_whole2", count=1)),
+            list(block_hasher.generate("tests/data/whole_whole2")),
            [
                (0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7"),
                (1, "2e863f1fcccd6642e4e28453eba10d2d3f74d798")
@ -42,7 +44,7 @@ class TestZfsCheck(unittest2.TestCase):
        )

        self.assertEqual(
-            list(block_hash("tests/data/whole_whole2_partial", count=1)),
+            list(block_hasher.generate("tests/data/whole_whole2_partial")),
            [
                (0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7"), #whole
                (1, "2e863f1fcccd6642e4e28453eba10d2d3f74d798"), #whole2
@ -50,16 +52,18 @@ class TestZfsCheck(unittest2.TestCase):
            ]
        )

+        block_hasher=BlockHasher(count=2)
        self.assertEqual(
-            list(block_hash("tests/data/whole_whole2_partial", count=2)),
+            list(block_hasher.generate("tests/data/whole_whole2_partial")),
            [
                (0, "959e6b58078f0cfd2fb3d37e978fda51820473ff"), #whole_whole2
                (1, "642027d63bb0afd7e0ba197f2c66ad03e3d70de1")  #partial
            ]
        )

+        block_hasher=BlockHasher(count=10)
        self.assertEqual(
-            list(block_hash("tests/data/whole_whole2_partial", count=10)),
+            list(block_hasher.generate("tests/data/whole_whole2_partial")),
            [
                (0, "309ffffba2e1977d12f3b7469971f30d28b94bd8"), #whole_whole2_partial
            ])
--- a/zfs_autobackup/BlockHasher.py
+++ b/zfs_autobackup/BlockHasher.py
@ -0,0 +1,45 @@
+import hashlib
+
+
+class BlockHasher():
+    """This class was created to checksum huge files and blockdevices (TB's)
+    Instead of one sha1sum of the whole file, it generates sha1susms of chunks of the file.
+
+    The chunksize is count*bs (bs is the read blocksize from disk)
+
+    Its also possible to only read a certain percentage of blocks to just check a sample.
+    """
+    def __init__(self, count=10000, bs=4096, hash_class=hashlib.sha1):
+        self.count=count
+        self.bs=bs
+        self.hash_class=hash_class
+
+
+    def generate(self, fname):
+        """Generates checksums
+
+        yields(chunk_nr, hexdigest)
+
+        yields nothing for empty files.
+        """
+        with open(fname, "rb") as f:
+            hash = self.hash_class()
+            block_nr = 0
+            chunk_nr = 0
+            for block in iter(lambda: f.read(self.bs), b""):
+                hash.update(block)
+                block_nr = block_nr + 1
+                if block_nr % self.count == 0:
+                    yield (chunk_nr, hash.hexdigest())
+                    chunk_nr = chunk_nr + 1
+                    hash = self.hash_class()
+
+            # yield last (incomplete) block
+            if block_nr % self.count != 0:
+                yield (chunk_nr, hash.hexdigest())
+
+        # def compare(fname, generator):
+        #     """reads from generatos and compares blocks"""
+        #
+        #     with open(fname, "rb") as f:
+        #         for ( count, bs , chunk_nr, hexdigest) in input_generator:
--- a/zfs_autobackup/TreeHasher.py
+++ b/zfs_autobackup/TreeHasher.py
@ -0,0 +1,33 @@
+import os
+
+
+class TreeHasher():
+    """uses BlockHasher recursively on a directory tree"""
+
+    def __init__(self, block_hasher):
+        self.block_hasher=block_hasher
+
+    def generate(self, start_path):
+        """Use BlockHasher on every file in a tree, yielding the results
+
+        note that it only checks the contents of actual files. It ignores metadata like permissions and mtimes.
+        It also ignores empty directories, symlinks and special files.
+        """
+
+        cwd=os.getcwd()
+        os.chdir(start_path)
+
+        def walkerror(e):
+            raise e
+
+        try:
+            for (dirpath, dirnames, filenames) in os.walk(".", onerror=walkerror):
+                for f in filenames:
+                    file_path=os.path.join(dirpath, f)[2:]
+
+                    if (not os.path.islink(file_path)) and os.path.isfile(file_path):
+                        for (chunk_nr, hash) in self.block_hasher.generate(file_path):
+                            yield ( file_path, chunk_nr, hash )
+        finally:
+            os.chdir(cwd)
+
--- a/zfs_autobackup/ZfsCheck.py
+++ b/zfs_autobackup/ZfsCheck.py
@ -1,8 +1,10 @@
 from __future__ import print_function

+import time
 from signal import signal, SIGPIPE

-
+from .TreeHasher import TreeHasher
+from .BlockHasher import BlockHasher
 from .ZfsNode import ZfsNode
 from .util import *
 from .CliBase import CliBase
@ -62,9 +64,11 @@ class ZfsCheck(CliBase):

            snapshot.mount(mnt)

+            tree_hasher=TreeHasher(BlockHasher(count=count, bs=bs))
+
            self.debug("Hashing tree: {}".format(mnt))
            if not self.args.test:
-                for (file, block, hash) in block_hash_tree(mnt, count, bs):
+                for (file, block, hash) in tree_hasher.generate(mnt):
                    print("{}\t{}\t{}".format(file, block, hash))
                    sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect

@ -113,14 +117,14 @@ class ZfsCheck(CliBase):
    def hash_volume(self, snapshot, count, bs):
        try:
            dev=self.activate_volume_snapshot(snapshot)
+            block_hasher=BlockHasher(count=count, bs=bs)

            self.debug("Hashing dev: {}".format(dev))
            if not self.args.test:
-                for (block, hash) in block_hash(dev, count, bs):
+                for (block, hash) in block_hasher.generate(dev):
                    print("{}\t{}".format(block, hash))
                    sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect

-
        finally:
            self.deacitvate_volume_snapshot(snapshot)

--- a/zfs_autobackup/util.py
+++ b/zfs_autobackup/util.py
@ -1,5 +1,3 @@
-import hashlib
-
 # root@psyt14s:/home/psy/zfs_autobackup# ls -lh /home/psy/Downloads/carimage.zip
 # -rw-rw-r-- 1 psy psy 990M Nov 26  2020 /home/psy/Downloads/carimage.zip
 # root@psyt14s:/home/psy/zfs_autobackup# time sha1sum /home/psy/Downloads/carimage.zip
@ -18,60 +16,6 @@ import hashlib
 import os
 import platform
 import sys
-import time
-
-
-
-def block_hash(fname, count=10000, bs=4096):
-    """This function was created to checksum huge files and blockdevices (TB's)
-    Instead of one sha1sum of the whole file, it generates sha1susms of chunks of the file.
-
-    yields sha1 hash of fname,  per count blocks.
-    yields(chunk_nr, hexdigest)
-
-    yields nothing for empty files.
-
-    """
-
-    with open(fname, "rb") as f:
-        hash = hashlib.sha1()
-        block_nr = 0
-        chunk_nr = 0
-        for block in iter(lambda: f.read(bs), b""):
-            hash.update(block)
-            block_nr = block_nr + 1
-            if block_nr % count == 0:
-                yield (chunk_nr, hash.hexdigest())
-                chunk_nr = chunk_nr + 1
-                hash = hashlib.sha1()
-
-        # yield last (incomplete) block
-        if block_nr % count != 0:
-            yield (chunk_nr, hash.hexdigest())
-
-def block_hash_tree(start_path, count=10000, bs=4096):
-    """block_hash every file in a tree, yielding the results
-
-    note that it only checks the contents of actual files. It ignores metadata like permissions and mtimes.
-    It also ignores empty directories, symlinks and special files.
-    """
-
-    cwd=os.getcwd()
-    os.chdir(start_path)
-
-    def walkerror(e):
-        raise e
-
-    try:
-        for (dirpath, dirnames, filenames) in os.walk(".", onerror=walkerror):
-            for f in filenames:
-                file_path=os.path.join(dirpath, f)[2:]
-
-                if (not os.path.islink(file_path)) and os.path.isfile(file_path):
-                    for (chunk_nr, hash) in block_hash(file_path, count, bs):
-                        yield ( file_path, chunk_nr, hash )
-    finally:
-        os.chdir(cwd)


 def tmp_name(suffix=""):