extract BlockHasher and TreeHasher classes

This commit is contained in:
Edwin Eefting 2022-02-21 13:49:05 +01:00
parent a807ec320e
commit a2f85690a3
5 changed files with 97 additions and 67 deletions

View File

@ -1,4 +1,5 @@
from basetest import *
from zfs_autobackup.BlockHasher import BlockHasher
class TestZfsCheck(unittest2.TestCase):
@ -17,24 +18,25 @@ class TestZfsCheck(unittest2.TestCase):
# 959e6b58078f0cfd2fb3d37e978fda51820473ff whole_whole2
# 309ffffba2e1977d12f3b7469971f30d28b94bd8 whole_whole2_partial
block_hasher=BlockHasher(count=1)
self.assertEqual(
list(block_hash("tests/data/empty", count=1)),
list(block_hasher.generate("tests/data/empty")),
[]
)
self.assertEqual(
list(block_hash("tests/data/partial", count=1)),
list(block_hasher.generate("tests/data/partial")),
[(0, "642027d63bb0afd7e0ba197f2c66ad03e3d70de1")]
)
self.assertEqual(
list(block_hash("tests/data/whole", count=1)),
list(block_hasher.generate("tests/data/whole")),
[(0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7")]
)
self.assertEqual(
list(block_hash("tests/data/whole_whole2", count=1)),
list(block_hasher.generate("tests/data/whole_whole2")),
[
(0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7"),
(1, "2e863f1fcccd6642e4e28453eba10d2d3f74d798")
@ -42,7 +44,7 @@ class TestZfsCheck(unittest2.TestCase):
)
self.assertEqual(
list(block_hash("tests/data/whole_whole2_partial", count=1)),
list(block_hasher.generate("tests/data/whole_whole2_partial")),
[
(0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7"), #whole
(1, "2e863f1fcccd6642e4e28453eba10d2d3f74d798"), #whole2
@ -50,16 +52,18 @@ class TestZfsCheck(unittest2.TestCase):
]
)
block_hasher=BlockHasher(count=2)
self.assertEqual(
list(block_hash("tests/data/whole_whole2_partial", count=2)),
list(block_hasher.generate("tests/data/whole_whole2_partial")),
[
(0, "959e6b58078f0cfd2fb3d37e978fda51820473ff"), #whole_whole2
(1, "642027d63bb0afd7e0ba197f2c66ad03e3d70de1") #partial
]
)
block_hasher=BlockHasher(count=10)
self.assertEqual(
list(block_hash("tests/data/whole_whole2_partial", count=10)),
list(block_hasher.generate("tests/data/whole_whole2_partial")),
[
(0, "309ffffba2e1977d12f3b7469971f30d28b94bd8"), #whole_whole2_partial
])

View File

@ -0,0 +1,45 @@
import hashlib
class BlockHasher():
"""This class was created to checksum huge files and blockdevices (TB's)
Instead of one sha1sum of the whole file, it generates sha1susms of chunks of the file.
The chunksize is count*bs (bs is the read blocksize from disk)
Its also possible to only read a certain percentage of blocks to just check a sample.
"""
def __init__(self, count=10000, bs=4096, hash_class=hashlib.sha1):
self.count=count
self.bs=bs
self.hash_class=hash_class
def generate(self, fname):
"""Generates checksums
yields(chunk_nr, hexdigest)
yields nothing for empty files.
"""
with open(fname, "rb") as f:
hash = self.hash_class()
block_nr = 0
chunk_nr = 0
for block in iter(lambda: f.read(self.bs), b""):
hash.update(block)
block_nr = block_nr + 1
if block_nr % self.count == 0:
yield (chunk_nr, hash.hexdigest())
chunk_nr = chunk_nr + 1
hash = self.hash_class()
# yield last (incomplete) block
if block_nr % self.count != 0:
yield (chunk_nr, hash.hexdigest())
# def compare(fname, generator):
# """reads from generatos and compares blocks"""
#
# with open(fname, "rb") as f:
# for ( count, bs , chunk_nr, hexdigest) in input_generator:

View File

@ -0,0 +1,33 @@
import os
class TreeHasher():
"""uses BlockHasher recursively on a directory tree"""
def __init__(self, block_hasher):
self.block_hasher=block_hasher
def generate(self, start_path):
"""Use BlockHasher on every file in a tree, yielding the results
note that it only checks the contents of actual files. It ignores metadata like permissions and mtimes.
It also ignores empty directories, symlinks and special files.
"""
cwd=os.getcwd()
os.chdir(start_path)
def walkerror(e):
raise e
try:
for (dirpath, dirnames, filenames) in os.walk(".", onerror=walkerror):
for f in filenames:
file_path=os.path.join(dirpath, f)[2:]
if (not os.path.islink(file_path)) and os.path.isfile(file_path):
for (chunk_nr, hash) in self.block_hasher.generate(file_path):
yield ( file_path, chunk_nr, hash )
finally:
os.chdir(cwd)

View File

@ -1,8 +1,10 @@
from __future__ import print_function
import time
from signal import signal, SIGPIPE
from .TreeHasher import TreeHasher
from .BlockHasher import BlockHasher
from .ZfsNode import ZfsNode
from .util import *
from .CliBase import CliBase
@ -62,9 +64,11 @@ class ZfsCheck(CliBase):
snapshot.mount(mnt)
tree_hasher=TreeHasher(BlockHasher(count=count, bs=bs))
self.debug("Hashing tree: {}".format(mnt))
if not self.args.test:
for (file, block, hash) in block_hash_tree(mnt, count, bs):
for (file, block, hash) in tree_hasher.generate(mnt):
print("{}\t{}\t{}".format(file, block, hash))
sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect
@ -113,14 +117,14 @@ class ZfsCheck(CliBase):
def hash_volume(self, snapshot, count, bs):
try:
dev=self.activate_volume_snapshot(snapshot)
block_hasher=BlockHasher(count=count, bs=bs)
self.debug("Hashing dev: {}".format(dev))
if not self.args.test:
for (block, hash) in block_hash(dev, count, bs):
for (block, hash) in block_hasher.generate(dev):
print("{}\t{}".format(block, hash))
sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect
finally:
self.deacitvate_volume_snapshot(snapshot)

View File

@ -1,5 +1,3 @@
import hashlib
# root@psyt14s:/home/psy/zfs_autobackup# ls -lh /home/psy/Downloads/carimage.zip
# -rw-rw-r-- 1 psy psy 990M Nov 26 2020 /home/psy/Downloads/carimage.zip
# root@psyt14s:/home/psy/zfs_autobackup# time sha1sum /home/psy/Downloads/carimage.zip
@ -18,60 +16,6 @@ import hashlib
import os
import platform
import sys
import time
def block_hash(fname, count=10000, bs=4096):
"""This function was created to checksum huge files and blockdevices (TB's)
Instead of one sha1sum of the whole file, it generates sha1susms of chunks of the file.
yields sha1 hash of fname, per count blocks.
yields(chunk_nr, hexdigest)
yields nothing for empty files.
"""
with open(fname, "rb") as f:
hash = hashlib.sha1()
block_nr = 0
chunk_nr = 0
for block in iter(lambda: f.read(bs), b""):
hash.update(block)
block_nr = block_nr + 1
if block_nr % count == 0:
yield (chunk_nr, hash.hexdigest())
chunk_nr = chunk_nr + 1
hash = hashlib.sha1()
# yield last (incomplete) block
if block_nr % count != 0:
yield (chunk_nr, hash.hexdigest())
def block_hash_tree(start_path, count=10000, bs=4096):
"""block_hash every file in a tree, yielding the results
note that it only checks the contents of actual files. It ignores metadata like permissions and mtimes.
It also ignores empty directories, symlinks and special files.
"""
cwd=os.getcwd()
os.chdir(start_path)
def walkerror(e):
raise e
try:
for (dirpath, dirnames, filenames) in os.walk(".", onerror=walkerror):
for f in filenames:
file_path=os.path.join(dirpath, f)[2:]
if (not os.path.islink(file_path)) and os.path.isfile(file_path):
for (chunk_nr, hash) in block_hash(file_path, count, bs):
yield ( file_path, chunk_nr, hash )
finally:
os.chdir(cwd)
def tmp_name(suffix=""):