From bf2300beb9e1d85ead771af8076f53e479a7e335 Mon Sep 17 00:00:00 2001 From: kyle0r Date: Wed, 22 Nov 2023 04:48:30 +0000 Subject: [PATCH] Fixes #225 zfs-check: efficient handling of sparse files BlockHasher.py * hash_class is sourced from cli args instead of hardcoding it. * hash_factory() lays the groundwork to support arbitrary hash libs. * Detection of and use of xxhash lib. ZfsCheck.py * Implement new cli arg --hash. The choices for the arg are generated based on what is detected in the python env. * The input to --hash is is validated against the arg choices. * Implemented helper method determine_algorithms_available(). This tries to pick a performant default with a fallback to sha1. * Detection of and use of xxhash lib. --- zfs_autobackup/BlockHasher.py | 18 ++++++++++++++---- zfs_autobackup/ZfsCheck.py | 22 +++++++++++++++++++++- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/zfs_autobackup/BlockHasher.py b/zfs_autobackup/BlockHasher.py index 0a37c90..ff7eaa6 100644 --- a/zfs_autobackup/BlockHasher.py +++ b/zfs_autobackup/BlockHasher.py @@ -1,6 +1,11 @@ import hashlib import os +xxhash = None +try: + import xxhash +except: + pass class BlockHasher(): """This class was created to checksum huge files and blockdevices (TB's) @@ -16,7 +21,7 @@ class BlockHasher(): """ - def __init__(self, count=10000, bs=4096, hash_class=hashlib.sha1, skip=0): + def __init__(self, count=10000, bs=4096, hash_class=None, skip=0): self.count = count self.bs = bs self.chunk_size=bs*count @@ -28,6 +33,11 @@ class BlockHasher(): self.stats_total_bytes=0 + def hash_factory(self): + if self.hash_class in hashlib.algorithms_available: + return hashlib.new(self.hash_class) + if self.hash_class.startswith('xxh'): + return getattr(xxhash, self.hash_class)() def _seek_next_chunk(self, fh, fsize): """seek fh to next chunk and update skip counter. @@ -80,7 +90,7 @@ class BlockHasher(): return #read chunk - hash = self.hash_class() + hash = self.hash_factory() block_nr = 0 while block_nr != self.count: block=fh.read(self.bs) @@ -105,7 +115,7 @@ class BlockHasher(): try: checked = checked + 1 - hash = self.hash_class() + hash = self.hash_factory() f.seek(int(chunk_nr) * self.bs * self.count) block_nr = 0 for block in iter(lambda: f.read(self.bs), b""): @@ -124,4 +134,4 @@ class BlockHasher(): yield ( chunk_nr , hexdigest, 'ERROR: '+str(e)) except Exception as e: - yield ( '-', '-', 'ERROR: '+ str(e)) \ No newline at end of file + yield ( '-', '-', 'ERROR: '+ str(e)) diff --git a/zfs_autobackup/ZfsCheck.py b/zfs_autobackup/ZfsCheck.py index 10fe8ae..c904047 100644 --- a/zfs_autobackup/ZfsCheck.py +++ b/zfs_autobackup/ZfsCheck.py @@ -10,6 +10,13 @@ from .ZfsNode import ZfsNode from .util import * from .CliBase import CliBase +from hashlib import algorithms_available +from copy import copy +xxhash = None +try: + import xxhash +except: + pass class ZfsCheck(CliBase): @@ -20,7 +27,17 @@ class ZfsCheck(CliBase): self.node = ZfsNode(self.log, utc=self.args.utc, readonly=self.args.test, debug_output=self.args.debug_output) - self.block_hasher = BlockHasher(count=self.args.count, bs=self.args.block_size, skip=self.args.skip) + self.block_hasher = BlockHasher(count=self.args.count, bs=self.args.block_size, skip=self.args.skip, hash_class=self.args.hash) + + def determine_algorithms_available(self): + self.algorithms_available = copy(algorithms_available) + + if None != xxhash: + for value in ( 'xxh128', 'xxh32', 'xxh3_128', 'xxh3_64', 'xxh64' ): + self.algorithms_available.add(value) + self.hash_default = 'xxh3_64' + else: + self.hash_default = 'sha1' def get_parser(self): @@ -42,6 +59,9 @@ class ZfsCheck(CliBase): group.add_argument('--skip', '-s', metavar="NUMBER", default=0, type=int, help="Skip this number of chunks after every hash. %(default)s") + self.determine_algorithms_available() + group.add_argument('--hash', default=self.hash_default, + help="Specify the hashing algorithm to use", choices=sorted([item for item in self.algorithms_available])) return parser def parse_args(self, argv):