From 0d882ec031144cb46461f375eb09334e432a9fef Mon Sep 17 00:00:00 2001 From: Edwin Eefting Date: Tue, 22 Feb 2022 16:59:08 +0100 Subject: [PATCH] comparing input now functions --- zfs_autobackup/BlockHasher.py | 10 ++- zfs_autobackup/TreeHasher.py | 49 +++++++------- zfs_autobackup/ZfsCheck.py | 119 ++++++++++++++++++++++------------ 3 files changed, 107 insertions(+), 71 deletions(-) diff --git a/zfs_autobackup/BlockHasher.py b/zfs_autobackup/BlockHasher.py index 199ba5b..c891b44 100644 --- a/zfs_autobackup/BlockHasher.py +++ b/zfs_autobackup/BlockHasher.py @@ -8,6 +8,8 @@ class BlockHasher(): The chunksize is count*bs (bs is the read blocksize from disk) Its also possible to only read a certain percentage of blocks to just check a sample. + + Input and output generators are in the format ( chunk_nr, hexdigest ) """ def __init__(self, count=10000, bs=4096, hash_class=hashlib.sha1): @@ -39,7 +41,11 @@ class BlockHasher(): yield (chunk_nr, hash.hexdigest()) def compare(self, fname, generator): - """reads from generator and compares blocks, yields mismatches""" + """reads from generator and compares blocks + Yields mismatches in the form: ( chunk_nr, hexdigest, actual_hexdigest) + Yields errors in the form: ( chunk_nr, hexdigest, "message" ) + + """ try: checked = 0 @@ -49,7 +55,7 @@ class BlockHasher(): checked = checked + 1 hash = self.hash_class() - f.seek(chunk_nr * self.bs * self.count) + f.seek(int(chunk_nr) * self.bs * self.count) block_nr = 0 for block in iter(lambda: f.read(self.bs), b""): hash.update(block) diff --git a/zfs_autobackup/TreeHasher.py b/zfs_autobackup/TreeHasher.py index c243d08..b787475 100644 --- a/zfs_autobackup/TreeHasher.py +++ b/zfs_autobackup/TreeHasher.py @@ -3,7 +3,11 @@ import os class TreeHasher(): - """uses BlockHasher recursively on a directory tree""" + """uses BlockHasher recursively on a directory tree + + Input and output generators are in the format: ( relative-filepath, chunk_nr, hexdigest) + + """ def __init__(self, block_hasher): """ @@ -19,44 +23,37 @@ class TreeHasher(): It also ignores empty directories, symlinks and special files. """ - cwd=os.getcwd() - os.chdir(start_path) - def walkerror(e): raise e - try: - for (dirpath, dirnames, filenames) in os.walk(".", onerror=walkerror): - for f in filenames: - file_path=os.path.join(dirpath, f)[2:] + for (dirpath, dirnames, filenames) in os.walk(start_path, onerror=walkerror): + for f in filenames: + file_path=os.path.join(dirpath, f) - if (not os.path.islink(file_path)) and os.path.isfile(file_path): - for (chunk_nr, hash) in self.block_hasher.generate(file_path): - yield ( file_path, chunk_nr, hash ) - finally: - os.chdir(cwd) + if (not os.path.islink(file_path)) and os.path.isfile(file_path): + for (chunk_nr, hash) in self.block_hasher.generate(file_path): + yield ( os.path.relpath(file_path,start_path), chunk_nr, hash ) def compare(self, start_path, generator): - """reads from generator and compares blocks, raises exception on error + """reads from generator and compares blocks + + yields mismatches in the form: ( relative_filename, chunk_nr, compare_hexdigest, actual_hexdigest ) + yields errors in the form: ( relative_filename, chunk_nr, compare_hexdigest, "message" ) + """ - cwd=os.getcwd() - os.chdir(start_path) count=0 - try: - def filter_file_name( file_name, chunk_nr, hexdigest): - return ( chunk_nr, hexdigest ) + def filter_file_name( file_name, chunk_nr, hexdigest): + return ( chunk_nr, hexdigest ) - for file_name, group_generator in itertools.groupby(generator, lambda x: x[0]): - count=count+1 - block_generator=itertools.starmap(filter_file_name, group_generator) - for ( chunk_nr, compare_hexdigest, actual_hexdigest) in self.block_hasher.compare(file_name, block_generator): - yield ( file_name, chunk_nr, compare_hexdigest, actual_hexdigest ) - finally: - os.chdir(cwd) + for file_name, group_generator in itertools.groupby(generator, lambda x: x[0]): + count=count+1 + block_generator=itertools.starmap(filter_file_name, group_generator) + for ( chunk_nr, compare_hexdigest, actual_hexdigest) in self.block_hasher.compare(os.path.join(start_path,file_name), block_generator): + yield ( file_name, chunk_nr, compare_hexdigest, actual_hexdigest ) diff --git a/zfs_autobackup/ZfsCheck.py b/zfs_autobackup/ZfsCheck.py index cf54d66..de982e2 100644 --- a/zfs_autobackup/ZfsCheck.py +++ b/zfs_autobackup/ZfsCheck.py @@ -14,17 +14,18 @@ class ZfsCheck(CliBase): def __init__(self, argv, print_arguments=True): - # NOTE: common options and parameters are in ZfsAuto + # NOTE: common options argument parsing are in CliBase super(ZfsCheck, self).__init__(argv, print_arguments) self.node = ZfsNode(self.log, readonly=self.args.test, debug_output=self.args.debug_output) + self.block_hasher = BlockHasher(count=self.args.count, bs=self.args.block_size) def get_parser(self): parser = super(ZfsCheck, self).get_parser() # positional arguments - parser.add_argument('snapshot', metavar='SNAPSHOT', default=None, nargs='?', help='Snapshot to checksum') + parser.add_argument('target', metavar='TARGET', default=None, nargs='?', help='Target to checksum. (can be blockdevice, directory or ZFS snapshot)') group = parser.add_argument_group('Hasher options') @@ -45,13 +46,13 @@ class ZfsCheck(CliBase): self.warning("TEST MODE - NOT DOING ANYTHING USEFULL") self.log.show_debug = True # show at least what we would do - if args.snapshot is None: - self.error("Please specify SNAPSHOT") + if args.target is None: + self.error("Please specify TARGET") sys.exit(1) return args - def hash_filesystem(self, snapshot, count, bs): + def generate_zfs_filesystem(self, snapshot, input_generator): """ recursively hash all files in this snapshot, using block_hash_tree() :type snapshot: ZfsDataset.ZfsDataset @@ -64,19 +65,16 @@ class ZfsCheck(CliBase): snapshot.mount(mnt) - tree_hasher=TreeHasher(BlockHasher(count=count, bs=bs)) + tree_hasher=TreeHasher(self.block_hasher) self.debug("Hashing tree: {}".format(mnt)) if not self.args.test: - - # generator=tree_hasher.generate(mnt) - # tree_hasher.compare(mnt, generator) - - - for (file, block, hash) in tree_hasher.generate(mnt): - print("{}\t{}\t{}".format(file, block, hash)) - sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect - + if input_generator: + for i in tree_hasher.compare(mnt, input_generator): + yield i + else: + for i in tree_hasher.generate(mnt): + yield i finally: snapshot.unmount() @@ -119,24 +117,26 @@ class ZfsCheck(CliBase): clone = snapshot.zfs_node.get_dataset(clone_name) clone.destroy(deferred=True, verbose=False) - def hash_volume(self, snapshot, count, bs): + def generate_zfs_volume(self, snapshot, input_generator): try: dev=self.activate_volume_snapshot(snapshot) - block_hasher=BlockHasher(count=count, bs=bs) self.debug("Hashing dev: {}".format(dev)) if not self.args.test: - for (block, hash) in block_hasher.generate(dev): - print("{}\t{}".format(block, hash)) - sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect + if input_generator: + for i in self.block_hasher.compare(dev, input_generator): + yield i + else: + for i in self.block_hasher.generate(dev): + yield i finally: self.deacitvate_volume_snapshot(snapshot) - def run(self): - - snapshot = self.node.get_dataset(self.args.snapshot) + def generate_zfs_target(self, input_generator): + """specified arget is a ZFS snapshot""" + snapshot = self.node.get_dataset(self.args.target) if not snapshot.exists: snapshot.error("Snapshot not found") sys.exit(1) @@ -147,15 +147,64 @@ class ZfsCheck(CliBase): dataset_type = snapshot.parent.properties['type'] - snapshot.verbose("Generating checksums...") - if dataset_type == 'volume': - self.hash_volume(snapshot, self.args.count, self.args.block_size) + return self.generate_zfs_volume(snapshot, input_generator) elif dataset_type == 'filesystem': - self.hash_filesystem(snapshot, self.args.count, self.args.block_size) + return self.generate_zfs_filesystem(snapshot, input_generator) else: raise Exception("huh?") + def generate(self, input_generator=None): + """generate checksums or compare (and generate error messages)""" + + if '@' in self.args.target: + self.verbose("Assuming target {} is ZFS snapshot.".format(self.args.target)) + return self.generate_zfs_target(input_generator) + elif os.path.isdir(self.args.target): + self.verbose("Target {} is directory, checking recursively.".format(self.args.target)) + return self.check_path(input_generator) + elif os.path.isfile(self.args.target): + self.verbose("Target {} is single file or blockdevice.".format(self.args.target)) + + def input_parser(self, file_name): + """parse input lines and generate items to use in compare functions""" + with open(file_name, 'r') as input_fh: + for line in input_fh: + i=line.rstrip().split("\t") + #ignores lines without tabs + if (len(i)>1): + yield i + + def run(self): + + try: + #run as generator + if self.args.check==None: + for i in self.generate(input_generator=None): + if len(i)==3: + print("{}\t{}\t{}".format(*i)) + else: + print("{}\t{}".format(*i)) + sys.stdout.flush() + #run as compare + else: + input_generator=self.input_parser(self.args.check) + for i in self.generate(input_generator): + if len(i)==4: + (file_name, chunk_nr, compare_hexdigest, actual_hexdigest)=i + self.log.error("{}\t{}\t{}\t{}".format(file_name, chunk_nr, compare_hexdigest, actual_hexdigest)) + else: + (chunk_nr, compare_hexdigest, actual_hexdigest) = i + self.log.error("{}\t{}\t{}".format(chunk_nr, compare_hexdigest, actual_hexdigest)) + + except Exception as e: + self.error("Exception: " + str(e)) + if self.args.debug: + raise + return 255 + except KeyboardInterrupt: + self.error("Aborted") + return 255 def cli(): import sys @@ -163,22 +212,6 @@ def cli(): sys.exit(ZfsCheck(sys.argv[1:], False).run()) - # block_hasher=BlockHasher() - - # if sys.argv[1]=="s": - # for ( fname, nr, hash ) in TreeHasher(block_hasher).generate("/usr/src/linux-headers-5.14.14-051414"): - # print("{}\t{}\t{}".format(fname, nr, hash)) - # - # if sys.argv[1]=="r": - # - # def gen(): - # for line in sys.stdin: - # ( fname, nr, hash)=line.rstrip().split('\t') - # yield (fname, int(nr), hash) - # - # TreeHasher(block_hasher).compare("/usr/src/linux-headers-5.14.14-051414", gen()) - - if __name__ == "__main__": cli()