mirror of
https://github.com/psy0rz/zfs_autobackup.git
synced 2025-04-11 22:40:01 +03:00
extract BlockHasher and TreeHasher classes
This commit is contained in:
parent
a807ec320e
commit
a2f85690a3
@ -1,4 +1,5 @@
|
||||
from basetest import *
|
||||
from zfs_autobackup.BlockHasher import BlockHasher
|
||||
|
||||
|
||||
class TestZfsCheck(unittest2.TestCase):
|
||||
@ -17,24 +18,25 @@ class TestZfsCheck(unittest2.TestCase):
|
||||
# 959e6b58078f0cfd2fb3d37e978fda51820473ff whole_whole2
|
||||
# 309ffffba2e1977d12f3b7469971f30d28b94bd8 whole_whole2_partial
|
||||
|
||||
block_hasher=BlockHasher(count=1)
|
||||
|
||||
self.assertEqual(
|
||||
list(block_hash("tests/data/empty", count=1)),
|
||||
list(block_hasher.generate("tests/data/empty")),
|
||||
[]
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
list(block_hash("tests/data/partial", count=1)),
|
||||
list(block_hasher.generate("tests/data/partial")),
|
||||
[(0, "642027d63bb0afd7e0ba197f2c66ad03e3d70de1")]
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
list(block_hash("tests/data/whole", count=1)),
|
||||
list(block_hasher.generate("tests/data/whole")),
|
||||
[(0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7")]
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
list(block_hash("tests/data/whole_whole2", count=1)),
|
||||
list(block_hasher.generate("tests/data/whole_whole2")),
|
||||
[
|
||||
(0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7"),
|
||||
(1, "2e863f1fcccd6642e4e28453eba10d2d3f74d798")
|
||||
@ -42,7 +44,7 @@ class TestZfsCheck(unittest2.TestCase):
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
list(block_hash("tests/data/whole_whole2_partial", count=1)),
|
||||
list(block_hasher.generate("tests/data/whole_whole2_partial")),
|
||||
[
|
||||
(0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7"), #whole
|
||||
(1, "2e863f1fcccd6642e4e28453eba10d2d3f74d798"), #whole2
|
||||
@ -50,16 +52,18 @@ class TestZfsCheck(unittest2.TestCase):
|
||||
]
|
||||
)
|
||||
|
||||
block_hasher=BlockHasher(count=2)
|
||||
self.assertEqual(
|
||||
list(block_hash("tests/data/whole_whole2_partial", count=2)),
|
||||
list(block_hasher.generate("tests/data/whole_whole2_partial")),
|
||||
[
|
||||
(0, "959e6b58078f0cfd2fb3d37e978fda51820473ff"), #whole_whole2
|
||||
(1, "642027d63bb0afd7e0ba197f2c66ad03e3d70de1") #partial
|
||||
]
|
||||
)
|
||||
|
||||
block_hasher=BlockHasher(count=10)
|
||||
self.assertEqual(
|
||||
list(block_hash("tests/data/whole_whole2_partial", count=10)),
|
||||
list(block_hasher.generate("tests/data/whole_whole2_partial")),
|
||||
[
|
||||
(0, "309ffffba2e1977d12f3b7469971f30d28b94bd8"), #whole_whole2_partial
|
||||
])
|
||||
|
45
zfs_autobackup/BlockHasher.py
Normal file
45
zfs_autobackup/BlockHasher.py
Normal file
@ -0,0 +1,45 @@
|
||||
import hashlib
|
||||
|
||||
|
||||
class BlockHasher():
|
||||
"""This class was created to checksum huge files and blockdevices (TB's)
|
||||
Instead of one sha1sum of the whole file, it generates sha1susms of chunks of the file.
|
||||
|
||||
The chunksize is count*bs (bs is the read blocksize from disk)
|
||||
|
||||
Its also possible to only read a certain percentage of blocks to just check a sample.
|
||||
"""
|
||||
def __init__(self, count=10000, bs=4096, hash_class=hashlib.sha1):
|
||||
self.count=count
|
||||
self.bs=bs
|
||||
self.hash_class=hash_class
|
||||
|
||||
|
||||
def generate(self, fname):
|
||||
"""Generates checksums
|
||||
|
||||
yields(chunk_nr, hexdigest)
|
||||
|
||||
yields nothing for empty files.
|
||||
"""
|
||||
with open(fname, "rb") as f:
|
||||
hash = self.hash_class()
|
||||
block_nr = 0
|
||||
chunk_nr = 0
|
||||
for block in iter(lambda: f.read(self.bs), b""):
|
||||
hash.update(block)
|
||||
block_nr = block_nr + 1
|
||||
if block_nr % self.count == 0:
|
||||
yield (chunk_nr, hash.hexdigest())
|
||||
chunk_nr = chunk_nr + 1
|
||||
hash = self.hash_class()
|
||||
|
||||
# yield last (incomplete) block
|
||||
if block_nr % self.count != 0:
|
||||
yield (chunk_nr, hash.hexdigest())
|
||||
|
||||
# def compare(fname, generator):
|
||||
# """reads from generatos and compares blocks"""
|
||||
#
|
||||
# with open(fname, "rb") as f:
|
||||
# for ( count, bs , chunk_nr, hexdigest) in input_generator:
|
33
zfs_autobackup/TreeHasher.py
Normal file
33
zfs_autobackup/TreeHasher.py
Normal file
@ -0,0 +1,33 @@
|
||||
import os
|
||||
|
||||
|
||||
class TreeHasher():
|
||||
"""uses BlockHasher recursively on a directory tree"""
|
||||
|
||||
def __init__(self, block_hasher):
|
||||
self.block_hasher=block_hasher
|
||||
|
||||
def generate(self, start_path):
|
||||
"""Use BlockHasher on every file in a tree, yielding the results
|
||||
|
||||
note that it only checks the contents of actual files. It ignores metadata like permissions and mtimes.
|
||||
It also ignores empty directories, symlinks and special files.
|
||||
"""
|
||||
|
||||
cwd=os.getcwd()
|
||||
os.chdir(start_path)
|
||||
|
||||
def walkerror(e):
|
||||
raise e
|
||||
|
||||
try:
|
||||
for (dirpath, dirnames, filenames) in os.walk(".", onerror=walkerror):
|
||||
for f in filenames:
|
||||
file_path=os.path.join(dirpath, f)[2:]
|
||||
|
||||
if (not os.path.islink(file_path)) and os.path.isfile(file_path):
|
||||
for (chunk_nr, hash) in self.block_hasher.generate(file_path):
|
||||
yield ( file_path, chunk_nr, hash )
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
|
@ -1,8 +1,10 @@
|
||||
from __future__ import print_function
|
||||
|
||||
import time
|
||||
from signal import signal, SIGPIPE
|
||||
|
||||
|
||||
from .TreeHasher import TreeHasher
|
||||
from .BlockHasher import BlockHasher
|
||||
from .ZfsNode import ZfsNode
|
||||
from .util import *
|
||||
from .CliBase import CliBase
|
||||
@ -62,9 +64,11 @@ class ZfsCheck(CliBase):
|
||||
|
||||
snapshot.mount(mnt)
|
||||
|
||||
tree_hasher=TreeHasher(BlockHasher(count=count, bs=bs))
|
||||
|
||||
self.debug("Hashing tree: {}".format(mnt))
|
||||
if not self.args.test:
|
||||
for (file, block, hash) in block_hash_tree(mnt, count, bs):
|
||||
for (file, block, hash) in tree_hasher.generate(mnt):
|
||||
print("{}\t{}\t{}".format(file, block, hash))
|
||||
sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect
|
||||
|
||||
@ -113,14 +117,14 @@ class ZfsCheck(CliBase):
|
||||
def hash_volume(self, snapshot, count, bs):
|
||||
try:
|
||||
dev=self.activate_volume_snapshot(snapshot)
|
||||
block_hasher=BlockHasher(count=count, bs=bs)
|
||||
|
||||
self.debug("Hashing dev: {}".format(dev))
|
||||
if not self.args.test:
|
||||
for (block, hash) in block_hash(dev, count, bs):
|
||||
for (block, hash) in block_hasher.generate(dev):
|
||||
print("{}\t{}".format(block, hash))
|
||||
sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect
|
||||
|
||||
|
||||
finally:
|
||||
self.deacitvate_volume_snapshot(snapshot)
|
||||
|
||||
|
@ -1,5 +1,3 @@
|
||||
import hashlib
|
||||
|
||||
# root@psyt14s:/home/psy/zfs_autobackup# ls -lh /home/psy/Downloads/carimage.zip
|
||||
# -rw-rw-r-- 1 psy psy 990M Nov 26 2020 /home/psy/Downloads/carimage.zip
|
||||
# root@psyt14s:/home/psy/zfs_autobackup# time sha1sum /home/psy/Downloads/carimage.zip
|
||||
@ -18,60 +16,6 @@ import hashlib
|
||||
import os
|
||||
import platform
|
||||
import sys
|
||||
import time
|
||||
|
||||
|
||||
|
||||
def block_hash(fname, count=10000, bs=4096):
|
||||
"""This function was created to checksum huge files and blockdevices (TB's)
|
||||
Instead of one sha1sum of the whole file, it generates sha1susms of chunks of the file.
|
||||
|
||||
yields sha1 hash of fname, per count blocks.
|
||||
yields(chunk_nr, hexdigest)
|
||||
|
||||
yields nothing for empty files.
|
||||
|
||||
"""
|
||||
|
||||
with open(fname, "rb") as f:
|
||||
hash = hashlib.sha1()
|
||||
block_nr = 0
|
||||
chunk_nr = 0
|
||||
for block in iter(lambda: f.read(bs), b""):
|
||||
hash.update(block)
|
||||
block_nr = block_nr + 1
|
||||
if block_nr % count == 0:
|
||||
yield (chunk_nr, hash.hexdigest())
|
||||
chunk_nr = chunk_nr + 1
|
||||
hash = hashlib.sha1()
|
||||
|
||||
# yield last (incomplete) block
|
||||
if block_nr % count != 0:
|
||||
yield (chunk_nr, hash.hexdigest())
|
||||
|
||||
def block_hash_tree(start_path, count=10000, bs=4096):
|
||||
"""block_hash every file in a tree, yielding the results
|
||||
|
||||
note that it only checks the contents of actual files. It ignores metadata like permissions and mtimes.
|
||||
It also ignores empty directories, symlinks and special files.
|
||||
"""
|
||||
|
||||
cwd=os.getcwd()
|
||||
os.chdir(start_path)
|
||||
|
||||
def walkerror(e):
|
||||
raise e
|
||||
|
||||
try:
|
||||
for (dirpath, dirnames, filenames) in os.walk(".", onerror=walkerror):
|
||||
for f in filenames:
|
||||
file_path=os.path.join(dirpath, f)[2:]
|
||||
|
||||
if (not os.path.islink(file_path)) and os.path.isfile(file_path):
|
||||
for (chunk_nr, hash) in block_hash(file_path, count, bs):
|
||||
yield ( file_path, chunk_nr, hash )
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
|
||||
|
||||
def tmp_name(suffix=""):
|
||||
|
Loading…
x
Reference in New Issue
Block a user