mirror of
https://github.com/psy0rz/zfs_autobackup.git
synced 2025-04-13 22:47:12 +03:00
extract BlockHasher and TreeHasher classes
This commit is contained in:
parent
a807ec320e
commit
a2f85690a3
@ -1,4 +1,5 @@
|
|||||||
from basetest import *
|
from basetest import *
|
||||||
|
from zfs_autobackup.BlockHasher import BlockHasher
|
||||||
|
|
||||||
|
|
||||||
class TestZfsCheck(unittest2.TestCase):
|
class TestZfsCheck(unittest2.TestCase):
|
||||||
@ -17,24 +18,25 @@ class TestZfsCheck(unittest2.TestCase):
|
|||||||
# 959e6b58078f0cfd2fb3d37e978fda51820473ff whole_whole2
|
# 959e6b58078f0cfd2fb3d37e978fda51820473ff whole_whole2
|
||||||
# 309ffffba2e1977d12f3b7469971f30d28b94bd8 whole_whole2_partial
|
# 309ffffba2e1977d12f3b7469971f30d28b94bd8 whole_whole2_partial
|
||||||
|
|
||||||
|
block_hasher=BlockHasher(count=1)
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
list(block_hash("tests/data/empty", count=1)),
|
list(block_hasher.generate("tests/data/empty")),
|
||||||
[]
|
[]
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
list(block_hash("tests/data/partial", count=1)),
|
list(block_hasher.generate("tests/data/partial")),
|
||||||
[(0, "642027d63bb0afd7e0ba197f2c66ad03e3d70de1")]
|
[(0, "642027d63bb0afd7e0ba197f2c66ad03e3d70de1")]
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
list(block_hash("tests/data/whole", count=1)),
|
list(block_hasher.generate("tests/data/whole")),
|
||||||
[(0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7")]
|
[(0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7")]
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
list(block_hash("tests/data/whole_whole2", count=1)),
|
list(block_hasher.generate("tests/data/whole_whole2")),
|
||||||
[
|
[
|
||||||
(0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7"),
|
(0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7"),
|
||||||
(1, "2e863f1fcccd6642e4e28453eba10d2d3f74d798")
|
(1, "2e863f1fcccd6642e4e28453eba10d2d3f74d798")
|
||||||
@ -42,7 +44,7 @@ class TestZfsCheck(unittest2.TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
list(block_hash("tests/data/whole_whole2_partial", count=1)),
|
list(block_hasher.generate("tests/data/whole_whole2_partial")),
|
||||||
[
|
[
|
||||||
(0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7"), #whole
|
(0, "3c0bf91170d873b8e327d3bafb6bc074580d11b7"), #whole
|
||||||
(1, "2e863f1fcccd6642e4e28453eba10d2d3f74d798"), #whole2
|
(1, "2e863f1fcccd6642e4e28453eba10d2d3f74d798"), #whole2
|
||||||
@ -50,16 +52,18 @@ class TestZfsCheck(unittest2.TestCase):
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
block_hasher=BlockHasher(count=2)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
list(block_hash("tests/data/whole_whole2_partial", count=2)),
|
list(block_hasher.generate("tests/data/whole_whole2_partial")),
|
||||||
[
|
[
|
||||||
(0, "959e6b58078f0cfd2fb3d37e978fda51820473ff"), #whole_whole2
|
(0, "959e6b58078f0cfd2fb3d37e978fda51820473ff"), #whole_whole2
|
||||||
(1, "642027d63bb0afd7e0ba197f2c66ad03e3d70de1") #partial
|
(1, "642027d63bb0afd7e0ba197f2c66ad03e3d70de1") #partial
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
block_hasher=BlockHasher(count=10)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
list(block_hash("tests/data/whole_whole2_partial", count=10)),
|
list(block_hasher.generate("tests/data/whole_whole2_partial")),
|
||||||
[
|
[
|
||||||
(0, "309ffffba2e1977d12f3b7469971f30d28b94bd8"), #whole_whole2_partial
|
(0, "309ffffba2e1977d12f3b7469971f30d28b94bd8"), #whole_whole2_partial
|
||||||
])
|
])
|
||||||
|
45
zfs_autobackup/BlockHasher.py
Normal file
45
zfs_autobackup/BlockHasher.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
import hashlib
|
||||||
|
|
||||||
|
|
||||||
|
class BlockHasher():
|
||||||
|
"""This class was created to checksum huge files and blockdevices (TB's)
|
||||||
|
Instead of one sha1sum of the whole file, it generates sha1susms of chunks of the file.
|
||||||
|
|
||||||
|
The chunksize is count*bs (bs is the read blocksize from disk)
|
||||||
|
|
||||||
|
Its also possible to only read a certain percentage of blocks to just check a sample.
|
||||||
|
"""
|
||||||
|
def __init__(self, count=10000, bs=4096, hash_class=hashlib.sha1):
|
||||||
|
self.count=count
|
||||||
|
self.bs=bs
|
||||||
|
self.hash_class=hash_class
|
||||||
|
|
||||||
|
|
||||||
|
def generate(self, fname):
|
||||||
|
"""Generates checksums
|
||||||
|
|
||||||
|
yields(chunk_nr, hexdigest)
|
||||||
|
|
||||||
|
yields nothing for empty files.
|
||||||
|
"""
|
||||||
|
with open(fname, "rb") as f:
|
||||||
|
hash = self.hash_class()
|
||||||
|
block_nr = 0
|
||||||
|
chunk_nr = 0
|
||||||
|
for block in iter(lambda: f.read(self.bs), b""):
|
||||||
|
hash.update(block)
|
||||||
|
block_nr = block_nr + 1
|
||||||
|
if block_nr % self.count == 0:
|
||||||
|
yield (chunk_nr, hash.hexdigest())
|
||||||
|
chunk_nr = chunk_nr + 1
|
||||||
|
hash = self.hash_class()
|
||||||
|
|
||||||
|
# yield last (incomplete) block
|
||||||
|
if block_nr % self.count != 0:
|
||||||
|
yield (chunk_nr, hash.hexdigest())
|
||||||
|
|
||||||
|
# def compare(fname, generator):
|
||||||
|
# """reads from generatos and compares blocks"""
|
||||||
|
#
|
||||||
|
# with open(fname, "rb") as f:
|
||||||
|
# for ( count, bs , chunk_nr, hexdigest) in input_generator:
|
33
zfs_autobackup/TreeHasher.py
Normal file
33
zfs_autobackup/TreeHasher.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class TreeHasher():
|
||||||
|
"""uses BlockHasher recursively on a directory tree"""
|
||||||
|
|
||||||
|
def __init__(self, block_hasher):
|
||||||
|
self.block_hasher=block_hasher
|
||||||
|
|
||||||
|
def generate(self, start_path):
|
||||||
|
"""Use BlockHasher on every file in a tree, yielding the results
|
||||||
|
|
||||||
|
note that it only checks the contents of actual files. It ignores metadata like permissions and mtimes.
|
||||||
|
It also ignores empty directories, symlinks and special files.
|
||||||
|
"""
|
||||||
|
|
||||||
|
cwd=os.getcwd()
|
||||||
|
os.chdir(start_path)
|
||||||
|
|
||||||
|
def walkerror(e):
|
||||||
|
raise e
|
||||||
|
|
||||||
|
try:
|
||||||
|
for (dirpath, dirnames, filenames) in os.walk(".", onerror=walkerror):
|
||||||
|
for f in filenames:
|
||||||
|
file_path=os.path.join(dirpath, f)[2:]
|
||||||
|
|
||||||
|
if (not os.path.islink(file_path)) and os.path.isfile(file_path):
|
||||||
|
for (chunk_nr, hash) in self.block_hasher.generate(file_path):
|
||||||
|
yield ( file_path, chunk_nr, hash )
|
||||||
|
finally:
|
||||||
|
os.chdir(cwd)
|
||||||
|
|
@ -1,8 +1,10 @@
|
|||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import time
|
||||||
from signal import signal, SIGPIPE
|
from signal import signal, SIGPIPE
|
||||||
|
|
||||||
|
from .TreeHasher import TreeHasher
|
||||||
|
from .BlockHasher import BlockHasher
|
||||||
from .ZfsNode import ZfsNode
|
from .ZfsNode import ZfsNode
|
||||||
from .util import *
|
from .util import *
|
||||||
from .CliBase import CliBase
|
from .CliBase import CliBase
|
||||||
@ -62,9 +64,11 @@ class ZfsCheck(CliBase):
|
|||||||
|
|
||||||
snapshot.mount(mnt)
|
snapshot.mount(mnt)
|
||||||
|
|
||||||
|
tree_hasher=TreeHasher(BlockHasher(count=count, bs=bs))
|
||||||
|
|
||||||
self.debug("Hashing tree: {}".format(mnt))
|
self.debug("Hashing tree: {}".format(mnt))
|
||||||
if not self.args.test:
|
if not self.args.test:
|
||||||
for (file, block, hash) in block_hash_tree(mnt, count, bs):
|
for (file, block, hash) in tree_hasher.generate(mnt):
|
||||||
print("{}\t{}\t{}".format(file, block, hash))
|
print("{}\t{}\t{}".format(file, block, hash))
|
||||||
sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect
|
sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect
|
||||||
|
|
||||||
@ -113,14 +117,14 @@ class ZfsCheck(CliBase):
|
|||||||
def hash_volume(self, snapshot, count, bs):
|
def hash_volume(self, snapshot, count, bs):
|
||||||
try:
|
try:
|
||||||
dev=self.activate_volume_snapshot(snapshot)
|
dev=self.activate_volume_snapshot(snapshot)
|
||||||
|
block_hasher=BlockHasher(count=count, bs=bs)
|
||||||
|
|
||||||
self.debug("Hashing dev: {}".format(dev))
|
self.debug("Hashing dev: {}".format(dev))
|
||||||
if not self.args.test:
|
if not self.args.test:
|
||||||
for (block, hash) in block_hash(dev, count, bs):
|
for (block, hash) in block_hasher.generate(dev):
|
||||||
print("{}\t{}".format(block, hash))
|
print("{}\t{}".format(block, hash))
|
||||||
sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect
|
sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect
|
||||||
|
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
self.deacitvate_volume_snapshot(snapshot)
|
self.deacitvate_volume_snapshot(snapshot)
|
||||||
|
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
import hashlib
|
|
||||||
|
|
||||||
# root@psyt14s:/home/psy/zfs_autobackup# ls -lh /home/psy/Downloads/carimage.zip
|
# root@psyt14s:/home/psy/zfs_autobackup# ls -lh /home/psy/Downloads/carimage.zip
|
||||||
# -rw-rw-r-- 1 psy psy 990M Nov 26 2020 /home/psy/Downloads/carimage.zip
|
# -rw-rw-r-- 1 psy psy 990M Nov 26 2020 /home/psy/Downloads/carimage.zip
|
||||||
# root@psyt14s:/home/psy/zfs_autobackup# time sha1sum /home/psy/Downloads/carimage.zip
|
# root@psyt14s:/home/psy/zfs_autobackup# time sha1sum /home/psy/Downloads/carimage.zip
|
||||||
@ -18,60 +16,6 @@ import hashlib
|
|||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
import sys
|
import sys
|
||||||
import time
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def block_hash(fname, count=10000, bs=4096):
|
|
||||||
"""This function was created to checksum huge files and blockdevices (TB's)
|
|
||||||
Instead of one sha1sum of the whole file, it generates sha1susms of chunks of the file.
|
|
||||||
|
|
||||||
yields sha1 hash of fname, per count blocks.
|
|
||||||
yields(chunk_nr, hexdigest)
|
|
||||||
|
|
||||||
yields nothing for empty files.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
with open(fname, "rb") as f:
|
|
||||||
hash = hashlib.sha1()
|
|
||||||
block_nr = 0
|
|
||||||
chunk_nr = 0
|
|
||||||
for block in iter(lambda: f.read(bs), b""):
|
|
||||||
hash.update(block)
|
|
||||||
block_nr = block_nr + 1
|
|
||||||
if block_nr % count == 0:
|
|
||||||
yield (chunk_nr, hash.hexdigest())
|
|
||||||
chunk_nr = chunk_nr + 1
|
|
||||||
hash = hashlib.sha1()
|
|
||||||
|
|
||||||
# yield last (incomplete) block
|
|
||||||
if block_nr % count != 0:
|
|
||||||
yield (chunk_nr, hash.hexdigest())
|
|
||||||
|
|
||||||
def block_hash_tree(start_path, count=10000, bs=4096):
|
|
||||||
"""block_hash every file in a tree, yielding the results
|
|
||||||
|
|
||||||
note that it only checks the contents of actual files. It ignores metadata like permissions and mtimes.
|
|
||||||
It also ignores empty directories, symlinks and special files.
|
|
||||||
"""
|
|
||||||
|
|
||||||
cwd=os.getcwd()
|
|
||||||
os.chdir(start_path)
|
|
||||||
|
|
||||||
def walkerror(e):
|
|
||||||
raise e
|
|
||||||
|
|
||||||
try:
|
|
||||||
for (dirpath, dirnames, filenames) in os.walk(".", onerror=walkerror):
|
|
||||||
for f in filenames:
|
|
||||||
file_path=os.path.join(dirpath, f)[2:]
|
|
||||||
|
|
||||||
if (not os.path.islink(file_path)) and os.path.isfile(file_path):
|
|
||||||
for (chunk_nr, hash) in block_hash(file_path, count, bs):
|
|
||||||
yield ( file_path, chunk_nr, hash )
|
|
||||||
finally:
|
|
||||||
os.chdir(cwd)
|
|
||||||
|
|
||||||
|
|
||||||
def tmp_name(suffix=""):
|
def tmp_name(suffix=""):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user