This commit is contained in:
Edwin Eefting 2022-03-07 19:34:13 +01:00
parent 28ed44b1c8
commit b68ca19e5f
3 changed files with 149 additions and 34 deletions

View File

@ -1,4 +1,6 @@
import hashlib
import os
from random import random
class BlockHasher():
@ -12,14 +14,45 @@ class BlockHasher():
Input and output generators are in the format ( chunk_nr, hexdigest )
"""
def __init__(self, count=10000, bs=4096, hash_class=hashlib.sha1, coverage=1):
def __init__(self, count=10000, bs=4096, hash_class=hashlib.sha1, skip=0):
self.count = count
self.bs = bs
self.chunk_size=bs*count
self.hash_class = hash_class
self.coverage=1
self.stats_total=0
self.stats_checked=0
# self.coverage=coverage
self.skip=skip
self._skip_count=0
self.stats_total_bytes=0
def _seek_next_chunk(self, fh, fsize):
"""seek fh to next chunk and update skip counter.
returns chunk_nr
return false it should skip the rest of the file"""
#ignore rempty files
if fsize==0:
return False
# need to skip chunks?
if self._skip_count > 0:
chunks_left = ((fsize - fh.tell()) // self.chunk_size) + 1
# not enough chunks left in this file?
if self._skip_count >= chunks_left:
# skip rest of this file
self._skip_count = self._skip_count - chunks_left
return False
else:
# seek to next chunk, reset skip count
fh.seek(self.chunk_size * self._skip_count, os.SEEK_CUR)
self._skip_count = self.skip
return fh.tell()//self.chunk_size
else:
# should read this chunk, reset skip count
self._skip_count = self.skip
return fh.tell() // self.chunk_size
def generate(self, fname):
"""Generates checksums
@ -28,23 +61,37 @@ class BlockHasher():
yields nothing for empty files.
"""
with open(fname, "rb") as f:
hash = self.hash_class()
block_nr = 0
chunk_nr = 0
for block in iter(lambda: f.read(self.bs), b""):
hash.update(block)
block_nr = block_nr + 1
if block_nr % self.count == 0:
yield (chunk_nr, hash.hexdigest())
chunk_nr = chunk_nr + 1
hash = self.hash_class()
with os.open(fname, os.O_RDONLY) as fh:
print (os.lseek(fh, 0, os.SEEK_END))
with os.openopen(fname, "rb") as fh:
# print(os.path.getsize(fname))
print(os.lseek(fh, 0, os.SEEK_END))
fsize = fh.seek(0, os.SEEK_END)
fh.seek(0)
while fh.tell()<fsize:
chunk_nr=self._seek_next_chunk(fh, fsize)
if chunk_nr is False:
return
#read chunk
hash = self.hash_class()
block_nr = 0
while block_nr != self.count:
block=fh.read(self.bs)
if block==b"":
break
hash.update(block)
block_nr = block_nr + 1
# yield last (incomplete) block
if block_nr % self.count != 0:
yield (chunk_nr, hash.hexdigest())
def compare(self, fname, generator):
def compare(self, fname, generator):
"""reads from generator and compares blocks
Yields mismatches in the form: ( chunk_nr, hexdigest, actual_hexdigest)
Yields errors in the form: ( chunk_nr, hexdigest, "message" )

View File

@ -20,10 +20,7 @@ class ZfsCheck(CliBase):
self.node = ZfsNode(self.log, readonly=self.args.test, debug_output=self.args.debug_output)
if self.args.check is None:
self.block_hasher = BlockHasher(count=self.args.count, bs=self.args.block_size)
else:
self.block_hasher = BlockHasher(count=self.args.count, bs=self.args.block_size, coverage=self.args.percentage)
self.block_hasher = BlockHasher(count=self.args.count, bs=self.args.block_size, skip=self.args.skip)
def get_parser(self):
@ -37,13 +34,13 @@ class ZfsCheck(CliBase):
group.add_argument('--block-size', metavar="BYTES", default=4096, help="Read block-size, default %(default)s",
type=int)
group.add_argument('--count', metavar="COUNT", default=int((100 * (1024 ** 2)) / 4096),
help="Hash chunks of COUNT blocks. Default %(default)s . (Chunk size is BYTES * COUNT) ", type=int) # 100MiB
help="Hash chunks of COUNT blocks. Default %(default)s . (CHUNK size is BYTES * COUNT) ", type=int) # 100MiB
group.add_argument('--check', '-c', metavar="FILE", default=None, const=True, nargs='?',
help="Read hashes from STDIN (or FILE) and compare them")
group.add_argument('--percentage', '-p', metavar="NUMBER", default=100, type=float,
help="Generate/compare only this percentage of hashes. Default %(default)s")
group.add_argument('--skip', '-s', metavar="NUMBER", default=0, type=float,
help="Skip this number of chunks after every hash. %(default)s")
return parser
@ -61,10 +58,9 @@ class ZfsCheck(CliBase):
self.verbose("Block size : {} bytes".format(args.block_size))
self.verbose("Block count : {}".format(args.count))
self.verbose("Effective chunk size : {} bytes".format(args.count*args.block_size))
self.verbose("Percentage to check : {} %".format(args.percentage))
self.verbose("Skip chunk count : {} (checks {:.2f}% of data)".format(args.skip, 100/(1+args.skip)))
self.verbose("")
args.percentage=args.percentage/100
return args
@ -216,28 +212,30 @@ class ZfsCheck(CliBase):
last_progress_time = time.time()
progress_checked = 0
progress_total = 0
progress_skipped = 0
line=input_fh.readline()
skip=0
while line:
i=line.rstrip().split("\t")
#ignores lines without tabs
if (len(i)>1):
if self.args.percentage==1 or self.args.percentage>random():
if skip==0:
progress_checked=progress_checked+1
yield i
progress_total=progress_total+1
skip=self.args.skip
else:
skip=skip-1
progress_skipped=progress_skipped+1
if self.args.progress and time.time() - last_progress_time > 1:
last_progress_time = time.time()
self.progress("Checked {}/{} hashes. ({:.2f}% coverage)".format(progress_checked, progress_total, (float(progress_checked)/progress_total)*100))
self.progress("Checked {} hashes (skipped {})".format(progress_checked, progress_skipped))
line=input_fh.readline()
self.verbose("Checked {}/{} hashes. ({:.2f}% coverage)".format(progress_checked, progress_total, (
float(progress_checked) / progress_total) * 100))
self.verbose("Checked {} hashes (skipped {})".format(progress_checked, progress_skipped))
def run(self):

View File

@ -0,0 +1,70 @@
import os.path
import os
import time
from random import random
with open('test.py', 'rb') as fh:
# fsize = fh.seek(10000, os.SEEK_END)
# print(fsize)
start=time.time()
for i in range(0,1000000):
# fh.seek(0, 0)
fsize=fh.seek(0, os.SEEK_END)
# fsize=fh.tell()
# os.path.getsize('test.py')
print(time.time()-start)
print(fh.tell())
sys.exit(0)
checked=1
skipped=1
coverage=0.1
max_skip=0
skipinarow=0
while True:
total=checked+skipped
skip=coverage<random()
if skip:
skipped = skipped + 1
print("S {:.2f}%".format(checked * 100 / total))
skipinarow = skipinarow+1
if skipinarow>max_skip:
max_skip=skipinarow
else:
skipinarow=0
checked=checked+1
print("C {:.2f}%".format(checked * 100 / total))
print(max_skip)
skip=0
while True:
total=checked+skipped
if skip>0:
skip=skip-1
skipped = skipped + 1
print("S {:.2f}%".format(checked * 100 / total))
else:
checked=checked+1
print("C {:.2f}%".format(checked * 100 / total))
#calc new skip
skip=skip+((1/coverage)-1)*(random()*2)
# print(skip)
if skip> max_skip:
max_skip=skip
print(max_skip)