diff --git a/zfs_autobackup/BlockHasher.py b/zfs_autobackup/BlockHasher.py index 255aadd..44bb8a6 100644 --- a/zfs_autobackup/BlockHasher.py +++ b/zfs_autobackup/BlockHasher.py @@ -1,4 +1,6 @@ import hashlib +import os +from random import random class BlockHasher(): @@ -12,14 +14,45 @@ class BlockHasher(): Input and output generators are in the format ( chunk_nr, hexdigest ) """ - def __init__(self, count=10000, bs=4096, hash_class=hashlib.sha1, coverage=1): + def __init__(self, count=10000, bs=4096, hash_class=hashlib.sha1, skip=0): self.count = count self.bs = bs + self.chunk_size=bs*count self.hash_class = hash_class - self.coverage=1 - self.stats_total=0 - self.stats_checked=0 + # self.coverage=coverage + self.skip=skip + self._skip_count=0 + + self.stats_total_bytes=0 + + + def _seek_next_chunk(self, fh, fsize): + """seek fh to next chunk and update skip counter. + returns chunk_nr + return false it should skip the rest of the file""" + + #ignore rempty files + if fsize==0: + return False + + # need to skip chunks? + if self._skip_count > 0: + chunks_left = ((fsize - fh.tell()) // self.chunk_size) + 1 + # not enough chunks left in this file? + if self._skip_count >= chunks_left: + # skip rest of this file + self._skip_count = self._skip_count - chunks_left + return False + else: + # seek to next chunk, reset skip count + fh.seek(self.chunk_size * self._skip_count, os.SEEK_CUR) + self._skip_count = self.skip + return fh.tell()//self.chunk_size + else: + # should read this chunk, reset skip count + self._skip_count = self.skip + return fh.tell() // self.chunk_size def generate(self, fname): """Generates checksums @@ -28,23 +61,37 @@ class BlockHasher(): yields nothing for empty files. """ - with open(fname, "rb") as f: - hash = self.hash_class() - block_nr = 0 - chunk_nr = 0 - for block in iter(lambda: f.read(self.bs), b""): - hash.update(block) - block_nr = block_nr + 1 - if block_nr % self.count == 0: - yield (chunk_nr, hash.hexdigest()) - chunk_nr = chunk_nr + 1 - hash = self.hash_class() + with os.open(fname, os.O_RDONLY) as fh: + print (os.lseek(fh, 0, os.SEEK_END)) + + + with os.openopen(fname, "rb") as fh: + + # print(os.path.getsize(fname)) + print(os.lseek(fh, 0, os.SEEK_END)) + + fsize = fh.seek(0, os.SEEK_END) + fh.seek(0) + + while fh.tell()1): - if self.args.percentage==1 or self.args.percentage>random(): + if skip==0: progress_checked=progress_checked+1 yield i - - progress_total=progress_total+1 + skip=self.args.skip + else: + skip=skip-1 + progress_skipped=progress_skipped+1 if self.args.progress and time.time() - last_progress_time > 1: last_progress_time = time.time() - self.progress("Checked {}/{} hashes. ({:.2f}% coverage)".format(progress_checked, progress_total, (float(progress_checked)/progress_total)*100)) + self.progress("Checked {} hashes (skipped {})".format(progress_checked, progress_skipped)) line=input_fh.readline() - self.verbose("Checked {}/{} hashes. ({:.2f}% coverage)".format(progress_checked, progress_total, ( - float(progress_checked) / progress_total) * 100)) + self.verbose("Checked {} hashes (skipped {})".format(progress_checked, progress_skipped)) def run(self): diff --git a/zfs_autobackup/test.py b/zfs_autobackup/test.py index e69de29..6c1e39b 100644 --- a/zfs_autobackup/test.py +++ b/zfs_autobackup/test.py @@ -0,0 +1,70 @@ +import os.path +import os +import time +from random import random + +with open('test.py', 'rb') as fh: + + # fsize = fh.seek(10000, os.SEEK_END) + # print(fsize) + + start=time.time() + for i in range(0,1000000): + # fh.seek(0, 0) + fsize=fh.seek(0, os.SEEK_END) + # fsize=fh.tell() + # os.path.getsize('test.py') + print(time.time()-start) + + + print(fh.tell()) + +sys.exit(0) + + + +checked=1 +skipped=1 +coverage=0.1 + +max_skip=0 + + +skipinarow=0 +while True: + total=checked+skipped + + skip=coveragemax_skip: + max_skip=skipinarow + else: + skipinarow=0 + checked=checked+1 + print("C {:.2f}%".format(checked * 100 / total)) + + print(max_skip) + +skip=0 +while True: + + total=checked+skipped + if skip>0: + skip=skip-1 + skipped = skipped + 1 + print("S {:.2f}%".format(checked * 100 / total)) + else: + checked=checked+1 + print("C {:.2f}%".format(checked * 100 / total)) + + #calc new skip + skip=skip+((1/coverage)-1)*(random()*2) + # print(skip) + if skip> max_skip: + max_skip=skip + + print(max_skip)