wip

2025-07-07 16:18:43 +03:00 · 2022-03-07 19:34:13 +01:00 · 2022-03-07 19:34:13 +01:00 · b68ca19e5f
commit b68ca19e5f
parent 28ed44b1c8
3 changed files with 149 additions and 34 deletions
--- a/zfs_autobackup/BlockHasher.py
+++ b/zfs_autobackup/BlockHasher.py
@ -1,4 +1,6 @@
 import hashlib
+import os
+from random import random


 class BlockHasher():
@ -12,14 +14,45 @@ class BlockHasher():
    Input and output generators are in the format ( chunk_nr, hexdigest )
    """

-    def __init__(self, count=10000, bs=4096, hash_class=hashlib.sha1, coverage=1):
+    def __init__(self, count=10000, bs=4096, hash_class=hashlib.sha1, skip=0):
        self.count = count
        self.bs = bs
+        self.chunk_size=bs*count
        self.hash_class = hash_class
-        self.coverage=1

-        self.stats_total=0
-        self.stats_checked=0
+        # self.coverage=coverage
+        self.skip=skip
+        self._skip_count=0
+
+        self.stats_total_bytes=0
+
+
+    def _seek_next_chunk(self, fh, fsize):
+        """seek fh to next chunk and update skip counter.
+        returns chunk_nr
+        return false it should skip the rest of the file"""
+
+        #ignore rempty files
+        if fsize==0:
+            return False
+
+        # need to skip chunks?
+        if self._skip_count > 0:
+            chunks_left = ((fsize - fh.tell()) // self.chunk_size) + 1
+            # not enough chunks left in this file?
+            if self._skip_count >= chunks_left:
+                # skip rest of this file
+                self._skip_count = self._skip_count - chunks_left
+                return False
+            else:
+                # seek to next chunk, reset skip count
+                fh.seek(self.chunk_size * self._skip_count, os.SEEK_CUR)
+                self._skip_count = self.skip
+                return  fh.tell()//self.chunk_size
+        else:
+            # should read this chunk, reset skip count
+            self._skip_count = self.skip
+            return fh.tell() // self.chunk_size

    def generate(self, fname):
        """Generates checksums
@ -28,23 +61,37 @@ class BlockHasher():

        yields nothing for empty files.
        """
-        with open(fname, "rb") as f:
-            hash = self.hash_class()
-            block_nr = 0
-            chunk_nr = 0
-            for block in iter(lambda: f.read(self.bs), b""):
-                hash.update(block)
-                block_nr = block_nr + 1
-                if block_nr % self.count == 0:
-                    yield (chunk_nr, hash.hexdigest())
-                    chunk_nr = chunk_nr + 1
-                    hash = self.hash_class()
+        with os.open(fname, os.O_RDONLY) as fh:
+            print (os.lseek(fh, 0, os.SEEK_END))
+
+
+        with os.openopen(fname, "rb") as fh:
+
+            # print(os.path.getsize(fname))
+            print(os.lseek(fh, 0, os.SEEK_END))
+
+            fsize = fh.seek(0, os.SEEK_END)
+            fh.seek(0)
+
+            while fh.tell()<fsize:
+
+                chunk_nr=self._seek_next_chunk(fh, fsize)
+                if chunk_nr is False:
+                    return
+
+                #read chunk
+                hash = self.hash_class()
+                block_nr = 0
+                while block_nr != self.count:
+                    block=fh.read(self.bs)
+                    if block==b"":
+                        break
+                    hash.update(block)
+                    block_nr = block_nr + 1

-            # yield last (incomplete) block
-            if block_nr % self.count != 0:
                yield (chunk_nr, hash.hexdigest())

-    def compare(self, fname, generator):
+    def  compare(self, fname, generator):
        """reads from generator and compares blocks
        Yields mismatches in the form: ( chunk_nr, hexdigest, actual_hexdigest)
        Yields errors in the form: ( chunk_nr, hexdigest, "message" )
--- a/zfs_autobackup/ZfsCheck.py
+++ b/zfs_autobackup/ZfsCheck.py
@ -20,10 +20,7 @@ class ZfsCheck(CliBase):

        self.node = ZfsNode(self.log, readonly=self.args.test, debug_output=self.args.debug_output)

-        if self.args.check is None:
-            self.block_hasher = BlockHasher(count=self.args.count, bs=self.args.block_size)
-        else:
-            self.block_hasher = BlockHasher(count=self.args.count, bs=self.args.block_size, coverage=self.args.percentage)
+        self.block_hasher = BlockHasher(count=self.args.count, bs=self.args.block_size, skip=self.args.skip)

    def get_parser(self):

@ -37,13 +34,13 @@ class ZfsCheck(CliBase):
        group.add_argument('--block-size', metavar="BYTES", default=4096, help="Read block-size, default %(default)s",
                           type=int)
        group.add_argument('--count', metavar="COUNT", default=int((100 * (1024 ** 2)) / 4096),
-                           help="Hash chunks of COUNT blocks. Default %(default)s . (Chunk size is BYTES * COUNT) ", type=int)  # 100MiB
+                           help="Hash chunks of COUNT blocks. Default %(default)s . (CHUNK size is BYTES * COUNT) ", type=int)  # 100MiB

        group.add_argument('--check', '-c', metavar="FILE", default=None, const=True, nargs='?',
                           help="Read hashes from STDIN (or FILE) and compare them")

-        group.add_argument('--percentage', '-p', metavar="NUMBER", default=100, type=float,
-                           help="Generate/compare only this percentage of hashes. Default %(default)s")
+        group.add_argument('--skip', '-s', metavar="NUMBER", default=0, type=float,
+                           help="Skip this number of chunks after every hash. %(default)s")

        return parser

@ -61,10 +58,9 @@ class ZfsCheck(CliBase):
        self.verbose("Block size           : {} bytes".format(args.block_size))
        self.verbose("Block count          : {}".format(args.count))
        self.verbose("Effective chunk size : {} bytes".format(args.count*args.block_size))
-        self.verbose("Percentage to check  : {} %".format(args.percentage))
+        self.verbose("Skip chunk count     : {} (checks {:.2f}% of data)".format(args.skip, 100/(1+args.skip)))
        self.verbose("")

-        args.percentage=args.percentage/100

        return args

@ -216,28 +212,30 @@ class ZfsCheck(CliBase):

        last_progress_time = time.time()
        progress_checked = 0
-        progress_total = 0
+        progress_skipped = 0

        line=input_fh.readline()
+        skip=0
        while line:
            i=line.rstrip().split("\t")
            #ignores lines without tabs
            if (len(i)>1):

-                if self.args.percentage==1 or self.args.percentage>random():
+                if skip==0:
                    progress_checked=progress_checked+1
                    yield i
-
-                progress_total=progress_total+1
+                    skip=self.args.skip
+                else:
+                    skip=skip-1
+                    progress_skipped=progress_skipped+1

                if self.args.progress and time.time() - last_progress_time > 1:
                    last_progress_time = time.time()
-                    self.progress("Checked {}/{} hashes. ({:.2f}% coverage)".format(progress_checked, progress_total, (float(progress_checked)/progress_total)*100))
+                    self.progress("Checked {} hashes (skipped {})".format(progress_checked, progress_skipped))

            line=input_fh.readline()

-        self.verbose("Checked {}/{} hashes. ({:.2f}% coverage)".format(progress_checked, progress_total, (
-                    float(progress_checked) / progress_total) * 100))
+        self.verbose("Checked {} hashes (skipped {})".format(progress_checked, progress_skipped))

    def run(self):

--- a/zfs_autobackup/test.py
+++ b/zfs_autobackup/test.py
@ -0,0 +1,70 @@
+import os.path
+import os
+import time
+from random import random
+
+with open('test.py', 'rb') as fh:
+
+    # fsize = fh.seek(10000, os.SEEK_END)
+    # print(fsize)
+
+    start=time.time()
+    for i in range(0,1000000):
+        # fh.seek(0, 0)
+        fsize=fh.seek(0, os.SEEK_END)
+        # fsize=fh.tell()
+        # os.path.getsize('test.py')
+    print(time.time()-start)
+
+
+    print(fh.tell())
+
+sys.exit(0)
+
+
+
+checked=1
+skipped=1
+coverage=0.1
+
+max_skip=0
+
+
+skipinarow=0
+while True:
+    total=checked+skipped
+
+    skip=coverage<random()
+    if skip:
+        skipped = skipped + 1
+        print("S {:.2f}%".format(checked * 100 / total))
+
+        skipinarow = skipinarow+1
+        if skipinarow>max_skip:
+            max_skip=skipinarow
+    else:
+        skipinarow=0
+        checked=checked+1
+        print("C {:.2f}%".format(checked * 100 / total))
+
+    print(max_skip)
+
+skip=0
+while True:
+
+    total=checked+skipped
+    if skip>0:
+        skip=skip-1
+        skipped = skipped + 1
+        print("S {:.2f}%".format(checked * 100 / total))
+    else:
+        checked=checked+1
+        print("C {:.2f}%".format(checked * 100 / total))
+
+        #calc new skip
+        skip=skip+((1/coverage)-1)*(random()*2)
+        # print(skip)
+        if skip> max_skip:
+            max_skip=skip
+
+    print(max_skip)