zfs_autobackup 2.4: try to continue on non-fatal errors

This commit is contained in:
Edwin Eefting 2019-10-02 18:21:24 +02:00
parent c176b968a9
commit 54235f455a
2 changed files with 138 additions and 113 deletions

View File

@ -18,7 +18,7 @@ It has the following features:
* Supports resuming of interrupted transfers. (via the zfs extensible_dataset feature)
* Backups and snapshots can be named to prevent conflicts. (multiple backups from and to the same filesystems are no problem)
* Always creates a new snapshot before starting.
* Checks everything and aborts on errors.
* Checks everything but tries continue on non-fatal errors when possible. (Reports error-count when done)
* Ability to 'finish' aborted backups to see what goes wrong.
* Easy to debug and has a test-mode. Actual unix commands are printed.
* Keeps latest X snapshots remote and locally. (default 30, configurable)
@ -42,7 +42,7 @@ usage: zfs_autobackup [-h] [--ssh-source SSH_SOURCE] [--ssh-target SSH_TARGET]
[--debug]
backup_name target_path
ZFS autobackup v2.3
ZFS autobackup v2.4
positional arguments:
backup_name Name of the backup (you should set the zfs property
@ -108,6 +108,9 @@ optional arguments:
(still does all read-only operations)
--verbose verbose output
--debug debug output (shows commands that are executed)
When a filesystem fails, zfs_backup will continue and report the number of
failures at that end. Also the exit code will indicate the number of failures.
```
Backup example

View File

@ -13,18 +13,20 @@ import time
def error(txt):
print(txt, file=sys.stderr)
def verbose(txt):
if args.verbose:
print(txt)
def debug(txt):
if args.debug:
print(txt)
#fatal abort execution, exit code 255
def abort(txt):
error(txt)
sys.exit(255)
"""run a command. specifiy ssh user@host to run remotely"""
def run(cmd, input=None, ssh_to="local", tab_split=False, valid_exitcodes=[ 0 ], test=False):
@ -473,6 +475,15 @@ def zfs_get_unchanged_filesystems(ssh_to, filesystems):
#fugly..
failures=0
#something failed, but we try to continue with the rest
def failed(txt):
global failures
failures=failures+1
error("FAILURE: "+txt+"\n")
def zfs_autobackup():
############## data gathering section
@ -490,8 +501,7 @@ def zfs_autobackup():
#nothing todo
if not source_filesystems:
error("No source filesystems selected, please do a 'zfs set autobackup:{0}=true' on {1}".format(args.backup_name,args.ssh_source))
sys.exit(1)
abort("No source filesystems selected, please do a 'zfs set autobackup:{0}=true' on {1}".format(args.backup_name,args.ssh_source))
if args.ignore_replicated:
replicated_filesystems=zfs_get_unchanged_filesystems(args.ssh_source, source_filesystems)
@ -540,7 +550,6 @@ def zfs_autobackup():
### get eixsting source snapshots
verbose("Getting source snapshot-list from {0}".format(args.ssh_source))
source_snapshots=zfs_get_snapshots(args.ssh_source, source_filesystems, args.backup_name)
debug("Source snapshots:\n" + str(pprint.pformat(source_snapshots)))
@ -588,114 +597,117 @@ def zfs_autobackup():
#determine which snapshots to send for each filesystem
for source_filesystem in source_filesystems:
target_filesystem=args.target_path + "/" + lstrip_path(source_filesystem, args.strip_path)
try:
target_filesystem=args.target_path + "/" + lstrip_path(source_filesystem, args.strip_path)
if source_filesystem not in source_snapshots:
#this happens if you use --no-snapshot and there are new filesystems without snapshots
verbose("* Skipping source filesystem {0}, no snapshots found".format(source_filesystem))
else:
#incremental or initial send?
if target_filesystem in target_snapshots and target_snapshots[target_filesystem]:
#incremental mode, determine what to send and what is obsolete
#latest succesfully send snapshot, should be common on both source and target
latest_target_snapshot=target_snapshots[target_filesystem][-1]
if latest_target_snapshot not in source_snapshots[source_filesystem]:
#cant find latest target anymore. find first common snapshot and inform user
error_msg="Cant find latest target snapshot on source, did you destroy/rename it?"
error_msg=error_msg+"\nLatest on target : "+target_filesystem+"@"+latest_target_snapshot
error_msg=error_msg+"\nMissing on source: "+source_filesystem+"@"+latest_target_snapshot
found=False
for latest_target_snapshot in reversed(target_snapshots[target_filesystem]):
if latest_target_snapshot in source_snapshots[source_filesystem]:
error_msg=error_msg+"\nYou could solve this by rolling back to this common snapshot on target: "+target_filesystem+"@"+latest_target_snapshot
found=True
break
if not found:
error_msg=error_msg+"\nAlso could not find an earlier common snapshot to rollback to."
else:
if args.ignore_new:
verbose("* Skipping source filesystem {0}, target already has newer snapshots.".format(source_filesystem))
continue
raise(Exception(error_msg))
#send all new source snapshots that come AFTER the last target snapshot
latest_source_index=source_snapshots[source_filesystem].index(latest_target_snapshot)
send_snapshots=source_snapshots[source_filesystem][latest_source_index+1:]
#source snapshots that come BEFORE last target snapshot are obsolete
source_obsolete_snapshots[source_filesystem]=source_snapshots[source_filesystem][0:latest_source_index]
#target snapshots that come BEFORE last target snapshot are obsolete
latest_target_index=target_snapshots[target_filesystem].index(latest_target_snapshot)
target_obsolete_snapshots[target_filesystem]=target_snapshots[target_filesystem][0:latest_target_index]
if source_filesystem not in source_snapshots:
#this happens if you use --no-snapshot and there are new filesystems without snapshots
verbose("* Skipping source filesystem {0}, no snapshots found".format(source_filesystem))
else:
#initial mode, send all snapshots, nothing is obsolete:
latest_target_snapshot=None
send_snapshots=source_snapshots[source_filesystem]
target_obsolete_snapshots[target_filesystem]=[]
source_obsolete_snapshots[source_filesystem]=[]
#now actually send the snapshots
if not args.no_send:
#incremental or initial send?
if target_filesystem in target_snapshots and target_snapshots[target_filesystem]:
#incremental mode, determine what to send and what is obsolete
if send_snapshots and args.rollback and latest_target_snapshot:
#roll back any changes on target
debug("Rolling back target to latest snapshot.")
run(ssh_to=args.ssh_target, test=args.test, cmd=["zfs", "rollback", target_filesystem+"@"+latest_target_snapshot ])
#latest succesfully send snapshot, should be common on both source and target
latest_target_snapshot=target_snapshots[target_filesystem][-1]
if latest_target_snapshot not in source_snapshots[source_filesystem]:
#cant find latest target anymore. find first common snapshot and inform user
error_msg="Cant find latest target snapshot on source for '{}', did you destroy/rename it?".format(source_filesystem)
error_msg=error_msg+"\nLatest on target : "+target_filesystem+"@"+latest_target_snapshot
error_msg=error_msg+"\nMissing on source: "+source_filesystem+"@"+latest_target_snapshot
found=False
for latest_target_snapshot in reversed(target_snapshots[target_filesystem]):
if latest_target_snapshot in source_snapshots[source_filesystem]:
error_msg=error_msg+"\nYou could solve this by rolling back to this common snapshot on target: "+target_filesystem+"@"+latest_target_snapshot
found=True
break
if not found:
error_msg=error_msg+"\nAlso could not find an earlier common snapshot to rollback to."
else:
if args.ignore_new:
verbose("* Skipping source filesystem '{0}', target already has newer snapshots.".format(source_filesystem))
continue
raise(Exception(error_msg))
#send all new source snapshots that come AFTER the last target snapshot
latest_source_index=source_snapshots[source_filesystem].index(latest_target_snapshot)
send_snapshots=source_snapshots[source_filesystem][latest_source_index+1:]
#source snapshots that come BEFORE last target snapshot are obsolete
source_obsolete_snapshots[source_filesystem]=source_snapshots[source_filesystem][0:latest_source_index]
#target snapshots that come BEFORE last target snapshot are obsolete
latest_target_index=target_snapshots[target_filesystem].index(latest_target_snapshot)
target_obsolete_snapshots[target_filesystem]=target_snapshots[target_filesystem][0:latest_target_index]
else:
#initial mode, send all snapshots, nothing is obsolete:
latest_target_snapshot=None
send_snapshots=source_snapshots[source_filesystem]
target_obsolete_snapshots[target_filesystem]=[]
source_obsolete_snapshots[source_filesystem]=[]
#now actually send the snapshots
if not args.no_send:
if send_snapshots and args.rollback and latest_target_snapshot:
#roll back any changes on target
debug("Rolling back target to latest snapshot.")
run(ssh_to=args.ssh_target, test=args.test, cmd=["zfs", "rollback", target_filesystem+"@"+latest_target_snapshot ])
for send_snapshot in send_snapshots:
for send_snapshot in send_snapshots:
#resumable?
if target_filesystem in resumable_target_filesystems:
resume_token=resumable_target_filesystems.pop(target_filesystem)
else:
resume_token=None
#hold the snapshot we're sending on the source
if not args.no_holds:
zfs_hold_snapshot(ssh_to=args.ssh_source, snapshot=source_filesystem+"@"+send_snapshot)
zfs_transfer(
ssh_source=args.ssh_source, source_filesystem=source_filesystem,
first_snapshot=latest_target_snapshot, second_snapshot=send_snapshot,
ssh_target=args.ssh_target, target_filesystem=target_filesystem,
resume_token=resume_token
)
#hold the snapshot we just send to the target
zfs_hold_snapshot(ssh_to=args.ssh_target, snapshot=target_filesystem+"@"+send_snapshot)
#now that we succesfully transferred this snapshot, the previous snapshot is obsolete:
if latest_target_snapshot:
zfs_release_snapshot(ssh_to=args.ssh_target, snapshot=target_filesystem+"@"+latest_target_snapshot)
target_obsolete_snapshots[target_filesystem].append(latest_target_snapshot)
#resumable?
if target_filesystem in resumable_target_filesystems:
resume_token=resumable_target_filesystems.pop(target_filesystem)
else:
resume_token=None
#hold the snapshot we're sending on the source
if not args.no_holds:
zfs_release_snapshot(ssh_to=args.ssh_source, snapshot=source_filesystem+"@"+latest_target_snapshot)
source_obsolete_snapshots[source_filesystem].append(latest_target_snapshot)
#we just received a new filesytem?
else:
if args.clear_refreservation:
debug("Clearing refreservation to save space.")
zfs_hold_snapshot(ssh_to=args.ssh_source, snapshot=source_filesystem+"@"+send_snapshot)
run(ssh_to=args.ssh_target, test=args.test, cmd=["zfs", "set", "refreservation=none", target_filesystem ])
zfs_transfer(
ssh_source=args.ssh_source, source_filesystem=source_filesystem,
first_snapshot=latest_target_snapshot, second_snapshot=send_snapshot,
ssh_target=args.ssh_target, target_filesystem=target_filesystem,
resume_token=resume_token
)
#hold the snapshot we just send to the target
zfs_hold_snapshot(ssh_to=args.ssh_target, snapshot=target_filesystem+"@"+send_snapshot)
if args.clear_mountpoint:
debug("Setting canmount=noauto to prevent auto-mounting in the wrong place. (ignoring errors)")
run(ssh_to=args.ssh_target, test=args.test, cmd=["zfs", "set", "canmount=noauto", target_filesystem ], valid_exitcodes= [0, 1] )
#now that we succesfully transferred this snapshot, the previous snapshot is obsolete:
if latest_target_snapshot:
zfs_release_snapshot(ssh_to=args.ssh_target, snapshot=target_filesystem+"@"+latest_target_snapshot)
target_obsolete_snapshots[target_filesystem].append(latest_target_snapshot)
if not args.no_holds:
zfs_release_snapshot(ssh_to=args.ssh_source, snapshot=source_filesystem+"@"+latest_target_snapshot)
source_obsolete_snapshots[source_filesystem].append(latest_target_snapshot)
#we just received a new filesytem?
else:
if args.clear_refreservation:
debug("Clearing refreservation to save space.")
run(ssh_to=args.ssh_target, test=args.test, cmd=["zfs", "set", "refreservation=none", target_filesystem ])
latest_target_snapshot=send_snapshot
if args.clear_mountpoint:
debug("Setting canmount=noauto to prevent auto-mounting in the wrong place. (ignoring errors)")
run(ssh_to=args.ssh_target, test=args.test, cmd=["zfs", "set", "canmount=noauto", target_filesystem ], valid_exitcodes= [0, 1] )
latest_target_snapshot=send_snapshot
# failed, skip this source_filesystem
except Exception as e:
failed(str(e))
############## cleanup section
@ -730,23 +742,28 @@ def zfs_autobackup():
source_destroys=determine_destroy_list(source_obsolete_snapshots, args.keep_source)
if source_destroys:
verbose("Destroying old snapshots on source {0}:\n{1}".format(args.ssh_source, "\n".join(source_destroys)))
zfs_destroy_snapshots(ssh_to=args.ssh_source, snapshots=source_destroys)
try:
zfs_destroy_snapshots(ssh_to=args.ssh_source, snapshots=source_destroys)
except Exception as e:
failed(str(e))
target_destroys=determine_destroy_list(target_obsolete_snapshots, args.keep_target)
if target_destroys:
verbose("Destroying old snapshots on target {0}:\n{1}".format(args.ssh_target, "\n".join(target_destroys)))
zfs_destroy_snapshots(ssh_to=args.ssh_target, snapshots=target_destroys)
verbose("All done")
try:
zfs_destroy_snapshots(ssh_to=args.ssh_target, snapshots=target_destroys)
except Exception as e:
failed(str(e))
################################################################## ENTRY POINT
# parse arguments
import argparse
parser = argparse.ArgumentParser(description='ZFS autobackup v2.3')
parser = argparse.ArgumentParser(
description='ZFS autobackup v2.4',
epilog='When a filesystem fails, zfs_backup will continue and report the number of failures at that end. Also the exit code will indicate the number of failures.')
parser.add_argument('--ssh-source', default="local", help='Source host to get backup from. (user@hostname) Default %(default)s.')
parser.add_argument('--ssh-target', default="local", help='Target host to push backup to. (user@hostname) Default %(default)s.')
parser.add_argument('--keep-source', type=int, default=30, help='Number of days to keep old snapshots on source. Default %(default)s.')
@ -782,17 +799,22 @@ parser.add_argument('--debug', action='store_true', help='debug output (shows co
args = parser.parse_args()
if args.ignore_replicated and args.allow_empty:
print("Cannot use allow_empty with ignore_replicated.")
sys.exit(1)
abort("Cannot use allow_empty with ignore_replicated.")
try:
zfs_autobackup()
if not failures:
verbose("All operations completed succesfully.")
sys.exit(0)
else:
verbose("{} OPERATION(S) FAILED!".format(failures))
#exit with the number of failures.
sys.exit(min(255,failed))
except Exception as e:
if args.debug:
raise
else:
print("ABORTED")
print(str(e))
sys.exit(1)
abort("FATAL ERROR")