This commit is contained in:
Edwin Eefting 2019-02-17 22:57:47 +01:00
parent cae8ec3e70
commit 765dbf124a

View File

@ -7,7 +7,7 @@ import re
import traceback import traceback
import subprocess import subprocess
import pprint import pprint
import cStringIO # import cStringIO
import time import time
@ -23,7 +23,53 @@ def debug(txt):
print(txt) print(txt)
class ZfsNode: class TreeNode():
"""generic tree implementation, with parent/child and prev/next relations"""
def __init__(self, name, parent=None, next=None, prev=None, *args, **kwargs):
self.childs={}
self.name=name
self.parent=parent
if parent:
if name in parent.childs:
raise(Exception("parent {} already has child {}").format(parent.name, name))
parent.childs[name]=self
self.next=next
if next:
if next.prev:
raise(Exception("{} already has a previous item").format(next.name))
next.prev=self
self.prev=prev
if prev:
if prev.next:
raise(Exception("{} already has a next item").format(prev.name))
prev.next=self
def remove(self):
"""remove the item from other referenced TreeNodes. call before you actually delete a treeobject"""
if self.parent:
self.parent.childs.remove(self.name)
# let previous and next objects point to eachother
if self.next:
self.next.prev=self.prev
if self.prev:
self.prev.next=self.next
self.parent=None
self.next=None
self.prev=None
class ZfsNode(TreeNode):
"""an endpoint that contains zfs filesystems. """an endpoint that contains zfs filesystems.
contains lowlevel zfs wrappers for actual zfs commands on remote nodes via ssh (or local) contains lowlevel zfs wrappers for actual zfs commands on remote nodes via ssh (or local)
@ -31,8 +77,10 @@ class ZfsNode:
methods only accept and return simple dataset names, just like the real commands methods only accept and return simple dataset names, just like the real commands
""" """
def __init__(self, ssh_to): def __init__(self, ssh_to, *args, **kwargs):
"""ssh_to: server you want to ssh to. specify 'local' to just use local commands without ssh""" """ssh_to: server you want to ssh to. specify 'local' to just use local commands without ssh"""
super().__init__(*args, **kwargs)
self.ssh_to=ssh_to self.ssh_to=ssh_to
@ -44,10 +92,6 @@ class ZfsNode:
#use ssh? #use ssh?
if self.ssh_to != "local": if self.ssh_to != "local":
encoded_cmd.extend(["ssh", self.ssh_to]) encoded_cmd.extend(["ssh", self.ssh_to])
if args.ssh_cipher:
encoded_cmd.extend(["-c", args.ssh_cipher])
if args.compress:
encoded_cmd.append("-C")
#make sure the command gets all the data in utf8 format: #make sure the command gets all the data in utf8 format:
#(this is neccesary if LC_ALL=en_US.utf8 is not set in the environment) #(this is neccesary if LC_ALL=en_US.utf8 is not set in the environment)
@ -92,321 +136,20 @@ class ZfsNode:
return(ret) return(ret)
def zfs_get_selected_filesystems():
"""determine filesystems that should be backupped by looking at the special autobackup-property
return: list with dataset names
"""
#get all source filesystems that have the backup property class ZfsPool(TreeNode):
source_filesystems=self.run(tab_split=True, cmd=[ """a zfs pool"""
"zfs", "get", "-t", "volume,filesystem", "-o", "name,value,source", "-s", "local,inherited", "-H", "autobackup:"+args.backup_name def __init__(self, *kwargs, **args):
]) super().__init(*args, **kwargs)
#determine filesystems that should be actually backupped
selected_filesystems=[]
direct_filesystems=[]
for source_filesystem in source_filesystems:
(name,value,source)=source_filesystem
if value=="false":
verbose("Ignoring: {0} (disabled)".format(name))
else:
if source=="local":
selected_filesystems.append(name)
direct_filesystems.append(name)
verbose("Selected: {0} (direct selection)".format(name))
elif source.find("inherited from ")==0:
inherited_from=re.sub("^inherited from ", "", source)
if inherited_from in direct_filesystems:
selected_filesystems.append(name)
verbose("Selected: {0} (inherited selection)".format(name))
else:
verbose("Ignored: {0} (already a backup)".format(name))
else:
vebose("Ignored: {0} ({0})".format(source))
return(selected_filesystems)
def zfs_get_resumable_filesystems(filesystems):
"""determine filesystems that can be resumed via receive_resume_token (should be executed on target)"""
cmd=[ "zfs", "get", "-t", "volume,filesystem", "-o", "name,value", "-H", "receive_resume_token" ]
cmd.extend(filesystems)
resumable_filesystems=self.run(tab_split=True, cmd=cmd)
ret={}
for (resumable_filesystem,token) in resumable_filesystems:
if token!='-':
ret[resumable_filesystem]=token
return(ret)
def zfs_destroy_snapshots(snapshots):
"""deferred destroy list of snapshots (in @format). """
#zfs can only destroy one filesystem at once so we use xargs and stdin
self.run(test=args.test, input="\0".join(snapshots), cmd=
[ "xargs", "-0", "-n", "1", "zfs", "destroy", "-d" ]
)
def zfs_destroy(filesystems, recursive=False):
"""destroy list of filesystems """
cmd=[ "xargs", "-0", "-n", "1", "zfs", "destroy" ]
if recursive:
cmd.append("-r")
#zfs can only destroy one filesystem at once so we use xargs and stdin
self.run(test=args.test, input="\0".join(filesystems), cmd=cmd)
#simulate snapshots for --test option
#FIXME
test_snapshots={}
def zfs_create_snapshot(filesystems, snapshot):
"""create snapshot on multiple filesystems at once (atomicly)"""
cmd=[ "zfs", "snapshot" ]
for filesystem in filesystems:
cmd.append(filesystem+"@"+snapshot)
#in testmode we dont actually make changes, so keep them in a list to simulate
if args.test:
if not filesystem in test_snapshots:
test_snapshots[filesystem]=[]
test_snapshots[filesystem].append(snapshot)
run(ssh_to=ssh_to, tab_split=False, cmd=cmd, test=args.test)
def zfs_get_snapshots(filesystems):
"""get names of all snapshots for specified filesystems belonging to backup_name
return[filesystem_name]=[ "snashot1", "snapshot2", ... ]
"""
ret={}
if filesystems:
#TODO: get rid of ugly errors for non-existing target filesystems
cmd=[
"zfs", "list", "-d", "1", "-r", "-t" ,"snapshot", "-H", "-o", "name"
]
cmd.extend(filesystems)
snapshots=run(ssh_to=ssh_to, tab_split=False, cmd=cmd, valid_exitcodes=[ 0,1 ])
for snapshot in snapshots:
(filesystem, snapshot_name)=snapshot.split("@")
if re.match("^"+backup_name+"-[0-9]*$", snapshot_name):
if not filesystem in ret:
ret[filesystem]=[]
ret[filesystem].append(snapshot_name)
#also add any test-snapshots that where created with --test mode
if args.test:
if ssh_to in test_snapshots:
for filesystem in filesystems:
if filesystem in test_snapshots[ssh_to]:
if not filesystem in ret:
ret[filesystem]=[]
ret[filesystem].extend(test_snapshots[ssh_to][filesystem])
return(ret)
"""transfer a zfs snapshot from source to target. both can be either local or via ssh.
TODO:
buffering: specify buffer_size to use mbuffer (or alike) to apply buffering where neccesary
local to local:
local send -> local buffer -> local receive
local to remote and remote to local:
local send -> local buffer -> ssh -> remote buffer -> remote receive
remote send -> remote buffer -> ssh -> local buffer -> local receive
remote to remote:
remote send -> remote buffer -> ssh -> local buffer -> ssh -> remote buffer -> remote receive
TODO: can we string together all the zfs sends and recvs, so that we only need to use 1 ssh connection? should be faster if there are many small snaphots
"""
def zfs_transfer(ssh_source, source_filesystem, first_snapshot, second_snapshot,
ssh_target, target_filesystem, resume_token=None, buffer_size=None):
#### build source command
source_cmd=[]
if ssh_source != "local":
source_cmd.extend([ "ssh", ssh_source ])
if args.ssh_cipher:
source_cmd.extend(["-c", args.ssh_cipher])
if args.compress:
source_cmd.append("-C")
source_cmd.extend(["zfs", "send", ])
#only verbose in debug mode, lots of output
if args.debug:
source_cmd.append("-v")
if not first_snapshot:
txt="Initial transfer of "+source_filesystem+" snapshot "+second_snapshot
else:
txt="Incremental transfer of "+source_filesystem+" between snapshots "+first_snapshot+"..."+second_snapshot
if resume_token:
source_cmd.extend([ "-t", resume_token ])
verbose("RESUMING "+txt)
else:
source_cmd.append("-p")
if first_snapshot:
source_cmd.extend([ "-i", first_snapshot ])
if ssh_source != "local":
source_cmd.append("'" + source_filesystem + "@" + second_snapshot + "'")
else:
source_cmd.append(source_filesystem + "@" + second_snapshot)
verbose(txt)
#### build target command
target_cmd=[]
if ssh_target != "local":
target_cmd.extend([ "ssh", ssh_target ])
if args.ssh_cipher:
target_cmd.extend(["-c", args.ssh_cipher])
if args.compress:
target_cmd.append("-C")
target_cmd.extend(["zfs", "recv", "-u" ])
#also verbose in --verbose mode so we can see the transfer speed when its completed
if args.verbose or args.debug:
target_cmd.append("-v")
if args.resume:
target_cmd.append("-s")
if ssh_target!="local":
target_cmd.append("'" + target_filesystem + "'")
else:
target_cmd.append(target_filesystem)
#### make sure parent on target exists
parent_filesystem= "/".join(target_filesystem.split("/")[:-1])
run(ssh_to=ssh_target, cmd=[ "zfs", "create" ,"-p", parent_filesystem ], test=args.test)
### execute pipe
debug_txt="# "+source_cmd[0]+" '"+("' '".join(source_cmd[1:]))+"'" + " | " + target_cmd[0]+" '"+("' '".join(target_cmd[1:]))+"'"
if args.test:
debug("[TEST] "+debug_txt)
return
else:
debug(debug_txt)
source_proc=subprocess.Popen(source_cmd, env=os.environ, stdout=subprocess.PIPE)
target_proc=subprocess.Popen(target_cmd, env=os.environ, stdin=source_proc.stdout)
source_proc.stdout.close() # Allow p1 to receive a SIGPIPE if p2 exits.
target_proc.communicate()
if source_proc.returncode:
raise(subprocess.CalledProcessError(source_proc.returncode, source_cmd))
#zfs recv sometimes gives an exitcode 1 while the transfer was succesfull, therefore we ignore exit 1's and do an extra check to see if the snapshot is there.
if target_proc.returncode and target_proc.returncode!=1:
raise(subprocess.CalledProcessError(target_proc.returncode, target_cmd))
debug("Verifying if snapshot exists on target")
run(ssh_to=ssh_target, cmd=["zfs", "list", target_filesystem+"@"+second_snapshot ])
"""get filesystems that where already backupped to a target. """
def zfs_get_backupped_filesystems(ssh_to, backup_name, target_fs):
#get all target filesystems that have received or inherited the backup propert, under the target_fs tree
ret=run(ssh_to=ssh_to, tab_split=False, cmd=[
"zfs", "get", "-r", "-t", "volume,filesystem", "-o", "name", "-s", "received,inherited", "-H", "autobackup:"+backup_name, target_fs
])
return(ret)
"""get filesystems that where once backupped to target but are no longer selected on source
these are filesystems that are not in the list in target_filesystems.
this happens when filesystems are destroyed or unselected on the source.
"""
def get_stale_backupped_filesystems(ssh_to, backup_name, target_fs, target_filesystems):
backupped_filesystems=zfs_get_backupped_filesystems(ssh_to=ssh_to, backup_name=backup_name, target_fs=target_fs)
#determine backupped filesystems that are not in target_filesystems anymore
stale_backupped_filesystems=[]
for backupped_filesystem in backupped_filesystems:
if backupped_filesystem not in target_filesystems:
stale_backupped_filesystems.append(backupped_filesystem)
return(stale_backupped_filesystems)
now=time.time()
"""determine list of snapshot (in @format) to destroy, according to age"""
def determine_destroy_list(snapshots, days):
ret=[]
for filesystem in snapshots:
for snapshot in snapshots[filesystem]:
time_str=re.findall("^.*-([0-9]*)$", snapshot)[0]
if len(time_str)==14:
#new format:
time_secs=time.mktime(time.strptime(time_str,"%Y%m%d%H%M%S"))
else:
time_secs=int(time_str)
# verbose("time_secs"+time_str)
if (now-time_secs) > (24 * 3600 * days):
ret.append(filesystem+"@"+snapshot)
return(ret)
def lstrip_path(path, count):
return("/".join(path.split("/")[count:]))
class ZfsDataset: class ZfsDataset:
"""a generic zfs dataset""" """a generic zfs dataset"""
def __init__(name, parent, backup=false, created=false): def __init__(name, parent):
""" backup: should be backupped by zfs_autobackup """
created: is created by zfs_autobackup (and may be destroyed by it as well)
parent: parent dataset this belongs to (none is "root")
""" """
self.name=name self.name=name
self.parent=parent self.parent=parent
@ -426,267 +169,21 @@ class ZfsSnapshot(Dataset):
self.next_snapshot self.next_snapshot
class ZfsBackupSource(): # class ZfsBackupSource():
"""backup source. # """backup source.
#
contains high level backup source functions. # contains high level backup source functions.
#
these work with ZfsDataset and ZfsSnapshot objects. # these work with ZfsDataset and ZfsSnapshot objects.
#
""" # """
#
def __init__(self): # def __init__(self):
self.node=ZfsNode(ssh_to=args.ssh_to) # self.node=ZfsNode(ssh_to=args.ssh_to)
self.datasets={} # self.datasets={}
self.snapshots={} # self.snapshots={}
#
#
def refresh(): # def refresh():
"""refresh all data by calling various zfs commands""" # """refresh all data by calling various zfs commands"""
selected_filesystems=self.node.zfs_get_selected_filesystems() # selected_filesystems=self.node.zfs_get_selected_filesystems()
def zfs_autobackup():
############## data gathering section
if args.test:
args.verbose=True
verbose("RUNNING IN TEST-MODE, NOT MAKING ACTUAL BACKUP!")
### getting and determinging source/target filesystems
# get selected filesystem on backup source
verbose("Getting selected source filesystems for backup {0} on {1}".format(args.backup_name,args.ssh_source))
source_filesystems=zfs_get_selected_filesystems(args.ssh_source, args.backup_name)
#nothing todo
if not source_filesystems:
error("No filesystems source selected, please do a 'zfs set autobackup:{0}=true' on {1}".format(args.backup_name,args.ssh_source))
sys.exit(1)
# determine target filesystems
target_filesystems=[]
for source_filesystem in source_filesystems:
#append args.target_fs prefix and strip args.strip_path paths from source_filesystem
target_filesystems.append(args.target_fs + "/" + lstrip_path(source_filesystem, args.strip_path))
### creating snapshots
# this is one of the first things we do, so that in case of failures we still have snapshots.
#create new snapshot?
if not args.no_snapshot:
new_snapshot_name=args.backup_name+"-"+time.strftime("%Y%m%d%H%M%S")
verbose("Creating source snapshot {0} on {1} ".format(new_snapshot_name, args.ssh_source))
zfs_create_snapshot(args.ssh_source, source_filesystems, new_snapshot_name)
### get resumable transfers
resumable_target_filesystems={}
if args.resume:
verbose("Checking for aborted transfers that can be resumed")
resumable_target_filesystems=zfs_get_resumable_filesystems(args.ssh_target, target_filesystems)
debug("Resumable filesystems: "+str(pprint.pformat(resumable_target_filesystems)))
### get all snapshots of all selected filesystems on both source and target
verbose("Getting source snapshot-list from {0}".format(args.ssh_source))
source_snapshots=zfs_get_snapshots(args.ssh_source, source_filesystems, args.backup_name)
debug("Source snapshots: " + str(pprint.pformat(source_snapshots)))
target_snapshots={}
try:
verbose("Getting target snapshot-list from {0}".format(args.ssh_target))
target_snapshots=zfs_get_snapshots(args.ssh_target, target_filesystems, args.backup_name)
except subprocess.CalledProcessError:
verbose("(ignoring errors, probably initial backup for this filesystem)")
pass
debug("Target snapshots: " + str(pprint.pformat(target_snapshots)))
#obsolete snapshots that may be removed
source_obsolete_snapshots={}
target_obsolete_snapshots={}
############## backup section
#determine which snapshots to send for each filesystem
for source_filesystem in source_filesystems:
target_filesystem=args.target_fs + "/" + lstrip_path(source_filesystem, args.strip_path)
if source_filesystem not in source_snapshots:
#this happens if you use --no-snapshot and there are new filesystems without snapshots
verbose("Skipping source filesystem {0}, no snapshots found".format(source_filesystem))
else:
#incremental or initial send?
if target_filesystem in target_snapshots and target_snapshots[target_filesystem]:
#incremental mode, determine what to send and what is obsolete
#latest succesfully send snapshot, should be common on both source and target
latest_target_snapshot=target_snapshots[target_filesystem][-1]
if latest_target_snapshot not in source_snapshots[source_filesystem]:
#cant find latest target anymore. find first common snapshot and inform user
error="Cant find latest target snapshot on source, did you destroy it accidently? "+source_filesystem+"@"+latest_target_snapshot
for latest_target_snapshot in reversed(target_snapshots[target_filesystem]):
if latest_target_snapshot in source_snapshots[source_filesystem]:
error=error+"\nYou could solve this by rolling back to: "+target_filesystem+"@"+latest_target_snapshot;
break
raise(Exception(error))
#send all new source snapshots that come AFTER the last target snapshot
latest_source_index=source_snapshots[source_filesystem].index(latest_target_snapshot)
send_snapshots=source_snapshots[source_filesystem][latest_source_index+1:]
#source snapshots that come BEFORE last target snapshot are obsolete
source_obsolete_snapshots[source_filesystem]=source_snapshots[source_filesystem][0:latest_source_index]
#target snapshots that come BEFORE last target snapshot are obsolete
latest_target_index=target_snapshots[target_filesystem].index(latest_target_snapshot)
target_obsolete_snapshots[target_filesystem]=target_snapshots[target_filesystem][0:latest_target_index]
else:
#initial mode, send all snapshots, nothing is obsolete:
latest_target_snapshot=None
send_snapshots=source_snapshots[source_filesystem]
target_obsolete_snapshots[target_filesystem]=[]
source_obsolete_snapshots[source_filesystem]=[]
#now actually send the snapshots
if not args.no_send:
if send_snapshots and args.rollback and latest_target_snapshot:
#roll back any changes on target
debug("Rolling back target to latest snapshot.")
run(ssh_to=args.ssh_target, test=args.test, cmd=["zfs", "rollback", target_filesystem+"@"+latest_target_snapshot ])
for send_snapshot in send_snapshots:
#resumable?
if target_filesystem in resumable_target_filesystems:
resume_token=resumable_target_filesystems.pop(target_filesystem)
else:
resume_token=None
zfs_transfer(
ssh_source=args.ssh_source, source_filesystem=source_filesystem,
first_snapshot=latest_target_snapshot, second_snapshot=send_snapshot,
ssh_target=args.ssh_target, target_filesystem=target_filesystem,
resume_token=resume_token
)
#now that we succesfully transferred this snapshot, the previous snapshot is obsolete:
if latest_target_snapshot:
target_obsolete_snapshots[target_filesystem].append(latest_target_snapshot)
source_obsolete_snapshots[source_filesystem].append(latest_target_snapshot)
#we just received a new filesytem?
else:
if args.clear_refreservation:
debug("Clearing refreservation to save space.")
run(ssh_to=args.ssh_target, test=args.test, cmd=["zfs", "set", "refreservation=none", target_filesystem ])
if args.clear_mountpoint:
debug("Setting canmount=noauto to prevent auto-mounting in the wrong place. (ignoring errors)")
run(ssh_to=args.ssh_target, test=args.test, cmd=["zfs", "set", "canmount=noauto", target_filesystem ], valid_exitcodes= [0, 1] )
latest_target_snapshot=send_snapshot
############## cleanup section
#we only do cleanups after everything is complete, to keep everything consistent (same snapshots everywhere)
#find stale backups on target that have become obsolete
verbose("Getting stale filesystems and snapshots from {0}".format(args.ssh_target))
stale_target_filesystems=get_stale_backupped_filesystems(ssh_to=args.ssh_target, backup_name=args.backup_name, target_fs=args.target_fs, target_filesystems=target_filesystems)
debug("Stale target filesystems: {0}".format("\n".join(stale_target_filesystems)))
stale_target_snapshots=zfs_get_snapshots(args.ssh_target, stale_target_filesystems, args.backup_name)
debug("Stale target snapshots: " + str(pprint.pformat(stale_target_snapshots)))
target_obsolete_snapshots.update(stale_target_snapshots)
#determine stale filesystems that have no snapshots left (the can be destroyed)
#TODO: prevent destroying filesystems that have underlying filesystems that are still active.
stale_target_destroys=[]
for stale_target_filesystem in stale_target_filesystems:
if stale_target_filesystem not in stale_target_snapshots:
stale_target_destroys.append(stale_target_filesystem)
if stale_target_destroys:
if args.destroy_stale:
verbose("Destroying stale filesystems on target {0}:\n{1}".format(args.ssh_target, "\n".join(stale_target_destroys)))
zfs_destroy(ssh_to=args.ssh_target, filesystems=stale_target_destroys, recursive=True)
else:
verbose("Stale filesystems on {0}, use --destroy-stale to destroy:\n{1}".format(args.ssh_target, "\n".join(stale_target_destroys)))
#now actually destroy the old snapshots
source_destroys=determine_destroy_list(source_obsolete_snapshots, args.keep_source)
if source_destroys:
verbose("Destroying old snapshots on source {0}:\n{1}".format(args.ssh_source, "\n".join(source_destroys)))
zfs_destroy_snapshots(ssh_to=args.ssh_source, snapshots=source_destroys)
target_destroys=determine_destroy_list(target_obsolete_snapshots, args.keep_target)
if target_destroys:
verbose("Destroying old snapshots on target {0}:\n{1}".format(args.ssh_target, "\n".join(target_destroys)))
zfs_destroy_snapshots(ssh_to=args.ssh_target, snapshots=target_destroys)
verbose("All done")
################################################################## ENTRY POINT
# parse arguments
import argparse
parser = argparse.ArgumentParser(description='ZFS autobackup v2.1')
parser.add_argument('--ssh-source', default="local", help='Source host to get backup from. (user@hostname) Default %(default)s.')
parser.add_argument('--ssh-target', default="local", help='Target host to push backup to. (user@hostname) Default %(default)s.')
parser.add_argument('--ssh-cipher', default=None, help='SSH cipher to use (default %(default)s)')
parser.add_argument('--keep-source', type=int, default=30, help='Number of days to keep old snapshots on source. Default %(default)s.')
parser.add_argument('--keep-target', type=int, default=30, help='Number of days to keep old snapshots on target. Default %(default)s.')
parser.add_argument('backup_name', help='Name of the backup (you should set the zfs property "autobackup:backup-name" to true on filesystems you want to backup')
parser.add_argument('target_fs', help='Target filesystem')
parser.add_argument('--no-snapshot', action='store_true', help='dont create new snapshot (usefull for finishing uncompleted backups, or cleanups)')
parser.add_argument('--no-send', action='store_true', help='dont send snapshots (usefull to only do a cleanup)')
parser.add_argument('--resume', action='store_true', help='support resuming of interrupted transfers by using the zfs extensible_dataset feature (both zpools should have it enabled)')
parser.add_argument('--strip-path', default=0, type=int, help='number of directory to strip from path (use 1 when cloning zones between 2 SmartOS machines)')
parser.add_argument('--destroy-stale', action='store_true', help='Destroy stale backups that have no more snapshots. Be sure to verify the output before using this! ')
parser.add_argument('--clear-refreservation', action='store_true', help='Set refreservation property to none for new filesystems. Usefull when backupping SmartOS volumes. (recommended)')
parser.add_argument('--clear-mountpoint', action='store_true', help='Sets canmount=noauto property, to prevent the received filesystem from mounting over existing filesystems. (recommended)')
parser.add_argument('--rollback', action='store_true', help='Rollback changes on the target before starting a backup. (normally you can prevent changes by setting the readonly property on the target_fs to on)')
parser.add_argument('--compress', action='store_true', help='use compression during zfs send/recv')
parser.add_argument('--test', action='store_true', help='dont change anything, just show what would be done (still does all read-only operations)')
parser.add_argument('--verbose', action='store_true', help='verbose output')
parser.add_argument('--debug', action='store_true', help='debug output (shows commands that are executed)')
#note args is the only global variable we use, since its a global readonly setting anyway
args = parser.parse_args()
zfs_autobackup()