#!/bin/bash

# program to perform backups
# called by at.

# depends: lockfile-progs


# this program will ostensibly be able to fix situations where a snapshot
# was failed to be sent, maybe due to network problems.  missing snapshots
# should be sent, and that includes all of them if none have ever been sent.

declare -A CONFIG
export CONFIG
ETC=/etc
export PRE=/usr/local
export ETCDIR=$ETC/snapsched
export CFGFILE=$ETCDIR/config
export INTCRONFILE_BASE=snapsched
export CRONTAB=$ETC/cron.d/$INTCRONFILE_BASE
export TMPBASE=/tmp/ssched

exec 2>&1

export OSILENT=

# load the library routines
. $PRE/lib/snapsched/snapsched-funcs

# send a single snapshot to the backup host
cp_snap()
{
	local SSH_C="$1"
	local BHOST="$2"
	local SVOL="$3"
	local RECV_DST="$4"
	local SARGS
	local C
	local ES=0

	shift;shift;shift;shift

	# send in the clones
	for C in "${SNAP_SENT_LST[@]}" ; do
		SARGS+=" -c ${C#* }"
	done

	# if this is the first snap EVER to be sent, then
	# SARGS will be the the null string, so the entire snap will be sent.
	# otherwise, previously sent snaps will be mentioned as possible
	# clone sources, and btrfs can supposedly figure out the parent
	# relationship.  however, if it fails to do that because the snapshots
	# have been screwed with, then this will fail.

	omsg "  sending ${SVOL#*/} from `hostname` to $BHOST ..."

	TMP_OUTPUT=`mktemp /tmp/snapsched-snapsend-output.XXXX`

	# send the snapshot to backup server BHOST, with a 10 minute timeout
	btrfs send $SARGS "$SVOL"  2>$TMP_OUTPUT |
		ssh -x -o ServerAliveInterval=300 -o ServerAliveCountMax=2 \
			$SSH_C $BHOST btrfs rec "$RECV_DST" &>> $TMP_OUTPUT

	BTRFS_SND=${PIPESTATUS[0]} BTRFS_RCV=${PIPESTATUS[1]}

	# for now, let's look at this
	sed 's/^/    /' "$TMP_OUTPUT"

	if [ "$BTRFS_SND" -ne 0 -o "$BTRFS_RCV" -ne 0 ] ||
		egrep -q 'ERROR|Timeout' "$TMP_OUTPUT" ; then
		#cat "$TMP_OUTPUT"
		omsg "Failure sending '${SVOL#*/}' from `hostname` to $BHOST"
		omsg "btrfs send args: $SARGS $SVOL"
		ES=1
	fi

	rm -f "$TMP_OUTPUT"

	return $ES
}

# check to see if the snapshot argument (string: "UUID <snap-path>")
# already exists on the backup host
# uses the global array RSNAPS and RX (the number of elements in RSNAPS)
_already_sent()
{
	local S="$1"
	local -i X

	for ((X = 0; X < RX; X++)) ; do
		# check the UUID of the snap with all the UUIDs of already sent snaps
		# check the name.  apparently the uuid can change under certain conds
		if [ "${S#*$NSRC/}" = "${RSNAPS[X]#*$NSRC/}" ] ; then
			# it matches, so yes, it's already been sent
			return 0
		fi
	done

	# no matches, so no, it hasn't been sent
	return 1
}


# clean up lockfiles on signal receipt
_snapback_cleanup_locks()
{
	local L

	for L in $SNAP_VAR/snapback_backups_${BHOST}_?.lck ; do
		_ssched_lockfile_v "$L"
	done
}


if $OLDBASH ; then
_rsnaps2send()
{
	local I
	local -i D
	local LSNAP

	for I in "${!SNAP_SENT_LST[@]}" ; do
		unset SNAP_SENT_LST[$I]
	done
	D=0
	for I in "${!RSNAPS[@]}" ; do
		LSNAP=${RSNAPS[$I]#* }
		# remove the leading path ending with this hostname
		# and replace it with CONFIG[SNAP_BASE_DIR]
		LSNAP=${CONFIG[SNAP_BASE_DIR]}${LSNAP#*`hostname`}
		if [ -d "${CONFIG[SNAP_MOUNT_DIR]}/$LSNAP" ] ; then
			SNAP_SENT_LST[$D]="$LSNAP"
				D=`expr $D + 1`
		fi
	done
}
else
# create the initial clone source list
_rsnaps2send()
{
	local -n DARR=$1 SARR=$2
	local I
	local -i D
	local LSNAP

	# clear out the destination array which might have old values
	for I in "${!DARR[@]}" ; do
		unset DARR[$I]
	done

	D=0
	for I in "${!SARR[@]}" ; do
		LSNAP=${SARR[$I]#* }
		# remove the leading path ending with this hostname
		# and replace it with CONFIG[SNAP_BASE_DIR]
		#LSNAP=${LSNAP/`hostname`/${CONFIG[SNAP_BASE_DIR]}}
		LSNAP=${CONFIG[SNAP_BASE_DIR]}${LSNAP#*`hostname`}
		if [ -d "${CONFIG[SNAP_MOUNT_DIR]}/$LSNAP" ] ; then
			DARR[D++]="$LSNAP"
		fi
	done
}
fi


# first arg is snapshot source
#echo "arguments for snapback are: " "$@"

NSRC="$1"
shift

# second optional arg is backup host #, currently 1 or 2
if [ "$1" ] ; then
	BNUM="$1"
else
	BNUM=
fi

# validate NSRC arg, and read in config file
# no usage message, since this is called by cron.  supposedly.
_ssched_validate_nsrc NSRC "" ||
	exit 1

trap "_snapback_cleanup_locks;exit" int term

declare -i SX RX
declare -a SSNAPS RSNAPS

declare -a SNAP_SENT_LST
declare -a SNAP_XFER_LST
declare -a TPDIRS

# these two are not BACKUPHOST specific
FREQ=${CONFIG["SSRC_${NSRC}%BACKUPINT"]}
#FREQ=daily
MTYPE="${CONFIG[SSRC_${NSRC}%BACKUPMTYPE]}"
#MTYPE="host 1"

# _ssched_set_verbosity

omsg "$FREQ backup job for '$NSRC' on `hostname` at `date '+%F %T'`"

# TODO: different SNAP_MOUNT_DIRs for different sources
_ssched_mount_rootvol "${CONFIG[SNAP_MOUNT_DIR]}" || {
	ES=$?
	omsg "Unable to mount root volume '${CONFIG[SNAP_MOUNT_DIR]}': $ES"
	if [ ! -d "${CONFIG[SNAP_MOUNT_DIR]}" ] ; then
		omsg "Directory '${CONFIG[SNAP_MOUNT_DIR]}' does not exist."
	elif [ "$ES" -eq 4 ] ; then
		omsg "Perhaps ${CONFIG[SNAP_MOUNT_DIR]} needs to be configured in /etc/fstab?  Snapsched requires that."
	fi
	exit 1
}

# don't let the mount point be unmounted in the middle of this
cd ${CONFIG[SNAP_MOUNT_DIR]}

# get the list of relevant snaps on this host
_ssched_bsub_list "$NSRC" "" "$FREQ" SSNAPS "${CONFIG[SNAP_MOUNT_DIR]}"
SX=${#SSNAPS[*]}

if [ -z "${SSNAPS[0]}" ] ; then
	omsg "No snaps to send - SSNAPS[0]=NULL"
	exit 0
fi

if [ "$SX" -eq 0 ] ; then
	# nothing to do?
	omsg "Num snaps to send: $SX"
	exit 0
fi

for BHOSTN in 1 2 ; do

	# if specified on the cmd line, only backup to that bhost number
	if [ "$BNUM" ] ; then
		if [ "$BNUM" -ne "$BHOSTN" ] ; then
			continue
		fi
	fi

	omsg ""

	BHOST=${CONFIG["SSRC_${NSRC}%BACKUPHOST$BHOSTN"]}
	# BHOST should be null if there isn't one defined for this BHOSTN
	if [ -z "$BHOST" ] ; then
		omsg "No BHOST configured for BHOSTN=$BHOSTN, skipping"
		continue
	fi

	BHOSTFS=${CONFIG["SSRC_${NSRC}%BACKUPHOST${BHOSTN}FS"]}
	BHOSTZ=${CONFIG["SSRC_${NSRC}%BACKUPHOST${BHOSTN}_Z"]}

	# is this NSRC configured for backup?
	if [ -z "$FREQ" ] ; then
		omsg "No backups configured for '$NSRC'."
		break
	fi

	# check to see if we can see the backup host
	# a laptop might be traveling, or backup host offline for maintenance
	ping -c 1 $BHOST &> /dev/null || {
		omsg "Unable to contact '$BHOST' backup host."
		continue
	}

	# this prog sends snaps of all intervals that need to be sent.  i think.
	#if [ "${CONFIG[SSRC_$NSRC%`_ssched_int2u $FREQ`_MSC]}" -eq 0 ] ; then
		# there's nothing to backup
		#exit 0
	#fi


	# mitigate the network traffic usage and the overhead of multiple backups
	# sending at the same time.  possible configs are:
	#    N simultaneous to the same backup host
	#       N simultaneous from the perspective of the backup host - other
	#         systems sending to that host besides this one
	#    N simultaneous
	#    N simultaneous per network segment

	LCK_FILE_BASE=$SNAP_VAR/snapback_backups_
	SLEEP_S=180

	mkdir -p `dirname $LCK_FILE_BASE` 2>/dev/null

	case "${MTYPE% *}" in
		host)
			LCK_FILE=${LCK_FILE_BASE}${BHOST}_
			;& # drop into next one
		backups)
			for ((;;)) ; do
				NLOCKS=`\ls $LCK_FILE}?.lck 2>/dev/null`
				GLOCK_FILE=${LCK_FILE}1.lck
				if [ -z "$NLOCKS" ] ; then
					_ssched_lockfile_p $GLOCK_FILE || {
						serr "Lockfile failure for '$GLOCK_FILE'"
						exit 1
					}
				else
					# wait 3 minutes and see if there's an open slot
					if [ `wc -w "$NLOCKS"` -ge ${MTYPE#* } ] ; then
						omsg "Waiting $SLEEP_S secs for lockfile $GLOCK_FILE..."
						sleep $SLEEP_S
						continue
					fi

					# this code makes no sense AND IS TOTALLY BROKEN!
					# BUG
					GLOCK_FILE=
					for I in 1 2 3 4 ; do
						if [ "$I" -gt "${MTYPE#* }" ] ; then
							break
						fi
						GLOCK_FILE=${LCK_FILE}${I}.lck
						if [ -e "${GLOCK_FILE}" ] ; then
							continue
						fi
						_ssched_lockfile_p $GLOCK_FILE || {
							serr "Lockfile failure for '$GLOCK_FILE'"
							exit 1
						}
						break
					done
					if [ -z "$GLOCK_FILE" ] ; then
						omsg "$NSRC: failed to find slot when expecting to.  NLOCKS=$NLOCKS"
						omsg "Waiting $SLEEP_S secs for ??? ..."
						sleep $SLEEP_S
						continue
					fi
				fi
				break
			done
			;;
	esac


	TBASE="${CONFIG["SNAP_MOUNT_DIR"]}/${CONFIG["SNAP_BASE_DIR"]}/$NSRC"

	BSDIR=$BHOSTFS/`hostname`/$NSRC

	# whether or not to use compression with ssh.  generally makes xfers take
	# roughly 2.5 times longer on 1Gb ethernet or faster connections
	if $BHOSTZ ; then
		SSHZ_ARG=-C
	else
		SSHZ_ARG=
	fi

	# get the list of snaps on the backup host for this nsrc/intval
	RSNAPS=()
	_ssched_bsub_list "$NSRC" "$BHOST" "$FREQ" RSNAPS $BHOSTFS
	RX=${#RSNAPS[*]}

	SNAP_XFER_LST=()
	TPDIRS=()

	# cleans out SNAP_SENT_LST for us
	_rsnaps2send SNAP_SENT_LST RSNAPS

	# if oldbash, just create all three interval directories on backup host
	if $OLDBASH ; then
		TPDIRS=(monthly weekly daily)
	fi

	# create xfer list and possible target directories
	for SNAP in "${SSNAPS[@]}" ; do
		# this func uses RSNAPS and RX as global variables
		if _already_sent "$SNAP" "$NSRC" ; then
			continue
		fi
		SNAP_XFER_LST+=(${SNAP#* })
		# if not oldbash, create interval dirs based on what snaps are
		# being sent.  we don't do hourlies, do we?  why it needs to be
		# sorted, i don't know
		$OLDBASH ||
		_sort -u TPDIRS `egrep -o "hourly|daily|weekly|monthly" <<<"${SNAP#* }"`
	done

	if [ "${#SNAP_XFER_LST[*]}" -gt 0 ] ; then
		if [ "${#SNAP_XFER_LST[*]}" -gt 1 ] ; then
			V=are
			S=s
		else
			V=is
			S=
		fi
		omsg "There $V ${#SNAP_XFER_LST[*]} snapshot$S to be sent to ${BHOST}:"

		# create target directories on backup host if needed
		for D in "${TPDIRS[@]}" ; do
			ssh -x $BHOST mkdir -p "$BSDIR/$D" || {
				_snapback_cleanup_locks
				omsg "ssh mkdir '${BHOST}:$BSDIR/$D' failed"
				# bail on this backup host
				continue
			}
		done

		# send the snaps
		for SNAP in "${SNAP_XFER_LST[@]}" ; do
			D=`dirname "$SNAP"`
			D=`basename "$D"`
			# echo "cp_snap 'ssh -x' '$SSHZ_ARG' '$BHOST' '$SNAP' '$BSDIR/$D'"
			cp_snap "$SSHZ_ARG" "$BHOST" "$SNAP" "$BSDIR/$D" &&
				SNAP_SENT_LST+=("$SNAP") || {
					omsg -e "Sending failure for snap '$SNAP' BHOST='$BHOST:$BSDIR/$D'\n" "SNAP_SENT_LST=(${SNAP_SENT_LST[*]})\n"
					omsg -e "SNAP_XFER_LST=(${SNAP_XFER_LST[*]})\n"
					break
				}
		done
	fi

	if [ "$MTYPE" ] ; then
		_ssched_lockfile_v $GLOCK_FILE
	fi
done

# release the mount point
cd - >/dev/null

_ssched_umount_rootvol "${CONFIG[SNAP_MOUNT_DIR]}"

#	_snapback_cleanup_locks
#	cd - >/dev/null
#	_ssched_umount_rootvol "${CONFIG[SNAP_MOUNT_DIR]}"

exit
