#!/usr/bin/perl
# squasher.pl <path> <prefix> <size> <tree>
# Make multi-volume backup images of <path>, not ot exceed <size>,
# volume images named <prefix>.#, using <tree> as workspace.
# Author: R. J. Brown rj@elilabs.com
# Date: Fri Oct 29 02:39:18 EDT 2004
# Copyright 2004 Elijah Laboratories Inc. ALL RIGHTS RESERVED WORLDWIDE
# License: GPL
# Download: http://www.elilabs.com/~rj/papers/squasher.pl
# This script attempts to make a set of dvd images (to backup a
# directory and its children) as squashed filesystems, such that each
# such image is as nearly as full as possible, without exceeding
# either the capacity of a single dvd-/+r, or the maximum capacity of
# a squashsf filesystem.
# Since the direectory tree rooted at <path> will potentially contain
# more informationm than can fit on a single dvd, this script attemps
# to find a way to divide the files across multiple dvds in such a way
# as to maximize the space utilization of each such dvd.
# Since the ultimate intended purpose is to provide an easy to use,
# directly addressable form of off-line backup, the path/filename's
# are first sorted into lexicographic order. This way, each dvd will
# contain ordered path/filenames, making it easier to find the desired
# file in a multi-volume backup set.
# Since it is possible for a single file to exceed the capacity of a
# single dvd, even after it has been squashed, if any such files are
# detected, they will have their path/filenames output to a culls list
# on stderr. It will be necessary to use some other means to backup
# these files; they will not appear in the multi-volume dvd set, as
# they are unamenable to this format.
# The operation of this script is iterative, as it cannot be
# determined prior to the squashing performed by mksquashfs just how
# large the resultant squashed filesystem will be.
# An initial file set is determined by taking all the files up to a
# predetermined cumulative size. This list of files is piped into
# cpio, which produces a directory tree containing hard links to the
# specified files. Using hard links is much faster than actually
# copying the files, and takes up much less space, but it does require
# that the source tree and the destination tree be on the same logical
# filesystem.
# Once the linked tree is generated, mksquashfs is invoked to squash
# the filesystem from that linked tree. When mksquashfs finishes, the
# size of the resultant squashed filesystem is compared to the maximum
# allowable size, and the difference is computed. The result is used
# to determine a new estimated unsquashed filesystem size.
# The necessary files are then piped into cpio as above to add to the
# linked tree, such that the new estimated filesystem size is
# approached as closely as possible, but not exceeded.
# The process then continues with another squashing operation. The
# result of this squashing operation is checked for size. If the
# squashed filesystem is still too small, the linked tree is grown
# some more.
# Only if the squashed filesystem is "near enough" to the maximum
# allowable size does the iterative size adjustment and subsequent
# squashing stop. Once this does occur, that squashed filesystem is
# saved as a dvd image, and the process continues to generate the next
# dvd image in the multi-volume set.
# There is an implementation restriction of mksquashfs and the
# squashfs filesystem in general: the entire sqaushed filesystem must
# fit in less than 4 GB (2^32-1). Since this is less than the total
# capacity of a dvd, this size limit is actually the dominating one,
# and not the maximum capacity of a dvd. Fortunately, the two sizes
# do not differ by a significant percentage.
# Because of the iterative approach taken in this script, and the
# compute-intensive nature of data compression in general, it takes
# about two hours per dvd image for this script to run on my AMD
# XP-2000+ machine with 1 GB of RAM and a 250 GB 3-way mirrored IDE
# RAID-1 storage system.
# I plan to run this script to produce monthly offline backups from my
# backup server. This server backs up about 10 machines. This works
# out to roughly 15 dvd images per month. Thus it is expected to take
# about 1 day of run time to produce all the dvd images for these
# systems each month. Fortunately, the backup server has no other
# duties except this, its nightly network backup run, serving the
# online backups via NFS, and purging expired backups.
# The resulting dvds burned from these squashed filesystems are
# directly mountable as type squashfs filesystems. The dvds are not
# burned in ISO format; they are burned directly from the sqyashed
# images in raw mode using growisofs.
# For added security against unauthorized use of the dvds, I use dd to
# copy these squashed dvd images to a loop device thru the crypto-api.
# The loop device is associated with a raw disk partition that is the
# size of a dvd. I then use this raw disk partition as the raw dvd
# image that is input to growisofs. This way, the dvds are encrypted
# with aes-256, and require a passphrase when they are mounted.
# Encrypted backups are a good idea, as if anyone gets ahold of your
# backups, they own you!
# It is too bad that mksquashfs does not have a provision to operate
# analogously to cpio and take a list of sorted filenames on stdin,
# producing a multi-volume set of squashed filesystems that each meet
# a maximum size constraint, but mksquashfs was apparently not
# designed with a multi-volume capability in mind. Rather, it seems
# to have been designed to squash a filesystem as quickly as possible,
# and as such, it works directly with lists of inodes until the entire
# squashed filesystem is generated, with no ability to determine
# incrementally what the resultant squashed size is at intermediate
# points along the way.
# This script is a kluge to work around that problem. :-(
# FIXME FIXME FIXME FIXME FIXME FIXME FIXME FIXME FIXME FIXME
# I have this nagging thought in the back of my head that this script
# still suffers from a serious but hard to test bug. The iterative
# process that attempts to fill a volume to within $tolerance of its
# capacity will probably fail if the next file that could be added to
# the tree is too big to fit in the squashed filesystem, but without
# it, the squashed filesystem is too small to fulfill the $tolerance
# requirement. There needs to be an escape from this potentially
# endless loop. If the addition of only a single file to the tree
# would exceed the maximum volume size, then we must settle for not
# filling the squashed filesystem to within $tolerance of being full.
# NOTE: I have attempted to solve the above problem by the use of the
# $added_at_least_one_file flag, but I have not yet tried to test this
# logic, other than to verify that it doesn not break the rest of the
# program in those cases where it does not apply.
# ----------------------------------------------------------------
use strict; # be picky about syntax
# local variables
my $file_name; # name of file we are currently working on
my $file_size; # its length
my $cum_size; # cumulative size so far
my $trial_size; # size we are growing the tree up to
my $vol_name; # name of current squashed volume
my $volno; # number of current squashed volume
my $sq_vol_size; # its size so far
my $more_to_go; # TRUE if we still have unprocessed files
my $added_at_least_one_file; # TRUE if we grew by at least one file
my $still_have_a_file; # TRUE if we still have a file from a previous iteration
# manifest constants
my $FALSE = 0; # logical truth values
my $TRUE = 1;
my $KB = 1000; # size abbreviations
my $MB = $KB * $KB;
my $GB = $KB * $MB;
my $tolerance = 0.9; # acceptable percentage of a full dvd
my $max_squash_size = 4 * 1024 * 1024 * 1024 - 1; # GOTCHA mksquashfs pukes if bigger !!!
# get command line stuff: squasher.pl <path> <size> <prefix> <tree>
my $path; # <path>
my $max_vol_size; # <size>
my $prefix; # <prefix>
my $linked_tree; # <tree>
my $nargs = @ARGV; # fetch arguments from command line
my $arg1;
my $arg2;
my $arg3;
my $arg4;
($arg1, $arg2, $arg3, $arg4) = @ARGV;
if ($nargs < 1 # sanity check
|| $nargs > 4
|| $arg1 eq "-?"
|| $arg1 eq "-h"
|| $arg1 eq "--help") {
print "squasher.pl <path> <prefix> <size> <tree>\n";
print "-- Must at least specify <path>!\n";
exit;
}
# process defaults
$path = $arg1;
$prefix = ( $arg2 ne "" ? $arg2 : "vol");
$max_vol_size = ( $arg3 ne "" && $arg3 < $max_squash_size ? $arg3 : $max_squash_size );
$linked_tree = ( $arg4 ne "" ? $arg4 : "linked_tree" );
# ready to go!
open(DATE, "date |")
|| die "Could not obtain date from system: $!";
my $started = <DATE>;
close(DATE);
print "\nsquasher.pl <path> <prefix> <size> <tree>\n";
print "\n Started $started";
print "\n <path> = $path;\n <prefix> = $prefix;\n <size> = $max_vol_size;\n <tree> = $linked_tree;\n";
open(FILES, "find $path -print0 | sort -z |") # lexicographically sorted list of filenames
|| die "CANNOT GET FILENAMES FROM $path: $!";
`rm -rf $prefix.*`; # get rid of any old squashed volume sets
$more_to_go = $TRUE; # we obviously have more; we haven't even started!
$still_have_a_file = $FALSE; # we don't have one yet
$volno = 0; # initialize the volume number
big_loop: while ($more_to_go) { # a long as we have unprocessed files...
$volno++;
$cum_size = 0;
$vol_name = "$prefix.$volno"; # determine the volume name
$sq_vol_size = 0;
$trial_size = 0;
$added_at_least_one_file = $TRUE; # white lie to force first interation for a new volume
print "\n Working on $vol_name\n";
`rm -rf $linked_tree`; # uproot any old tree
`mkdir $linked_tree`; # plant a new one
$/ = "\000"; # set input record delimiter to NUL
volume_loop: while ($sq_vol_size < $tolerance * $max_vol_size # pack up a volume
&& $added_at_least_one_file
&& $more_to_go) {
$trial_size += $max_vol_size - $sq_vol_size; # since we have more room, increase our expectations
print "\n Growing tree to size $trial_size\n";
open(TREE, "| cpio -p0ldm $linked_tree 2>>$vol_name.log")
|| die "CANNOT OPEN PIPE TO cpio TO GROW LINKED TREE AT $linked_tree: $!";
$more_to_go = $FALSE; # TRUE if we still have unprocessed filenames
$added_at_least_one_file = $FALSE; # hasn't happened so far
growth_loop: while ($TRUE) { # one file at a time...
unless ($still_have_a_file) { # do we still have a file from last time around?
$_ = <FILES>; # no, get one
unless ($_) { # ain't no more
$still_have_a_file = $FALSE; # say one is not in waiting
$more_to_go = $FALSE; # and none are left in the list
last growth_loop; # get out of this while loop
}
chomp;
$file_name = $_; # get the file name
$file_size = -s $file_name; # and its size
}
if ($file_size > $max_vol_size) { # is this file alone too big for a single volume?
print STDERR "Skipping $file_name -- too big: $file_size\n"; # yes, say so
$still_have_a_file = $FALSE; # we just used the one we had
next growth_loop; # skip over it!
}
$cum_size += $file_size; # keep running total
if ($cum_size >= $trial_size) { # total too big?
$still_have_a_file = $TRUE; # yes, say we still have a filename we did not use yet
$more_to_go = $TRUE; # say we sill have more files to do
last growth_loop; # get out of this while loop
}
print TREE "$file_name\000"; # no, grow the tree
$added_at_least_one_file = $TRUE; # remember that we added a file to the tree
$still_have_a_file = $FALSE; # we just used the one we had
} ### END growth_loop ###
close(TREE); # make sure cpio finished before we start squashing
# make a squashed filesystem from it.
print " Squashing\n";
`rm -rf $vol_name`;
`mksquashfs $linked_tree $vol_name`; # squash it
$sq_vol_size = -s $vol_name; # how big was the squashed filesystem?
print " Squashed filesystem size is $sq_vol_size\n";
die "!!! SQUASHED FILESYSTEM TOO BIG !!!" if $sq_vol_size > $max_vol_size; # too big? FIXME
} ### END volume_loop ###
print "\n Done with $vol_name!\n";
} ### END big_loop ###
print "\nDone making squashed images for $path\n";
`rm -rf $linked_tree`;
print "\nDone cleaning up.\n";
print "\nThank you for using squasher.pl!\n\n";