#!/usr/bin/perl # squasher.pl <path> <prefix> <size> <tree> # Make multi-volume backup images of <path>, not ot exceed <size>, # volume images named <prefix>.#, using <tree> as workspace. # Author: R. J. Brown rj@elilabs.com # Date: Fri Oct 29 02:39:18 EDT 2004 # Copyright 2004 Elijah Laboratories Inc. ALL RIGHTS RESERVED WORLDWIDE # License: GPL # Download: http://www.elilabs.com/~rj/papers/squasher.pl # This script attempts to make a set of dvd images (to backup a # directory and its children) as squashed filesystems, such that each # such image is as nearly as full as possible, without exceeding # either the capacity of a single dvd-/+r, or the maximum capacity of # a squashsf filesystem. # Since the direectory tree rooted at <path> will potentially contain # more informationm than can fit on a single dvd, this script attemps # to find a way to divide the files across multiple dvds in such a way # as to maximize the space utilization of each such dvd. # Since the ultimate intended purpose is to provide an easy to use, # directly addressable form of off-line backup, the path/filename's # are first sorted into lexicographic order. This way, each dvd will # contain ordered path/filenames, making it easier to find the desired # file in a multi-volume backup set. # Since it is possible for a single file to exceed the capacity of a # single dvd, even after it has been squashed, if any such files are # detected, they will have their path/filenames output to a culls list # on stderr. It will be necessary to use some other means to backup # these files; they will not appear in the multi-volume dvd set, as # they are unamenable to this format. # The operation of this script is iterative, as it cannot be # determined prior to the squashing performed by mksquashfs just how # large the resultant squashed filesystem will be. # An initial file set is determined by taking all the files up to a # predetermined cumulative size. This list of files is piped into # cpio, which produces a directory tree containing hard links to the # specified files. Using hard links is much faster than actually # copying the files, and takes up much less space, but it does require # that the source tree and the destination tree be on the same logical # filesystem. # Once the linked tree is generated, mksquashfs is invoked to squash # the filesystem from that linked tree. When mksquashfs finishes, the # size of the resultant squashed filesystem is compared to the maximum # allowable size, and the difference is computed. The result is used # to determine a new estimated unsquashed filesystem size. # The necessary files are then piped into cpio as above to add to the # linked tree, such that the new estimated filesystem size is # approached as closely as possible, but not exceeded. # The process then continues with another squashing operation. The # result of this squashing operation is checked for size. If the # squashed filesystem is still too small, the linked tree is grown # some more. # Only if the squashed filesystem is "near enough" to the maximum # allowable size does the iterative size adjustment and subsequent # squashing stop. Once this does occur, that squashed filesystem is # saved as a dvd image, and the process continues to generate the next # dvd image in the multi-volume set. # There is an implementation restriction of mksquashfs and the # squashfs filesystem in general: the entire sqaushed filesystem must # fit in less than 4 GB (2^32-1). Since this is less than the total # capacity of a dvd, this size limit is actually the dominating one, # and not the maximum capacity of a dvd. Fortunately, the two sizes # do not differ by a significant percentage. # Because of the iterative approach taken in this script, and the # compute-intensive nature of data compression in general, it takes # about two hours per dvd image for this script to run on my AMD # XP-2000+ machine with 1 GB of RAM and a 250 GB 3-way mirrored IDE # RAID-1 storage system. # I plan to run this script to produce monthly offline backups from my # backup server. This server backs up about 10 machines. This works # out to roughly 15 dvd images per month. Thus it is expected to take # about 1 day of run time to produce all the dvd images for these # systems each month. Fortunately, the backup server has no other # duties except this, its nightly network backup run, serving the # online backups via NFS, and purging expired backups. # The resulting dvds burned from these squashed filesystems are # directly mountable as type squashfs filesystems. The dvds are not # burned in ISO format; they are burned directly from the sqyashed # images in raw mode using growisofs. # For added security against unauthorized use of the dvds, I use dd to # copy these squashed dvd images to a loop device thru the crypto-api. # The loop device is associated with a raw disk partition that is the # size of a dvd. I then use this raw disk partition as the raw dvd # image that is input to growisofs. This way, the dvds are encrypted # with aes-256, and require a passphrase when they are mounted. # Encrypted backups are a good idea, as if anyone gets ahold of your # backups, they own you! # It is too bad that mksquashfs does not have a provision to operate # analogously to cpio and take a list of sorted filenames on stdin, # producing a multi-volume set of squashed filesystems that each meet # a maximum size constraint, but mksquashfs was apparently not # designed with a multi-volume capability in mind. Rather, it seems # to have been designed to squash a filesystem as quickly as possible, # and as such, it works directly with lists of inodes until the entire # squashed filesystem is generated, with no ability to determine # incrementally what the resultant squashed size is at intermediate # points along the way. # This script is a kluge to work around that problem. :-( # FIXME FIXME FIXME FIXME FIXME FIXME FIXME FIXME FIXME FIXME # I have this nagging thought in the back of my head that this script # still suffers from a serious but hard to test bug. The iterative # process that attempts to fill a volume to within $tolerance of its # capacity will probably fail if the next file that could be added to # the tree is too big to fit in the squashed filesystem, but without # it, the squashed filesystem is too small to fulfill the $tolerance # requirement. There needs to be an escape from this potentially # endless loop. If the addition of only a single file to the tree # would exceed the maximum volume size, then we must settle for not # filling the squashed filesystem to within $tolerance of being full. # NOTE: I have attempted to solve the above problem by the use of the # $added_at_least_one_file flag, but I have not yet tried to test this # logic, other than to verify that it doesn not break the rest of the # program in those cases where it does not apply. # ---------------------------------------------------------------- use strict; # be picky about syntax # local variables my $file_name; # name of file we are currently working on my $file_size; # its length my $cum_size; # cumulative size so far my $trial_size; # size we are growing the tree up to my $vol_name; # name of current squashed volume my $volno; # number of current squashed volume my $sq_vol_size; # its size so far my $more_to_go; # TRUE if we still have unprocessed files my $added_at_least_one_file; # TRUE if we grew by at least one file my $still_have_a_file; # TRUE if we still have a file from a previous iteration # manifest constants my $FALSE = 0; # logical truth values my $TRUE = 1; my $KB = 1000; # size abbreviations my $MB = $KB * $KB; my $GB = $KB * $MB; my $tolerance = 0.9; # acceptable percentage of a full dvd my $max_squash_size = 4 * 1024 * 1024 * 1024 - 1; # GOTCHA mksquashfs pukes if bigger !!! # get command line stuff: squasher.pl <path> <size> <prefix> <tree> my $path; # <path> my $max_vol_size; # <size> my $prefix; # <prefix> my $linked_tree; # <tree> my $nargs = @ARGV; # fetch arguments from command line my $arg1; my $arg2; my $arg3; my $arg4; ($arg1, $arg2, $arg3, $arg4) = @ARGV; if ($nargs < 1 # sanity check || $nargs > 4 || $arg1 eq "-?" || $arg1 eq "-h" || $arg1 eq "--help") { print "squasher.pl <path> <prefix> <size> <tree>\n"; print "-- Must at least specify <path>!\n"; exit; } # process defaults $path = $arg1; $prefix = ( $arg2 ne "" ? $arg2 : "vol"); $max_vol_size = ( $arg3 ne "" && $arg3 < $max_squash_size ? $arg3 : $max_squash_size ); $linked_tree = ( $arg4 ne "" ? $arg4 : "linked_tree" ); # ready to go! open(DATE, "date |") || die "Could not obtain date from system: $!"; my $started = <DATE>; close(DATE); print "\nsquasher.pl <path> <prefix> <size> <tree>\n"; print "\n Started $started"; print "\n <path> = $path;\n <prefix> = $prefix;\n <size> = $max_vol_size;\n <tree> = $linked_tree;\n"; open(FILES, "find $path -print0 | sort -z |") # lexicographically sorted list of filenames || die "CANNOT GET FILENAMES FROM $path: $!"; `rm -rf $prefix.*`; # get rid of any old squashed volume sets $more_to_go = $TRUE; # we obviously have more; we haven't even started! $still_have_a_file = $FALSE; # we don't have one yet $volno = 0; # initialize the volume number big_loop: while ($more_to_go) { # a long as we have unprocessed files... $volno++; $cum_size = 0; $vol_name = "$prefix.$volno"; # determine the volume name $sq_vol_size = 0; $trial_size = 0; $added_at_least_one_file = $TRUE; # white lie to force first interation for a new volume print "\n Working on $vol_name\n"; `rm -rf $linked_tree`; # uproot any old tree `mkdir $linked_tree`; # plant a new one $/ = "\000"; # set input record delimiter to NUL volume_loop: while ($sq_vol_size < $tolerance * $max_vol_size # pack up a volume && $added_at_least_one_file && $more_to_go) { $trial_size += $max_vol_size - $sq_vol_size; # since we have more room, increase our expectations print "\n Growing tree to size $trial_size\n"; open(TREE, "| cpio -p0ldm $linked_tree 2>>$vol_name.log") || die "CANNOT OPEN PIPE TO cpio TO GROW LINKED TREE AT $linked_tree: $!"; $more_to_go = $FALSE; # TRUE if we still have unprocessed filenames $added_at_least_one_file = $FALSE; # hasn't happened so far growth_loop: while ($TRUE) { # one file at a time... unless ($still_have_a_file) { # do we still have a file from last time around? $_ = <FILES>; # no, get one unless ($_) { # ain't no more $still_have_a_file = $FALSE; # say one is not in waiting $more_to_go = $FALSE; # and none are left in the list last growth_loop; # get out of this while loop } chomp; $file_name = $_; # get the file name $file_size = -s $file_name; # and its size } if ($file_size > $max_vol_size) { # is this file alone too big for a single volume? print STDERR "Skipping $file_name -- too big: $file_size\n"; # yes, say so $still_have_a_file = $FALSE; # we just used the one we had next growth_loop; # skip over it! } $cum_size += $file_size; # keep running total if ($cum_size >= $trial_size) { # total too big? $still_have_a_file = $TRUE; # yes, say we still have a filename we did not use yet $more_to_go = $TRUE; # say we sill have more files to do last growth_loop; # get out of this while loop } print TREE "$file_name\000"; # no, grow the tree $added_at_least_one_file = $TRUE; # remember that we added a file to the tree $still_have_a_file = $FALSE; # we just used the one we had } ### END growth_loop ### close(TREE); # make sure cpio finished before we start squashing # make a squashed filesystem from it. print " Squashing\n"; `rm -rf $vol_name`; `mksquashfs $linked_tree $vol_name`; # squash it $sq_vol_size = -s $vol_name; # how big was the squashed filesystem? print " Squashed filesystem size is $sq_vol_size\n"; die "!!! SQUASHED FILESYSTEM TOO BIG !!!" if $sq_vol_size > $max_vol_size; # too big? FIXME } ### END volume_loop ### print "\n Done with $vol_name!\n"; } ### END big_loop ### print "\nDone making squashed images for $path\n"; `rm -rf $linked_tree`; print "\nDone cleaning up.\n"; print "\nThank you for using squasher.pl!\n\n";