#!/usr/bin/perl

###########################################################################
#
#  snatch - version 0.6 (The Man Who Couldn't Cry)
#
#  This script downloads the page images from various sites using a
#  modular framework.  The idea is to abstract the actual downloading
#  from the prepatory work so that by simply changing a couple of
#  commandline variables one can get the page images from different
#  websites.
#
#  There is one module (subroutine) for each supported image site.
#  Each module processes the commandline info and returns an array
#  of URLs to be downloaded by the main script.
#
#  Details on each module are in the comments before that module's
#  subroutine.
#
###########################################################################


use warnings;
use strict;
use diagnostics;


###########################################################################
#
# Prototypes
#
###########################################################################

sub module_set_limit ( $ );
sub module_check_format ( $@ );
sub print_modules();
sub print_v ( $ );
sub read_module_cache();
sub update_module_cache();


###########################################################################
#
# Load Preferences
#
###########################################################################

our %config;

$config{'format'} = "";


# Load preferences
do 'config.rc';



###########################################################################
#
# Default Values
#
###########################################################################

# Start number
$config{'start'} = 1;


###########################################################################
#
# Load Modules
#
###########################################################################
our %sites;

# Determine what modules are available
my $mod_dir = "modules/";
my $mod_cache = $mod_dir."modules.cache";

if ( !-r $mod_cache || -z $mod_cache ) {
    update_module_cache();
}

read_module_cache();


###########################################################################
#
# Help Message
#
###########################################################################


# Set the usage message
my $usage = <<EOT;

Usage: $0 [options] site id

  -d dir     Save to directory "dir" (create dir if it doesn't exist).

  -f format  Download format. Valid values are "pdf" or "image" (not all
             sites support downloading of both formats).

  -l         Generate a list of available modules and exit. (Other options
             ignored)

  -i start   Begin downloading at "start" (integer) page.

  -h         Print help and exit (other options ignored)

  -o offset  End downloading at "start" + "offset" (see -i option). If not
             set, the selected module will determine the offset equal to
             the total number of pages in the book.

  -p proxy   Set the proxy string to "proxy". The string "off" turns the 
             proxy feature off (e.g., if it's set in your config.rc file)

  -q         Quiet mode (verbose reporting OFF).

  -r         Renumber images sequentially.  By default, images are saved
             using the same filename as on the server.

  -u         Don't download pages; instead, print a list of URLs from
             which the pages may be downloaded (overrides -v). Note that
             some files may still be downloaded in order to generate
             the URLs.

  -ua uagent Set the user agent to "uagent".

  -v         Verbose mode (unless -u is selected).

  -w wait    Number of seconds to wait between the download of each file.


  site       Short form of the site to download from (i.e., which module
             to use).

  id         Unique ID of which book's images to download.


  --update-cache    Update the site cache and exit. (Other options ignored)

EOT


###########################################################################
#
# Parse Command Arguments
#
###########################################################################

# -l = print the list of available modules and exit
if ( $ARGV[0] eq "-l" ) {
    print_modules();
    exit();
}

# --update-cache = update the list of available modules, print and exit
elsif ( $ARGV[0] eq "--update-cache" ) {
	undef(%sites);
    update_module_cache();
    print "Module cache updated!\n";
	read_module_cache();
    print_modules();
    exit();
}

# We need at least two arguments -- so if there aren't that many, fail
if ( !$#ARGV ) {
    print "Not enough arguments\n";
    print $usage;
    exit;
}



my $skip = 0;
my $dump = 0;
my $noverbose = 0;
our $idno;

# Loop through the commandline arguments and set variables as necessary
foreach my $argnum (0 .. $#ARGV) {
    # Reset the $skip flag
    if ($skip) {
        $skip = 0;
    }

    # -d = directory
    # overrides $config{'dir'} setting
    elsif ( $ARGV[$argnum] eq "-d" ) {
        $skip = 1;
        $config{'dir'} = $ARGV[$argnum+1];
    }

    # --dump = dump the settings without doing anything
    elsif ( $ARGV[$argnum] eq "--dump" ) {
        $dump = 1;
    }

    # -e = use ARGV+1 as the file extension
    elsif ( $ARGV[$argnum] eq "-e" ) {
        $skip = 1;
        $config{'ext'} = $ARGV[$argnum+1];
    }

    # -f = use ARGV+1 as the format
    # overrides the $config{'format'} setting
    elsif ( $ARGV[$argnum] eq "-f" ) {
        $skip = 1;
        $config{'format'} = $ARGV[$argnum+1];
    }

    # -h | --help = Print help usage
    elsif ( $ARGV[$argnum] eq "-h" || $ARGV[$argnum] eq "--help" ) {
        print $usage;
        exit;
    }

    # -i = start at ARGV+1
    elsif ( $ARGV[$argnum] eq "-i" ) {
        $skip = 1;
        $config{'start'} = $ARGV[$argnum+1];
    }

    # -o = get ARGV+1 number of pages
    elsif ( $ARGV[$argnum] eq "-o" ) {
        $config{'offset'} = $ARGV[$argnum+1];
        $skip = 1;
    }
    
    # -p = proxy string
    # overrides the $config{'proxy'} setting
    elsif ( $ARGV[$argnum] eq "-p" ) {
        $config{'proxy'} = $ARGV[$argnum+1];
        $skip = 1;
    }

    # -q = shutup
    elsif ( $ARGV[$argnum] eq "-q" ) {
        $config{'verbose'} = 0;
    }

    # -r = renumber files
    # overrides the $config{'renumber'} setting
    elsif ( $ARGV[$argnum] eq "-r" ) {
        $config{'renumber'} = 1;
    }

    # -u = print a list of URLs without downloading anything
    elsif ( $ARGV[$argnum] eq "-u" ) {
        $config{'verbose'} = 0;
        $config{'nodl'} = 1;
        $noverbose = 1;
    }

    # -ua | --user-agent = use ARGV+1 as user agent string
    # overrides the $config{'uagent'} setting
    elsif ( $ARGV[$argnum] eq "-ua" || $ARGV[$argnum] eq "--user-agent" ) {
        $config{'uagent'} = $ARGV[$argnum+1];
        $skip = 1;
    }

    # -w = wait for ARGV+1 seconds
    # overrides the $config{'wait'} setting
    elsif ( $ARGV[$argnum] eq "-w" ) {
        $config{'wait'} = $ARGV[$argnum+1];
        $skip = 1;
    }

    # -v = chatter like a teenage schoolgirl
    # overrides the $config{'verbose'} setting
    elsif ( $ARGV[$argnum] eq "-v" && !$noverbose ) {
        $config{'verbose'} = 1;
    }

    # penultimate argument is the site module
    elsif ( !$config{'site'} ) {
        $config{'site'} = $ARGV[$argnum];
    }

    # ultimate argument is the book ID
    else {
        $idno = $ARGV[$argnum];
    }
}

# If the --dump flag was set, dump the variables and exit
if ( $dump ) {
    print "\n\n";
    foreach my $key ( sort ( keys(%config) ) ) {
      print "$key: ".$config{$key}."\n";
    }
    exit();
}


# If the unique ID is not set, print usage and exit
if (!$idno) {
  print $usage;
  exit();
}


###########################################################################
#
# Import UserAgent module
#
###########################################################################

# Use the LWP::UserAgent module for web stuff
use LWP::UserAgent;
use HTTP::Cookies;

# set the cookie file
my $cookiefile;
if ( $config{'cookiefile'} && length($config{'cookiefile'}) ) {
  $cookiefile = $config{'cookiefile'};
} else {
  $cookiefile = "cookies.txt";
}

# Create a cookie jar
my $cookie_jar = HTTP::Cookies::Netscape->new(
    File => $cookiefile,
	Autosave => 0,
	hide_cookie2 => 1
);

# Use the configured useragent; otherwise pick a random one
my $uagent;
if ( $config{'useragent'} ) {
    $uagent = $config{'useragent'};
}
else {
    open(UAGENTS,"uagents");
    my @agents = <UAGENTS>;
    close(UAGENTS);
    $uagent = chomp($agents[rand($#agents)]);
}

our $ua = new LWP::UserAgent (
    cookie_jar=>$cookie_jar,
    env_proxy => 1,
    agent=>$uagent,
    keep_alive=>1,
);

# Set proxy if config variable available
if ( $config{'proxy'} && $config{'proxy'} ne "off" ) {
    $ua->proxy('http',$config{'proxy'});
}


###########################################################################
#
# Output Directory
#
###########################################################################

# If directory is not set, use current directory
if ( !$config{'dir'} ) {
	$config{'dir'} = "./";
}


# If the output directory doesn't have a slash at the end, put one there
# (unless the output directory has no length).
$config{'dir'} =~ s/\s*$//;
if ( length($config{'dir'}) && $config{'dir'} =~ m![^/]$! ) {
    if ( $config{'dir'} =~ m!\\! ) {
        $config{'dir'} =~ s!\\*$!\\!;
    }
    else {
        $config{'dir'} =~ s!/*$!/!;
    }
}


# If the output directory doesn't exist, create it
unless ( -e $config{'dir'} ) {
    # Use the mkpath function from File::Path for recursive directory making...
    use File::Path qw( mkpath );

    if ( $config{'verbose'} ) { print "Output directory does not exist\nMaking $config{'dir'}\n"; }
    mkpath($config{'dir'});
}


###########################################################################
#
# Print Information
#
###########################################################################

if ($config{'verbose'}) {
    print "Verbose mode on\n";
    print "User agent: $uagent\n";
    print "Using proxy: ".$config{'proxy'}."\n" if $config{'proxy'};
    print "Using cookie file: ".$config{'cookiefile'}."\n" if $config{'cookiefile'};
    print "Getting images from \"".$sites{$config{'site'}}."\" (".$config{'site'}.")\n";
    print "Saving files in ".$config{'dir'}."\n";
    print "No preferred format detected; using module default...\n" unless $config{'format'};
    print "Detected ".$config{'format'}." as desired format...\n" if $config{'format'};
    print "Got idno: ".$idno."\n";
    print "Starting with image ".$config{'start'}."\n";
    print "Limit set at $config{'high'}\n" if $config{'high'};
}


###########################################################################
#
# Select Module
#
###########################################################################

our @urls;
our @touchurls;


# Run the appropriate subroutine. Each subroutine should return an array of
# URLs to be retrieved
my $module = $mod_dir.$config{'site'}.".mod.pl";
if ( $config{'verbose'} ) { print "Starting ".$config{'site'}." module ($module)\n"; }
do "$module";


###########################################################################
#
# Download Images
#
###########################################################################


# Loop through the URLs and download the files

if ( $config{'nodl'} ) {
  print join("\n",@urls);
  exit();
}

my $i = $config{'start'};
foreach my $url ( @urls) {
  if ( @touchurls ) {
    my $touchurl = shift(@touchurls);
    print_v("Touching $touchurl");
    my $res = $ua->head($touchurl);
  }

  print_v("Downloading file: $url");
  my $res = $ua->get($url);

  my $file;

  if ( $config{'renumber'} ) {
    my $num = sprintf("%07d",$i);
    $file = "$config{'dir'}$num.$config{'ext'}";
    $i++;
  }
  else {
    my $url =~ m:([^/]+$):;
    $file = $config{'dir'}.$1;
  }

  if ( $res->is_success ) {
    if ( $config{'verbose'} ) { print "Writing $file\n"; }
    open(FILE,">$file");
    binmode(FILE,":raw");
    print FILE $res->content;
    close(FILE);
  }
  else {
    print STDOUT "Could not download file (".$res->code."): $url\n";
  }

  if ( $config{'wait'} ) {
    sleep($config{'wait'});
  }
}


###########################################################################
#
# subroutine: update_module_cache
#
# Update the modules cache for installing new (or updated) modules.
#
###########################################################################

sub update_module_cache ( ) {
  opendir(DIR, $mod_dir) || die "Can't open module directory $mod_dir: $!";
  my @modules = grep { /mod\.pl$/ } readdir(DIR);
  closedir DIR;

  open(CACHE, '>', $mod_cache) || die "Can't open module cache $mod_cache: $!";

  print CACHE "# This file contains an automatically generated cache of available modules -- DO NOT EDIT\n";
  print CACHE "#\n";
  print CACHE "# To update this file, run the following command:\n";
  print CACHE "#\n";
  print CACHE "#     perl snatch --update-cache\n\n";

  foreach my $module (@modules) {
    open(MOD,"<modules/$module");
    my @line = <MOD>;
    close(MOD);
    chomp(@line);

    $line[0] =~ /^#\s*(.+?)\s*:\s*(.+)$/;
    my $shortform = $1;
    my $longform  = $2;

    if ( $line[0] =~ /\(defunct\)/  ) {
       print CACHE "# Found defunct module: $shortform: $longform\n";
    }
    else {
      print CACHE "\$sites{'$1'} = \"$2\";\n";
    }
  }

  close(CACHE);
}


###########################################################################
#
# subroutine: read_module_cache
#
# Read the module cache and load modules
#
###########################################################################

sub read_module_cache ( ) {
  do "$mod_cache";
}


###########################################################################
#
# subroutine: print_modules
#
# Print a table of module info to stdout
#
###########################################################################

sub print_modules ( ) {
    print "\nShort\tModule\n";
    print "-----\t------\n";
    foreach my $key (sort(keys %sites)) {
      print $key , "\t" , $sites{$key}, "\n";
    }
}


###########################################################################
#
# subroutine: print_v
#
# If verbose is on, print the text
#
###########################################################################

sub print_v ( $ ) {
  if ( $config{'verbose'} && !$config{'nodl'} ) {
    print shift(@_)."\n";
  }
}




##
##  The following subroutines are to help standardize module creation.
##


###########################################################################
#
# subroutine: module_set_limit
#
# A function to help modules set the limit of pages to retrieve
#
###########################################################################

sub module_set_limit ( $ ) {
  my $high = shift(@_);

  # Set the limit if a limit is set
  if ( $config{'offset'} && $config{'offset'} < $high ) {
    $high = $config{'start'}+$config{'offset'}-1;
  }

  print_v("Getting pages $config{'start'} through $high");
  return $high
}


###########################################################################
#
# subroutine: module_check_format
#
# A function to check if the desired format is valid for the module
#
###########################################################################

sub module_check_format ( $@ ) {
  my ($default,@formats) = @_;
  my $desired = exists $config{'format'} ? $config{'format'} : "unset";

  my $return = $default;
  my $message = "Format not supported: $desired\n";

  for my $format (@formats) {
    if ( $format eq $desired ) {
      $return = $desired;
      $message = "";
    }
  }

  print_v($message."Using format: $return");
  return $return;
}
