#!/usr/bin/perl -w

use LWP::Simple;
use SOAP::Lite;
use Digest::MD5 qw( md5_hex );
use YAML qw( LoadFile DumpFile );
use IO::File;
use strict;

my $key      = shift(@ARGV) or die "usage: $0 <google_key> [<catalog.yaml>]\n";
my $type     = "rtf";
my @words    = qw( the is of and );
my $cat_file = shift(@ARGV) || "catalog.yaml";
my ($catalog, $seen);

if (-r $cat_file) {
    warn "* Loading catalog from $cat_file ...\n";
    $catalog = LoadFile( $cat_file );
    $seen->{$catalog->{$_}} = $_ for keys %$catalog;
} else {
    warn "Can't load catalog from $cat_file. Creating a new one...\n";
    $catalog = {};
}

my $start  = 0; # $catalog->{START} || 0;
my $done   = 0;
my $word;

$SIG{INT} = $SIG{HUP} = $SIG{TERM} = sub { $done++ }; 

warn "* Initiating Google search service...\n";
my $google = SOAP::Lite->service("http://api.google.com/GoogleSearch.wsdl");

until ($done) {
    if (not $word or $start >= 1000) { # Google doesn't return results > 1000
	$word = shift @words; $start = 0;
	if ($word) { 
	    warn "* Now using search term '$word'\n";
	} else {
	    warn "* Run out of search terms! Done.\n";
	    exit;
	}
    }

    warn "* Querying Google for results $start + ...\n";

    # key, q, start, maxResults, filter, restrict, safeSearch, 
    # lr, ie, oe
    my @params = ($key, "filetype:$type +$word", $start, 
	10, 0, '', 0, '', '', '');
    my $result = $google->doGoogleSearch(@params);
    
    for my $item (@{$result->{resultElements}}) {
	last if $done; # someone hit the stop button

	my $url = $item->{URL}; # make sure it's RTF
	next unless $url =~ /\.$type$/o;

	if ($seen->{$url}) { # already have it.
	    warn "= $url\n";
	    next;
	}

	warn "+ $url\n";
	my $data = get( $url );
	unless ($data) {
	    warn "Can't load $url?\n";
	    next;
	}

	my $md5 = md5_hex( $data );
	$md5 = substr($md5, 0, 16); # leave somewhat manageable filenames
	my $file = "$md5.$type";
	if (-r "$file") { # Already have it.
	    warn "| $url = $file\n";
	    next;
	}

	my $fh = IO::File->new(">$file");
	unless ($fh) {
	    warn "Can't write to $file??\n";
	    next;
	}

	warn "  -> $file\n";
	$fh->print($data);
	$fh->close;
	$catalog->{$md5} = $url;
	$seen->{$url} = $md5;
    }

    warn "* Writing catalog...\n";
    DumpFile( $cat_file, $catalog );

    $start += 10 unless $done;
}

