#!/usr/bin/perl -w

#
#  TODO:
#
#     What's still unlikeable about this code is the awkward copying of state from 
#     the latest attempt object up into the url object, especially considering that a url attempt is
#     a subclass of a url.
#

#
#  Examples:
#
#        ./www-map.pl -l '.*.cs.fsu.edu' 'http://www.cs.fsu.edu/~langley/CNT5605-2011-Fall/'
#        ./www-map.pl -d 2 -r '.*.cs.fsu.edu' http://www.cs.fsu.edu
#        ./www-map.pl -l '.*.cs.fsu.edu' http://www.cs.fsu.edu > /tmp/abc
#        ./www-map.pl -r '/^http://.*\.cs\.fsu\.edu/.*$' 'http://www.cs.fsu.edu/~langley/'
#

use strict;
use warnings;

#
#  Turn this into nicer code
#
#
#    wwwmap            -->   the main bit
#    wwwmap::url       -->   do the URL objects
#    wwwmap::attempt   -->   attempts at retrieving a given URL
#


use Data::Dumper;
use Time::HiRes qw( gettimeofday tv_interval );


package wwwmap;
use Data::Dumper;
use Getopt::Std;
use DBI;

#
# defaults
#
my $version = "0.1";
my $verbose = 0;                   # verbosity
my $max_attempts = 3;              # safety measure, don't want to try too many times on a failing link
my $max_depth = 1;                 # safety measure, don't want to go too deep any one time
my $retrieval_restriction = '.*';  # safety measure, doesn't even leave the regular expression for any reason
my $retrieval_limit = '.*';        # safety measure, only leaves the regular expression to verify one level
my $max_urls = 1;                  # safety measure
my $verify_certificate = 1;        # check the chain of validity for presented certificates
my $believe_dubious_text_html = 1; # don't report files with unusual names being represented as content: text/html files instead of applications
my $dbfile = "/tmp/newdb.sqlite";  #

#
#  create an root url object
#

my $url0 = "http://www.google.com";


#
# now check options
#

my %opts;
getopts('f:scl:m:vn:d:r:', \%opts);

$verbose = 1 if defined($opts{v});                         # -v: Verbose mode
$max_attempts = $opts{n} if defined($opts{n});             # -n: Maximum number of attempts on one URL
$max_depth = $opts{d} if defined($opts{d});                # -d: Maximum tree depth
$max_urls = $opts{m} if defined($opts{m});                 # -m: Maximum number of urls to check (safety measure!)
$retrieval_restriction = $opts{r} if defined($opts{r});    # -r: A regular expression that restricts all retrievals to hostnames that meet that URL
$retrieval_limit = $opts{l} if defined($opts{l});          # -l: A regular expression that unlike -r also checks dependencies to verify 
                                                           #     outside links work (use of -r and -l are mutually exclusive since one says 
                                                           #     do not check outside links and the other says to do so)
$verify_certificate = 0 if defined($opts{c});              # -c: accept all certificates
$believe_dubious_text_html = 0 if defined($opts{s});       # -s: report files with unusual names being represented as 
                                                           #     content: text/html files instead of applications
$dbfile = $opts{f} if defined($opts{f});                   # -f: alternative name for dbfile (defaults to /tmp/newdb.sqlite)


if(defined($ARGV[0]))
{
    $url0 = $ARGV[0];
}
else
{
    print "Defaulting to '$url0' ... (probably not what you want!)\n";
}


#
# give the user some idea of what is going on!
#
print "Verbose mode is on\n" if $verbose;
print "Root url is '$url0'\n";

#
# structures that we will use
#
my %urls;
my %bad_urls;

$urls{ "$url0" } = wwwmap::url->new($url0,0);


#
# Okay, let's fire up the queue. It's now a queue of pending URLs; the attempt
# record, for good or bad, is now kept in the URL object's state.
#
my $queue = wwwmap::queue->new(
    {
     retrieval_restriction => defined($retrieval_restriction) ? "$retrieval_restriction" : undef,
     retrieval_limit => defined($retrieval_limit) ? "$retrieval_limit" : undef,
     max_urls => $max_urls
    }
);
$queue->add($url0);

#
#  Now for the database
#
my $database_handle;
db_init();


#
#  Now for the big loop
#
my $url;
while($url = $queue->remove())
{
    #
    #
    #
    print "Here's what is in the queue:\n";
    $queue->print();

    #
    # verify that we have not been able to visit this url
    #
    print "Attempt url '$url':\n";
    my $url_obj = $urls{ "$url" };
    if(!defined($url_obj))
    {
	print "I cannot find anything in the map for url '$url'!\n";
	next;
    }


    #
    # Rule #1 for processing a url: if it's been too many times around, then don't bother to try again
    #
    if($url_obj->{succeeded})
    {
	print "No, I have already succeeded with this one, let's get another one from the queue!\n";
	next;
    }
    if($url_obj->{failures} >= $max_attempts)
    {
	print "No, I have already failed $url_obj->{failures} time(s), that's it!\n";
	$bad_urls{ "$url" } = 1;
	next;
    }


    #****************************************************************************************
    # Other exclusion rules are located in wwwmap::queue since we don't want to waste space *
    # queuing URLs that clearly should not be enqueued                                      *
    #****************************************************************************************


    #
    # Okay, create a new attempt. 
    #
    my $attempt = wwwmap::attempt->new("$url",$url_obj->{head_only});
    push ( @{ $url_obj->{attempts} }, $attempt );

    if($attempt->{succeeded})
    {
	# okay, we had a success; mark it as such and then extract any
	# new urls that we might can use
	$url_obj->{succeeded} = 1;

	# gad. it is stupid to copy this information ftro mt oattemfrofrfrofrfrofrfrom 
	$url_obj->{elapsed} = $attempt->{elapsed};
	$url_obj->{attempt_count} = $url_obj->{failures} + 1;        # stupid, but, well, what else could I do?
	$url_obj->{redirect_url} = $attempt->{redirect_url};
	$url_obj->{response_code} = $attempt->{response_code};
	$url_obj->{website} = $attempt->{website};

	#
	#  see the "attempt" module's "visit()" routine to see where these are coming from
	#
	$url_obj->{last_modified} = $attempt->{last_modified};
	$url_obj->{etag} = $attempt->{etag};
	$url_obj->{content_type} = $attempt->{content_type};

	$url_obj->parse_response($attempt->{response});
	$url_obj->add_links($queue);
	$url_obj->db_insert;

	#
	# Okay, our object is no longer required!
	#
	$urls{"$url"} = "***FROZEN***";
    }
    else
    {
	# chalk up a failure and re-enqueue
	$url_obj->{failures}++;
	$queue->add("$url");
    }
}

print "State of URLs:\n";
foreach(sort(keys %urls))
{
    print "$_ :: $urls{$_}\n";
}

print "State of bad URLs:\n";
foreach(sort(keys %bad_urls))
{
    print "$_ :: $bad_urls{$_}\n";
}


#
#  Utilities that I am not sure where to put...
#
sub extract_website
{
    my $current = shift;
    my $website;

    if( $current->{url} =~ m{^https?://(.*?)/}i )
    {
	$website = $1;
	print "Setting website = $website\n";
    }
    else
    {
	print "Ooops, try #1 --> didn't get website from '$current->{url}' !\n";
    }

    if( !defined($website) )
    {
	if( $current->{url} =~ m{^https?://([^/?]*)}i )
	{
	    $website = $1;
	    print "Setting website = $website\n";
	}
	else
	{
	    print "Ooops, try #2 --> didn't get website from '$current->{url}' !\n";
	}
    }
    return $website;
}

sub extract_ssl
{
    my $current = shift;
    my $ssl = 0;
    if( $current->{url} =~ m{^https://(.*?)}i )
    {
	$ssl = 1;
    }
    return $ssl;
}

sub parse_response
{
    my $self = shift;
    my $response = shift;

#    print "Parsing response " . Dumper($response) . "\n";

    # first, determine if we this is something that we can use
    if($response->code == 200)
    {
	# we have an ordinary return, see what we can do with it
	print "That's an ordinary 200, let's work on it!\n";
	$self->process200($response);
    }
    elsif( ($response->code >= 200) && ($response->code <= 299) )
    {
	# well, we got some kinda weird response code, complain about it
	print "ANGRY ERROR! LWP::USERAGENT SHOULD NOT RECEIVE " . $response->code . " (url = '$self->{url}')\n";
    }
    elsif( ($response->code == 301) || ($response->code == 302) || ($response->code == 303) 
	|| ($response->code == 305) || ($response->code == 307) )
    {
	# we have a redirect, use the "Location" header to create a new child url
	$self->{links} = [ $response->header("location") ];
	print "Links set to " . Dumper($self->{links}) ." \n";
    }
    elsif( $response->code == 304 )
    {
	print "ANGRY ERROR! LWP::USERAGENT SHOULD NOT RECEIVE 304 'I HAVE NOT CHANGED' RESPONSES! (url = '$self->{url}')\n";
    }
    elsif( ($response->code >= 400) && ($response->code <= 499) )
    {
	print "Hey, we got a 'bad client request' response from the server for url '$self->{url}!\n";
    }
    elsif( ($response->code >= 500) && ($response->code <= 500) )
    {
	print "I received a 'server error' response for url '$self->{url}'\n";
    }
    else
    {
	print "I received an unknown response '" . $response->code . "' for url '$self->{url}'\n";
    }
}

#
# The "main" routine: when we get back a 200, we have work to do see what's going on here
#
sub process200
{
    my $self = shift;
    my $response = shift;

    if($response->header("content-type") =~ m{text/html}i)
    {
	# okay, this appears to be an ordinary body, but let's do a sanity check
	if(!$believe_dubious_text_html && ($self->{url} =~ /(\.pdf)|(\.ps)$/i) )
	{
	    $self->{dubious} = 1;
	}
	elsif($response->decoded_content)
	{
	    print "Okay, we have some content!\n";
	    
	    #
	    # title
	    #
	    my $ph = HTML::HeadParser->new;
	    $ph->parse($response->decoded_content);
	    $self->{title} = $ph->header('Title');
	    print "Title is '$self->{title}'\n";


#	    process_method($self,$response);
	    
	    #
	    # links --> using LinkExtor
	    #
#	    use HTML::LinkExtor;
#	    my $p = HTML::LinkExtor->new(undef,$self->{url});
#	    $p->parse($response->decoded_content);

	    # foreach($p->links)
	    # {
	    # 	print "*** HERE : " . Dumper($_) . "\n\n";
	    # 	my $x = @{$_}[2];
	    # 	if( (@{$_}[0] eq "a") && (@{$_}[1] eq "href") )
	    # 	{
	    # 	    push(@{$self->{links}},$x->as_string);
	    # 	}
	    # 	elsif( (@{$_}[0] eq "img") && (@{$_}[1] eq "src") )
	    # 	{
	    # 	    push(@{$self->{images}},$x->as_string);
	    # 	}
	    # 	elsif( (@{$_}[0] eq "iframe") && (@{$_}[1] eq "src") )
	    # 	{
	    # 	    push(@{$self->{iframes}},$x->as_string);
	    # 	}
	    # 	elsif( (@{$_}[0] eq "form") && (@{$_}[1] eq "action") )
	    # 	{
	    # 	    push(@{$self->{actions}},$x->as_string);
	    # 	}
	    # 	elsif( (@{$_}[0] eq "script") && (@{$_}[1] eq "src") )
	    # 	{
	    # 	    push(@{$self->{scripts}},$x->as_string);
	    # 	}
	    # 	elsif( (@{$_}[0] eq "link") && (@{$_}[1] eq "href") )
	    # 	{
	    # 	    push(@{$self->{links}},$x->as_string);
	    # 	}
	    # 	elsif( (@{$_}[0] eq "area") && (@{$_}[1] eq "href") )
	    # 	{
	    # 	    push(@{$self->{links}},$x->as_string);
	    # 	}
	    # 	else
	    # 	{
	    # 	    print "What's a @{$_}[0] / @{$_}[1] ?? \n";
	    # 	}
	    #}

	    #
	    # links --> using LinkExtractor
	    #
	    use HTML::LinkExtractor;
	    my $p = HTML::LinkExtractor->new(undef,$self->{url});
	    $p->parse(\$response->decoded_content);
	    print Dumper($p->links);
	    foreach(@{$p->links})
	    {
		if(!defined($_))
		{
		    next;
		}
		print "What ... " . Dumper($_);
		if(defined(%{$_}->{href}))
		{
		    print "OKay..\n";
		    my $x = %{$_}->{href};
		    push(@{$self->{links}},$x->as_string);
		}
	    }
	}
    }
}


#
#  Create new SQLITE database
#
sub db_init
{
    unlink($dbfile);
    $database_handle = DBI->connect("dbi:SQLite:dbname=${dbfile}","","");

    $database_handle->do(
	"CREATE TABLE url (
   id             integer primary key,
   url            text,
   succeeded      boolean,
   content_type   text,
   title          text,
   last_modified  date,
   etag           text,
   elapsed        real,
   attempt_count  int,
   response_code  int,
   redirect_url   text,
   website        text,
   depth          int
)");
    
    $database_handle->do(
	"CREATE TABLE link (
   id           integer primary key,
   parent       text,
   link         text,
   description  text
)"
	);
    
    $database_handle->do(
	"CREATE TABLE form (
   id           integer primary key,
   parent       text,
   action       text,
   method       text,
   name         text
)"
	);

    $database_handle->do(
	"CREATE TABLE bad_link (
   id           integer primary key,
   parent       text,
   link         text
)"
	);
    
    $database_handle->do(
	"CREATE TABLE config (
   id           integer primary key,
   key          text,
   value        text
)"
	);
    
    $database_handle->do("INSERT INTO config (key,value) VALUES ('version','$version')");
    $database_handle->do("INSERT INTO config (key,value) VALUES ('max_attempts','$max_attempts')");
    $database_handle->do("INSERT INTO config (key,value) VALUES ('max_depth','$max_depth')");
    $database_handle->do("INSERT INTO config (key,value) VALUES ('retrieval_restriction','$retrieval_restriction')");
    $database_handle->do("INSERT INTO config (key,value) VALUES ('retrieval_limit','$retrieval_limit')");
    $database_handle->do("INSERT INTO config (key,value) VALUES ('max_urls','$max_urls')");
    $database_handle->do("INSERT INTO config (key,value) VALUES ('initial_url','$url0')");
    
}


sub db_insert
{
    my $current = shift;
    
    print "Adding '$current->{url}' to database...\n";
#    print Dumper($current);

    print "Doing basic 'url' information first...\n";
    
    my $url = $current->{url};
    my $keys = "(url";
    my $values = "('$current->{url}'";
    if(defined($current->{succeeded}))
    {
	$keys .= ",succeeded";
	$values .= ",$current->{succeeded}";
    }
    if(defined($current->{elapsed}))
    {
	$keys .= ",elapsed";
	$values .= ",$current->{elapsed}";
    }
    if(defined($current->{attempt_count}))
    {
	$keys .= ",attempt_count";
	$values .= ",$current->{attempt_count}";
    }
    if(defined($current->{response_code}))
    {
	$keys .= ",response_code";
	$values .= ",$current->{response_code}";
    }
    if(defined($current->{content_type}))
    {
        $keys .= ",content_type";
	$values .= ",'$current->{content_type}'";
    }
    if(defined($current->{redirect_url}))
    {
	$keys .= ",redirect_url";
	$values .= ",'$current->{redirect_url}'";
    }
    if(defined($current->{website}))
    {
	$keys .= ",website";
	$values .= ",'$current->{website}'";
    }
    if(defined($current->{title}))
    {
	$keys .= ",title";
	$values .= ",'$current->{title}'";
    }
    if(defined($current->{last_modified}))
    {
	$keys .= ",last_modified";
	$values .= ",'$current->{last_modified}'";
    }
    if(defined($current->{etag}))
    {
	$keys .= ",etag";
	$values .= ",'$current->{etag}'";
    }
    if(defined($current->{depth}))
    {
	$keys .= ",depth";
	$values .= ",$current->{depth}";
    }

#    print Dumper($current);

    my $stmt = "INSERT INTO url ${keys}) VALUES ${values})";
    print "stmt = $stmt\n";

    $database_handle->do("$stmt");


    if(defined($current->{links}))
    {
#	print Dumper($current);
	print "Doing link information second...\n";
	foreach(@{$current->{links}})
	{
	    print "Found link '$_'\n";
	    my $stmt = "INSERT INTO link (parent,link) VALUES ('$current->{url}','$_')";
	    print "stmt = $stmt\n";
	    $database_handle->do("$stmt");
	}
    }

    if(defined($current->{actions}))
    {
#	print Dumper($current);
	print "Doing form/action information third...\n";
	foreach(@{$current->{actions}})
	{
	    print "Found action '$_'\n";
	    my $stmt = "INSERT INTO form (parent,action) VALUES ('$current->{url}','$_')";
	    print "stmt = $stmt\n";
	    $database_handle->do("$stmt");
	}
    }

}


#
#   wwwmap::url
#
#     handle urls
#

package wwwmap::url;
use strict;
use warnings;
use base 'wwwmap';

sub new
{
    my $class = shift;        # magic
    my $url = shift;          # url
    my $depth = shift;        # how deep was this one


    # 
    # initialize the url
    #

    my $self = 
    {
	url => "$url",
	depth => $depth,

	attempt_count => 0,
	elapsed_time => 0.0,
	response_code => 0,
	content_type => "",
	redirect_url => undef,
	website => "",
	title => "",
	last_modified => "",
	etag => "",
	succeeded => 0,
	failures => 0,
	attempts => [ ],

	head_only => 0,

	links => [ ],
	images => [ ],
	iframes => [ ],
	actions => [ ],
	scripts => [ ],
    };
    if($depth > $max_depth)
    {
	$self->{head_only} = 1;
    }

    bless($self,$class);

    #
    # do a bit of caching of data
    #

    $self->{website} = $self->SUPER::extract_website($self->{url});
    $self->{ssl} = $self->SUPER::extract_ssl($self->{url});

    #
    # now finish up
    #

    return($self);
}


sub set
{
    my $self = shift;
    my $field = shift;
    my $value = shift;

    $self->{field} = $value;
}

sub get
{
    my $self = shift;
    my $field = shift;

    return($self->{field});
}

sub add_links
{
    my $self = shift;
    my $queue = shift;

    print "Looking at links... \n";
    foreach(@{$self->{links}})
    {
	print "Should I add $_ ? ...\n";
	if(defined($urls{"$_"}))
	{
	    print "    ... no, skipping old url '$_'\n";
	}
	else
	{
	    $urls{"$_"} = wwwmap::url->new("$_",$self->{depth}+1);
	    $queue->add("$_");
	}
    }
    print "...finished links\n";
}

#
#   wwwmap::attempt
#
#     handle attempts
#
package wwwmap::attempt;
use strict;
use warnings;
use LWP::UserAgent;
use Data::Dumper;
use Time::HiRes qw( gettimeofday tv_interval );

use base 'wwwmap';

sub new
{
    #
    #  Arguments: class, url, [head_only]
    #
    #     class = magic
    #     url = 
    #     [head_only] = [defaults to 0]
    #

    my ($class,$url,$head_only) = (undef, undef, 0);
    ($class,$url,$head_only) = @_;

    my $time = time();
    my $self;

    # 
    # initialize the url
    #

    $self = 
    {
	url => "$url",
	time => $time,
	succeeded => 0,
	head_only => $head_only
    };
    bless($self,$class);
    
    #
    # do a bit of caching of data
    #

    $self->{website} = $self->SUPER::extract_website($self->{url});
    $self->{ssl} = $self->SUPER::extract_ssl($self->{url});

    #
    # check to see if we make an immediate attempt to retrieve this URL
    #

    $self->visit;

    return($self);
}

sub visit
{
    my $self = shift;
    $self->{pending} = 0;
    
    my $useragent = LWP::UserAgent->new(
	(
	 agent => "wwwmap/0.1",
	 max_redirect => 0,
	 requests_redirectable => [ ],
	 timeout => 10
	)
	);

    my $response;
    my $t0 = [gettimeofday];
    if($self->{head_only})
    {
	$response = $useragent->head($self->{url});
    }
    else
    {
	$response = $useragent->get($self->{url});
    }
#    print Dumper($response);
    $self->{elapsed} = tv_interval($t0, [gettimeofday]);

    $self->{response_code} = $response->code;
    $self->{response} = $response;
    if($response->is_success)
    {
	$self->{succeeded} = 1;

	$self->{last_modified} = $response->{_headers}->{"last-modified"};
	$self->{etag} = $response->{_headers}->{etag};
	$self->{content_type} = $response->{_headers}->{"content-type"};

	print "***** LAST MODIFIED INFO *****";
#	print Dumper($response);
    }
    elsif( ($response->code == 301) || ($response->code == 302) || ($response->code == 303) 
	|| ($response->code == 305) || ($response->code == 307) )
    {
	#
	#  Unlike LWP, we count a redirect as a success
	#
	$self->{succeeded} = 1;
	$self->{redirect} = 1;
	$self->{redirect_url} = $response->{_headers}->{location};
	$self->{etag} = $response->{_headers}->{etag};
    }
    else
    {
	print "Retrieval error on $self->{url}\n";
    }

#    print "Okay, let's look at this 'attempt' object after a visit: " . Dumper($self) . "\n";
    return($self->{succeeded});
}


package wwwmap::queue;
use strict;
use Data::Dumper;

sub new
{
    my $class = shift;
    my $self = shift;

#    print Dumper($self);

    $self->{count} = 0;         # this is the current count
    $self->{enqueued} = 0;      # this is the total enqueued

    bless($self,$class);
    return($self);
}

sub add
{
    my $self = shift;
    my $item = shift;

    my $item_object = $urls{"$item"};

    #
    #  1. Check to see if the queue limit has been reached (this is a total of the lifetime enqueued, not the current
    #     number of elements in the queue .)
    #
    if($self->{enqueued} >= $self->{max_urls})
    {
	print "Queue has reached max items ($self->{max_urls}), not adding url '$item'\n";
	return;
    }

    #
    #  2. Check for an absolute restriction.
    #
    #
    if(defined($self->{retrieval_restriction}))
    {
	if($item_object->{website} =~ /$self->{retrieval_restriction}/)
	{
	    print "Restriction Check: URL '$item_object->{url}' with website = $item_object->{website} passed restriction '$self->{retrieval_restriction}'!\n";
	    $item_object->set("restricted",0);
	}
	else
	{
	    print "Restriction Check: URL $item_object->{url} with website = $item_object->{website} FAILED restriction '$self->{retrieval_restriction}'!\n";
	    $item_object->set("restricted",1);
	    return;
	}
    }

    #
    #   3. See if we are limiting existence checks.
    #
    if(defined($self->{retrieval_limit}))
    {
	if($item_object->{website} =~ /$self->{retrieval_limit}/)
	{
	    print "Limitation Check: URL $item_object->{url} with website = $item_object->{website} passed limit '$self->{retrieval_limit}'!\n";
	    $item_object->set("limited",0);
	}
	else
	{
	    print "Limitation Check: URL $item_object->{url} with website = $item_object->{website} FAILED limit '$self->{retrieval_limit}'!\n";
	    $item_object->set("limited",1);
	    $item_object->set("head_only",1);
	}
    }

    $self->{count}++;
    $self->{enqueued}++;

    if(defined($self->{items}))
    {
	push(@{$self->{items}},$item);
    }
    else
    {
	$self->{items} = [ $item ];
    }

}

sub remove
{
    my $self = shift;
    if($self->{count})
    {
	$self->{count}--;
	my $item = shift(@{$self->{items}});
	return($item);
    }
    else
    {
	return(undef);
    }
}

sub print
{
    my $self = shift;
    print "There are $self->{count} item(s) in the queue.\n";
    print "--------------------------------------------\n";
#    print Dumper(@{$self->{items}});
}


=comment


#
#  now work through the "pending" list
#
while(@pending && ($count_urls <= $max_urls))
{
    print "Starting!\n";
    my $current = pop(@pending);

#    print Dumper($current);

    print "visiting '$current->{url}'\n";
    $current->{attempts}++;

    $curl->setopt(CURLOPT_URL, $current->{url});
    
    my $response_body;
    $curl->setopt(CURLOPT_WRITEDATA,\$response_body);

    # first time through, just get the HEAD
    $curl->setopt(CURLOPT_NOBODY,1);
    $curl->setopt(CURLOPT_SSL_VERIFYPEER,$verify_certificate);

    my $t0 = [gettimeofday];
    my $ret = $curl->perform;
    my $elapsed = tv_interval($t0, [gettimeofday]);
    
    if ($ret == 0) 
    {
	print("HEAD Transfer Complete!!\n");

	$current->{visited} = 1;
	my $response_code = $curl->getinfo(CURLINFO_RESPONSE_CODE);

	$current->{content_type} = $curl->getinfo(CURLINFO_CONTENT_TYPE);
	$current->{content} = $response_body;
	$current->{elapsed} = $elapsed;
	$current->{response_code} = $response_code;

	if($response_body =~ m/^last-modified: (.*)\r$/mi)
	{
	    $current->{last_modified} = $1;
	    print "Found last_modified = '$current->{last_modified}'\n";
	}

	if($response_code == 200)
	{
	    # let's verify that this is actually html ?
#	    if( $response_body =~ m{^Content-Type: text/html}mi )
#	    {
#		$current->{content_type} = "text/html";
	    if($current->{content_type} eq "text/html")
	    {

		if( !$believe_dubious_text_html && ($current->{url} =~ /(\.pdf)|(\.ps)$/i) )
		{
		    $current->{dubious} = 1;
		}
		# now make sure that we aren't under a limitation before we process this page's links --> not an issue
		# with "retrieval_restriction" since we don't leave the website restriction at all
		elsif(defined($retrieval_limit))
		{
		    if($current->{website} =~ /$retrieval_limit/)
		    {
			# okay, we need to actual get the data
			print "$current->{website} passed a_href parsing limit '$retrieval_limit'!\n";

			$curl->setopt(CURLOPT_NOBODY,0);
			
			my $t0 = [gettimeofday];
			my $ret = $curl->perform;
			my $elapsed = tv_interval($t0, [gettimeofday]);
			
			my $response_code = $curl->getinfo(CURLINFO_HTTP_CODE);
			$current->{content} = $response_body;
			$current->{elapsed} = $elapsed;
			$current->{response_code} = $response_code;

			if ($ret == 0) 
			{
			    print("BODY Transfer Complete!!\n");

			    $current->{limited} = 0;
			    process_method($current,$response_body);
			    process_a_href($current,$response_body);
			}
			else
			{
			    print("BODY Transfer Failed!!\n");
			}
		    }
		    else
		    {
			$current->{limited} = 1;
			print "$current->{website} FAILED a_href parsing limit '$retrieval_limit'!\n";
		    }
		}
		else
		{
		    $current->{limited} = 0;
		    process_method($current,$response_body);
		    process_a_href($current,$response_body);
		}

		process_title($current,$response_body);
	    }
	    else
	    {
		if ($response_body =~ m{^Content-Type: (.*)\r$}mi)
		{
		    $current->{content_type} = "$1";
		    print "Found content_type = $1\n";
		}
		else
		{
		    $current->{content_type} = "unknown";
		    print "Did not find content-type in '$response_body'\n";
		}
	    }
	}
	if($response_code == 301)
	{
	    print "Received a redirect!\n";
	    $current->{content_type} = "redirect";
	    if($response_body =~ m/^Location: (.*)\r$/im)  # remember, headers have an extra \r at the end !
	    {
		my $new_url = $1;
		print "The new URL is '$new_url'\n";
		$current->{redirect_url} = "$new_url";
		print "Response body was '$response_body'\n";
		deal_with_url($new_url," [ REDIRECT ] ",$current,extract_website($current));
#		print Dumper($current);
	    }
	    else
	    {
		print "I could not parse out this redirect!\n";
		print "Response body was '$response_body'\n";
		$current->{redirect_url} = "[NOT KNOWN]";
	    }
	}
    } 
    else 
    {
	print STDERR "Error: $ret ".$curl->strerror($ret)." ".$curl->errbuf."\n";
    }
}

my %broken_link_cache;
print "\n";
print "\n";
print "Building cache of non-responsive URLs\n";
print "-------------------------------------\n";
foreach(keys %urls)
{
    my $current = $urls{$_};
    if(!$current->{visited} && !$current->{restricted} && !($current->{depth} > $max_depth))
    {
	print "\tFailed on all $current->{attempts} attempt(s) on $current->{url}, adding to cache\n ";
	$broken_link_cache{"$current->{url}"} = $current->{attempts};
    }
}

print "\n";
print "\n";
print "Final Report\n";
print "\n";
foreach(keys %urls)
{
    my $current = $urls{$_};
    print "\tFor URL '$current->{url}' at depth $current->{depth}: ";
    if(defined($current->{dubious}))
    {
	print "\t DUBIOUS $current->{url} --> reports to be text/html, but name doesn't make it likely!\n";
    }
    if($current->{visited})
    {
	print "Succeeded on first attempt (took $current->{elapsed} seconds)\n" if $current->{attempts} == 1;
	print "Succeeded on #$current->{attempts} attempt (took $current->{elapsed} seconds)\n" if $current->{attempts} > 1;
	print "\t\tResponse code was '$current->{response_code}'\n";
	print "\t\tContent type was '$current->{content_type}'\n";
	print "\t\tRedirected to URL '$current->{redirect_url}'\n" if defined($current->{redirect_url});
	print "\t\tWebsite is '$current->{website}'\n";
	print "\t\tTitle of the content is '$current->{title}'\n" if defined($current->{title});
	print "\t\tThis page links to the following pages:\n";
	print "\t\t---------------------------------------\n";
	if(defined($current->{links}))
	{
	    my @links = @{$current->{links}};
#	    print "****" . Dumper(@links);
	    foreach(@links)
	    {
		my $broken_flag = "";
		# check the "broken link" cache to see if this link is broken
		if(defined($broken_link_cache{"$_->[0]"}))
		{
		    $broken_flag = "*** appears broken: does not answer on $broken_link_cache{$_->[0]} attempt(s)";
		}
#		print Dumper("$_ \n");
		print "\t\t\t '$_->[0]'  ( '$_->[1]' ) $broken_flag\n";
	    }
	}
	else
	{
	    if($current->{limited})
	    {
		print "\t\t\t [ Retrieval was allowed by limiting regular expression '$retrieval_limit', but none of the child links were followed ]\n";
	    }
	    else
	    {
		print "\t\t\t [ NO LINKS FOUND ] \n";
	    }
	}
	if(defined($current->{forms}))
	{
	    print "\t\tThis page has these forms:\n";
	    print "\t\t--------------------------\n";
	    my @forms = @{$current->{forms}};
	    foreach(@forms)
	    {
		print "\t\t\t  '$_->[0]'  '$_->[1]' ( $_->[2] )\n";
	    }
	}
    }
    else
    {
	if($current->{restricted})
	{
	    print "Retrieval not allowed by restriction regular expression '$retrieval_restriction'\n";
	    next;
	}
	if($current->{depth} > $max_depth)
	{
	    print "LEAF, no attempts made\n";
	}
	else
	{
	    print "Failed on all $current->{attempts} attempt(s)\n" if ( !$current->{visited} );
	}
    }
}

# print Dumper(%urls);

#
#  Write it all to a new SQLITE database
#
unlink("/tmp/newdb.sqlite");
my $database_handle = DBI->connect("dbi:SQLite:dbname=/tmp/newdb.sqlite","","");

$database_handle->do(
"CREATE TABLE url (
   id             integer primary key,
   url            text,
   visited        boolean,
   content_type   text,
   title          text,
   last_modified  date,
   elapsed        real,
   attempts       int,
   response_code  int,
   redirect_url   text,
   website        text,
   depth          int
)");

$database_handle->do(
"CREATE TABLE a_href (
   id           integer primary key,
   parent       text,
   href         text,
   description  text
)"
    );

$database_handle->do(
"CREATE TABLE form (
   id           integer primary key,
   parent       text,
   action       text,
   method       text,
   name         text
)"
    );

$database_handle->do(
"CREATE TABLE config (
   id           integer primary key,
   key          text,
   value        text
)"
    );

$database_handle->do("INSERT INTO config (key,value) VALUES ('version','$version')");
$database_handle->do("INSERT INTO config (key,value) VALUES ('max_attempts','$max_attempts')");
$database_handle->do("INSERT INTO config (key,value) VALUES ('max_depth','$max_depth')");
$database_handle->do("INSERT INTO config (key,value) VALUES ('retrieval_restriction','$retrieval_restriction')");
$database_handle->do("INSERT INTO config (key,value) VALUES ('retrieval_limit','$retrieval_limit')");
$database_handle->do("INSERT INTO config (key,value) VALUES ('max_urls','$max_urls')");
$database_handle->do("INSERT INTO config (key,value) VALUES ('count_urls','$count_urls')");
$database_handle->do("INSERT INTO config (key,value) VALUES ('initial_url','$url0{url}')");


foreach(keys %urls)
{
    my $current = $urls{$_};

    print "Adding '$current->{url}' to database...\n";
    print Dumper($current);

    my $url = $current->{url};
    my $keys = "(url";
    my $values = "('$current->{url}'";
    if(defined($current->{visited}))
    {
	$keys .= ",visited";
	$values .= ",$current->{visited}";
    }
    if(defined($current->{elapsed}))
    {
	$keys .= ",elapsed";
	$values .= ",$current->{elapsed}";
    }
    if(defined($current->{attempts}))
    {
	$keys .= ",attempts";
	$values .= ",$current->{attempts}";
    }
    if(defined($current->{response_code}))
    {
	$keys .= ",response_code";
	$values .= ",$current->{response_code}";
    }
    if(defined($current->{content_type}))
    {
        $keys .= ",content_type";
	$values .= ",'$current->{content_type}'";
    }
    if(defined($current->{redirect_url}))
    {
	$keys .= ",redirect_url";
	$values .= ",'$current->{redirect_url}'";
    }
    if(defined($current->{website}))
    {
	$keys .= ",website";
	$values .= ",'$current->{website}'";
    }
    if(defined($current->{title}))
    {
	$keys .= ",title";
	$values .= ",'$current->{title}'";
    }
    if(defined($current->{last_modified}))
    {
	$keys .= ",last_modified";
	$values .= ",'$current->{last_modified}'";
    }
    if(defined($current->{depth}))
    {
	$keys .= ",depth";
	$values .= ",$current->{depth}";
    }

    my $stmt = "INSERT INTO url ${keys}) VALUES ${values})";
    print "stmt = $stmt\n";

    $database_handle->do("$stmt");
    
# 	if(defined($current->{links}))
# 	{
# 	    my @links = @{$current->{links}};
# #	    print "****" . Dumper(@links);
# 	    foreach(@links)
# 	    {
# #		print Dumper("$_ \n");
# 		print "\t\t\t '$_->[0]'  ( '$_->[1]' )\n";
# 	    }
# 	}
# 	else
# 	{
# 	    if($current->{limited})
# 	    {
# 		print "\t\t\t [ Retrieval was allowed by limiting regular expression '$retrieval_limit', but none of the child links were followed ]\n";
# 	    }
# 	    else
# 	    {
# 		print "\t\t\t [ NO LINKS FOUND ] \n";
# 	    }
# 	}
# 	if(defined($current->{forms}))
# 	{
# 	    print "\t\tThis page has these forms:\n";
# 	    print "\t\t--------------------------\n";
# 	    my @forms = @{$current->{forms}};
# 	    foreach(@forms)
# 	    {
# 		print "\t\t\t  '$_->[0]'  '$_->[1]' ( $_->[2] )\n";
# 	    }
# 	}
#     }
#     else
#     {
# 	if($current->{restricted})
# 	{
# 	    print "Retrieval not allowed by restriction regular expression '$retrieval_restriction'\n";
# 	    next;
# 	}
# 	if($current->{depth} > $max_depth)
# 	{
# 	    print "LEAF, no attempts made\n";
# 	}
# 	else
# 	{
# 	    print "Failed on all $current->{attempts} attempt(s)\n" if ( !$current->{visited} );
# 	}
#    }
}


$database_handle->disconnect;


sub deal_with_url
{
    my $url = shift;
    my $description = shift;
    my $current = shift;
    my $website = shift;
    
    # try to canonicalize
    my $new_url = canonicalize($current,$website,$url);
    print "... and I settle on '$new_url'\n";
    if(!defined($current->{links}))
    {
	$current->{links} = [ [ "$new_url" , "$description" ] ];
    }
    else
    {
	push(@{$current->{links}},[ "$new_url" , "$description" ]);
    }
#	print "Now, current looks like " . Dumper($current) . "\n";

    # 
    #  big breath now. clearly, if we have seen this URL, we don't have to do
    #  anything.
    # 

    if(defined($urls{$new_url}))
    {
	print "\t\t $new_url is not a new url... skipping\n";
	return;
    }

    # okay, start building up a new url

    my %urlN;
    $urlN{"url"} = "$new_url";
    $urlN{"visited"} = 0;
    $urlN{"attempts"} = 0;
    $urlN{"website"} = extract_website(\%urlN);
    $urlN{"ssl"} = extract_ssl(\%urlN);
    $urlN{"depth"} = $current->{depth} + 1;

    $urls{ "$new_url" } = \%urlN;

    # policy checks

    # okay, see if we are restricting all checks to a given hostname pattern
    if(defined($retrieval_restriction))
    {
	if($urlN{website} =~ /$retrieval_restriction/)
	{
	    print "Restriction Check: URL '$new_url' with website = $current->{website} passed restriction '$retrieval_restriction'!\n";
	    $urlN{restricted} = 0;
	}
	else
	{
	    print "Restriction Check: URL $current->{url} with website = $current->{website} FAILED restriction '$retrieval_restriction'!\n";
	    $urlN{restricted} = 1;
	    return;
	}
    }

    # okay, see if we are limiting all checks to no more than one level away from the current link
    if(defined($retrieval_limit))
    {
	if($current->{website} =~ /$retrieval_limit/)
	{
	    print "Limitation Check: URL current->{url} with website = $current->{website} passed limit '$retrieval_limit'!\n";
	    $urlN{limited} = 0;
	}
	else
	{
	    print "Limitation Check: URL $current->{url} with website = $current->{website} FAILED limit '$retrieval_limit'!\n";
	    $urlN{limited} = 1;
	    return;
	}
    }

    if($urlN{"depth"} <= $max_depth)
    {
	# add to pending?
	push(@pending,\%urlN);
	$count_urls++;
    }

}


sub process_a_href
{
    my $current = shift;   # this is a reference the current url hash
    my $body = shift;    # this is the body

    #
    # try to extract a useful website name from $current
    #
    my $website = extract_website($current);

    #
    # look for <a href=></a>
    #
    while( $body =~ m{<a href=(.*?)>(.*?)</a>}sig )
    {
	my $url = $1;
	my $label = $2;

	if(!defined($url))
	{
	    next;
	}

	print "\n\n";
	print "'Raw' url = '$url'\n";

	# okay, if this begins as a "#" address, it just points back to this page, so forget it
	if($url =~ m/^"?#/)
	{
	    print "Skipping local page '$url' url\n";
	    next;
	}

	# okay, is this really a href, or did we get other tags?
	if( $url =~ m/^"/ )
	{
	    if($url =~ m/^"(.*?)"/s)
	    {
		$url = $1;
	    }
	}
	else
	{
	    if($url =~ m/^(\S+)/s)
	    {
		$url = $1;
	    }
	}

	print "\n";
	print "... final '$url' (labeled as '$label')\n";;
	print "\n";

	deal_with_url($url,$label,$current,$website);

    }
}

sub process_method
{
    my $current = shift;   # this is a reference the current url hash
    my $body = shift;    # this is the body

    #
    # try to extract a useful website name from $current
    #
    my $website = extract_website($current);

    #
    # look for method
    #
    while( $body =~ m{<form\s(.*?)>\s}sig )
    {

	my $string = $1;
	my $action;
	my $method;
	my $name;

	# okay, look for various ways of specifying an action
	if($string =~ m/action="(.*?)"/si)
	{
	    $action = $1;
	}
	elsif($string =~ m/action=(.*?)\s/si)
	{
	    $action = $1;
	}
	else
	{
	    $action = "[ NO ACTION FOUND ]";
	}

	# okay, look for various ways of specifying a method
	if($string =~ m/method="(.*?)"/si)
	{
	    $method = $1;
	}
	elsif($string =~ m/method=(.*?)\s/si)
	{
	    $method = $1;
	}
	else
	{
	    $method = "[ NO METHOD FOUND ]";
	}

	# okay, look for various ways of specifying a name
	if($string =~ m/name="(.*?)"/si)
	{
	    $name = $1;
	}
	elsif($string =~ m/name=(.*?)\s/si)
	{
	    $name = $1;
	}
	else
	{
	    $name = "[ NO NAME FOUND ]";
	}

	print "GREAT! found in '$string' found name = '$name', method = '$method' , action = '$action' \n";
	my $new_action = canonicalize($current,$website,$action);

	if(!defined($current->{forms}))
	{
	    $current->{forms} = [ [ "$method" , "$new_action" , "$name" ] ];
	}
	else
	{
	    push(@{$current->{forms}},[ "$method" , "$new_action" , "$name" ]);
	}
	
    }
}

sub process_title
{
    my $current = shift;
    my $body = shift;

    if($body =~ m{<head.*?>.*<title>(.*?)</title>.*<body.*?>}si)
    {
	print "Found title: $1\n";
	$current->{title} = $1;
    }
    else
    {
	print "Could not find title!\n";
	$current->{title} = "UNKNOWN";
    }
}

sub canonicalize
{
    my $current = shift;   # this is a reference the current url hash
    my $website = shift;   # this is the extracted website
    my $url = shift;

    # check for "mailto:" address
    if($url =~ /^mailto:/si)
    {
	print "... '$url' is a pseudo url, a mailto!\n";
	return $url;
    }
    
    # does it start with http:? If so, no more
    if( $url =~ m/^https?:/si )
    {
	print "Well, I guess that's enough for now\n";
	return $url;
    }
    
    # does it start with / ? If so, then add the current website to front
    if( defined($website) && ($url =~ m{^/}si) )
    {
	my $ssl = "";
	$ssl = "s" if($current->{ssl});
	   
	# QUESTIONABLE POINT 1 --> should we keep the "https" info to fill this back in correctly?
	$url = "http${ssl}://${website}${url}";
	print "... rule 1 would change that to $url!\n";
	return $url;
    }

    #
    # okay, I am guess it is relative to the current path!
    # 
    # check to see that if we need to have a / in there
    #
    if($current->{url} =~ m{/$})
    {
	$url = "$current->{url}${url}";
    }
    else
    {
	$url = "$current->{url}/${url}";
    }
    print "... rule 2 would change that to $url!\n";
    return $url;
}


# sub process_method
# {
#     my $self = shift;
#     my $response = shift;

#     my $body = $response->decoded_content;

#     #
#     # try to extract a useful website name from $current
#     #
#     my $website = $self->{website};

#     #
#     # look for method
#     #
#     while( $body =~ m{<form\s(.*?)>\s}sig )
#     {
# 	my $string = $1;
# 	my $action;
# 	my $method;
# 	my $name;

# 	# okay, look for various ways of specifying an action
# 	if($string =~ m/action="(.*?)"/si)
# 	{
# 	    $action = $1;
# 	}
# 	elsif($string =~ m/action=(.*?)\s/si)
# 	{
# 	    $action = $1;
# 	}
# 	else
# 	{
# 	    $action = "[ NO ACTION FOUND ]";
# 	}

# 	# okay, look for various ways of specifying a method
# 	if($string =~ m/method="(.*?)"/si)
# 	{
# 	    $method = $1;
# 	}
# 	elsif($string =~ m/method=(.*?)\s/si)
# 	{
# 	    $method = $1;
# 	}
# 	else
# 	{
# 	    $method = "[ NO METHOD FOUND ]";
# 	}

# 	# okay, look for various ways of specifying a name
# 	if($string =~ m/name="(.*?)"/si)
# 	{
# 	    $name = $1;
# 	}
# 	elsif($string =~ m/name=(.*?)\s/si)
# 	{
# 	    $name = $1;
# 	}
# 	else
# 	{
# 	    $name = "[ NO NAME FOUND ]";
# 	}

# 	print "GREAT! found in '$string' found name = '$name', method = '$method' , action = '$action' \n";
# 	my $new_action = canonicalize($self,$website,$action);

# 	if(!defined($self->{forms}))
# 	{
# 	    $self->{forms} = [ [ "$method" , "$new_action" , "$name" ] ];
# 	}
# 	else
# 	{
# 	    push(@{$self->{forms}},[ "$method" , "$new_action" , "$name" ]);
# 	}
#     }
# }


# sub canonicalize
# {
#     my $self = shift;
#     my $website = shift;   # this is the extracted website
#     my $url = shift;

# #    print "Okay, now look at the headers in self: " . Dumper($self);
# #    print "*** url is '$url' ***\n";

#     # check for "mailto:" address
#     if($url =~ /^mailto:/si)
#     {
# 	print "... '$url' is a pseudo url, a mailto!\n";
# 	return $url;
#     }
    
#     # does it start with http:? If so, no more
#     if( $url =~ m/^https?:/si )
#     {
# 	print "Well, I guess that's enough for now\n";
# 	return $url;
#     }
    
#     # does it start with / ? If so, then add the current website to front
#     if( defined($website) && ($url =~ m{^/}si) )
#     {
# 	my $ssl = "";
# 	$ssl = "s" if($self->{ssl});
	   
# 	# QUESTIONABLE POINT 1 --> should we keep the "https" info to fill this back in correctly?
# 	$url = "http${ssl}://${website}${url}";
# 	print "... rule 1 would change that to $url!\n";
# 	return $url;
#     }

#     #
#     # okay, I am guess it is relative to the current path!
#     # 
#     # check to see that if we need to have a / in there
#     #
#     if($self->{url} =~ m{/$})
#     {
# 	$url = "$self->{url}${url}";
#     }
#     else
#     {
# 	$url = "$self->{url}/${url}";
#     }
#     print "... rule 2 would change that to $url!\n";
#     return $url;
# }

=cut