#!/usr/bin/perl -w # # TODO: # # What's still unlikeable about this code is the awkward copying of state from # the latest attempt object up into the url object, especially considering that a url attempt is # a subclass of a url. # # # Examples: # # ./www-map.pl -l '.*.cs.fsu.edu' 'http://www.cs.fsu.edu/~langley/CNT5605-2011-Fall/' # ./www-map.pl -d 2 -r '.*.cs.fsu.edu' http://www.cs.fsu.edu # ./www-map.pl -l '.*.cs.fsu.edu' http://www.cs.fsu.edu > /tmp/abc # ./www-map.pl -r '/^http://.*\.cs\.fsu\.edu/.*$' 'http://www.cs.fsu.edu/~langley/' # use strict; use warnings; # # Turn this into nicer code # # # wwwmap --> the main bit # wwwmap::url --> do the URL objects # wwwmap::attempt --> attempts at retrieving a given URL # use Data::Dumper; use Time::HiRes qw( gettimeofday tv_interval ); package wwwmap; use Data::Dumper; use Getopt::Std; use DBI; # # defaults # my $version = "0.1"; my $verbose = 0; # verbosity my $max_attempts = 3; # safety measure, don't want to try too many times on a failing link my $max_depth = 1; # safety measure, don't want to go too deep any one time my $retrieval_restriction = '.*'; # safety measure, doesn't even leave the regular expression for any reason my $retrieval_limit = '.*'; # safety measure, only leaves the regular expression to verify one level my $max_urls = 1; # safety measure my $verify_certificate = 1; # check the chain of validity for presented certificates my $believe_dubious_text_html = 1; # don't report files with unusual names being represented as content: text/html files instead of applications my $dbfile = "/tmp/newdb.sqlite"; # # # create an root url object # my $url0 = "http://www.google.com"; # # now check options # my %opts; getopts('f:scl:m:vn:d:r:', \%opts); $verbose = 1 if defined($opts{v}); # -v: Verbose mode $max_attempts = $opts{n} if defined($opts{n}); # -n: Maximum number of attempts on one URL $max_depth = $opts{d} if defined($opts{d}); # -d: Maximum tree depth $max_urls = $opts{m} if defined($opts{m}); # -m: Maximum number of urls to check (safety measure!) $retrieval_restriction = $opts{r} if defined($opts{r}); # -r: A regular expression that restricts all retrievals to hostnames that meet that URL $retrieval_limit = $opts{l} if defined($opts{l}); # -l: A regular expression that unlike -r also checks dependencies to verify # outside links work (use of -r and -l are mutually exclusive since one says # do not check outside links and the other says to do so) $verify_certificate = 0 if defined($opts{c}); # -c: accept all certificates $believe_dubious_text_html = 0 if defined($opts{s}); # -s: report files with unusual names being represented as # content: text/html files instead of applications $dbfile = $opts{f} if defined($opts{f}); # -f: alternative name for dbfile (defaults to /tmp/newdb.sqlite) if(defined($ARGV[0])) { $url0 = $ARGV[0]; } else { print "Defaulting to '$url0' ... (probably not what you want!)\n"; } # # give the user some idea of what is going on! # print "Verbose mode is on\n" if $verbose; print "Root url is '$url0'\n"; # # structures that we will use # my %urls; my %bad_urls; $urls{ "$url0" } = wwwmap::url->new($url0,0); # # Okay, let's fire up the queue. It's now a queue of pending URLs; the attempt # record, for good or bad, is now kept in the URL object's state. # my $queue = wwwmap::queue->new( { retrieval_restriction => defined($retrieval_restriction) ? "$retrieval_restriction" : undef, retrieval_limit => defined($retrieval_limit) ? "$retrieval_limit" : undef, max_urls => $max_urls } ); $queue->add($url0); # # Now for the database # my $database_handle; db_init(); # # Now for the big loop # my $url; while($url = $queue->remove()) { # # # print "Here's what is in the queue:\n"; $queue->print(); # # verify that we have not been able to visit this url # print "Attempt url '$url':\n"; my $url_obj = $urls{ "$url" }; if(!defined($url_obj)) { print "I cannot find anything in the map for url '$url'!\n"; next; } # # Rule #1 for processing a url: if it's been too many times around, then don't bother to try again # if($url_obj->{succeeded}) { print "No, I have already succeeded with this one, let's get another one from the queue!\n"; next; } if($url_obj->{failures} >= $max_attempts) { print "No, I have already failed $url_obj->{failures} time(s), that's it!\n"; $bad_urls{ "$url" } = 1; next; } #**************************************************************************************** # Other exclusion rules are located in wwwmap::queue since we don't want to waste space * # queuing URLs that clearly should not be enqueued * #**************************************************************************************** # # Okay, create a new attempt. # my $attempt = wwwmap::attempt->new("$url",$url_obj->{head_only}); push ( @{ $url_obj->{attempts} }, $attempt ); if($attempt->{succeeded}) { # okay, we had a success; mark it as such and then extract any # new urls that we might can use $url_obj->{succeeded} = 1; # gad. it is stupid to copy this information ftro mt oattemfrofrfrofrfrofrfrom $url_obj->{elapsed} = $attempt->{elapsed}; $url_obj->{attempt_count} = $url_obj->{failures} + 1; # stupid, but, well, what else could I do? $url_obj->{redirect_url} = $attempt->{redirect_url}; $url_obj->{response_code} = $attempt->{response_code}; $url_obj->{website} = $attempt->{website}; # # see the "attempt" module's "visit()" routine to see where these are coming from # $url_obj->{last_modified} = $attempt->{last_modified}; $url_obj->{etag} = $attempt->{etag}; $url_obj->{content_type} = $attempt->{content_type}; $url_obj->parse_response($attempt->{response}); $url_obj->add_links($queue); $url_obj->db_insert; # # Okay, our object is no longer required! # $urls{"$url"} = "***FROZEN***"; } else { # chalk up a failure and re-enqueue $url_obj->{failures}++; $queue->add("$url"); } } print "State of URLs:\n"; foreach(sort(keys %urls)) { print "$_ :: $urls{$_}\n"; } print "State of bad URLs:\n"; foreach(sort(keys %bad_urls)) { print "$_ :: $bad_urls{$_}\n"; } # # Utilities that I am not sure where to put... # sub extract_website { my $current = shift; my $website; if( $current->{url} =~ m{^https?://(.*?)/}i ) { $website = $1; print "Setting website = $website\n"; } else { print "Ooops, try #1 --> didn't get website from '$current->{url}' !\n"; } if( !defined($website) ) { if( $current->{url} =~ m{^https?://([^/?]*)}i ) { $website = $1; print "Setting website = $website\n"; } else { print "Ooops, try #2 --> didn't get website from '$current->{url}' !\n"; } } return $website; } sub extract_ssl { my $current = shift; my $ssl = 0; if( $current->{url} =~ m{^https://(.*?)}i ) { $ssl = 1; } return $ssl; } sub parse_response { my $self = shift; my $response = shift; # print "Parsing response " . Dumper($response) . "\n"; # first, determine if we this is something that we can use if($response->code == 200) { # we have an ordinary return, see what we can do with it print "That's an ordinary 200, let's work on it!\n"; $self->process200($response); } elsif( ($response->code >= 200) && ($response->code <= 299) ) { # well, we got some kinda weird response code, complain about it print "ANGRY ERROR! LWP::USERAGENT SHOULD NOT RECEIVE " . $response->code . " (url = '$self->{url}')\n"; } elsif( ($response->code == 301) || ($response->code == 302) || ($response->code == 303) || ($response->code == 305) || ($response->code == 307) ) { # we have a redirect, use the "Location" header to create a new child url $self->{links} = [ $response->header("location") ]; print "Links set to " . Dumper($self->{links}) ." \n"; } elsif( $response->code == 304 ) { print "ANGRY ERROR! LWP::USERAGENT SHOULD NOT RECEIVE 304 'I HAVE NOT CHANGED' RESPONSES! (url = '$self->{url}')\n"; } elsif( ($response->code >= 400) && ($response->code <= 499) ) { print "Hey, we got a 'bad client request' response from the server for url '$self->{url}!\n"; } elsif( ($response->code >= 500) && ($response->code <= 500) ) { print "I received a 'server error' response for url '$self->{url}'\n"; } else { print "I received an unknown response '" . $response->code . "' for url '$self->{url}'\n"; } } # # The "main" routine: when we get back a 200, we have work to do see what's going on here # sub process200 { my $self = shift; my $response = shift; if($response->header("content-type") =~ m{text/html}i) { # okay, this appears to be an ordinary body, but let's do a sanity check if(!$believe_dubious_text_html && ($self->{url} =~ /(\.pdf)|(\.ps)$/i) ) { $self->{dubious} = 1; } elsif($response->decoded_content) { print "Okay, we have some content!\n"; # # title # my $ph = HTML::HeadParser->new; $ph->parse($response->decoded_content); $self->{title} = $ph->header('Title'); print "Title is '$self->{title}'\n"; # process_method($self,$response); # # links --> using LinkExtor # # use HTML::LinkExtor; # my $p = HTML::LinkExtor->new(undef,$self->{url}); # $p->parse($response->decoded_content); # foreach($p->links) # { # print "*** HERE : " . Dumper($_) . "\n\n"; # my $x = @{$_}[2]; # if( (@{$_}[0] eq "a") && (@{$_}[1] eq "href") ) # { # push(@{$self->{links}},$x->as_string); # } # elsif( (@{$_}[0] eq "img") && (@{$_}[1] eq "src") ) # { # push(@{$self->{images}},$x->as_string); # } # elsif( (@{$_}[0] eq "iframe") && (@{$_}[1] eq "src") ) # { # push(@{$self->{iframes}},$x->as_string); # } # elsif( (@{$_}[0] eq "form") && (@{$_}[1] eq "action") ) # { # push(@{$self->{actions}},$x->as_string); # } # elsif( (@{$_}[0] eq "script") && (@{$_}[1] eq "src") ) # { # push(@{$self->{scripts}},$x->as_string); # } # elsif( (@{$_}[0] eq "link") && (@{$_}[1] eq "href") ) # { # push(@{$self->{links}},$x->as_string); # } # elsif( (@{$_}[0] eq "area") && (@{$_}[1] eq "href") ) # { # push(@{$self->{links}},$x->as_string); # } # else # { # print "What's a @{$_}[0] / @{$_}[1] ?? \n"; # } #} # # links --> using LinkExtractor # use HTML::LinkExtractor; my $p = HTML::LinkExtractor->new(undef,$self->{url}); $p->parse(\$response->decoded_content); print Dumper($p->links); foreach(@{$p->links}) { if(!defined($_)) { next; } print "What ... " . Dumper($_); if(defined(%{$_}->{href})) { print "OKay..\n"; my $x = %{$_}->{href}; push(@{$self->{links}},$x->as_string); } } } } } # # Create new SQLITE database # sub db_init { unlink($dbfile); $database_handle = DBI->connect("dbi:SQLite:dbname=${dbfile}","",""); $database_handle->do( "CREATE TABLE url ( id integer primary key, url text, succeeded boolean, content_type text, title text, last_modified date, etag text, elapsed real, attempt_count int, response_code int, redirect_url text, website text, depth int )"); $database_handle->do( "CREATE TABLE link ( id integer primary key, parent text, link text, description text )" ); $database_handle->do( "CREATE TABLE form ( id integer primary key, parent text, action text, method text, name text )" ); $database_handle->do( "CREATE TABLE bad_link ( id integer primary key, parent text, link text )" ); $database_handle->do( "CREATE TABLE config ( id integer primary key, key text, value text )" ); $database_handle->do("INSERT INTO config (key,value) VALUES ('version','$version')"); $database_handle->do("INSERT INTO config (key,value) VALUES ('max_attempts','$max_attempts')"); $database_handle->do("INSERT INTO config (key,value) VALUES ('max_depth','$max_depth')"); $database_handle->do("INSERT INTO config (key,value) VALUES ('retrieval_restriction','$retrieval_restriction')"); $database_handle->do("INSERT INTO config (key,value) VALUES ('retrieval_limit','$retrieval_limit')"); $database_handle->do("INSERT INTO config (key,value) VALUES ('max_urls','$max_urls')"); $database_handle->do("INSERT INTO config (key,value) VALUES ('initial_url','$url0')"); } sub db_insert { my $current = shift; print "Adding '$current->{url}' to database...\n"; # print Dumper($current); print "Doing basic 'url' information first...\n"; my $url = $current->{url}; my $keys = "(url"; my $values = "('$current->{url}'"; if(defined($current->{succeeded})) { $keys .= ",succeeded"; $values .= ",$current->{succeeded}"; } if(defined($current->{elapsed})) { $keys .= ",elapsed"; $values .= ",$current->{elapsed}"; } if(defined($current->{attempt_count})) { $keys .= ",attempt_count"; $values .= ",$current->{attempt_count}"; } if(defined($current->{response_code})) { $keys .= ",response_code"; $values .= ",$current->{response_code}"; } if(defined($current->{content_type})) { $keys .= ",content_type"; $values .= ",'$current->{content_type}'"; } if(defined($current->{redirect_url})) { $keys .= ",redirect_url"; $values .= ",'$current->{redirect_url}'"; } if(defined($current->{website})) { $keys .= ",website"; $values .= ",'$current->{website}'"; } if(defined($current->{title})) { $keys .= ",title"; $values .= ",'$current->{title}'"; } if(defined($current->{last_modified})) { $keys .= ",last_modified"; $values .= ",'$current->{last_modified}'"; } if(defined($current->{etag})) { $keys .= ",etag"; $values .= ",'$current->{etag}'"; } if(defined($current->{depth})) { $keys .= ",depth"; $values .= ",$current->{depth}"; } # print Dumper($current); my $stmt = "INSERT INTO url ${keys}) VALUES ${values})"; print "stmt = $stmt\n"; $database_handle->do("$stmt"); if(defined($current->{links})) { # print Dumper($current); print "Doing link information second...\n"; foreach(@{$current->{links}}) { print "Found link '$_'\n"; my $stmt = "INSERT INTO link (parent,link) VALUES ('$current->{url}','$_')"; print "stmt = $stmt\n"; $database_handle->do("$stmt"); } } if(defined($current->{actions})) { # print Dumper($current); print "Doing form/action information third...\n"; foreach(@{$current->{actions}}) { print "Found action '$_'\n"; my $stmt = "INSERT INTO form (parent,action) VALUES ('$current->{url}','$_')"; print "stmt = $stmt\n"; $database_handle->do("$stmt"); } } } # # wwwmap::url # # handle urls # package wwwmap::url; use strict; use warnings; use base 'wwwmap'; sub new { my $class = shift; # magic my $url = shift; # url my $depth = shift; # how deep was this one # # initialize the url # my $self = { url => "$url", depth => $depth, attempt_count => 0, elapsed_time => 0.0, response_code => 0, content_type => "", redirect_url => undef, website => "", title => "", last_modified => "", etag => "", succeeded => 0, failures => 0, attempts => [ ], head_only => 0, links => [ ], images => [ ], iframes => [ ], actions => [ ], scripts => [ ], }; if($depth > $max_depth) { $self->{head_only} = 1; } bless($self,$class); # # do a bit of caching of data # $self->{website} = $self->SUPER::extract_website($self->{url}); $self->{ssl} = $self->SUPER::extract_ssl($self->{url}); # # now finish up # return($self); } sub set { my $self = shift; my $field = shift; my $value = shift; $self->{field} = $value; } sub get { my $self = shift; my $field = shift; return($self->{field}); } sub add_links { my $self = shift; my $queue = shift; print "Looking at links... \n"; foreach(@{$self->{links}}) { print "Should I add $_ ? ...\n"; if(defined($urls{"$_"})) { print " ... no, skipping old url '$_'\n"; } else { $urls{"$_"} = wwwmap::url->new("$_",$self->{depth}+1); $queue->add("$_"); } } print "...finished links\n"; } # # wwwmap::attempt # # handle attempts # package wwwmap::attempt; use strict; use warnings; use LWP::UserAgent; use Data::Dumper; use Time::HiRes qw( gettimeofday tv_interval ); use base 'wwwmap'; sub new { # # Arguments: class, url, [head_only] # # class = magic # url = # [head_only] = [defaults to 0] # my ($class,$url,$head_only) = (undef, undef, 0); ($class,$url,$head_only) = @_; my $time = time(); my $self; # # initialize the url # $self = { url => "$url", time => $time, succeeded => 0, head_only => $head_only }; bless($self,$class); # # do a bit of caching of data # $self->{website} = $self->SUPER::extract_website($self->{url}); $self->{ssl} = $self->SUPER::extract_ssl($self->{url}); # # check to see if we make an immediate attempt to retrieve this URL # $self->visit; return($self); } sub visit { my $self = shift; $self->{pending} = 0; my $useragent = LWP::UserAgent->new( ( agent => "wwwmap/0.1", max_redirect => 0, requests_redirectable => [ ], timeout => 10 ) ); my $response; my $t0 = [gettimeofday]; if($self->{head_only}) { $response = $useragent->head($self->{url}); } else { $response = $useragent->get($self->{url}); } # print Dumper($response); $self->{elapsed} = tv_interval($t0, [gettimeofday]); $self->{response_code} = $response->code; $self->{response} = $response; if($response->is_success) { $self->{succeeded} = 1; $self->{last_modified} = $response->{_headers}->{"last-modified"}; $self->{etag} = $response->{_headers}->{etag}; $self->{content_type} = $response->{_headers}->{"content-type"}; print "***** LAST MODIFIED INFO *****"; # print Dumper($response); } elsif( ($response->code == 301) || ($response->code == 302) || ($response->code == 303) || ($response->code == 305) || ($response->code == 307) ) { # # Unlike LWP, we count a redirect as a success # $self->{succeeded} = 1; $self->{redirect} = 1; $self->{redirect_url} = $response->{_headers}->{location}; $self->{etag} = $response->{_headers}->{etag}; } else { print "Retrieval error on $self->{url}\n"; } # print "Okay, let's look at this 'attempt' object after a visit: " . Dumper($self) . "\n"; return($self->{succeeded}); } package wwwmap::queue; use strict; use Data::Dumper; sub new { my $class = shift; my $self = shift; # print Dumper($self); $self->{count} = 0; # this is the current count $self->{enqueued} = 0; # this is the total enqueued bless($self,$class); return($self); } sub add { my $self = shift; my $item = shift; my $item_object = $urls{"$item"}; # # 1. Check to see if the queue limit has been reached (this is a total of the lifetime enqueued, not the current # number of elements in the queue .) # if($self->{enqueued} >= $self->{max_urls}) { print "Queue has reached max items ($self->{max_urls}), not adding url '$item'\n"; return; } # # 2. Check for an absolute restriction. # # if(defined($self->{retrieval_restriction})) { if($item_object->{website} =~ /$self->{retrieval_restriction}/) { print "Restriction Check: URL '$item_object->{url}' with website = $item_object->{website} passed restriction '$self->{retrieval_restriction}'!\n"; $item_object->set("restricted",0); } else { print "Restriction Check: URL $item_object->{url} with website = $item_object->{website} FAILED restriction '$self->{retrieval_restriction}'!\n"; $item_object->set("restricted",1); return; } } # # 3. See if we are limiting existence checks. # if(defined($self->{retrieval_limit})) { if($item_object->{website} =~ /$self->{retrieval_limit}/) { print "Limitation Check: URL $item_object->{url} with website = $item_object->{website} passed limit '$self->{retrieval_limit}'!\n"; $item_object->set("limited",0); } else { print "Limitation Check: URL $item_object->{url} with website = $item_object->{website} FAILED limit '$self->{retrieval_limit}'!\n"; $item_object->set("limited",1); $item_object->set("head_only",1); } } $self->{count}++; $self->{enqueued}++; if(defined($self->{items})) { push(@{$self->{items}},$item); } else { $self->{items} = [ $item ]; } } sub remove { my $self = shift; if($self->{count}) { $self->{count}--; my $item = shift(@{$self->{items}}); return($item); } else { return(undef); } } sub print { my $self = shift; print "There are $self->{count} item(s) in the queue.\n"; print "--------------------------------------------\n"; # print Dumper(@{$self->{items}}); } =comment # # now work through the "pending" list # while(@pending && ($count_urls <= $max_urls)) { print "Starting!\n"; my $current = pop(@pending); # print Dumper($current); print "visiting '$current->{url}'\n"; $current->{attempts}++; $curl->setopt(CURLOPT_URL, $current->{url}); my $response_body; $curl->setopt(CURLOPT_WRITEDATA,\$response_body); # first time through, just get the HEAD $curl->setopt(CURLOPT_NOBODY,1); $curl->setopt(CURLOPT_SSL_VERIFYPEER,$verify_certificate); my $t0 = [gettimeofday]; my $ret = $curl->perform; my $elapsed = tv_interval($t0, [gettimeofday]); if ($ret == 0) { print("HEAD Transfer Complete!!\n"); $current->{visited} = 1; my $response_code = $curl->getinfo(CURLINFO_RESPONSE_CODE); $current->{content_type} = $curl->getinfo(CURLINFO_CONTENT_TYPE); $current->{content} = $response_body; $current->{elapsed} = $elapsed; $current->{response_code} = $response_code; if($response_body =~ m/^last-modified: (.*)\r$/mi) { $current->{last_modified} = $1; print "Found last_modified = '$current->{last_modified}'\n"; } if($response_code == 200) { # let's verify that this is actually html ? # if( $response_body =~ m{^Content-Type: text/html}mi ) # { # $current->{content_type} = "text/html"; if($current->{content_type} eq "text/html") { if( !$believe_dubious_text_html && ($current->{url} =~ /(\.pdf)|(\.ps)$/i) ) { $current->{dubious} = 1; } # now make sure that we aren't under a limitation before we process this page's links --> not an issue # with "retrieval_restriction" since we don't leave the website restriction at all elsif(defined($retrieval_limit)) { if($current->{website} =~ /$retrieval_limit/) { # okay, we need to actual get the data print "$current->{website} passed a_href parsing limit '$retrieval_limit'!\n"; $curl->setopt(CURLOPT_NOBODY,0); my $t0 = [gettimeofday]; my $ret = $curl->perform; my $elapsed = tv_interval($t0, [gettimeofday]); my $response_code = $curl->getinfo(CURLINFO_HTTP_CODE); $current->{content} = $response_body; $current->{elapsed} = $elapsed; $current->{response_code} = $response_code; if ($ret == 0) { print("BODY Transfer Complete!!\n"); $current->{limited} = 0; process_method($current,$response_body); process_a_href($current,$response_body); } else { print("BODY Transfer Failed!!\n"); } } else { $current->{limited} = 1; print "$current->{website} FAILED a_href parsing limit '$retrieval_limit'!\n"; } } else { $current->{limited} = 0; process_method($current,$response_body); process_a_href($current,$response_body); } process_title($current,$response_body); } else { if ($response_body =~ m{^Content-Type: (.*)\r$}mi) { $current->{content_type} = "$1"; print "Found content_type = $1\n"; } else { $current->{content_type} = "unknown"; print "Did not find content-type in '$response_body'\n"; } } } if($response_code == 301) { print "Received a redirect!\n"; $current->{content_type} = "redirect"; if($response_body =~ m/^Location: (.*)\r$/im) # remember, headers have an extra \r at the end ! { my $new_url = $1; print "The new URL is '$new_url'\n"; $current->{redirect_url} = "$new_url"; print "Response body was '$response_body'\n"; deal_with_url($new_url," [ REDIRECT ] ",$current,extract_website($current)); # print Dumper($current); } else { print "I could not parse out this redirect!\n"; print "Response body was '$response_body'\n"; $current->{redirect_url} = "[NOT KNOWN]"; } } } else { print STDERR "Error: $ret ".$curl->strerror($ret)." ".$curl->errbuf."\n"; } } my %broken_link_cache; print "\n"; print "\n"; print "Building cache of non-responsive URLs\n"; print "-------------------------------------\n"; foreach(keys %urls) { my $current = $urls{$_}; if(!$current->{visited} && !$current->{restricted} && !($current->{depth} > $max_depth)) { print "\tFailed on all $current->{attempts} attempt(s) on $current->{url}, adding to cache\n "; $broken_link_cache{"$current->{url}"} = $current->{attempts}; } } print "\n"; print "\n"; print "Final Report\n"; print "\n"; foreach(keys %urls) { my $current = $urls{$_}; print "\tFor URL '$current->{url}' at depth $current->{depth}: "; if(defined($current->{dubious})) { print "\t DUBIOUS $current->{url} --> reports to be text/html, but name doesn't make it likely!\n"; } if($current->{visited}) { print "Succeeded on first attempt (took $current->{elapsed} seconds)\n" if $current->{attempts} == 1; print "Succeeded on #$current->{attempts} attempt (took $current->{elapsed} seconds)\n" if $current->{attempts} > 1; print "\t\tResponse code was '$current->{response_code}'\n"; print "\t\tContent type was '$current->{content_type}'\n"; print "\t\tRedirected to URL '$current->{redirect_url}'\n" if defined($current->{redirect_url}); print "\t\tWebsite is '$current->{website}'\n"; print "\t\tTitle of the content is '$current->{title}'\n" if defined($current->{title}); print "\t\tThis page links to the following pages:\n"; print "\t\t---------------------------------------\n"; if(defined($current->{links})) { my @links = @{$current->{links}}; # print "****" . Dumper(@links); foreach(@links) { my $broken_flag = ""; # check the "broken link" cache to see if this link is broken if(defined($broken_link_cache{"$_->[0]"})) { $broken_flag = "*** appears broken: does not answer on $broken_link_cache{$_->[0]} attempt(s)"; } # print Dumper("$_ \n"); print "\t\t\t '$_->[0]' ( '$_->[1]' ) $broken_flag\n"; } } else { if($current->{limited}) { print "\t\t\t [ Retrieval was allowed by limiting regular expression '$retrieval_limit', but none of the child links were followed ]\n"; } else { print "\t\t\t [ NO LINKS FOUND ] \n"; } } if(defined($current->{forms})) { print "\t\tThis page has these forms:\n"; print "\t\t--------------------------\n"; my @forms = @{$current->{forms}}; foreach(@forms) { print "\t\t\t '$_->[0]' '$_->[1]' ( $_->[2] )\n"; } } } else { if($current->{restricted}) { print "Retrieval not allowed by restriction regular expression '$retrieval_restriction'\n"; next; } if($current->{depth} > $max_depth) { print "LEAF, no attempts made\n"; } else { print "Failed on all $current->{attempts} attempt(s)\n" if ( !$current->{visited} ); } } } # print Dumper(%urls); # # Write it all to a new SQLITE database # unlink("/tmp/newdb.sqlite"); my $database_handle = DBI->connect("dbi:SQLite:dbname=/tmp/newdb.sqlite","",""); $database_handle->do( "CREATE TABLE url ( id integer primary key, url text, visited boolean, content_type text, title text, last_modified date, elapsed real, attempts int, response_code int, redirect_url text, website text, depth int )"); $database_handle->do( "CREATE TABLE a_href ( id integer primary key, parent text, href text, description text )" ); $database_handle->do( "CREATE TABLE form ( id integer primary key, parent text, action text, method text, name text )" ); $database_handle->do( "CREATE TABLE config ( id integer primary key, key text, value text )" ); $database_handle->do("INSERT INTO config (key,value) VALUES ('version','$version')"); $database_handle->do("INSERT INTO config (key,value) VALUES ('max_attempts','$max_attempts')"); $database_handle->do("INSERT INTO config (key,value) VALUES ('max_depth','$max_depth')"); $database_handle->do("INSERT INTO config (key,value) VALUES ('retrieval_restriction','$retrieval_restriction')"); $database_handle->do("INSERT INTO config (key,value) VALUES ('retrieval_limit','$retrieval_limit')"); $database_handle->do("INSERT INTO config (key,value) VALUES ('max_urls','$max_urls')"); $database_handle->do("INSERT INTO config (key,value) VALUES ('count_urls','$count_urls')"); $database_handle->do("INSERT INTO config (key,value) VALUES ('initial_url','$url0{url}')"); foreach(keys %urls) { my $current = $urls{$_}; print "Adding '$current->{url}' to database...\n"; print Dumper($current); my $url = $current->{url}; my $keys = "(url"; my $values = "('$current->{url}'"; if(defined($current->{visited})) { $keys .= ",visited"; $values .= ",$current->{visited}"; } if(defined($current->{elapsed})) { $keys .= ",elapsed"; $values .= ",$current->{elapsed}"; } if(defined($current->{attempts})) { $keys .= ",attempts"; $values .= ",$current->{attempts}"; } if(defined($current->{response_code})) { $keys .= ",response_code"; $values .= ",$current->{response_code}"; } if(defined($current->{content_type})) { $keys .= ",content_type"; $values .= ",'$current->{content_type}'"; } if(defined($current->{redirect_url})) { $keys .= ",redirect_url"; $values .= ",'$current->{redirect_url}'"; } if(defined($current->{website})) { $keys .= ",website"; $values .= ",'$current->{website}'"; } if(defined($current->{title})) { $keys .= ",title"; $values .= ",'$current->{title}'"; } if(defined($current->{last_modified})) { $keys .= ",last_modified"; $values .= ",'$current->{last_modified}'"; } if(defined($current->{depth})) { $keys .= ",depth"; $values .= ",$current->{depth}"; } my $stmt = "INSERT INTO url ${keys}) VALUES ${values})"; print "stmt = $stmt\n"; $database_handle->do("$stmt"); # if(defined($current->{links})) # { # my @links = @{$current->{links}}; # # print "****" . Dumper(@links); # foreach(@links) # { # # print Dumper("$_ \n"); # print "\t\t\t '$_->[0]' ( '$_->[1]' )\n"; # } # } # else # { # if($current->{limited}) # { # print "\t\t\t [ Retrieval was allowed by limiting regular expression '$retrieval_limit', but none of the child links were followed ]\n"; # } # else # { # print "\t\t\t [ NO LINKS FOUND ] \n"; # } # } # if(defined($current->{forms})) # { # print "\t\tThis page has these forms:\n"; # print "\t\t--------------------------\n"; # my @forms = @{$current->{forms}}; # foreach(@forms) # { # print "\t\t\t '$_->[0]' '$_->[1]' ( $_->[2] )\n"; # } # } # } # else # { # if($current->{restricted}) # { # print "Retrieval not allowed by restriction regular expression '$retrieval_restriction'\n"; # next; # } # if($current->{depth} > $max_depth) # { # print "LEAF, no attempts made\n"; # } # else # { # print "Failed on all $current->{attempts} attempt(s)\n" if ( !$current->{visited} ); # } # } } $database_handle->disconnect; sub deal_with_url { my $url = shift; my $description = shift; my $current = shift; my $website = shift; # try to canonicalize my $new_url = canonicalize($current,$website,$url); print "... and I settle on '$new_url'\n"; if(!defined($current->{links})) { $current->{links} = [ [ "$new_url" , "$description" ] ]; } else { push(@{$current->{links}},[ "$new_url" , "$description" ]); } # print "Now, current looks like " . Dumper($current) . "\n"; # # big breath now. clearly, if we have seen this URL, we don't have to do # anything. # if(defined($urls{$new_url})) { print "\t\t $new_url is not a new url... skipping\n"; return; } # okay, start building up a new url my %urlN; $urlN{"url"} = "$new_url"; $urlN{"visited"} = 0; $urlN{"attempts"} = 0; $urlN{"website"} = extract_website(\%urlN); $urlN{"ssl"} = extract_ssl(\%urlN); $urlN{"depth"} = $current->{depth} + 1; $urls{ "$new_url" } = \%urlN; # policy checks # okay, see if we are restricting all checks to a given hostname pattern if(defined($retrieval_restriction)) { if($urlN{website} =~ /$retrieval_restriction/) { print "Restriction Check: URL '$new_url' with website = $current->{website} passed restriction '$retrieval_restriction'!\n"; $urlN{restricted} = 0; } else { print "Restriction Check: URL $current->{url} with website = $current->{website} FAILED restriction '$retrieval_restriction'!\n"; $urlN{restricted} = 1; return; } } # okay, see if we are limiting all checks to no more than one level away from the current link if(defined($retrieval_limit)) { if($current->{website} =~ /$retrieval_limit/) { print "Limitation Check: URL current->{url} with website = $current->{website} passed limit '$retrieval_limit'!\n"; $urlN{limited} = 0; } else { print "Limitation Check: URL $current->{url} with website = $current->{website} FAILED limit '$retrieval_limit'!\n"; $urlN{limited} = 1; return; } } if($urlN{"depth"} <= $max_depth) { # add to pending? push(@pending,\%urlN); $count_urls++; } } sub process_a_href { my $current = shift; # this is a reference the current url hash my $body = shift; # this is the body # # try to extract a useful website name from $current # my $website = extract_website($current); # # look for # while( $body =~ m{(.*?)}sig ) { my $url = $1; my $label = $2; if(!defined($url)) { next; } print "\n\n"; print "'Raw' url = '$url'\n"; # okay, if this begins as a "#" address, it just points back to this page, so forget it if($url =~ m/^"?#/) { print "Skipping local page '$url' url\n"; next; } # okay, is this really a href, or did we get other tags? if( $url =~ m/^"/ ) { if($url =~ m/^"(.*?)"/s) { $url = $1; } } else { if($url =~ m/^(\S+)/s) { $url = $1; } } print "\n"; print "... final '$url' (labeled as '$label')\n";; print "\n"; deal_with_url($url,$label,$current,$website); } } sub process_method { my $current = shift; # this is a reference the current url hash my $body = shift; # this is the body # # try to extract a useful website name from $current # my $website = extract_website($current); # # look for method # while( $body =~ m{