#!/usr/bin/perl use strict; use warnings; use lib '/home/slaniel/bin'; use lanielLib qw/get_absolute_urls_from_parse_tree find_kintera_url_in_file unique/; use File::Find; use HTML::TreeBuilder; my $source_dir = shift || die "Must provide a directory to search for hrefs"; my @all_hrefs = (); find( sub { my $this_file = $File::Find::name; if( -f $this_file && ($this_file =~ m{\.html$}) ) { my $tree = new HTML::TreeBuilder; $tree->parse_file($this_file); my $base_url = find_kintera_url_in_file($this_file); push( @all_hrefs, get_absolute_urls_from_parse_tree($tree, $base_url) ); } } , $source_dir ); # strip out some junk URLs my @no_js = (); foreach(unique(@all_hrefs)) { push @no_js, $_ unless( m{^(javascript|mailto)}i ); } # we don't need the (possibly quite large) @all_hrefs list now, so # delete it undef @all_hrefs; print "External URLs:\n"; foreach(sort @no_js) { my $url = new URI($_); if( $url->host !~ m{americanprogress.org} ) { print $url, "\n"; } }