#!/usr/bin/perl
use strict;
use warnings;
use HTML::TreeBuilder;
use WWW::Mechanize;
my $wiki_url = shift
|| die "Must provide a URL to download and measure";
my $contents = get_article_text( $wiki_url );
my $word_count = english_wc( $contents );
print $word_count, "\n";
sub english_wc {
# Count (our closer approximation of) English words.
# Perl's notion of a word includes underscores, and excludes
# apostrophes (inter alia).
my $str = shift;
# first char in word must be alphanumeric
my $initial_char_re = qr#[[:alpha:]]#;
# then we can allow apostrophes
my $middle_of_word_re = qr#['[:alpha:]]+#;
# final char must be alphanumeric
my $final_char_re = qr#[[:alpha:]]?#;
my $word_re = qr#${initial_char_re}(?:${middle_of_word_re}${final_char_re})?#;
my @words = ($str =~ m#($word_re)#g);
return scalar @words;
}
sub get_article_text {
# get the body column of a Wikipedia URL,
# format it nicely, and return it as a string.
my $wiki_url = shift;
my $mech = WWW::Mechanize->new;
$mech->agent_alias('Windows IE 6');
my $article_text = $mech->get($wiki_url)->content;
my $tree = HTML::TreeBuilder->new->parse( $article_text );
strip_toc($tree);
my @bodies = $tree->look_down( 'id', 'bodyContent' );
if( (my $numBodies = scalar @bodies) != 1 ) {
warn "Should have found 1 #bodyContent item; found $numBodies instead";
return undef;
}
my $body = (shift @bodies)->as_text;
# manually delete the tree, because Perl won't garbage-collect
# circular references properly:
# http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/Element.pm#$h-%3Edelete()
$tree->delete()
# newline added because as_text() doesn't seem to add it
return "$body\n";
}
sub strip_toc {
# Remove the table of contents from a Wikipedia entry.
# Takes a parse tree as its argument.
#
# First some argument checking, using the closest Perl
# gets to type checking.
my $tree = shift;
my %valid_tree_types = (
"HTML::Element" => 1,
"HTML::TreeBuilder" => 1
);
my $valid_tree_types_str = join( ', ', keys %valid_tree_types );
unless( $valid_tree_types{ ref $tree } ) {
die "strip_toc() must be one of the following types: $valid_tree_types_str";
}
# now to the meat
if( $tree->attr("id") && ($tree->attr("id") eq "toc") ) {
$tree->detach();
}
# this bit of recursion borrowed from
# http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/Element/traverse.pm
foreach my $child_node ($tree->content_list) {
strip_toc( $child_node )
if ref $child_node;
}
}