#!/usr/bin/perl use strict; use warnings; use HTML::TreeBuilder; use WWW::Mechanize; my $wiki_url = shift || die "Must provide a URL to download and measure"; my $contents = get_article_text( $wiki_url ); my $word_count = english_wc( $contents ); print $word_count, "\n"; sub english_wc { # Count (our closer approximation of) English words. # Perl's notion of a word includes underscores, and excludes # apostrophes (inter alia). my $str = shift; # first char in word must be alphanumeric my $initial_char_re = qr#[[:alpha:]]#; # then we can allow apostrophes my $middle_of_word_re = qr#['[:alpha:]]+#; # final char must be alphanumeric my $final_char_re = qr#[[:alpha:]]?#; my $word_re = qr#${initial_char_re}(?:${middle_of_word_re}${final_char_re})?#; my @words = ($str =~ m#($word_re)#g); return scalar @words; } sub get_article_text { # get the body column of a Wikipedia URL, # format it nicely, and return it as a string. my $wiki_url = shift; my $mech = WWW::Mechanize->new; $mech->agent_alias('Windows IE 6'); my $article_text = $mech->get($wiki_url)->content; my $tree = HTML::TreeBuilder->new->parse( $article_text ); strip_toc($tree); my @bodies = $tree->look_down( 'id', 'bodyContent' ); if( (my $numBodies = scalar @bodies) != 1 ) { warn "Should have found 1 #bodyContent item; found $numBodies instead"; return undef; } my $body = (shift @bodies)->as_text; # manually delete the tree, because Perl won't garbage-collect # circular references properly: # http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/Element.pm#$h-%3Edelete() $tree->delete() # newline added because as_text() doesn't seem to add it return "$body\n"; } sub strip_toc { # Remove the table of contents from a Wikipedia entry. # Takes a parse tree as its argument. # # First some argument checking, using the closest Perl # gets to type checking. my $tree = shift; my %valid_tree_types = ( "HTML::Element" => 1, "HTML::TreeBuilder" => 1 ); my $valid_tree_types_str = join( ', ', keys %valid_tree_types ); unless( $valid_tree_types{ ref $tree } ) { die "strip_toc() must be one of the following types: $valid_tree_types_str"; } # now to the meat if( $tree->attr("id") && ($tree->attr("id") eq "toc") ) { $tree->detach(); } # this bit of recursion borrowed from # http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/Element/traverse.pm foreach my $child_node ($tree->content_list) { strip_toc( $child_node ) if ref $child_node; } }