#!/usr/local/bin/perl -w use strict; use LWP::UserAgent; my $ua = LWP::UserAgent->new; # Set attributes on UA $ua->timeout(30); $ua->agent("SJK downloading Year in Review: " . $ua->agent); $ua->env_proxy (); # fetch each article open ENTRIES, "<ENTRIES" or die "can't open ENTRIES file: $!"; while (<ENTRIES>) { chomp; my $url = "http://www.wikipedia.com/wiki.cgi?action=history&id=$_"; print "Getting $url...\n"; my $request = HTTP::Request->new('GET', $url); my $response = $ua->request($request); if ($response->is_error) { die $response->status_line . " \n"; } my $maxrev = 0; my @lines = split(/\n/,$response->content); for my $line (@lines) { if ($line =~ /Revision ([0-9]*): \<a href=/so) { if ($1 > $maxrev) { $maxrev = $1; }; } } $url = "http://www.wikipedia.com/wiki.cgi?action=edit&revision=$maxrev&id=$_"; print "Getting $url...\n"; $request = HTTP::Request->new('GET', $url); $response = $ua->request($request); if ($response->is_error) { die $response->status_line . " \n"; } open DATA, ">data/$_" or die "Can't open data/$_: $!"; if ($response->content =~ /\<textarea[^\>]*\>(.*)\<\/textarea\>/s) { my $out = "#YEAR [[$_]] REV=$maxrev\n" . $1; $out =~ s/'/\'/g; print DATA $out; } close DATA; }