User:Whobot/code
From Wikipedia, the free encyclopedia
### IMPORTANT ### # This code is released into the public domain. CONTRIBUTIONS are # welcome, but will also hereby be RELEASED TO THE PUBLIC DOMAIN. # See the documentation distributed with this code for important # warnings and caveats. # Cloned from Pearle Wisebot, modifications by User:Who ################# use strict; use Time::HiRes; # The following may be helpful in debugging character encoding # problems. #use utf8; #use encoding 'utf8'; # Initialization use LWP::UserAgent; use HTTP::Cookies; use HTTP::Request::Common qw(POST); use HTML::Entities; print "\n"; # LWP:UserAgent is a library which allows us to create a "user agent" # object that handles the low-level details of making HTTP requests. $::ua = LWP::UserAgent->new(timeout => 300); $::ua->agent("Whobot Wisebot/0.1"); $::ua->cookie_jar(HTTP::Cookies->new(file => "cookies.whobot.txt", autosave => 1)); $::ua->cookie_jar->load(); # Hot pipes $| = 1; # --- # test(); #sub test #{ # my ($target, $text, $editTime, $startTime, $token); # # $target = "Wikipedia:Sandbox"; # ($text, $editTime, $startTime, $token) = getPage($target); # print $text; # $text .= "\Eat my electrons! -- Whobot\n"; # print "---\n"; # postPage ($target, $editTime, $startTime, $token, $text, "Test 008"); # die ("Test complete."); #} # --- interpretCommand(@ARGV); sub interpretCommand { my ($command, @arguments, $i, $line, $argument, @newArguments, $from, $to, $page, $pageCopy); ($command, @arguments) = @_; $command =~ s/\*\s*//; myLog(`date /t`); myLog ($command.": ".join(" ", @arguments)."\n"); print `date /t`; print $command.": ".join(" ", @arguments)."\n"; if ($command eq "POST_STDIN") { if ($arguments[2] ne "") { myLog ("Too many arguments to POST_STDIN.\n"); die ("Too many arguments to POST_STDIN.\n"); } postSTDIN($arguments[0],$arguments[1]); } elsif ($command eq "POST_STDIN_NULLOK") { if ($arguments[2] ne "") { myLog ("Too many arguments to POST_STDIN.\n"); die ("Too many arguments to POST_STDIN.\n"); } $::nullOK = "yes"; postSTDIN($arguments[0],$arguments[1]); $::nullOK = "no"; } elsif ($command eq "MOVE_CONTENTS") { if ($arguments[3] ne "") { if (($arguments[4] eq "") and ($arguments[1] eq "->")) { moveCategoryContents($arguments[0],$arguments[2],$arguments[3],""); return(); } else { myLog ("Too many arguments to MOVE_CONTENTS.\n"); die ("Too many arguments to MOVE_CONTENTS.\n"); } } moveCategoryContents($arguments[0],$arguments[1],"no","yes",$arguments[2]); } elsif ($command eq "MOVE_CONTENTS_INCL_CATS") { if ($arguments[3] ne "") { if (($arguments[4] eq "") and ($arguments[1] eq "->")) { moveCategoryContents($arguments[0],$arguments[2],"yes","yes",$arguments[3]); return(); } else { myLog ("Too many arguments to MOVE_CONTENTS_INCL_CATS.\n"); die ("Too many arguments to MOVE_CONTENTS_INCL_CATS.\n"); } } moveCategoryContents($arguments[0],$arguments[1],"yes","yes",$arguments[2],""); } elsif ($command eq "REMOVE_X_FROM_CAT") { if ($arguments[3] ne "") { myLog ("Too many arguments to REMOVE_X_FROM_CAT.\n"); die ("Too many arguments to REMOVE_X_FROM_CAT.\n"); } removeXFromCat($arguments[0],$arguments[1],$arguments[2],""); } elsif ($command eq "DEPOPULATE_CAT") { if ($arguments[1] ne "") { if (($arguments[2] eq "") and ($arguments[1] eq "special")) { depopulateCat($arguments[0],"special"); } else { myLog ("Too many arguments to DEPOPULATE_CAT.\n"); die ("Too many arguments to DEPOPULATE_CAT.\n"); } } depopulateCat($arguments[0]); } elsif ($command eq "PRINT_WIKITEXT") { if ($arguments[1] ne "") { myLog ("Too many arguments to PRINT_WIKITEXT.\n"); die ("Too many arguments to PRINT_WIKITEXT.\n"); } printWikitext($arguments[0]); } elsif ($command eq "ADD_CFD_TAG") { if ($arguments[1] ne "") { myLog ("Too many arguments to ADD_CFD_TAG.\n"); die ("Too many arguments to ADD_CFD_TAG.\n"); } addCFDTag($arguments[0]); } elsif ($command eq "ADD_CFDU_TAG") { if ($arguments[2] ne "") { myLog ("Too many arguments to ADD_CFDU_TAG.\n"); die ("Too many arguments to ADD_CFDU_TAG.\n"); } addCFDUTag($arguments[0],$arguments[1],""); } elsif ($command eq "REMOVE_CFD_TAG") { if ($arguments[1] ne "") { myLog ("Too many arguments to REMOVE_CFD_TAG.\n"); die ("Too many arguments to REMOVE_CFD_TAG.\n"); } removeCFDTag($arguments[0]); } elsif ($command eq "REMOVE_CFDU_TAG") { if ($arguments[1] ne "") { myLog ("Too many arguments to REMOVE_CFDU_TAG.\n"); die ("Too many arguments to REMOVE_CFDU_TAG.\n"); } removeCFDUTag($arguments[0]); } elsif ($command eq "ADD_TO_CAT") { if ($arguments[2] ne "") { myLog ("Too many arguments to ADD_TO_CAT.\n"); die ("Too many arguments to ADD_TO_CAT.\n"); } addToCat($arguments[0],$arguments[1],""); } elsif ($command eq "ADD_TO_CAT_NULL_OK") { if ($arguments[2] ne "") { myLog ("Too many arguments to ADD_TO_CAT_NULL_OK.\n"); die ("Too many arguments to ADD_TO_CAT_NULL_OK.\n"); } $::nullOK = "yes"; addToCat($arguments[0],$arguments[1],""); $::nullOK = "no"; } elsif ($command eq "TRANSFER_TEXT") { if ($arguments[2] ne "") { myLog ("Too many arguments to TRANSFER_TEXT.\n"); die ("Too many arguments to TRANSFER_TEXT.\n"); } transferText($arguments[0], $arguments[1]); } # DON'T USE THE BELOW COMMAND; IT'S NOT IMPLEMENTED PROPERLY YET. # elsif ($command eq "LIST_TO_CAT_CHECK") # { # if ($arguments[2] ne "") # { # myLog ("Too many arguments to LIST_TO_CAT_CHECK.\n"); # die ("Too many arguments to LIST_TO_CAT_CHECK.\n"); # } # listToCat($arguments[0], $arguments[1], "no"); # } elsif ($command eq "CHANGE_CATEGORY") { if ($arguments[4] ne "") { myLog ("Too many arguments to CHANGE_CATEGORY.\n"); die ("Too many arguments to CHANGE_CATEGORY.\n"); } changeCategory($arguments[0], $arguments[1], $arguments[2], $arguments[3]); } elsif ($command eq "CLEANUP_DATE") { if ($arguments[0] ne "") { myLog ("Too many arguments to CLEANUP_DATE.\n"); die ("Too many arguments to CLEANUP_DATE.\n"); } cleanupDate(); } elsif ($command eq "OPENTASK_UPDATE") { if ($arguments[0] ne "") { myLog ("Too many arguments to OPENTASK_UPDATE.\n"); die ("Too many arguments to OPENTASK_UPDATE.\n"); } opentaskUpdate(); } # DON'T USE THE BELOW COMMAND; IT'S NOT IMPLEMENTED PROPERLY YET. #elsif ($command eq "ENFORCE_CATEGORY_REDIRECTS_CHECK") #{ # enforceCategoryRedirects("no"); #} # This command is for remedial cleanup only. #elsif ($command eq "INTERWIKI_LOOP") #{ # interwikiLoop(); #} elsif ($command eq "ENFORCE_CATEGORY_INTERWIKI") { if ($arguments[1] ne "") { myLog ("Too many arguments to ENFORCE_CATEGORY_INTERWIKI.\n"); die ("Too many arguments to ENFORCE_CATEGORY_INTERWIKI.\n"); } enforceCategoryInterwiki($arguments[0]); } ## Broken due to recent changes on WP:CFD # elsif ($command eq "ENFORCE_CFD") # { # enforceCFD(); # } elsif ($command eq "STOP") { myLog ("Stopped."); die ("Stopped."); } elsif (($command eq "READ_COMMANDS") or ($command eq "")) { while (<STDIN>) { $line = $_; if ($line =~ m/READ_COMMANDS/) { myLog ("interpretCommands(): Infinite loop!"); die ("interpretCommands(): Infinite loop!"); } if ($line =~ m/^\s*$/) { next; } $line =~ s/\s+$//s; $line =~ s/\*\s*//; if ($line =~ m/\[\[:?(.*?)\]\] -> \[\[:?(.*?)\]\]/) { $line =~ s/\[\[:?(.*?)\]\] -> \[\[:?(.*?)\]\]//; $from = $1; $to = $2; $line =~ s/\s*$//; $from =~ s/ /_/g; $to =~ s/ /_/g; interpretCommand($line, $from, $to); } else { while ($line =~ m/\[\[:?(.*?)\]\]/) { $line =~ m/\[\[:?(.*?)\]\]/; $page = $1; $pageCopy = $page; $page =~ s/ /_/g; $line =~ s/\[\[:?$pageCopy\]\]/$page/; } interpretCommand(split (" ", $line)); } # unless (($line =~ m/TRANSFER_TEXT_CHECK/) or # ($line =~ m/ENFORCE_CATEGORY_INTERWIKI/)) unless ($line =~ m/TRANSFER_TEXT_CHECK/) { limit(); } } myLog ("Execution complete.\n"); print ("Execution complete.\n"); } else { myLog ("Unrecognized command '".$command."': ".join(" ", @arguments)."\n"); die ("Unrecognized command '".$command."': ".join(" ", @arguments)); } } sub limit { my ($i); # Rate-limiting to avoid hosing the wiki server # Min 30 sec unmarked # Min 10 sec marked # May be raised by retry() if load is heavy ### ATTENTION ### # Increasing the speed of the bot to faster than 1 edit every 10 # seconds violates English Wikipedia rules as of April, 2005, and # will cause your bot to be banned. So don't change $normalDelay # unless you know what you are doing. Other sites may have # similar policies, and you are advised to check before using your # bot at the default speed. ################# if ($::speedLimit < 10) { $::speedLimit = 10; } $i = $::speedLimit; while ($i >= 0) { sleep (1); print STDERR "Sleeping $i seconds...\r"; $i--; } print STDERR " \r"; } # perl whobot.pl POST_STDIN User:Whobot/categories-alpha "Update from 13 Oct 2004 database dump" sub postSTDIN { my ($text, $articleName, $comment, $editTime, $startTime, $junk, $token); $articleName = $_[0]; $comment = $_[1]; #urlSafe($articleName); while (<STDIN>) { $text .= $_; } if ($text =~ m/^\s*$/) { myLog ("postSTDIN(): Null input.\n"); die ("postSTDIN(): Null input.\n"); } ($junk, $editTime, $startTime, $token) = getPage($articleName); if ($comment eq "") { $comment = "Automated post"; } postPage ($articleName, $editTime, $startTime, $token, $text, $comment); } # perl whobot.pl ADD_TO_CAT Page_name Category:Category_name sortkey sub addToCat { my ($text, $articleName, $category, $editTime, $startTime, $comment, $status, @junk, $sortkey, $token); $articleName = $_[0]; $category = $_[1]; $sortkey = $_[2]; #urlSafe($articleName); #urlSafe($category); ($text, $editTime, $startTime, $token) = getPage($articleName); $comment = "Add ${category} per [[WP:CFD]]"; ($status, $text, @junk) = addCatToText($category, $text, $sortkey, $articleName); if ($status ne "success") { return(); } postPage ($articleName, $editTime, $startTime, $token, $text, $comment); } sub myLog { open (LOG, ">>whobot-log.txt") || die "Could not append to log!"; print LOG $_[0]; close (LOG); } sub getPage { my ($target, $request, $response, $reply, $text, $text2, $editTime, $startTime, $attemptStartTime, $attemptFinishTime, $token); $target = $_[0]; if ($target =~ m/^\s*$/) { myLog("getPage: Null target."); die("getPage: Null target."); } # urlSafe ($target); # Monitor wiki server responsiveness $attemptStartTime = Time::HiRes::time(); # Create a request-object print "GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n"; myLog("GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit"); $response = $::ua->request($request); if ($response->is_success) { $reply = $response->content; # Monitor wiki server responsiveness $attemptFinishTime = Time::HiRes::time(); retry ("success", "getPage", sprintf("%.3f", $attemptFinishTime-$attemptStartTime)); # This detects whether or not we're logged in. unless ($reply =~ m%<a href="/wiki/User_talk:Whobot">My talk</a>%) { # We've lost our identity. myLog ("Wiki server is not recognizing me (1).\n---\n${reply}\n---\n"); die ("Wiki server is not recognizing me (1).\n"); } #$reply =~ m%<textarea\s*tabindex='1'\s*accesskey=","\s*name="wpTextbox1"\s*rows='25'\s*cols='80'\s*>(.*?)</textarea>%s; $reply =~ m%<textarea\s*tabindex='1'\s*accesskey=","\s*name="wpTextbox1"\s*id="wpTextbox1"\s*rows='25'\s*cols='80'\s*>(.*?)</textarea>%s; $text = $1; $reply =~ m/value="(\d+)" name="wpEdittime"/; $editTime = $1; # Added 22 Aug 2005 to correctly handle articles that have # been undeleted $reply =~ m/value="(\d+)" name="wpStarttime"/; $startTime = $1; # Added 9 Mar 2005 after recent software change. $reply =~ m/value="(\w+)" name="wpEditToken"/; $token = $1; ### if (($text =~ m/^\s*$/) and ($::nullOK ne "yes")) { myLog ("getPage($target): Null text!\n"); myLog "\n---\n$reply\n---\n"; die ("getPage($target): Null text!\n"); } if (($editTime =~ m/^\s*$/) and ($::nullOK ne "yes")) { myLog ("getPage($target): Null time!\n"); myLog "\n---\n$reply\n---\n"; die ("getPage($target): Null time!\n"); } if (($text =~ m/>/) or ($text =~ m/</)) { print $text; myLog "\n---\n$text\n---\n"; myLog ("getPage($target): Bad text suck!\n"); die ("getPage($target): Bad text suck!\n"); } # Change ( " -> " ) etc # This function is from HTML::Entities. decode_entities($text); # This may or may not actually work $::ua->cookie_jar->save(); return ($text, $editTime, $startTime, $token); } else { myLog ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n".$response->content."\n"); print ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n".$response->content."\n"); # 50X HTTP errors mean there is a problem connecting to the wiki server if (($response->status_line =~ m/^500/) or ($response->status_line =~ m/^502/) or ($response->status_line =~ m/^503/)) { return(retry("getPage", @_)); } else { # Unhandled HTTP response die ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n"); } } } sub postPage { my ($request, $response, $pageName, $textToPost, $summaryEntry, $editTime, $startTime, $actual, $expected, $attemptStartTime, $attemptFinishTime, $date, $editToken, $minor); $pageName = $_[0]; $editTime = $_[1]; $startTime = $_[2]; $editToken = $_[3]; $textToPost = $_[4]; $summaryEntry = $_[5]; # Max 200 chars! $minor = $_[6]; $summaryEntry = substr($summaryEntry, 0, 200); if ($pageName eq "") { myLog ("postPage(): Empty pageName.\n"); die ("postPage(): Empty pageName.\n"); } if ($summaryEntry eq "") { $summaryEntry = "Automated editing."; } # Monitor server responsiveness $attemptStartTime = Time::HiRes::time(); if ($minor eq "yes") { $request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageName}&action=submit", [wpTextbox1 => $textToPost, wpSummary => $summaryEntry, wpSave => "Save page", wpMinoredit => "on", wpEditToken => $editToken, wpStarttime => $startTime, wpEdittime => $editTime]; # Optional: wpWatchthis } else { $request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageName}&action=submit", [wpTextbox1 => $textToPost, wpSummary => $summaryEntry, wpSave => "Save page", wpEditToken => $editToken, wpStarttime => $startTime, wpEdittime => $editTime]; # Optional: wpWatchthis, wpMinoredit } # --- ## If posts are failing, you can uncomment the below to see what ## HTTP request is being made. # myLog($request->as_string()); # print $request->as_string(); $::speedLimit = 60 * 10; # print $::ua->request($request)->as_string; # --- myLog("POSTing..."); print "POSTing..."; # Pass request to the user agent and get a response back $response = $::ua->request($request); myLog("POSTed.\n"); print "POSTed.\n"; if ($response->content =~ m/Please confirm that really want to recreate this article./) { myLog ($response->content."\n"); die ("Deleted article conflict! See log!"); } # Check the outcome of the response if (($response->is_success) or ($response->is_redirect)) { # Monitor server responsiveness $attemptFinishTime = Time::HiRes::time(); retry ("success", "postPage", sprintf("%.3f", $attemptFinishTime-$attemptStartTime)); $expected = "302 Moved Temporarily"; $actual = $response->status_line; if (($expected ne $actual) and ($actual ne "200 OK")) { myLog ("postPage(${pageName}, $editTime)#1 - expected =! actual\n"); myLog ($request->as_string()); myLog ("EXPECTED: '${expected}'\n"); myLog (" ACTUAL: '${actual}'\n"); die ("postPage(${pageName}, $editTime)#1 - expected =! actual - see log\n"); } $expected = "http://en.wikipedia.org/wiki/${pageName}"; $expected =~ s/\'/%27/g; $expected =~ s/\*/%2A/g; $expected = urlEncode($expected); $actual = $response->headers->header("Location"); if (($expected ne $actual) and !(($actual eq "") and ($response->status_line eq "200 OK"))) { myLog ("postPage(${pageName}, $editTime)#2 - expected =! actual\n"); myLog ("EXPECTED: '${expected}'\n"); myLog (" ACTUAL: '${actual}'\n"); die ("postPage(${pageName}, $editTime)#2 - expected =! actual - see log\n"); } if ($response->content =~ m/<h1 class="firstHeading">Edit conflict/) { myLog ("Edit conflict on '$pageName' at '$editTime'!\n"); die ("Edit conflict on '$pageName' at '$editTime'!\n"); } $::ua->cookie_jar->save(); return ($response->content); } else { $date = `date /t`; $date =~ s/\n//g; myLog ("Bad response to POST to $pageName at $date.\n".$response->status_line."\n".$response->content."\n"); # 50X HTTP errors mean there is a problem connecting to the wiki server if (($response->status_line =~ m/^500/) or ($response->status_line =~ m/^502/) or ($response->status_line =~ m/^503/)) { print "Bad response to POST to $pageName at $date.\n".$response->status_line."\n".$response->content."\n"; return(retry("postPage", @_)); } else { # Unhandled HTTP response die ("Bad response to POST to $pageName at $date.\n".$response->status_line."\n"); } } } sub urlSafe { # This function is no longer called because the LWP::UserAgent and # HTTP::Request libraries handle character escaping. my ($text, $textCopy); $text = $_[0]; $textCopy = $text; # & may not be included in this list! $textCopy =~ s%[\p{IsWord}\w\-,\(\):\/\'\.\;\!]*%%g; unless ($textCopy eq "") { myLog ("urlSafe(): Bad character in ${text}: '${textCopy}'\n"); die ("urlSafe(): Bad character in ${text}: '${textCopy}'\n"); } } # perl whobot.pl MOVE_CONTENTS_INCL_CATS Category:From_here Category:To_here CFDListingDay sub moveCategoryContents { my (@articles, $categoryFrom, $categoryTo, $article, $status, @subcats, $includeCategories, $subcat, @junk, $sortkey, $includeSortkey, $cfdlisting); # -- INITIALIZATION -- $categoryFrom = $_[0]; $categoryTo = $_[1]; $includeCategories = $_[2]; $includeSortkey = $_[3]; $cfdlisting = $_[4]; if ($categoryFrom =~ m/^\[\[:(Category:.*?)\]\]/) { $categoryFrom =~ s/^\[\[:(Category:.*?)\]\]/$1/; $categoryFrom =~ s/\s+/_/g; } if ($categoryTo =~ m/^\[\[:(Category:.*?)\]\]/) { $categoryTo =~ s/^\[\[:(Category:.*?)\]\]/$1/; $categoryTo =~ s/\s+/_/g; } $categoryFrom =~ s/^\[\[://; $categoryTo =~ s/^\[\[://; $categoryFrom =~ s/\]\]$//; $categoryTo =~ s/\]\]$//; unless (($categoryFrom =~ m/^Category:/) and ($categoryTo =~ m/^Category:/)) { myLog ("moveCategoryContents(): Are you sure these are categories? ".$categoryFrom."/".$categoryTo."\n"); die ("moveCategoryContents(): Are you sure these are categories? ".$categoryFrom."/".$categoryTo."\n"); } transferText ($categoryFrom, $categoryTo, $cfdlisting); # Subcategory transfer if ($includeCategories eq "yes") { @subcats = getSubcategories($categoryFrom); foreach $subcat (@subcats) { if ($subcat =~ m/^\s*$/) { next; } $subcat = urlDecode($subcat); print "changeCategory($subcat, $categoryFrom, $categoryTo) c\n"; myLog "changeCategory($subcat, $categoryFrom, $categoryTo) c\n"; changeCategory($subcat, $categoryFrom, $categoryTo, $cfdlisting); limit(); } } # Article transfer @articles = getCategoryArticles($categoryFrom); foreach $article (reverse(@articles)) # foreach $article (@articles) { #die "article name is $article"; if ($article =~ m/^\s*$/) { next; } $article = urlDecode($article); print "changeCategory($article, $categoryFrom, $categoryTo) a\n"; myLog "changeCategory($article, $categoryFrom, $categoryTo) a\n"; changeCategory($article, $categoryFrom, $categoryTo, $cfdlisting); limit(); } } # perl whobot.pl DEPOPULATE_CAT Category:To_be_depopulated sub depopulateCat #($category); { my (@articles, $category, $article, $status, @subcats, $subcat, @junk, $doSpecial); $category = $_[0]; $doSpecial = $_[1]; if ($category =~ m/^\[\[:(Category:.*?)\]\]/) { $category =~ s/^\[\[:(Category:.*?)\]\]/$1/; $category =~ s/\s+/_/g; } if (!$doSpecial) { unless ($category =~ m/^Category:/) { myLog ("depopulateCat(): Are you sure '$category' is a category?\n"); die ("depopulateCat(): Are you sure '$category' is a category?\n"); } # Remove all subcategories @subcats = getSubcategories($category); foreach $subcat (@subcats) { $subcat = urlDecode($subcat); print "removeXFromCat($subcat, $category) c\n"; myLog "removeXFromCat($subcat, $category) c\n"; ($status, @junk) = removeXFromCat($subcat, $category); unless ($status == 0) { myLog ("Status: $status\n"); print "Status: $status\n"; } } } # Remove all articles @articles = getCategoryArticles($category, $doSpecial); #foreach $article (reverse(@articles)) foreach $article (@articles) { $article = urlDecode($article); print "removeXFromCat($article, $category, $doSpecial) a\n"; myLog "removeXFromCat($article, $category, $doSpecial) a\n"; ($status, @junk) = removeXFromCat($article, $category, $doSpecial); unless ($status == 0) { myLog ("Status: $status\n"); print "Status: $status\n"; } } } # perl whobot.pl REMOVE_X_FROM_CAT Article_name Category:Where_the_article_is CFDListingDay sub removeXFromCat { my ($text, $articleName, $category, $editTime, $startTime, $comment, $catTmp, $sortkey, @junk, $token, $categoryUnd, $categoryHuman, $cfdlisting, $doSpecial); $articleName = $_[0]; $category = $_[1]; $cfdlisting = $_[2]; $doSpecial = $_[3]; if (!$doSpecial) { $doSpecial = $cfdlisting; } #urlSafe($articleName); #urlSafe($category); if (!$doSpecial) { unless ($category =~ m/^Category:\w+/) { myLog ("addToCat(): Bad format on category.\n"); die ("addToCat(): Bad format on category.\n"); } } ($text, $editTime, $startTime, $token) = getPage($articleName); $comment = "Removed ${category} per [[Wikipedia:Categories_for_deletion/Log/${cfdlisting}|WP:CFD]]"; #$comment = "test edits ${cfdlisting}"; # Convert underscore to spaces; this is human-readable. $category =~ s/_/ /g; $categoryHuman = $category; # Insert possible whitespace $category =~ s/^Category://; # $category = "Category:\\s*\\Q".$category."\\E"; # THIS DOES NOT WORK $category = "Category:\\s*".$category; $category =~ s%\(%\\(%g; $category =~ s%\)%\\)%g; $category =~ s%\'%\\\'%g; $categoryUnd = $category; $categoryUnd =~ s/ /_/g; unless (($text =~ m/\[\[\s*${category}\s*\]\]/is) or ($text =~ m/\[\[\s*${category}\s*\|.*?\]\]/is) or ($text =~ m/\[\[\s*${categoryUnd}\s*\]\]/is) or ($text =~ m/\[\[\s*${categoryUnd}\s*\|.*?\]\]/is)) { print "removeXFromCat(): $articleName is not in '$category'.\n"; myLog ("removeXFromCat(): $articleName is not in '$category'.\n"); ### TEMPORARY ### ### Uncomment these lines if you want category remove attempts ### to trigger null edits. This is useful if you have have ### changed the category on a template, but due to a bug this ### does not actually move member articles until they are ### edited. ($text, @junk) = fixCategoryInterwiki($text); postPage ($articleName, $editTime, $startTime, $token, $text, "Mostly null edit to actually remove from ${categoryHuman}", "yes"); limit(); ### TEMPORARY ### return(1); } if ($text =~ m/^\s*\#REDIRECT/is) { print "addToCat(): $articleName is a redirect!\n"; myLog ("addToCat(): $articleName is a redirect!\n"); return(2); } $text =~ m/\[\[\s*${category}\s*\|\s*(.*?)\]\]/is; $sortkey = $1; if ($sortkey eq "") { $text =~ m/\[\[\s*${categoryUnd}\s*\|\s*(.*?)\]\]/is; } # Remove the page from the category and any trailing newline. $text =~ s/\[\[\s*${category}\s*\|?(.*?)\]\]\n?//isg; $text =~ s/\[\[\s*${categoryUnd}\s*\|?(.*?)\]\]\n?//isg; ($text, @junk) = fixCategoryInterwiki($text); postPage ($articleName, $editTime, $startTime, $token, $text, $comment); return(0, $sortkey); } # perl whobot.pl PRINT_WIKITEXT Article_you_want_to_get ## Warning: Saves to a file in the current directory with the same name ## as the article, plus another file with the .html extention. sub printWikitext { my ($editTime, $startTime, $text, $target, $token); $target = $_[0]; $target =~ s/^\[\[://; $target =~ s/\]\]$//; ($text, $editTime, $startTime, $token) = getPage($target); # Save the wikicode version to a file. open (WIKITEXT, ">./${target}"); print WIKITEXT $text; close (WIKITEXT); # Save the HTML version to a file. print `wget http://en.wikipedia.org/wiki/${target} -O ./${target}.html`; } # Get a list of the names of articles in a given category. sub getCategoryArticles { my ($target, $request, $response, $reply, $articles, $article, @articles, @articles1, @articles2, $attemptStartTime, $attemptFinishTime, $doSpecial); $target = $_[0]; $doSpecial = $_[1]; #urlSafe ($target); if (!$doSpecial) { unless ($target =~ m/^Category:/) { myLog ("getCategoryArticles(): Are you sure '$target' is a category?\n"); die ("getCategoryArticles(): Are you sure '$target' is a category?\n"); } } # Monitor wiki server responsiveness $attemptStartTime = Time::HiRes::time(); # Create a request-object if (!$doSpecial) { print "GET http://en.wikipedia.org/wiki/${target}\n"; myLog("GET http://en.wikipedia.org/wiki/${target}\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/wiki/${target}"); } else { print "GET http://en.wikipedia.org/w/wiki.phtml?title=Special:Whatlinkshere&target=${target}&limit=200&offset=0\n"; myLog("GET http://en.wikipedia.org/w/wiki.phtml?title=Special:Whatlinkshere&target=${target}&limit=200&offset=0\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/w/wiki.phtml?title=Special:Whatlinkshere&target=${target}&limit=200&offset=0\n"); } $response = $::ua->request($request); if ($response->is_success) { # Monitor wiki server responsiveness $attemptFinishTime = Time::HiRes::time(); retry ("success", "getCategoryArticles", sprintf("%.3f", $attemptFinishTime-$attemptStartTime)); $reply = $response->content; # This detects whether or not we're logged in. unless ($reply =~ m%<a href="/wiki/User_talk:Whobot">My talk</a>%) { # We've lost our identity. myLog ("Wiki server is not recognizing me (2).\n---\n${reply}\n---\n"); die ("Wiki server is not recognizing me (2).\n"); } $articles = $reply; $articles =~ s%^.*?<h2>Articles in category.*?</h2>%%s; $articles =~ s%<div class="printfooter">.*?$%%s; @articles1 = $articles =~ m%<li><a href="/wiki/(.*?)" title=%sg; @articles2 = $articles =~ m%px 0;"><a href="/wiki/(.*?)" title=%sg; my @articles = (@articles1, @articles2); $::ua->cookie_jar->save(); return @articles; } else { myLog ("getCategoryArticles($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/wiki/${target}\n".$response->content."\n"); # 50X HTTP errors mean there is a problem connecting to the wiki server if (($response->status_line =~ m/^500/) or ($response->status_line =~ m/^502/) or ($response->status_line =~ m/^503/)) { print "getCategoryArticles($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/wiki/${target}\n".$response->content."\n"; return(retry("getCategoryArticles", @_)); } else { # Unhandled HTTP response die ("getCategoryArticles($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/wiki/${target}\n"); } } } # Get a list of the names of subcategories of a given category. sub getSubcategories { my ($target, $request, $response, $reply, $subcats, $subcat, @subcats, $attemptStartTime, $attemptFinishTime); $target = $_[0]; #urlSafe ($target); unless ($target =~ m/^Category:/) { myLog ("getSubcategories(): Are you sure '$target' is a category?\n"); die ("getSubcategories(): Are you sure '$target' is a category?\n"); } # Monitor wiki server responsiveness $attemptStartTime = Time::HiRes::time(); # Create a request-object print "GET http://en.wikipedia.org/wiki/${target}\n"; myLog("GET http://en.wikipedia.org/wiki/${target}\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/wiki/${target}"); $response = $::ua->request($request); if ($response->is_success) { # Monitor wiki server responsiveness $attemptFinishTime = Time::HiRes::time(); retry ("success", "getSubcategories", sprintf("%.3f", $attemptFinishTime-$attemptStartTime)); $reply = $response->content; # This detects whether or not we're logged in. unless ($reply =~ m%<a href="/wiki/User_talk:Whobot">My talk</a>%) { # We've lost our identity. myLog ("Wikipedia is not recognizing me (3).\n---\n${reply}\n---\n"); die ("Wikipedia is not recognizing me (3).\n"); } $subcats = $reply; if ($subcats =~ m%^.*?<h2>Subcategories</h2>(.*?)<h2>Articles in category.*?</h2>.*?$%s) { $subcats =~ s%^.*?<h2>Subcategories</h2>(.*?)<h2>Articles in category.*?</h2>.*?$%$1%s; } else { return (); } @subcats = $subcats =~ m%<li><a href="/wiki/(.*?)" title=%sg; $::ua->cookie_jar->save(); return @subcats; } else { myLog ("getSubcategories($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/wiki/${target}\n".$response->content."\n"); # 50X HTTP errors mean there is a problem connecting to the wiki server if (($response->status_line =~ m/^500/) or ($response->status_line =~ m/^502/) or ($response->status_line =~ m/^503/)) { print "getSubcategories($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/wiki/${target}\n".$response->content."\n"; return(retry("getCategoryArticles", @_)); } else { # Unhandled HTTP response die ("getSubcategories($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/wiki/${target}\n"); } } } # perl whobot.pl ADD_CFD_TAG Category:Category_name sub addCFDTag { my ($text, $category, $editTime, $startTime, $comment, $catTmp, @junk, $token); $category = $_[0]; #urlSafe($category); unless ($category =~ m/^Category:\w+/) { myLog ("addCFDTag(): Bad format on category.\n"); die ("addCFDTag(): Bad format on category.\n"); } $::nullOK = "yes"; ($text, $editTime, $startTime, $token) = getPage($category); $::nullOK = "no"; $comment = "Nominated for deletion or renaming"; if (($text =~ m/\{\{cfd\}\}/is) or ($text =~ m/\{\{cfm/is) or ($text =~ m/\{\{cfr/is) or ($text =~ m/\{\{cfr-speedy/is)) { print "addCFDTag(): $category is already tagged.\n"; myLog ("addCFDTag(): $category is already tagged.\n"); return(); } if ($text =~ m/^\s*\#REDIRECT/is) { print "addCFDTag(): $category is a redirect!\n"; myLog ("addCFDTag(): $category is a redirect!\n"); return(); } # Add the CFD tag to the beginning of the page. $text = "{{cfd}}\n".$text; ($text, @junk) = fixCategoryInterwiki($text); postPage ($category, $editTime, $startTime, $token, $text, $comment); } # perl whobot.pl ADD_CFDU_TAG Category:Category_name sub addCFDUTag { my ($text, $category, $editTime, $startTime, $comment, $catTmp, @junk, $token, $stuff); $category = $_[0]; $stuff = $_[1]; urlSafe($category); #urlSafe($stuff); unless ($category =~ m/^Category:\w+/) { myLog ("addCFDUTag(): Bad format on category.\n"); die ("addCFDUTag(): Bad format on category.\n"); } $::nullOK = "yes"; ($text, $editTime, $startTime, $token) = getPage($category); $::nullOK = "no"; $comment = "Nominated for deletion or renaming"; #$comment = "Test edit"; if (($text =~ m/\{\{cfd\}\}/is) or ($text =~ m/\{\{cfm/is) or ($text =~ m/\{\{cfr/is) or ($text =~ m/\{\{cfr-speedy/is)) { print "addCFDUTag(): $category is already tagged.\n"; myLog ("addCFDUTag(): $category is already tagged.\n"); return(); } if ($text =~ m/^\s*\#REDIRECT/is) { print "addCFDUTag(): $category is a redirect!\n"; myLog ("addCFDUTag(): $category is a redirect!\n"); return(); } # Add the CFDU tag to the beginning of the page. $text = "{{". $stuff. "}}\n".$text; # $text = $stuff; ($text, @junk) = fixCategoryInterwiki($text); postPage ($category, $editTime, $startTime, $token, $text, $comment); } # perl whobot.pl REMOVE_CFD_TAG Category:Category_name sub removeCFDTag { my ($text, $category, $editTime, $startTime, $comment, $catTmp, @junk, $token); $category = $_[0]; #urlSafe($category); unless ($category =~ m/^Category:\w+/) { myLog ("removeCFDTag(): Bad format on category.\n"); die ("removeCFDTag(): Bad format on category.\n"); } $::nullOK = "yes"; ($text, $editTime, $startTime, $token) = getPage($category); $::nullOK = "no"; $comment = "De-listed from [[Wikipedia:Categories for deletion]]"; unless (($text =~ m/\{\{cfd\}\}/is) or ($text =~ m/\{\{cfm/is) or ($text =~ m/\{\{cfr/is) or ($text =~ m/\{\{cfr-speedy/is)) { print "removeCFDTag(): $category is not tagged.\n"; myLog ("removeCFDTag(): $category is not tagged.\n"); return(); } if ($text =~ m/^\s*\#REDIRECT/is) { print "removeCFDTag(): $category is a redirect!\n"; myLog ("removeCFDTag(): $category is a redirect!\n"); return(); } # Remove the CFD tag. $text =~ s/{{cfd}}\s*//gis; $text =~ s/\{\{cfr.*?\}\}\s*//is; $text =~ s/\{\{cfm.*?\}\}\s*//is; $text =~ s/\{\{cfdu.*?\}\}\s*//is; $text =~ s/\{\{cfru.*?\}\}\s*//is; $text =~ s/\{\{cfr-speedy.*?\}\}\s*//is; ($text, @junk) = fixCategoryInterwiki($text); postPage ($category, $editTime, $startTime, $token, $text, $comment); } # perl whobot.pl REMOVE_CFDU_TAG Category:Containing subs to remove CFDU sub removeCFDUTag #($category); { my (@articles, $category, $article, $status, @subcats, $subcat, @junk, $text, $editTime, $startTime, $comment, $catTmp, $token); $category = $_[0]; if ($category =~ m/^\[\[:(Category:.*?)\]\]/) { $category =~ s/^\[\[:(Category:.*?)\]\]/$1/; $category =~ s/\s+/_/g; } unless ($category =~ m/^Category:/) { myLog ("removeCFDUtag(): Are you sure '$category' is a category?\n"); die ("removeCFDUtag(): Are you sure '$category' is a category?\n"); } # Remove all subcategories @subcats = getSubcategories($category); foreach $subcat (@subcats) { $subcat = urlDecode($subcat); print "removeCFDTag($subcat, $category) c\n"; myLog "removeCFDTag($subcat, $category) c\n"; ($status, @junk) = removeCFDTag($subcat, $category, $editTime, $startTime, $token, $text, $comment); unless ($status == 0) { myLog ("Status: $status\n"); print "Status: $status\n"; } } } # perl whobot.pl TRANSFER_TEXT Category:From_here Category:To_there ## Note that this code is called automatically whenever moving a ## category, so you probably don't need to call it yourself from the ## command line. sub transferText { my ($source, $destination, $sourceText, $destinationText, $sourceTime, $destinationTime, @sourceCategories, @destinationCategories, $category, $lastCategory, $sourceTextOrig, $destinationTextOrig, $comment, $sourceHuman, $destinationHuman, $noMergeFlag, $sourceToken, $destinationToken, $junk, $sourceStartTime, $destinationStartTime, $cfdlisting, $summaryText); $source = $_[0]; $destination = $_[1]; $cfdlisting = $_[2]; if ($cfdlisting eq "speedy") { $comment = "Cleanup per [[Wikipedia:Category_renaming#Speedy_renaming_procedure|CFD Speedy rename]] (moving $source to $destination)"; } else { $comment = "Cleanup per [[Wikipedia:Categories_for_deletion/Log/${cfdlisting}|WP:CFD]] (moving $source to $destination)"; } # Make human-readable versions of these variables for use in edit summaries $sourceHuman = $source; $sourceHuman =~ s/_/ /g; $destinationHuman = $destination; $destinationHuman =~ s/_/ /g; unless (($source =~ m/^Category:/) and ($destination =~ m/^Category:/)) { myLog ("transferText(): Are you sure these are categories? ".$source."/".$destination."\n"); die ("transferText(): Are you sure these are categories? ".$source."/".$destination."\n"); } ($sourceText, $sourceTime, $sourceStartTime, $sourceToken) = getPage($source); # Avoid double runs! # This text must be the same as that which is implanted below, and # it should be an HTML comment, so that it's invisible. if ($sourceText =~ m/<\!--WHOBOT-MOVE-CATEGORY-CONTENTS-SRC-FLAG-->/) { return; } $sourceTextOrig = $sourceText; $sourceText =~ s/{{cfd}}//; $sourceText =~ s/\{\{cfr.*?\}\}\s*//is; $sourceText =~ s/\{\{cfm.*?\}\}\s*//is; $sourceText =~ s/\{\{cfdu.*?\}\}\s*//is; $sourceText =~ s/\{\{cfru.*?\}\}\s*//is; $sourceText =~ s/\{\{cfr-speedy.*?\}\}\s*//is; $sourceText =~ s/^\s+//s; $sourceText =~ s/\s+$//s; $::nullOK = "yes"; ($destinationText, $destinationTime, $destinationStartTime, $destinationToken) = getPage($destination); $::nullOK = "no"; $destinationTextOrig = $destinationText; $destinationText =~ s/{{cfd}}//; $destinationText =~ s/\{\{cfm.*?\}\}\s*//is; $destinationText =~ s/\{\{cfr.*?\}\}\s*//is; $destinationText =~ s/\{\{cfdu.*?\}\}\s*//is; $destinationText =~ s/\{\{cfru.*?\}\}\s*//is; $destinationText =~ s/\{\{cfr-speedy.*?\}\}\s*//is; $destinationText =~ s/^\s+//s; $destinationText =~ s/\s+$//s; # To help keep things straight when we're in a loop. print STDOUT "\n----\n"; if ($cfdlisting eq "speedy") { $summaryText = "[[Wikipedia:Category_renaming#Speedy_renaming_procedure|CFD Speedy rename]]"; } else { $summaryText = "[[Wikipedia:Categories_for_deletion/Log/${cfdlisting}|WP:CFD]]"; } if (($sourceText eq "") and ($destinationText ne "")) { # The HTML comment must be the same as that above. $sourceText = "{{cfd}}\nThis category has been moved to [[:$destinationHuman]]. Any remaining articles and subcategories will soon be moved by a bot unless otherwise noted on $summaryText.\n<!--WHOBOT-MOVE-CATEGORY-CONTENTS-SRC-FLAG-->\n"; } elsif (($sourceText ne "") and ($destinationText eq "")) { $destinationText = $sourceText; # The HTML comment must be the same as that above. $sourceText = "{{cfd}}\nThis category has been moved to [[:$destinationHuman]]. Any remaining articles and subcategories will soon be moved by a bot unless otherwise noted on $summaryText.\n<!--WHOBOT-MOVE-CATEGORY-CONTENTS-SRC-FLAG-->\n"; } elsif (($sourceText ne "") and ($destinationText ne "")) { @sourceCategories = $sourceText =~ m/\[\[\s*(Category:.*?)\s*\]\]/gs; @destinationCategories = $destinationText =~ m/\[\[\s*(Category:.*?)\s*\]\]/gs; $sourceText =~ s/\[\[\s*(Category:.*?)\s*\]\]\s*//gs; $sourceText =~ s/^\s+//s; $sourceText =~ s/\s+$//s; $destinationText =~ s/\[\[\s*(Category:.*?)\s*\]\]\s*//gs; $destinationText =~ s/^\s+//s; $destinationText =~ s/\s+$//s; $destinationText = $sourceText."\n".$destinationText; $destinationText =~ s/^\s+//s; $destinationText =~ s/\s+$//s; foreach $category (sort (@sourceCategories, @destinationCategories)) { if ($category eq $lastCategory) { next; } $destinationText .= "\n[[${category}]]"; $lastCategory = $category; } # The HTML comment must be the same as that above. $sourceText = "{{cfd}}\nThis category has been moved to [[:$destinationHuman]]. Any remaining articles and subcategories will soon be moved by a bot unless otherwise noted on [[Wikipedia:Categories_for_deletion/Log/${cfdlisting}|WP:CFD]].\n<!--WHOBOT-MOVE-CATEGORY-CONTENTS-SRC-FLAG-->\n"; } $sourceText =~ s/\n\s+\n/\n\n/sg; $destinationText =~ s/\n\s+\n/\n\n/sg; # You may need to futz with this, depending on the templates # currently in use. unless (($sourceTextOrig =~ m/\{\{cfd/) or ($sourceTextOrig =~ m/\{\{cfr/) or ($sourceTextOrig =~ m/\{\{cfru|/) or ($sourceTextOrig =~ m/\{\{cfdu|/) or ($sourceTextOrig =~ m/\{\{cfr-speedy/) or ($sourceTextOrig =~ m/\{\{cfm/)) { print STDOUT "FATAL ERROR: $source was not tagged {{cfd}}, {{cfm}}, {{cfr}}, {{cfdu}}, {{cfru}}, or {{cfr-speedy}}!\n"; myLog("FATAL ERROR: $source was not tagged {{cfd}}, {{cfr}}, {{cfm}}, {{cfdu}}, {{cfru}}, or {{cfr-speedy}}!\n"); die("FATAL ERROR: $source was not tagged {{cfd}}, {{cfr}}, {{cfm}}, {{cfdu}}, {{cfru}}, or {{cfr-speedy}}!\n"); } if (($sourceText eq $sourceTextOrig) and ($destinationText eq $destinationTextOrig)) { print STDOUT "No changes for $source and $destination.\n"; return(); } if ($destinationTextOrig =~ m/^\s*$/) { print "No merging was required from $source into $destination.\n"; $noMergeFlag = "yes"; } unless ($noMergeFlag eq "yes") { $destinationText .= "{{pearle-manual-cleanup}}\n"; } # Make sure category and interwiki links conform to style # guidelines. ($destinationText, $junk) = fixCategoryInterwiki($destinationText); # If we did have to change things around, print the changes and post them to the wiki. if ($sourceText ne $sourceTextOrig) { unless ($noMergeFlag eq "yes") { print STDOUT "SOURCE FROM:\n%%%${sourceTextOrig}%%%\nSOURCE TO:\n%%%${sourceText}%%%\n"; } postPage ($source, $sourceTime, $sourceStartTime, $sourceToken, $sourceText, $comment); } if ($destinationText ne $destinationTextOrig) { unless ($noMergeFlag eq "yes") { print STDOUT "DESTINATION FROM:\n%%%${destinationTextOrig}%%%\nDESTINATION TO:\n%%%${destinationText}%%%\n"; } postPage ($destination, $destinationTime, $destinationStartTime, $destinationToken, $destinationText, $comment); } } # Translate from HTTP URL encoding to the native character set. sub urlDecode { my ($input); $input = $_[0]; $input =~ s/\%([a-f|A-F|0-9][a-f|A-F|0-9])/chr(hex($1))/eg; return ($input); } # Translate from the native character set to HTTP URL encoding. sub urlEncode { my ($char, $input, $output); $input = $_[0]; foreach $char (split("",$input)) { # if ($char =~ m/[a-z|A-Z|0-9|\-_\.\!\~\*\'\(\)]/) # The below exclusions should conform to Wikipedia practice # (possibly non-standard) if ($char =~ m/[a-z|A-Z|0-9|\-_\.\*\/:]/) { $output .= $char; } elsif ($char eq " ") { $output .= "+"; } else { $output .= uc(sprintf("%%%x", ord($char))); # %HH where HH is the (Unicode?) hex code of $char } } return ($output); } # perl whobot.pl CHANGE_CATEGORY Article_name Category:From Category:To CFDlistingDay sub changeCategory { my ($articleName, $categoryFrom, $categoryTo, $editTime, $startTime, $text, $comment, $catTmp, $sortkey, $token, $junk, $categoryFromUnd, $cfdlisting); $articleName = $_[0]; $categoryFrom = $_[1]; $categoryTo = $_[2]; $cfdlisting = $_[3]; #urlSafe($articleName); #urlSafe($categoryFrom); #urlSafe($categoryTo); unless (($categoryFrom =~ m/^Category:/) and ($categoryTo =~ m/^Category:/)) { myLog ("moveCategoryContents(): Are you sure these are categories? ".$categoryFrom."/".$categoryTo."\n"); die ("moveCategoryContents(): Are you sure these are categories? ".$categoryFrom."/".$categoryTo."\n"); } #die ($articleName ."does not exist"); if ($articleName =~ m/^\s*$/) { myLog("changeCategory(): Null target."); die("changeCategory(): Null target."); } ($text, $editTime, $startTime, $token) = getPage($articleName); if ($cfdlisting eq "speedy") { $comment = "Recat per [[Wikipedia:Category_renaming#Speedy_renaming_procedure|CFD Speedy rename]] ${categoryFrom} to ${categoryTo}"; } else { $comment = "Recat per [[Wikipedia:Categories_for_deletion/Log/${cfdlisting}|WP:CFD]] ${categoryFrom} to ${categoryTo}"; } # --- Start the removing part --- # Convert underscore to spaces; this is human-readable. $categoryFrom =~ s/_/ /g; # Insert possible whitespace $categoryFrom =~ s/^Category://; $categoryFrom = "Category:\\s*".$categoryFrom; # Escape special characters $categoryFrom =~ s%\(%\\(%g; $categoryFrom =~ s%\)%\\)%g; $categoryFrom =~ s%\'%\\\'%g; $categoryFromUnd = $categoryFrom; $categoryFromUnd =~ s/ /_/g; unless (($text =~ m/\[\[\s*${categoryFrom}\s*\]\]/is) or ($text =~ m/\[\[\s*${categoryFrom}\s*\|.*?\]\]/is) or ($text =~ m/\[\[\s*${categoryFromUnd}\s*\]\]/is) or ($text =~ m/\[\[\s*${categoryFromUnd}\s*\|.*?\]\]/is)) { myLog ("changeCategory.r(): $articleName is not in '$categoryFrom'.\n"); my ($nullEditFlag); # Set this to "yes" if you want mass category change attempts # to trigger null edits automatically. You should check the # category later to see if everything worked or not, to see if # any templates should be changed. The below will add a small # amount of unnecessary server load to try the null edits if # template changes haven't already been made. $nullEditFlag = "yes"; if ($nullEditFlag eq "yes") { myLog ("changeCategory(): Attempting null edit on $articleName.\n"); print "changeCategory(): Attempting null edit on $articleName.\n"; nullEdit($articleName); return(); } else { print "###${text}###\n"; die ("changeCategory.r(): $articleName is not in '$categoryFrom'.\n"); } } if ($text =~ m/^\s*\#REDIRECT/is) { myLog ("changeCategory.r(): $articleName is a redirect!\n"); die ("changeCategory.r(): $articleName is a redirect!\n"); } # We're lazy and don't fully parse the document to properly check # for escaped category tags, so there may be some unnecssary # aborts from the following, but they are rare and easily # overridden by manually editing the page in question. if ($text =~ m/<nowiki>.*?category.*?<\/nowiki>/is) { myLog ("changeCategory.r(): $articleName has a dangerous nowiki tag!\n"); die ("changeCategory.r(): $articleName has a dangerous nowiki tag!\n"); } $text =~ m/\[\[\s*${categoryFrom}\s*\|\s*(.*?)\]\]/is; $sortkey = $1; if ($sortkey eq "") { $text =~ m/\[\[\s*${categoryFromUnd}\s*\|\s*(.*?)\]\]/is; } # Remove the page from the category and any trailing newline. $text =~ s/\[\[\s*${categoryFrom}\s*\|?(.*?)\]\]\n?//isg; $text =~ s/\[\[\s*${categoryFromUnd}\s*\|?(.*?)\]\]\n?//isg; # --- Start the adding part --- # Remove any newlines at the end of the document. $text =~ s/\n*$//s; $catTmp = $categoryTo; # _ and spaces are equivalent and may be intermingled in wikicode. $catTmp =~ s/Category:\s*/Category:\\s*/g; $catTmp =~ s/_/[_ ]/g; $catTmp =~ s%\(%\\\(%g; $catTmp =~ s%\)%\\\)%g; $catTmp =~ s%\.%\\\.%g; if (($text =~ m/(\[\[\s*${catTmp}\s*\|.*?\]\])/is) or ($text =~ m/(\[\[\s*${catTmp}\s*\]\])/is)) { myLog ("changeCategory.a(): $articleName is already in '$categoryTo'.\n"); print "\n1: '${1}'\n"; print "\ncattmp: '${catTmp}'\n"; print "changeCategory.a(): $articleName is already in '$categoryTo'.\n"; ## It's generally OK to merge it in, so don't do this: # die "changeCategory.a(): $articleName is already in '$categoryTo'.\n"; # return(); } elsif ($text =~ m/^\s*\#REDIRECT/is) { print "changeCategory.a(): $articleName is a redirect!\n"; myLog ("changeCategory.a(): $articleName is a redirect!\n"); return(); } else { # Convert underscore to spaces; this is human-readable. $categoryTo =~ s/_/ /g; # Add the category on a new line. if ($sortkey eq "") { $text .= "\n[[${categoryTo}]]"; } else { $text .= "\n[[${categoryTo}|${sortkey}]]"; } } # --- Post-processing --- ($text, $junk) = fixCategoryInterwiki($text); postPage ($articleName, $editTime, $startTime, $token, $text, $comment, "yes"); } # This function is not yet finished. Right now it simply compares the # membership of a given list and a given category. Eventually, it is # intended to be used to convert lists into categories. This is not # yet authorized behavior. sub listToCat { my ($lists, $cats, $list, $cat, $listText, @junk, @articlesInList, @articlesInCat, %articlesInCat, $article, $implement); $lists = $_[0]; $cats = $_[1]; $implement = $_[2]; if ($implement ne "yes") { print "Diffing membership of '$lists' and '$cats'\n"; } foreach $list (split(";", $lists)) { $list =~ s/^\[\[:?//; $list =~ s/\]\]$//; ($listText, @junk) = getPage($list); $listText =~ s%<nowiki>.*?%%gis; $listText =~ s%<pre>.*?
%%gis; #
@articlesInList = (@articlesInList, $listText =~ m%\[\[(.*?)\]\]%sg); sleep 1; } foreach $cat (split(";", $cats)) { $cat =~ s/^\[\[:?//; $cat =~ s/\]\]$//; $cat =~ s/^:Category/Category/; @articlesInCat = (@articlesInCat, getCategoryArticles($cat)); sleep 1; } foreach $article (@articlesInCat) { $article = urlDecode ($article); $articlesInCat{$article} = 1; # print "In cat: $article\n"; } foreach $article (@articlesInList) { $article =~ s/\s+/_/gs; $article =~ s/\|.*$//; if (exists $articlesInCat{$article}) { # print "OK: $article\n"; delete $articlesInCat{$article}; } else { print "Only in list(s): $article\n"; } } foreach $article (sort(keys(%articlesInCat))) { print "Only in cat(s): $article\n"; } } # A little paranoia never hurt anyone. sub shellfix { my ($string, $stringTmp); $string = $_[0]; $string =~ s/([\*\?\!\(\)\&\>\<])\"\'/\\$1/g; $stringTmp = $string; $stringTmp =~ s/[Å\p{IsWord}[:alpha:][:digit:]\*,:_.\'\"\)\(\?\-\/\&\>\<\!]//g; if ($stringTmp ne "") { die ("\nUnsafe character(s) in '${string}': '$stringTmp'\n"); } return $string; } # You will not be able to use this function; it requires a dataset # processed by scripts which have not been included. (It's not # finished, anyway.) sub enforceCategoryRedirects { my ($implementActually, $line, $lineTmp, $articlesToMove, $article, $flatResults, $entry, $contents, $catTo, $lineTmp2); $implementActually = $_[0]; $flatResults = `cat data/reverse-category-links-sorted.txt | grep ^Category:Wikipedia_category_redirects`; foreach $line (split("\n", $flatResults)) { $line =~ s/^Category:Wikipedia_category_redirects <\- //; $lineTmp = shellfix($line); $lineTmp2 = $lineTmp; $lineTmp2 =~ s/^Category://; if ($line =~ m/^Category/) { $articlesToMove = `type data/reverse-category-links-sorted.txt | grep ^${lineTmp}`; if ($articlesToMove eq "") { next; } print "ATM: $articlesToMove\n"; $entry = `egrep \"^\\([0-9]+,14,'$lineTmp2'\" data/entries-categoryredirect.txt `; $entry =~ m/^\([0-9]+,14,'$lineTmp2','(.*?)',/; $contents = $1; $contents =~ m/\{\{categoryredirect\|(.*?)\}\}/; $catTo = $1; $catTo = ":Category:".$catTo; $catTo =~ s/_/ /g; $lineTmp = $line; $lineTmp =~ s/^Category/:Category/i; $lineTmp =~ s/_/ /g; foreach $article (split("\n", $articlesToMove)) { print "ARTICLE: $article\n"; print "LINE: $line\n"; $article =~ s/^$line <\- //; print "* Move [[$article]] from [[$lineTmp]] to [[$catTo]]\n"; } } } } # A call to this recursive function handles any retries necessary to # wait out network or server problems. It's a bit of a hack. sub retry { my ($callType, @args, $i, $normalDelay, $firstRetry, $secondRetry, $thirdRetry); ($callType, @args) = @_; ### ATTENTION ### # Increasing the speed of the bot to faster than 1 edit every 10 # seconds violates English Wikipedia rules as of April, 2005, and # will cause your bot to be banned. So don't change $normalDelay # unless you know what you are doing. Other sites may have # similar policies, and you are advised to check before using your # bot at the default speed. ################# # HTTP failures are usually an indication of high server load. # The retry settings here are designed to give human editors # priority use of the server, by allowing it ample recovering time # when load is high. # Time to wait before retry on failure, in seconds $normalDelay = 10; # Normal interval between edits is 10 seconds $firstRetry = 60; # First delay on fail is 1 minute $secondRetry = 60 * 10; # Second delay on fail is 10 minutes $thirdRetry = 60 * 60; # Third delay on fail is 1 hour # SUCCESS CASE # e.g. retry ("success", "getPage", "0.23"); if ($callType eq "success") { myLog("Response time for ".$args[0]." (sec): ".$args[1]."\n"); $::retryDelay = $normalDelay; if ($args[0] eq "postPage") { # If the response time is greater than 20 seconds... if ($args[1] > 20) { print "Wikipedia is very slow. Increasing minimum wait to 10 min...\n"; myLog("Wikipedia is very slow. Increasing minimum wait to 10 min...\n"); $::speedLimit = 60 * 10; } # If the response time is between 10 and 20 seconds... elsif ($args[1] > 10) { print "Wikipedia is somewhat slow. Setting minimum wait to 60 sec...\n"; myLog("Wikipedia is somewhat slow. Setting minimum wait to 60 sec...\n"); $::speedLimit = 60; } # If the response time is less than 10 seconds... else { if ($::speedLimit > 10) { print "Returning to normal minimum wait time.\n"; myLog("Returning to normal minimum wait time.\n"); $::speedLimit = 10; } } } return(); } # e.g. retry ("getPage", "George_Washington") # FAILURE CASES elsif (($::retryDelay == $normalDelay) or ($::retryDelay == 0)) { print "First retry for ".$args[0]."\n"; myLog("First retry for ".$args[0]."\n"); $::retryDelay = $firstRetry; $::speedLimit = 60 * 10; } elsif ($::retryDelay == $firstRetry) { print "Second retry for ".$args[0]."\n"; myLog("Second retry for ".$args[0]."\n"); $::retryDelay = $secondRetry; $::speedLimit = 60 * 10; } elsif ($::retryDelay == $secondRetry) { print "Third retry for ".$args[0]."\n"; myLog("Third retry for ".$args[0]."\n"); $::retryDelay = $thirdRetry; $::speedLimit = 60 * 10; } elsif ($::retryDelay == $thirdRetry) { print "Nth retry for ".$args[0]."\n"; myLog("Nth retry for ".$args[0]."\n"); $::retryDelay = $thirdRetry; $::speedLimit = 60 * 10; } else { die ("retry(): Internal error - unknown delay factor '".$::retryDelay."'\n"); } # DEFAULT TO FAILURE CASE HANDLING $i = $::retryDelay; while ($i >= 0) { sleep (1); print STDERR "Waiting $i seconds for retry...\r"; $i--; } print " \r"; # DO THE ACTUAL RETRY if ($callType eq "getPage") { return(getPage(@args)); } elsif ($callType eq "postPage") { return(postPage(@args)); } elsif ($callType eq "getCategoryArticles") { return(getCategoryArticles(@args)); } elsif ($callType eq "getSubcategories") { return(getSubcategories(@args)); } elsif ($callType eq "getURL") { return(getURL(@args)); } else { myLog ("retry(): Unknown callType: $callType\n"); die ("retry(): Unknown callType: $callType\n"); } } # perl pearle ENFORCE_CFD ## This just compares the contents of Category:Categories_for_deletion ## with WP:CFD and /resolved and /unresolved. It is broken now due to ## recent changes which list all nominations on subpages. It also ## does not check above the first 200 members of the category, due to ## recent changes which paginates in 200-page blocks. sub enforceCFD { my (@subcats, $subcat, $cfd, $editTime, $startTime, $token, $cfdU, $cfdR); @subcats = getSubcategories("Category:Categories_for_deletion"); ($cfd, $editTime, $startTime, $token) = getPage("Wikipedia:Categories_for_deletion"); ($cfdU, $editTime, $startTime, $token) = getPage("Wikipedia:Categories_for_deletion/unresolved"); ($cfdR, $editTime, $startTime, $token) = getPage("Wikipedia:Categories_for_deletion/resolved"); $cfd =~ s/[\r\n_]/ /g; $cfd =~ s/\s+/ /g; $cfdU =~ s/[\r\n_]/ /g; $cfdU =~ s/\s+/ /g; $cfdR =~ s/[\r\n_]/ /g; $cfdR =~ s/\s+/ /g; foreach $subcat (@subcats) { $subcat =~ s/[\r\n_]/ /g; $subcat =~ s/\s+/ /g; $subcat = urlDecode ($subcat); unless ($cfd =~ m/$subcat/) { print "$subcat is not in WP:CFD"; if ($cfdR =~ m/$subcat/) { print " (listed on /resolved)"; } if ($cfdU =~ m/$subcat/) { print " (listed on /unresolved)"; } print "\n"; } } } # An internal function that handles the complexity of adding a # category tag to the wikicode of a page. sub addCatToText { my ($category, $text, $catTmp, $sortkey, $articleName, $junk); $category = $_[0]; $text = $_[1]; $sortkey = $_[2]; $articleName = $_[3]; unless ($category =~ m/^Category:\w+/) { myLog ("addCatToText(): Bad format on category.\n"); die ("addCatToText(): Bad format on category.\n"); } $catTmp = $category; # _ and spaces are equivalent and may be intermingled. $catTmp =~ s/Category:\s*/Category:\\s*/g; $catTmp =~ s/_/[_ ]/g; $catTmp =~ s%\(%\\\(%g; $catTmp =~ s%\)%\\\)%g; $catTmp =~ s%\.%\\\.%g; if (($text =~ m/(\[\[\s*${catTmp}\s*\|.*?\]\])/is) or ($text =~ m/(\[\[\s*${catTmp}\s*\]\])/is)) { print "addCatToText(): $articleName is already in '$category'.\n"; myLog ("addCatToText(): $articleName is already in '$category'.\n"); print "\n1: '${1}'\n"; print "\ncattmp: '${catTmp}'\n"; return("fail", $text); } if ($text =~ m/^\s*\#REDIRECT/is) { print "addCatToText(): $articleName is a redirect!\n"; myLog ("addCatToText(): $articleName is a redirect!\n"); return("fail", $text); } # Convert underscore to spaces; this is human-readable. $category =~ s/_/ /g; # Add the category $text .= "\n[[$category]]"; # Move the category to the right place ($text, $junk) = fixCategoryInterwiki($text); return ("success", $text); } ### THIS ROUTINE IS CURRENTLY UNUSED ### # It will probably not be useful to you, anyway, since it requires # pre-processed database dumps which are not included in Whobot. sub getPageOffline { my ($target, $result, $targetTmp); $target = $_[0]; # Must run the following before using this function, from 200YMMDD/data: # cat entries.txt | perl ../../scripts/rewrite-entries.pl > entries-simple.txt # Even after this pre-processing, this routine is incredibly slow. # Set up and use MySQL instead if you care about speed. $target =~ s/\s/_/g; # Double escape the tab, once for Perl, once for the shell # -P means "treat as Perl regexp" (yay!) # $result = `grep -P '^${target}\\t' /home/beland/wikipedia/20050107/data/entries-simple.txt`; $targetTmp = shellfix($target); $result = `grep -P '^${targetTmp}\\t' /home/beland/wikipedia/20050107/data/matches2.txt`; $result =~ s/^${target}\t//; $result =~ s/\\n/\n/g; return ($result, "junk"); } # --- CATEGORY AND INTERWIKI STYLE CLEANUP ROUTINES --- # perl whobot.pl INTERWIKI_LOOP # ## This command is for remedial cleanup only, and so is probably not ## useful anymore. This loop takes input of the form: ## "ArticleName\tBodyText\n{repeat...}" on STDIN. # sub interwikiLoop { my ($article, $text, @junk, $enforceCategoryInterwikiCalls); while (<STDIN>) { if ($_ =~ m/^\s*$/) { next; } ($article, $text, @junk) = split ("\t", $_); $text =~ s/\\n/\n/g; enforceCategoryInterwiki($article, $text); $enforceCategoryInterwikiCalls++; print STDOUT "\r interwikiLoop iteration ".$enforceCategoryInterwikiCalls; } } # perl whobot.pl ENFORCE_CATEGORY_INTERWIKI Article_name # ## This function is for both external use. From the command line, use ## it to tidy up a live page's category and interwiki tags, specifying ## only the name of the page. It can also be used by interwikiLoop(), ## which supplies the full text on its own. It will post any changes ## to the live wiki that involve anything more than whitespace ## changes. ## ## This function also does {{msg:foo}} -> {{foo}} conversion, so that ## the article parsing algorithm can be recycled. # sub enforceCategoryInterwiki { my ($articleName, $text, $editTime, $startTime, $textOrig, @newLines, $line, $textCopy, $textOrigCopy, $message, @junk, $diff, $token, $online); $articleName = $_[0]; myLog("enforceCategoryInterwiki($articleName)\n"); $text = $_[1]; $online = 0; if ($text eq "") { $online = 1; ($text, $editTime, $startTime, $token) = getPage($articleName); } $textOrig = $text; ($text, $message) = fixCategoryInterwiki($text); if (substantiallyDifferent($text, $textOrig)) { @newLines = split ("\n", $text); $textCopy = $text; $textOrigCopy = $textOrig; open (ONE, ">/tmp/article1.$$"); print ONE $textOrig; close (ONE); open (TWO, ">/tmp/article2.$$"); print TWO $text; close (TWO); $diff = `diff /tmp/article1.$$ /tmp/article2.$$`; unlink("/tmp/article1.$$"); unlink("/tmp/article2.$$"); myLog("*** $articleName - $message\n"); myLog("*** DIFF FOR $articleName\n"); myLog($diff); if ($online == 0) { # Isolate changed files for later runs open (FIXME, ">>./fixme.interwiki.txt.$$"); $text =~ s/\t/\\t/g; $text =~ s/\n/\\n/g; print FIXME $articleName."\t".$text."\n"; close (FIXME); } myLog($articleName." changed by fixCategoryInterwiki(): $message\n"); print STDOUT $articleName." changed by fixCategoryInterwiki(): $message\n"; if ($online == 1) { postPage ($articleName, $editTime, $startTime, $token, $text, $message, "yes"); } } else { print STDOUT "--- No change for ${articleName}.\n"; myLog ("--- No change for ${articleName}.\n"); ### TEMPORARY ### ### Uncomment this line if you want category changes to ### trigger null edits. This is useful if you have have ### changed the category on a template, but due to a bug this ### does not actually move member articles until they are ### edited. postPage ($articleName, $editTime, $startTime, $token, $textOrig, "null edit", "yes"); ### TEMPORARY ### } } sub substantiallyDifferent { my($a, $b); $a = $_[0]; $b = $_[1]; $a =~ s/\s//g; $b =~ s/\s//g; return ($a ne $b); } # Given some wikicode as input, this function will tidy up the # category and interwiki links and return the result and a comment # suitable for edit summaries. sub fixCategoryInterwiki { my ($input, @segmentNames, @segmentContents, $langlist, $i, $message, $output, $flagForReview, $interwikiBlock, $categoryBlock, $flagError, $bodyBlock, $contents, $name, @interwikiNames, @interwikiContents, @categoryNames, @categoryContents, @bodyNames, @bodyContents, $bodyFlag, @bottomNames, @bottomContents, @segmentNamesNew, @segmentContentsNew, $lastContents, @stubContents, @stubNames, $stubBlock, $msgFlag); $input = $_[0]; # The algorithm here is complex. The general idea is to split the # page in to segments, each of which is assigned a type, and then # to rearrange, consolidate, and frob the segments as needed. # Start with one segment that includes the whole page. @::segmentNames = ("bodyText"); @::segmentContents = ($input); # Recognize and tag certain types of segments. The order of # processing is very important. metaTagInterwiki("nowiki", "^(.*?)(\s*.*?\s*)"); metaTagInterwiki("comment", "^(.*?)(<!.*?>\\n?)"); metaTagInterwiki("html", "^(.*?)(<.*?>\\n?)"); metaTagInterwiki("category", "^(.*?)(\\[\\[\\s*Category\\s*:\\s*.*?\\]\\]\\n?)"); $langlist = `type langlist`; $langlist =~ s/^\s*//s; $langlist =~ s/\s*$//s; $langlist =~ s/\n/\|/gs; $langlist .= "|minnan|zh\-cn|zh\-tw|nb"; metaTagInterwiki("interwiki", "^(.*?)(\\[\\[\\s*($langlist)\\s*:\\s*.*?\\]\\]\\n?)"); metaTagInterwiki("tag", "^(.*?)(\{\{.*?\}\})"); # Allow category and interwiki segments to be followed by HTML # comments only (plus any intervening whitespace). $i = 0; while ($i < @::segmentNames) { $name = $::segmentNames[$i]; $contents = $::segmentContents[$i]; # {{msg:foo}} -> {{foo}} conversion if (($name eq "tag") and ($contents =~ m/^{{msg:(.*?)}}/)) { $msgFlag = 1; $contents =~ s/^{{msg:(.*?)}}/{{$1}}/; } if (($name eq "category") or ($name eq "interwiki")) { if (!($contents =~ m/\n/) and ($::segmentNames[$i+1] eq "comment")) { push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents.$::segmentContents[$i+1]); $i += 2; # DEBUG print "AAA - ".$contents.$::segmentContents[$i+1]); next; } if (!($contents =~ m/\n/) and ($::segmentNames[$i+1] eq "bodyText") and ($::segmentContents[$i+1] =~ m/^\s*$/) and !($::segmentContents[$i+1] =~ m/^\n$/) and ($::segmentNames[$i+2] eq "comment") ) { push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents.$::segmentContents[$i+1].$::segmentContents[$i+2]); $i += 3; # DEBUG print "BBB".$contents.$::segmentContents[$i+1].$::segmentContents[$i+2]); next; } # Consolidate with any following whitespace if (($::segmentNames[$i+1] eq "bodyText") and ($::segmentContents[$i+1] =~ m/^\s*$/) ) { push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents.$::segmentContents[$i+1]); $i += 2; next; } } push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents); $i++; } # Clean up results @::segmentNames = @segmentNamesNew; @::segmentContents = @segmentContentsNew; @segmentContentsNew = (); @segmentNamesNew = (); # Move category and interwiki tags that precede the body text (at # the top of the page) to the bottom of the page. $bodyFlag = 0; foreach $i (0 ... @::segmentNames-1) { $name = $::segmentNames[$i]; $contents = $::segmentContents[$i]; if ($bodyFlag == 1) { push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents); } elsif (($name eq "category") or ($name eq "interwiki")) { push (@bottomNames, $name); push (@bottomContents, $contents); } else { push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents); $bodyFlag = 1; } } # Clean up results @::segmentNames = (@segmentNamesNew, @bottomNames); @::segmentContents = (@segmentContentsNew, @bottomContents); @segmentContentsNew = (); @segmentNamesNew = (); @bottomNames = (); @bottomContents = (); # Starting at the bottom of the page, isolate category, interwiki, # and body text. If categories or interwiki links are mixed with # body text, flag for human review. ### DEBUG ### # foreach $i (0 ... @::segmentNames-1) # { # print "---$i ".$::segmentNames[$i]."---\n"; # print "%%%".$::segmentContents[$i]."%%%\n"; # } ### DEBUG ### ### DEBUG ### #my ($first); #$first = 1; ### DEBUG ### $bodyFlag = 0; $flagForReview = 0; foreach $i (reverse(0 ... @::segmentNames-1)) { $name = $::segmentNames[$i]; $contents = $::segmentContents[$i]; if (($name eq "category") and ($bodyFlag == 0)) { # Push in reverse @categoryNames = ($name, @categoryNames); @categoryContents = ($contents, @categoryContents); next; } elsif (($name eq "interwiki") and ($bodyFlag == 0)) { # Push in reverse @interwikiNames = ($name, @interwikiNames); @interwikiContents = ($contents, @interwikiContents); next; } elsif (($bodyFlag == 0) and ($name eq "tag") and (($contents =~ m/\{\{[ \w\-]+[\- ]?stub\}\}/) or ($contents =~ m/\{\{[ \w\-]+[\- ]?stub\|.*?\}\}/))) { ### IF THIS IS A STUB TAG AND WE ARE STILL $bodyFlag == 0, ### THEN ADD THIS TO $stubBlock! # Canonicalize by making {{msg:Foo}} into {{Foo}} s/^\{\{\s*msg:(.*?)\}\}/\{\{$1\}\}/i; # Push in reverse @stubNames = ($name, @stubNames); @stubContents = ($contents, @stubContents); next; } elsif (($name eq "category") or ($name eq "interwiki")) # bodyFlag implicitly == 1 { if ($flagForReview == 0) { $flagForReview = 1; $lastContents =~ s/^\s*//s; $lastContents =~ s/\s*$//s; $flagError = substr ($lastContents, 0, 30); } # Drop down to push onto main body stack. } # Handle this below instead. ## Skip whitespace #if (($contents =~ m/^\s*$/s) and ($bodyFlag == 0)) #{ # next; #} # Delete these comments if (($bodyFlag == 0) and ($name == "comment")) { if ( ($contents =~ m/<!--\s*interwiki links\s*-->/i) or ($contents =~ m/<!--\s*interwiki\s*-->/i) or ($contents =~ m/<!--\s*interlanguage links\s*-->/i) or ($contents =~ m/<!--\s*categories\s*-->/i) or ($contents =~ m/<!--\s*other languages\s*-->/i) or ($contents =~ m/<!--\s*The below are interlanguage links.\s*-->/i) ) { ### DEBUG ### #print STDOUT ("YELP!\n"); # #foreach $i (0 ... @bodyNames-1) #{ # print "---$i ".$bodyNames[$i]."---\n"; # print "%%%".$bodyContents[$i]."%%%\n"; #} # #print STDOUT ("END-YELP!"); ### DEBUG ### next; } } # Push onto main body stack (in reverse). @bodyNames = ($name, @bodyNames); @bodyContents = ($contents, @bodyContents); ### DEBUG ### #if (($flagForReview == 1) and ($first == 1)) #{ # $first = 0; # print "\@\@\@${lastContents}\#\#\#\n"; #} ### DEBUG ### # This should let tags mixed in with the category and # interwiki links (not comingled with body text) bubble up. unless (($contents =~ m/^\s*$/s) or ($name eq "tag")) { $bodyFlag = 1; } $lastContents = $contents; } ### DEBUG ### # foreach $i (0 ... @bodyNames-1) # { # print "---$i ".$bodyNames[$i]."---\n"; # print "%%%".$bodyContents[$i]."%%%\n"; # } # foreach $i (0 ... @categoryNames-1) # { # print "---$i ".$categoryNames[$i]."---\n"; # print "^^^".$categoryContents[$i]."^^^\n"; # } # foreach $i (0 ... @interwikiNames-1) # { # print "---$i ".$interwikiNames[$i]."---\n"; # print "&&&".$interwikiContents[$i]."&&&\n"; # } ### DEBUG ### # Assemble body text, category, interwiki, and stub arrays into strings foreach $i (0 ... @bodyNames-1) { $name = $bodyNames[$i]; $contents = $bodyContents[$i]; $bodyBlock .= $contents; } foreach $i (0 ... @categoryNames-1) { $name = $categoryNames[$i]; $contents = $categoryContents[$i]; # Enforce style conventions $contents =~ s/\[\[category\s*:\s*/\[\[Category:/i; # Enforce a single newline at the end of each category line. $contents =~ s/\s*$//; $categoryBlock .= $contents."\n"; } foreach $i (0 ... @interwikiNames-1) { $name = $interwikiNames[$i]; $contents = $interwikiContents[$i]; # Canonicalize minnan to zh-min-nan, since that's what's in # the officially distributed langlist. $contents =~ s/^\[\[minnan:/\[\[zh-min-nan:/; # Canonicalize zh-ch, Chinese (simplified) and zn-tw, Chinese # (traditional) to "zh"; the distinction is being managed # implicitly by software now, not explicitly in wikicode. $contents =~ s/^\[\[zh-cn:/\[\[zh:/g; $contents =~ s/^\[\[zh-tw:/\[\[zh:/g; # Canonicalize nb to no $contents =~ s/^\[\[nb:/\[\[no:/g; # Canonicalize dk to da $contents =~ s/^\[\[dk:/\[\[da:/g; # Enforce a single newline at the end of each interwiki line. $contents =~ s/\s*$//; $interwikiBlock .= $contents."\n"; } foreach $i (0 ... @stubNames-1) { $name = $stubNames[$i]; $contents = $stubContents[$i]; # Enforce a single newline at the end of each stub line. $contents =~ s/\s*$//; $contents =~ s/^\s*//; $stubBlock .= $contents."\n"; } # Minimize interblock whitespace $bodyBlock =~ s/^\s*//s; $bodyBlock =~ s/\s*$//s; $categoryBlock =~ s/^\s*//s; $categoryBlock =~ s/\s*$//s; $interwikiBlock =~ s/^\s*//s; $interwikiBlock =~ s/\s*$//s; $stubBlock =~ s/^\s*//s; $stubBlock =~ s/\s*$//s; # Assemble the three blocks into a single string, flagging for # human review if necessary. $output = ""; if ($bodyBlock ne "") { $output .= $bodyBlock."\n\n"; } if (($flagForReview == 1) and !($input =~ m/\{\{interwiki-category-check/) and !($input =~ m/\{\{split/) and !($input =~ m/\[\[Category:Pages for deletion\]\]/)) { $output .= "{{interwiki-category-check|${flagError}}}\n\n"; } if ($categoryBlock ne "") { $output .= $categoryBlock."\n"; } if ($interwikiBlock ne "") { # $output .= "<!-- The below are interlanguage links. -->\n".$interwikiBlock."\n"; $output .= $interwikiBlock."\n"; } if ($stubBlock ne "") { $output .= $stubBlock."\n"; } if ($input ne $output) { $message = "Minor category, interwiki, or template style cleanup"; if ($flagForReview == 1) { $message = "Flagged for manual review of category/interwiki style"; } if ($msgFlag == 1) { $message .= "; {{msg:foo}} -> {{foo}} conversion for MediaWiki 1.5+ compatibility"; } } else { $message = "No change"; } return($output, $message); } #sub displayInterwiki #{ # my ($i); # ## THIS FUNCTION CANNOT BE CALLED DUE TO SCOPING; YOU MUST MANUALLY # ## COPY THIS TEXT INTO fixCategoryInterwiki(). IT IS ONLY USEFUL # ## FOR DIAGNOSTIC PURPOSES. # # foreach $i (0 ... @::segmentNames-1) # { # print "---$i ".$::segmentNames[$i]."---\n"; # print "%%%".$::segmentContents[$i]."%%%\n"; # } #} # A subroutine of fixCategoryInterwiki(), this function isolates # certain parts of existing segments based on a regular expression # pattern, and tags them with the supplied name (which indicates their # type). Sorry for the global variables. sub metaTagInterwiki { my ($tag, $pattern, $i, $meta, $body, @segmentNamesNew, @segmentContentsNew, $name, $contents, $bodyText, ); $tag = $_[0]; $pattern = $_[1]; foreach $i (0 ... @::segmentNames-1) { $name = $::segmentNames[$i]; $contents = $::segmentContents[$i]; unless ($name eq "bodyText") { push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents); next; } while (1) { if ($contents =~ m%$pattern%is) { $bodyText = $1; $meta = $2; if ($bodyText ne "") { push (@segmentNamesNew, "bodyText"); push (@segmentContentsNew, $bodyText); } push (@segmentNamesNew, $tag); push (@segmentContentsNew, $meta); $contents =~ s/\Q${bodyText}${meta}\E//s; } else { if ($contents ne "") { push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents); } last; } } } @::segmentNames = @segmentNamesNew; @::segmentContents = @segmentContentsNew; @segmentContentsNew = (); @segmentNamesNew = (); } sub nullEdit { my ($text, $articleName, $comment, $editTime, $startTime, $token); $articleName = $_[0]; print "nullEdit($articleName)\n"; myLog ("nullEdit($articleName)\n"); ($text, $editTime, $startTime, $token) = getPage($articleName); postPage ($articleName, $editTime, $startTime, $token, $text, "null edit"); } sub cleanupDate { my ($article, @articles); # Get all articles from Category:Wikipedia cleanup @articles = getCategoryArticles ("Category:Wikipedia cleanup"); # @articles = reverse (sort(@articles)); @articles = (sort(@articles)); foreach $article (@articles) { if (($article =~ m/^Wikipedia:/) or ($article =~ m/^Template:/) or ($article =~ m/^User:/) or ($article =~ m/talk:/i) ) { next; } cleanupDateArticle($article); limit(); } } sub cleanupDateArticle #($target) { my (@result, $link, $currentMonth, $currentYear, $junk, $line, $month, $year, $found, $lineCounter, $target); $target = $_[0]; print "cleanupDateArticle($target)\n"; @result = parseHistory($target); ($currentMonth, $currentYear, $junk) = split(" ", $result[0]); $found = ""; foreach $line (@result) { $lineCounter++; ($month, $year, $link) = split(" ", $line); if (($month eq $currentMonth) and ($year eq $currentYear)) { # print "$month $year - SKIP\n"; next; } # Skip this, because it produces false positives on articles that were # protected at the end of last month, but no longer are. The correct # thing to do is to check if an article is CURRENTLY protected by # fetching the current version, but this seems like a waste of network # resources. # if (checkForTag("protected", $link) eq "yes") # { # print "$target is {{protected}}; skipping\n"; # myLog("$target is {{protected}}; skipping\n"); # return(); # } if (checkForTag("sectionclean", $link) eq "yes") { print "$target has {{sectionclean}}\n"; myLog("$target has {{sectionclean}}\n"); nullEdit($target); return(); } if (checkForTag("Sect-Cleanup", $link) eq "yes") { print "$target has {{Sect-Cleanup}}\n"; myLog("$target has {{Sect-Cleanup}}\n"); nullEdit($target); return(); } if (checkForTag("section cleanup", $link) eq "yes") { print "$target has {{section cleanup}}\n"; myLog("$target has {{section cleanup}}\n"); nullEdit($target); return(); } if (checkForTag("sectcleanup", $link) eq "yes") { print "$target has {{sectcleanup}}\n"; myLog("$target has {{sectcleanup}}\n"); nullEdit($target); return(); } if (checkForTag("cleanup-section", $link) eq "yes") { print "$target has {{cleanup-section}}\n"; myLog("$target has {{cleanup-section}}\n"); nullEdit($target); return(); } if (checkForTag("cleanup-list", $link) eq "yes") { print "$target has {{cleanup-list}}\n"; myLog("$target has {{cleanup-list}}\n"); nullEdit($target); return(); } if (checkForTag("cleanup-nonsense", $link) eq "yes") { print "$target has {{cleanup-nonsense}}\n"; myLog("$target has {{cleanup-nonsense}}\n"); nullEdit($target); return(); } if ((checkForTag("cleanup", $link) eq "yes") or (checkForTag("clean", $link) eq "yes") or (checkForTag("CU", $link) eq "yes") or (checkForTag("cu", $link) eq "yes") or (checkForTag("cleanup-quality", $link) eq "yes") or (checkForTag("tidy", $link) eq "yes")) { $currentMonth = $month; $currentYear = $year; # print "$month $year - YES\n"; next; } else { # print "$month $year - NO\n"; # print "Tag added $currentMonth $currentYear\n"; $found = "Tag added $currentMonth $currentYear\n"; last; } } if ($found eq "") { # print "HISTORY EXHAUSTED\n"; if ($lineCounter < 498) { $found = "Tag added $currentMonth $currentYear\n"; } else { # print "Unable to determine when tag was added to $target.\n"; myLog("Unable to determine when tag was added to $target.\n"); die("Unable to determine when tag was added to $target.\n"); } } if ($found ne "") { changeTag("cleanup", "cleanup-date\|${currentMonth} ${currentYear}", $target) || changeTag("tidy", "cleanup-date\|${currentMonth} ${currentYear}", $target) || changeTag("CU", "cleanup-date\|${currentMonth} ${currentYear}", $target) || changeTag("cu", "cleanup-date\|${currentMonth} ${currentYear}", $target) || changeTag("cleanup-quality", "cleanup-date\|${currentMonth} ${currentYear}", $target) || changeTag("clean", "cleanup-date\|${currentMonth} ${currentYear}", $target) || nullEdit($target); } } sub changeTag { my ($tagFrom, $tagFromUpper, $tagTo, $tagToUpper, $articleName, $editTime, $startTime, $text, $token, $comment, $junk); $tagFrom = $_[0]; # "cleanup" $tagTo = $_[1]; # "cleanup-date|August 2005" $articleName = $_[2]; # Article name print "changeTag (${tagFrom}, ${tagTo}, ${articleName})\n"; myLog("changeTag (${tagFrom}, ${tagTo}, ${articleName})\n"); $tagFromUpper = ucfirst($tagFrom); $tagToUpper = ucfirst($tagTo); if ($articleName =~ m/^\s*$/) { myLog("changeTag(): Null target."); die("changeTag(): Null target."); } ($text, $editTime, $startTime, $token) = getPage($articleName); unless (($text =~ m/\{\{\s*\Q$tagFrom\E\s*\}\}/) or ($text =~ m/\{\{\s*\Q$tagFromUpper\E\s*\}\}/) or ($text =~ m/\{\{\s*\Qmsg:$tagFrom\E\s*\}\}/) or ($text =~ m/\{\{\s*\Qmsg:$tagFromUpper\E\s*\}\}/) or ($text =~ m/\{\{\s*\QTemplate:$tagFrom\E\s*\}\}/) or ($text =~ m/\{\{\s*\QTemplate:$tagFromUpper\E\s*\}\}/) or ($text =~ m/\{\{\s*\Q$tagFrom\E\|.*?\s*\}\}/) or ($text =~ m/\{\{\s*\Q$tagFromUpper\E\|.*?\s*\}\}/) ) { myLog("changeTag(): {{$tagFrom}} is not in $articleName.\n"); print "changeTag(): {{$tagFrom}} is not in $articleName.\n"; # die("changeTag(): {{$tagFrom}} is not in $articleName.\n"); ### TEMPORARY ### # Just skip articles with {{tidy}}, {{clean}} {{sectionclean}}, {{advert}}, etc. sleep(1); # READ THROTTLE! return(0); } if (($text =~ m/\{\{\s*\Q$tagTo\E\s*\}\}/) or ($text =~ m/\{\{\s*\Q$tagToUpper\E\s*\}\}/)) { myLog("changeTag(): $articleName already contains {{$tagTo}}."); die("changeTag(): $articleName already contains {{$tagTo}}."); } if ($text =~ m/^\s*\#REDIRECT/is) { myLog ("changeTag.a(): $articleName is a redirect!\n"); die ("changeTag.a(): $articleName is a redirect!\n"); sleep(1); # READ THROTTLE! return(0); } # Escape special characters $tagFrom =~ s%\(%\\(%g; $tagFrom =~ s%\)%\\)%g; $tagFrom =~ s%\'%\\\'%g; # We're lazy and don't fully parse the document to properly check # for escaped tags, so there may be some unnecssary aborts from # the following, but they are rare and easily overridden by # manually editing the page in question. if (($text =~ m/<nowiki>.*?\Q$tagFrom\E.*?<\/nowiki>/is) or ($text =~ m/<pre>.*?\Q$tagFrom\E.*?<\/pre>/is)) # <pre> { myLog ("changeTag.r(): $articleName has a dangerous nowiki or pre tag!\n"); die ("changeTag.r(): $articleName has a dangerous nowiki or pre tag!\n"); } # Make the swap! $text =~ s/\{\{\s*\Q$tagFrom\E\s*\}\}/{{${tagTo}}}/g; $text =~ s/\{\{\s*\Q$tagFromUpper\E\s*\}\}/{{${tagTo}}}/g; $text =~ s/\{\{\s*\Qmsg:$tagFrom\E\s*\}\}/{{${tagTo}}}/g; $text =~ s/\{\{\s*\Qmsg:$tagFromUpper\E\s*\}\}/{{${tagTo}}}/g; $text =~ s/\{\{\s*\QTemplate:$tagFrom\E\s*\}\}/{{${tagTo}}}/g; $text =~ s/\{\{\s*\QTemplate:$tagFromUpper\E\s*\}\}/{{${tagTo}}}/g; $text =~ s/\{\{\s*\Q$tagFrom\E\|(.*?)\s*\}\}/{{${tagTo}}}/g; $text =~ s/\{\{\s*\Q$tagFromUpper\E\|(.*?)\s*\}\}/{{${tagTo}}}/g; # Tidy up the article in general ($text, $junk) = fixCategoryInterwiki($text); # Post the changes $comment = "Changing \{\{${tagFrom}\}\} to \{\{${tagTo}\}\}"; postPage ($articleName, $editTime, $startTime, $token, $text, $comment, "yes"); return (1); } sub parseHistory { my ($pageName, $html, @lines, $line, $date, $month, $year, $htmlCopy, $link, @result); $pageName = $_[0]; $html = getURL("http://en.wikipedia.org/w/wiki.phtml?title=${pageName}&action=history&limit=500"); $htmlCopy = $html; $html =~ s%^.*?<ul id="pagehistory">%%s; $html =~ s%(.*?)</ul>.*$%$1%s; $html =~ s%</li>\s*%%s; @lines = split ("</li>", $html); foreach $line (@lines) { $line =~ s/\n/ /g; if ($line =~ m/^\s*$/) { next; } $line =~ s%<span class='user'>.*?$%%; $line =~ s%^.*?Select a newer version for comparison%%; $line =~ s%^.*?Select a older version for comparison%%; $line =~ s%^.*?name="diff" />%%; # print "LINE: ".$line."\n"; $line =~ m%<a href="(.*?)" title="(.*?)">(.*?)</a>%; $link = $1; $date = $3; # print $link." / $date\n"; if ($date =~ m/Jan/) { $month = "January"; } elsif ($date =~ m/Feb/) { $month = "February"; } elsif ($date =~ m/Mar/) { $month = "March"; } elsif ($date =~ m/Apr/) { $month = "April"; } elsif ($date =~ m/May/) { $month = "May"; } elsif ($date =~ m/Jun/) { $month = "June"; } elsif ($date =~ m/Jul/) { $month = "July"; } elsif ($date =~ m/Aug/) { $month = "August"; } elsif ($date =~ m/Sep/) { $month = "September"; } elsif ($date =~ m/Oct/) { $month = "October"; } elsif ($date =~ m/Nov/) { $month = "November"; } elsif ($date =~ m/Dec/) { $month = "December"; } else { $month = "Unknown month"; myLog ("Unknown month - parse failure! $line\nHTML:\n$html\n"); die ("Unknown month - parse failure! (see log) LINE: $line\n"); } $date =~ m/(\d\d\d\d)/; $year = $1; @result = (@result, "$month $year $link"); } return (@result); } sub checkForTag #($targetURLWithOldIDAttached) { my ($tag, $target, $text); $tag = $_[0]; $target = $_[1]; # Must be absolute; assuming English Wikipedia here. $target =~ s%^/w/wiki.phtml%http://en.wikipedia.org/w/wiki.phtml%; # Decode HTML entities in links $target =~ s/\&/\&/g; if ($target eq $::cachedTarget) { $text = $::cachedText; } else { $text = getURL ($target."&action=edit"); $::cachedTarget = $target; $::cachedText = $text; } if ($text =~ m/\{\{\s*\Q$tag\E\s*\}\}/) { # print $text; die "Cough!"; return "yes"; } $tag = ucfirst($tag); if ($text =~ m/\{\{\s*\Q$tag\E\s*\}\}/) { # print "\n\nSneeze!\n\n"; print $text."\n\n"; return "yes"; } return "no"; } sub getURL #($target) { # Read throttle! sleep (1); my ($attemptStartTime, $attemptFinishTime, $request, $response, $reply, $url); $url = $_[0]; # Monitor wiki server responsiveness $attemptStartTime = Time::HiRes::time(); # Create a request-object print "GET ${url}\n"; myLog("GET ${url}\n"); $request = HTTP::Request->new(GET => "${url}"); $response = $::ua->request($request); if ($response->is_success) { $reply = $response->content; # Monitor wiki server responsiveness $attemptFinishTime = Time::HiRes::time(); retry ("success", "getURL", sprintf("%.3f", $attemptFinishTime-$attemptStartTime)); # This may or may not actually work $::ua->cookie_jar->save(); return ($reply); } else { myLog ("getURL(): HTTP ERR (".$response->status_line.") ${url}\n".$response->content."\n"); print ("getURL(): HTTP ERR (".$response->status_line.") ${url}\n".$response->content."\n"); # 50X HTTP errors mean there is a problem connecting to the wiki server if (($response->status_line =~ m/^500/) or ($response->status_line =~ m/^502/) or ($response->status_line =~ m/^503/)) { return(retry("getURL", @_)); } else { # Unhandled HTTP response die ("getURL(): HTTP ERR (".$response->status_line.") ${url}\n"); } } } sub opentaskUpdate { my ($target, $historyFile, $opentaskText, $editTime, $startTime, $token, $key, $historyDump); $target = "User:Beland/workspace"; $historyFile = "/home/beland/wikipedia/pearle-wisebot/opentask-history.pl"; ($opentaskText, $editTime, $startTime, $token) = getPage($target); eval(`type $historyFile`); $opentaskText = doOpentaskUpdate("NPOV", "Category:NPOV disputes", $opentaskText); $opentaskText = doOpentaskUpdate("COPYEDIT", "Category:Wikipedia articles needing copy edit", $opentaskText); $opentaskText = doOpentaskUpdate("WIKIFY", "Category:Articles that need to be wikified", $opentaskText); $opentaskText = doOpentaskUpdate("MERGE", "Category:Articles to be merged", $opentaskText); # Dump history $historyDump = "\%::history = (\n"; foreach $key (sort(keys(%::history))) { $historyDump .= "\"${key}\" => \"".$::history{$key}."\",\n"; } $historyDump =~ s/,\n$//s; $historyDump .= "\n)\n"; open (HISTORY, ">".$historyFile); print HISTORY $historyDump; close (HISTORY); postPage ($target, $editTime, $startTime, $token, $opentaskText, "Automatic rotation of NPOV, copyedit, wikify, and merge", "yes"); } sub doOpentaskUpdate { my ($categoryID, $sourceCategory, $opentaskText, @articles, $article, %rank, $featuredString, $characterLimit, $featuredStringTmp); $categoryID = $_[0]; $sourceCategory = $_[1]; $opentaskText = $_[2]; $characterLimit = 100; @articles = getCategoryArticles ($sourceCategory); # Shuffle and clean up article names; and exclude unwanted entries foreach $article (@articles) { if (($article =~ m/^Wikipedia:/) or ($article =~ m/^Template:/) or ($article =~ m/^User:/) or ($article =~ m/talk:/i) ) { next; } $article = urlDecode($article); $article =~ s/_/ /g; $rank{$article} = rand(); } # Pick as many articles as will fit in the space allowed foreach $article (sort {$rank{$a} <=> $rank {$b}} (keys(%rank))) { if (length($article)+1 < $characterLimit - length($featuredString)) { $featuredString .= "[[${article}]],\n"; # Record how many times each article is featured. $::history{"${article}-${categoryID}"}++; } } $featuredStringTmp = $featuredString; $featuredStringTmp =~ s/\n/ /g; print "Featuring: $featuredStringTmp\n"; myLog("Featuring: $featuredStringTmp\n"); # Insert into actual page text and finish $opentaskText =~ s/(<!--START-WHOBOT-INSERT-$categoryID-->).*?(<!--END-WHOBOT-INSERT-$categoryID-->)/${1}\n$featuredString${2}/gs; return ($opentaskText); }