#!/usr/bin/perl # STcatBot1.0.pl - Simplified and Traditional CATegorization roBOT # By WikiPedia:User:下一次登录 # Portions largely taken or based on upload.pl by WikiPedia:User:Eloquence # and mwpush.pl by WikiPedia:User:KeithTyler # Tested on WindowsXP/Cygwin/ActivePerl # Corresponding robot: User:STcatBot (application in progress) # Disclaimer: No warranty ganranteed. Use at your own risk. # call requirements use Getopt::Std; use LWP::Simple; use LWP::UserAgent; use HTTP::Request; use HTTP::Response; use HTTP::Cookies; #use warnings; my $username="****"; #input your username here, only English names are tested. my $password="****"; #input your password here my $WIKI_PATH="zh.wikipedia.org"; my $WIKI_PAGE; ### Login to wiki # Set up connection data my $browser=LWP::UserAgent->new(); my @ns_headers = ( 'User-Agent' => 'STcatBot 1.0 by 下一次登录', #Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7) Gecko/20041107 Firefox/1.0', 'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*', 'Accept-Charset' => 'iso-8859-1,*,utf-8', 'Accept-Language' => 'en-US', ); # Hold cookies $browser->cookie_jar( {} ); # Make login request $response=$browser->post("http://".$WIKI_PATH."/w/index.php?title=Special:Userlogin&action=submitlogin", @ns_headers, Content=>[wpName=>$username,wpPassword=>$password,wpRemember=>"1",wpLoginAttempt=>"Log in"]); # After logging in, we should be redirected to another page. # If we aren't, something is wrong. if($response->code!=302) { #cannot login print "We weren't able to login. This could have the following causes: * The username ($username) or password may be incorrect. Solution: Re-run script with correct credentials. * The MediaWiki software on the target host has been upgraded. Solution: Go to http://commons.wikimedia.org/wiki/Commons:File_upload_service and get a new version of the upload script. * You are trying to hack this script for other wikis. The wiki you are uploading to has cookie check disabled. Solution: Try setting \$ignore_login_error to 1. Regardless, we will now try to write the output from the server to rfget.debug.out....\n\n"; open(DEBUG,">rfget.debug.out") or die "Could not write file.\n"; print DEBUG $response->as_string; print "This seems to have worked. Take a look at the file for further information or send it to moeller AT scireview DOT de if you need help debugging the script.\n"; close(DEBUG); exit 1; } my $URL; my $filename1; #random page reply my $filestartstr; #first searching string my $filestart; #first string position my $fileendstr; #second searching string my $fileend; #second string position my $filename; #file name extracted my $pagecontent; #target page content my $redcat; #-1, no red cat; otherwise there is red cat my @unicat; #unicode catnames my @oricat; #original catnames my @tarcat; #found targeted catnames if the cat exists my $catlinecontent; #one cat line content my $catcount; #number of red cats my $probecatcontent; #target cat content my $emptyprobe="class=\"selected new\""; my $oricattemp; #temp string my $editToken; #edit token my $catfound; #is there any change? my $stcatfound; #is there any s/t cat? my $content2; #edit content my $content1; #reply content my $special_char; #illegal char my $contain_char; #is there any? my $changemade; $changemade=0; while($changemade<1000) { #loop until some changes are made... sleep 1; $catfound=0; #is there any change? $stcatfound=0; #is there any s/t cat? #go to a random page $URL="http://".$WIKI_PATH."/wiki/Special:Random"; $response=$browser->get($URL, @ns_headers); $filename1=$response->as_string ; # extract the filename $filestartstr="<li id=\"t-permalink\"><a href=\"/w/index.php?title="; $filestart = index($filename1, $filestartstr); $filestart+=49; $fileendstr="&oldid="; $fileend = index($filename1, $fileendstr); $filename = substr($filename1, $filestart, $fileend-$filestart); $WIKI_PAGE=$filename; print "\nRandom... "; #go to the target page $URL="http://".$WIKI_PATH."/wiki/".$WIKI_PAGE; sleep 1; $response=$browser->get($URL, @ns_headers); $pagecontent=$response->as_string ; print "Connected... "; #check there is a red category $filestartstr="<a href=\"/wiki/Special:Categories\" title=\"Special:Categories\">"; $redcat = index($pagecontent, $filestartstr); $catcount=0; if($redcat<0) { #if there is no cat at all, print in cat_log.txt if(0) { #debug catname10.txt open INPUT, ">>cat_log.txt"; print INPUT "No cat at all.\n\n"; close INPUT; print "No cat."; } } else { #there is(are) cat(s), search red cat(s) $redcat+=62; $pagecontent=substr($pagecontent, $redcat, 10000); $fileendstr="</div>"; $fileend=index($pagecontent, $fileendstr); $pagecontent=substr($pagecontent, 0, $fileend-4); $filestartstr="action=edit"; $redcat=index($pagecontent, $filestartstr); print "Cat found... "; } if($redcat<0) { #if there is no red cat, print in cat_log.txt if(0) { #debug cat_log.txt open INPUT, ">>cat_log.txt"; print INPUT "No red cat.\n\n"; close INPUT; } print "No redcat."; } else { if(1) { #record the target URL open INPUT, ">>cat_log.txt"; print INPUT $URL; print INPUT "\n"; close INPUT; } print "Redcat found... "; } while($redcat>=0) { #fount red cat(s) #extract a cat line in content $filestartstr="<a href"; $fileendstr="</a></span>"; $filestart=index($pagecontent, $filestartstr); $fileend=index($pagecontent, $fileendstr); $catlinecontent=substr($pagecontent, $filestart, $fileend-$filestart); $pagecontent=substr($pagecontent, $fileend+14, 10000); #is the cat red? $filestartstr="action=edit"; if(index($catlinecontent, $filestartstr)>=0) { #if the cat is red... #extract unicat $fileendstr="&action=edit"; $filestart=28; $fileend=index($catlinecontent, $fileendstr); $unicat[$catcount]=substr($catlinecontent, $filestart, $fileend-$filestart); #extract oricat $filestartstr="title=\"Category:"; $filestart=index($catlinecontent, $filestartstr); $oricattemp=substr($catlinecontent, $filestart+16, 1000); $oricat[$catcount]=substr($oricattemp, 0, length($oricattemp)/2-1); #does it have a simp/trad corresponding cat? $URL="http://".$WIKI_PATH."/w/index.php?title=".$unicat[$catcount]."&action=edit"; sleep 1; $response=$browser->get($URL, @ns_headers); $probecatcontent=$response->as_string ; if(index($probecatcontent, $emptyprobe)<0) { #if there is a corresponding cat... #extract tarcat $filestartstr="<title>"; $filestart=index($probecatcontent, $filestartstr); $filestart+=28; $probecatcontent=substr($probecatcontent, $filestart, 1000); $fileendstr=" - Wikipedia</title>"; $fileend=index($probecatcontent, $fileendstr); $tarcat[$catcount]=substr($probecatcontent, 0, $fileend); print "s/t "; $stcatfound=1; } else { $tarcat[$catcount]=-1; print "n/e "; } #cound the red cats $catcount+=1; } $filestartstr="action=edit"; $redcat=index($pagecontent, $filestartstr); } if($catcount>0) { #if change needed, process the content if(1) { #debug cat_log.txt open INPUT, ">>cat_log.txt"; print INPUT "Found "; print INPUT $catcount; print INPUT " red cat(s).\n"; close INPUT; } $URL="http://".$WIKI_PATH."/w/index.php?title=".$WIKI_PAGE."&action=edit"; sleep 1; $response=$browser->get($URL, @ns_headers); $content1=$response->as_string; # Get EditToken ($editToken) = ( $content1 =~ m/value\=\"([0-9a-f]*)\" name\=\"wpEditToken\"/ ); ($editTime) = ( $content1 =~ m/value\=\"([0-9a-f]*)\" name\=\"wpEdittime\"/ ); $filestartstr="<textarea tabindex='1' accesskey=\",\" name=\"wpTextbox1\" id=\"wpTextbox1\" rows='25'"; $fileendstr="</textarea>"; $filestart= index($content1, $filestartstr); $filestart+=92; $fileend= index($content1, $fileendstr); $content2=substr($content1, $filestart, $fileend-$filestart); #substitute my $i=0; while($i<$catcount) { if($tarcat[$i]>=0) { my $oricatname1="[category:".$oricat[$i]; my $oricatname2="[Category:".$oricat[$i]; my $tarcatname="[Category:".$tarcat[$i]; while(index($content2, $oricatname1)>=0) { substr($content2, index($content2, $oricatname1), length($oricatname1) ) =$tarcatname; $catfound=1; } while(index($content2, $oricatname2)>=0) { substr($content2, index($content2, $oricatname2), length($oricatname2) ) =$tarcatname; $catfound=1; } } $i+=1; } } #check for illegal characters $contain_char=-1; $special_char="""; $contain_char=index($content2, $special_char); $special_char="<"; if($contain_char<0) { $contain_char=index($content2, $special_char); } $special_char=">"; if($contain_char<0) { $contain_char=index($content2, $special_char); } $special_char="&"; if($contain_char<0) { $contain_char=index($content2, $special_char); } if($catfound==1) { #if there is changes to be made print "s/t cat found... "; if($contain_char<0) { #if there is no illegal character, upload the new content print "Updating... "; if(1) { #debug cat_log.txt open INPUT, ">>cat_log.txt"; print INPUT "Change made\n\n"; close INPUT; } $response=$browser -> post("http://".$WIKI_PATH."/w/index.php?title=".$WIKI_PAGE."&action=submit", @ns_headers, Content_Type=>'form-data',Content=> [ wpTextbox1 => $content2, wpSummary => "[[User:STcatBot|STcatBot]]: simp/trad catnames", wpSave => "Save page", wpSection => "", wpEdittime => $editTime, wpEditToken => $editToken, ]); $changemade+=1; print "Change made. Sleep 1 min."; sleep 60; } else { print "Illegal char found."; if(1) { #debug cat_log.txt open INPUT, ">>cat_log.txt"; print INPUT "Illegal char found, no change made\n\n"; close INPUT; } } } else { #if cannot make changes if($stcatfound==1) { print "No substritute found."; if(1) { #debug cat_log.txt open INPUT, ">>cat_log.txt"; print INPUT "No sub found.\n\n "; close INPUT; } } } } print "Done.\n"; open(LOG,">STcatBot.log") or die "Could not write file.\n"; print LOG $response->as_string;