STcatBot 2.0 源代码:

 #!/usr/bin/perl
 # STcatBot2.0.pl - Simplified and Traditional CATegorization roBOT
 # By WikiPedia:User:下一次登录
 # Portions largely taken or based on upload.pl by WikiPedia:User:Eloquence 
 #  and mwpush.pl by WikiPedia:User:KeithTyler
 
 # Tested on WindowsXP/Cygwin/ActivePerl
 # Corresponding robot: User:STcatBot (application in progress)
 
 # Disclaimer: No warranty ganranteed. Use at your own risk. 
 
 # call requirements
 use Getopt::Std;
 use LWP::Simple;
 use LWP::UserAgent;
 use HTTP::Request;
 use HTTP::Response;
 use HTTP::Cookies;
 #use warnings;
 
 my $username="STcatBot";	#input your username here, only English names are tested.
 my $password="****";	#input your password here
 my $WIKI_PATH="zh.wikipedia.org";
 my $WIKI_PAGE;
 
 ### Login to wiki
 
 # Set up connection data
 my $browser=LWP::UserAgent->new();
 my @ns_headers = (
  'User-Agent' => 'STcatBot 2.0 by 下一次登录',  #Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7) Gecko/20041107 Firefox/1.0',
  'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*',
  'Accept-Charset' => 'iso-8859-1,*,utf-8',
  'Accept-Language' => 'en-US',
 );
 
 # Hold cookies
 $browser->cookie_jar( {} );
 
 # Make login request
 $response=$browser->post("http://".$WIKI_PATH."/w/index.php?title=Special:Userlogin&action=submitlogin",
 @ns_headers, Content=>[wpName=>$username,wpPassword=>$password,wpRemember=>"1",wpLoginAttempt=>"Log in"]);
 
 # After logging in, we should be redirected to another page. 
 # If we aren't, something is wrong.
 if($response->code!=302) { #cannot login
         print 
 "We weren't able to login. This could have the following causes:
 
 * The username ($username) or password may be incorrect.
   Solution: Re-run script with correct credentials.
 * The MediaWiki software on the target host has been upgraded.
   Solution: Go to http://commons.wikimedia.org/wiki/Commons:File_upload_service
   and get a new version of the upload script.
 * You are trying to hack this script for other wikis. The wiki you
   are uploading to has cookie check disabled.
   Solution: Try setting \$ignore_login_error to 1.
 
 Regardless, we will now try to write the output from the server to 
 rfget.debug.out....\n\n";
         open(DEBUG,">rfget.debug.out") or die "Could not write file.\n";
         print DEBUG $response->as_string;
         print 
 "This seems to have worked. Take a look at the file for further information or
 send it to moeller AT scireview DOT de if you need help debugging the script.\n";
         close(DEBUG);
         exit 1;
 }
 
 my $URL;
 my $filename1; #random page reply
 my $filestartstr; #first searching string
 my $filestart; #first string position
 my $fileendstr; #second searching string
 my $fileend; #second string position
 my $filename; #file name extracted
 my $pagecontent; #target page content
 my $redcat; #-1, no red cat; otherwise there is red cat
 my @unicat; #unicode catnames
 my @oricat; #original catnames
 my @tarcat; #found targeted catnames if the cat exists
 my $catlinecontent; #one cat line content
 my $catcount; #number of red cats
 my $probecatcontent; #target cat content
 my $emptyprobe="class=\"selected new\"";
 my $oricattemp; #temp string
 my $editToken; #edit token
 my $catfound;	#is there any change?
 my $stcatfound; #is there any s/t cat?
 my $content2; #edit content
 my $content1; #reply content
 my $special_char; #illegal char
 my $contain_char; #is there any?
 
 my $changemade;
 $changemade=0;
 
 
 my $article_count=0; #number of articles in allpages
 my @article_name;	#the characters of the article names for log
 my @article_unicode; #the unicode article names for connection
 my $last_string; #the unicode of the last article in the last run (init="%21")
 my $article_line; #one article line in allpage content
 my $article_ID; 
 
 
 while(1) { #process
 
 	#read last_string.txt and start allpages from that article
 	open FILE, "<last_string.txt";
 	$last_string="";
 	while (<FILE>) {
   	  $last_string.=$_;
 	}
 	#print $last_string;
 
 	#go to allpages and get the contents
 	$URL="http://".$WIKI_PATH."/wiki/Special:Allpages/".$last_string;
 	$response=$browser->get($URL, @ns_headers);
 	$filename1=$response->as_string;
 	$article_count=0; #reset the article count
 	
 	if(1) {	#truncate the contents
 		#find the start point and extract the content
 		$filestartstr="<table style=\"background: inherit;\" border=\"0\" width=\"100%\">";
 		$filestart=index($filename1, $filestartstr);  
 		$filename1=substr($filename1, $filestart+60);
 		
 		#find the end point and cut
 		$fileendstr="<div class=\"printfooter\">";
 		$fileend=index($filename1, $fileendstr);  
 		$filename1=substr($filename1, 0, $fileend);
 	}
 	
 	#find all the article names without redirect
 		#extract a line (between<td> </td>)and leave rest to 
 		$filestartstr="<td>";
 		$fileendstr="</td>";
 		$filestart=index($filename1, $filestartstr)+4;
 		$fileend=index($filename1, $fileendstr);
 		$article_line=substr($filename1, $filestart, $fileend-$filestart);
 		$filename1=substr($filename1, $fileend+5);
 		
 		while($fileend>0)	{	#if there is article names in allpage contents
 
 			#check if it is a redirect
 			$filestartstr="<div class=\"allpagesredirect\">";
 			$filestart=index($article_line, $filestartstr);
 			if($filestart<0)	{ #it's not a redirect
 				#process $article_line
 					#extract the unicode name
 					$filestartstr="<a href=\"/wiki/";
 					$filestart=index($article_line, $filestartstr)+15;
 					$article_line=substr($article_line, $filestart);
 					$fileendstr="\"";
 					$fileend=index($article_line, $fileendstr);
 					$article_unicode[$article_count]=substr($article_line, 0, $fileend);
 					$article_line=substr($article_line, $fileend+1);
 					
 					if(0)	{	#debug allpage contents
 						open INPUT, ">>debug4.txt";
 						print INPUT $article_unicode[$article_count];
 						print INPUT "\n";
 						close INPUT;
 					}
 					
 					#extract the character name
 					$filestartstr="title=\"";
 					$filestart=index($article_line, $filestartstr)+7;
 					$article_line=substr($article_line, $filestart);
 					$fileendstr="\"";
 					$fileend=index($article_line, $fileendstr);
 					$article_name[$article_count]=substr($article_line, 0, $fileend);
 
 					if(0)	{	#debug allpage contents
 						open INPUT, ">>debug5.txt";
 						print INPUT $article_name[$article_count];
 						print INPUT "\n";
 						close INPUT;
 					}
 					
 					$article_count+=1;
 			}
 
 			#extract a line (between<td> </td>)and leave rest to 
 			$filestartstr="<td>";
 			$fileendstr="</td>";
 			$filestart=index($filename1, $filestartstr)+4;
 			$fileend=index($filename1, $fileendstr);
 			$article_line=substr($filename1, $filestart, $fileend-$filestart);
 			$filename1=substr($filename1, $fileend+5);
 			
 	}	#while
 	
 	$article_ID=0;
 
 	while($article_ID<$article_count) { #go through all the pages and process
 		$catfound=0;	#is there any change?
 		$stcatfound=0; #is there any s/t cat?
 		
 		#go to the target page
 		$WIKI_PAGE=$article_unicode[$article_ID];
 		$URL="http://".$WIKI_PATH."/wiki/".$WIKI_PAGE;
 		$response=$browser->get($URL, @ns_headers);
 		$pagecontent=$response->as_string ;
 		print "\nConnected... ";
 		
 		#check there is a red category
 		$filestartstr="<a href=\"/wiki/Special:Categories\" title=\"Special:Categories\">";
 		$redcat = index($pagecontent, $filestartstr); 
 		
 		$catcount=0;
 		
 		
 		if($redcat<0)	{ #if there is no cat at all, print in cat_log.txt
 			if(0)	{	#debug catname10.txt
 				open INPUT, ">>cat_log.txt";
 				print INPUT "No cat at all.\n\n";
 				close INPUT;
 				print "No cat.";
 			}
 		}
 		else { #there is(are) cat(s), search red cat(s)
 			$redcat+=62;
 			$pagecontent=substr($pagecontent, $redcat, 10000);
 			
 			$fileendstr="</div>";
 			$fileend=index($pagecontent, $fileendstr);
 			$pagecontent=substr($pagecontent, 0, $fileend-4);
 			
 			$filestartstr="action=edit";
 			$redcat=index($pagecontent, $filestartstr);
 			print "Cat found...  ";
 		}
 		
 		if($redcat<0) { #if there is no red cat, print in cat_log.txt
 				if(0)	{	#debug cat_log.txt
 					open INPUT, ">>cat_log.txt";
 					print INPUT "No red cat.\n\n";
 					close INPUT;
 				}
 				print "No redcat.";
 		}
 		else
 		{
 			if(1)	{	#record the target URL
 				open INPUT, ">>cat_log.txt";
 				print INPUT $URL;
 				print INPUT "\n";
 				close INPUT;
 			}
 			print "Redcat found...  ";
 		}
 		
 		while($redcat>=0) { #fount red cat(s)
 			#extract a cat line in content
 			$filestartstr="<a href";
 			$fileendstr="</a></span>";
 			$filestart=index($pagecontent, $filestartstr);
 			$fileend=index($pagecontent, $fileendstr);
 			$catlinecontent=substr($pagecontent, $filestart, $fileend-$filestart);
 			$pagecontent=substr($pagecontent, $fileend+14, 10000);
 			
 			#is the cat red?
 			$filestartstr="action=edit";
 			if(index($catlinecontent, $filestartstr)>=0) { #if the cat is red...
 				#extract unicat
 				$fileendstr="&amp;action=edit";
 				$filestart=28;
 				$fileend=index($catlinecontent, $fileendstr);
 				$unicat[$catcount]=substr($catlinecontent, $filestart, $fileend-$filestart);
 				
 				#extract oricat
 				$filestartstr="title=\"Category:";
 				$filestart=index($catlinecontent, $filestartstr);
 				$oricattemp=substr($catlinecontent, $filestart+16, 1000);
 				$oricat[$catcount]=substr($oricattemp, 0, length($oricattemp)/2-1);
 				
 				#does it have a simp/trad corresponding cat?
 				$URL="http://".$WIKI_PATH."/w/index.php?title=".$unicat[$catcount]."&action=edit";
 				$response=$browser->get($URL, @ns_headers);
 				$probecatcontent=$response->as_string ;
 				
 				if(index($probecatcontent, $emptyprobe)<0) { #if there is a corresponding cat...
 					#extract tarcat
 					$filestartstr="<title>";
 					$filestart=index($probecatcontent, $filestartstr);
 					$filestart+=28;
 					$probecatcontent=substr($probecatcontent, $filestart, 1000);
 					$fileendstr=" - Wikipedia</title>";
 					$fileend=index($probecatcontent, $fileendstr);
 					$tarcat[$catcount]=substr($probecatcontent, 0, $fileend);
 					print "s/t  ";
 					$stcatfound=1;
 				}
 				else {
 					$tarcat[$catcount]=-1;
 					print "n/e  ";
 				}
 				
 				#cound the red cats	
 				$catcount+=1;
 			}
 			$filestartstr="action=edit";
 			$redcat=index($pagecontent, $filestartstr);
 		}
 		
 		if($catcount>0) { #if change needed, process the content
 			if(1)	{	#debug cat_log.txt
 				open INPUT, ">>cat_log.txt";
 				print INPUT "Found ";
 				print INPUT $catcount;
 				print INPUT " red cat(s).\n";
 				close INPUT;
 			}
 		
 			$URL="http://".$WIKI_PATH."/w/index.php?title=".$WIKI_PAGE."&action=edit";
 			$response=$browser->get($URL, @ns_headers);
 			$content1=$response->as_string;
 			# Get EditToken
 			($editToken) = ( $content1 =~ m/value\=\"([0-9a-f\\]*)\" name\=\"wpEditToken\"/ );
 			($editTime) = ( $content1 =~ m/value\=\"([0-9a-f]*)\" name\=\"wpEdittime\"/ );
 			
 		
 			$filestartstr="<textarea tabindex='1' accesskey=\",\" name=\"wpTextbox1\" id=\"wpTextbox1\" rows='25'";
 			$fileendstr="</textarea>";
 			$filestart= index($content1, $filestartstr);
 			$filestart+=92;
 			$fileend= index($content1, $fileendstr);
 			$content2=substr($content1, $filestart, $fileend-$filestart);
 			
 			
 			#substitute
 			my $i=0;
 			while($i<$catcount) {
 				if($tarcat[$i]>=0) {
 						my $oricatname1="[category:".$oricat[$i];
 						my $oricatname2="[Category:".$oricat[$i];
 						my $tarcatname="[Category:".$tarcat[$i];
 						while(index($content2, $oricatname1)>=0) {
 							substr($content2, index($content2, $oricatname1), length($oricatname1) ) =$tarcatname;
 							$catfound=1;
 						}
 						while(index($content2, $oricatname2)>=0) {
 							substr($content2, index($content2, $oricatname2), length($oricatname2) ) =$tarcatname;
 							$catfound=1;
 						}
 					}
 					$i+=1;
 				}
 		}
 		
 		#check for illegal characters
 		$contain_char=-1;
 		
 		$special_char="&quot;"; #"
 		while(index($content2, $special_char)>=0) {
 			substr($content2, index($content2, $special_char), length($special_char) ) ="\"";
 		}
 		$special_char="&lt;"; #<
 		while(index($content2, $special_char)>=0) {
 			substr($content2, index($content2, $special_char), length($special_char) ) ="<";
 		}
 		$special_char="&gt;"; #>
 		while(index($content2, $special_char)>=0) {
 			substr($content2, index($content2, $special_char), length($special_char) ) =">";
 		}
 		$special_char="&amp;"; #&
 		while(index($content2, $special_char)>=0) {
 			substr($content2, index($content2, $special_char), length($special_char) ) ="&";
 		}
 		
 		if($catfound==1) { #if there is changes to be made
 			print "s/t cat found...  ";
 			{ #upload the new content
 				print "Updating...  ";	
 				if(1)	{	#debug cat_log.txt
 					open INPUT, ">>cat_log.txt";
 					print INPUT "Change made\n\n";
 					close INPUT;
 				}
 		
 				$response=$browser -> 
 		   		post("http://".$WIKI_PATH."/w/index.php?title=".$WIKI_PAGE."&action=submit",
 		      	  @ns_headers,
 		        	Content_Type=>'form-data',Content=>
 		        	[ wpTextbox1 => $content2,
 		          	wpSummary => "[[User:STcatBot|STcatBot]]: simp/trad catnames",
 			          wpSave => "Save page",
 		  	        wpSection => "",
 		    	      wpEdittime => $editTime,
 		      	    wpEditToken => $editToken,
 		      	    wpMinoredit => "1",
 		        	]);
 			$changemade+=1;
 			print "Change made. Sleep.";	
 			sleep 1;
 		
 		  }
 		}
 		else { #if cannot make changes
 				if($stcatfound==1) {
 					print "No substritute found.";
 						if(1)	{	#debug ns_log.txt
 							open INPUT, ">>ns_log.txt";
 							print INPUT "#[[";
 							print INPUT $article_name[$article_ID];
 							print INPUT "]]\n";
 							close INPUT;
 						}
 	
 				}
 		}
 	
 	$article_ID+=1;
 	
 	} #while ID<count
 	
 	if(1)	{	#record last string.txt
 		open INPUT, ">last_string.txt";
 		print INPUT $article_unicode[$article_count-1];
 		close INPUT;
 	}
 	
 } #while whole