SpamFerret.php

from HTYP, the free directory anyone can edit if they can prove to me that they're not a spambot
Jump to navigation Jump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

Navigation

{{#lst:SpamFerret|navbar}}: SpamFerret.php

Code

<php><?php /*

HISTORY:
 2007-08-19 (Wzl) fixed line 155 call to clsDatabase::Query() (method deprecated and removed)
 2007-09-30 (Wzl) fixing regex processing
 2007-10-11 (Wzl) logging ampersandbot attempts; spam throttle (automatic temporary blacklist)
 2007-10-13 (Wzl) fixed some issues which were preventing throttling from working - mainly changes to SQL
 2007-10-15 (Wzl) "Code" wasn't being recorded. Decided that normal filtering should use code '-' so NULL means something is wrong.
 2007-10-28 (Wzl) Events with THR and AMP codes *still* weren't being recorded because method call was improperly formatted.
 2007-12-23 (Wzl) Emails wikimaster if eregi() returns an error (due to improperly formatted regex)
 2007-12-26 (Wzl) Spam turd rejection / logging
 2007-12-27 (Wzl) Fixed spam turd detection to work for new pages too (will probably need refinement)
 2007-12-27 (Wzl) Ooops. Replaced missing if-block from regex results when inserted text is found.
 2008-08-29 (Wzl) Added permanent IP blocking
 2008-09-04 (Wzl) Added (optional) logging of successful edits
 2008-09-19 (Wzl) Actually *set* the "didEdit" flag for successful edits <facepalms>
 2008-10-21 (Wzl) Fixed minor syntax error in "defines"
 2009-02-25 (Wzl) $objArticleCurr->loadLastEdit() now causes error in MW 1.14 (was it necessary before?)
 2009-03-10 (Wzl) "require" -> "require_once" so other extensions can use data.php without conflict
   also optional $kfpWzlLibs so data.php can be somewhere not on the path
 2009-03-18 (Wzl) Got rid of shared.php requirement; now using kfpLib to locate data.php
 2009-03-26 (Wzl) Rewrote data library calls to use new classes (no longer using deprecated/removed classes)
   Also modified to use newer function hooks
 2009-04-24 (Wzl) fixed "strict" bug referencing unset $txtCurr when creating new page
 2009-07-05 (Wzl) Using LibMgr
 2009-07-14 (Wzl) Added attempts.Diff field, patterns.isDIff
 2009-07-15 (Wzl)
   On advice from FreeNode##php, changed from eregi() to preg_match()
   Added option to match diff results instead of submitted edit only
   Saves diff of each change, approved or not
   BUG: approved edits are not being logged properly; using "OK" code and logging as failed
 2009-07-26 (Wzl) fixed minor warning error on line 252
 2009-08-07 (Wzl) email notification working; removed TRD and AMP hard-coded offenses, to be redone as isDiff filters if needed
 2009-08-08 (Wzl) create client record immediately if client is not recognized; don't depend on spam filter being triggered
 2009-08-09 (Wzl) restructured "client" and "attempt" tables (was "clients", "attempts"); not backwards-compatible
 2010-02-24 (Wzl) some code-tidying; trying to restrict passing of data between methods to single array var in args/return
 2010-08-17 (Wzl) added some debug code to CheckRegex(); fixed problem with escaped chars in filter
 2011-04-26 (Wzl) minor bug fixes; hand-merge with version on htyp.org
TO DO:
 * Throttled save attempts should check for spam, just for data-gathering purposes.
   Possibly non-spam from a throttled IP should not update the "WhenLast" timestamp. Maybe this should be a LocalSettings option?
 * Figure out how to display a different error message than "the following text is what triggered our filter:"
OPTIONAL SETTINGS:
 kfpLib - path to data.php folder (no final slash)
 kfsLib_Data - filespec of data.php
  • /
  1. Loader for spam blacklist feature
  2. Include this from LocalSettings.php

if ( defined( 'MEDIAWIKI' ) ) {

$wgExtensionCredits['other'][] = array( 'name' => 'SpamFerret', 'author' => 'Woozle Staddon', 'url' => 'http://htyp.org/SpamFerret', 'version' => '2011-04-26', 'description' => 'database-driven wikispam blocker', ); if (!defined('kfsLib_Data')) {

   if (defined('kfpLib')) {

define('kfsLib_Data', kfpLib.'/data.php');

   } else {

define('kfsLib_Data','data.php'); // assume it's on the path

   }

} if (!defined('LIBMGR')) {

   require('libmgr.php');

} clsLibMgr::Add('data', kfsLib_Data,__FILE__,__LINE__); clsLibMgr::Load('data',__FILE__,__LINE__);

/* ==============

SET UP CALLBACKS
  • /

global $wgFilterCallback, $wgPreSpamFilterCallback;

$wgPreSpamFilterCallback = false; if ( defined( 'MW_SUPPORTS_EDITFILTERMERGED' ) ) {

   $wgHooks['EditFilterMerged'][] = 'wfSpamFerretMerged';

} else {

   if ( $wgFilterCallback ) {
       $wgPreSpamFilterCallback = $wgFilterCallback;
   }
   $wgFilterCallback = 'wfSpamFerretFilter';

}

/* $wgHooks['EditFilter'][] = 'wfSpamFerretValidate'; $wgHooks['ArticleSaveComplete'][] = 'wfSpamFerretArticleSave'; $wgHooks['APIEditBeforeSave'][] = 'wfSpamFerretAPIEditBeforeSave';

  • /

/* ================

SET GLOBAL OBJECTS
  • /

function GetSpamFerret() {

   static $objFerret;
   if (!isset($objFerret)) {

$objFerret = new SpamFerret();

   }
   return $objFerret;

} /* ================

CALLBACK FUNCTIONS
  • /

/**

* Hook function for $wgFilterCallback
*/

//function wfSpamFerretFilter( &$title, $text, $section, &$hookErr, $editSummary ) { function wfSpamFerretFilter( &$title, $text, $section ) {

   global $wgOut;
   $spamObj = GetSpamFerret();
   $wgOut->addWikiText( "Intercepted by SpamFerretFilter" );

// $ret = $spamObj->filter( $title, $text, , $editSummary, $editPage );

   return $spamObj->filter( $title, $text, $section );

}

/**

* Hook function for EditFilterMerged, replaces wfSpamBlacklistFilter
*/

function wfSpamFerretMerged( $editPage, $text, &$hookErr, $editSummary ) {

   global $wgTitle,$wgOut;
   if( is_null( $wgTitle ) ) {
       # API mode
       # wfSpamBlacklistFilterAPIEditBeforeSave already checked the blacklist
       return true;
   }
   $spamObj = GetSpamFerret();
   $title = $editPage->mArticle->getTitle();

// $ret = $spamObj->filter( $title, $text, , $editSummary, $editPage );

   $ret = $spamObj->filter( $title, $text, , $editPage );
   if ( $ret !== false ) $editPage->spamPage( $ret );

// additional text can be added here: // $wgOut->addWikiText( "Intercepted by SpamFerretMerged" );

   // Return convention for hooks is the inverse of $wgFilterCallback
   return ( $ret === false );

} /**

* Hook function for APIEditBeforeSave
*/

function wfSpamFerretAPIEditBeforeSave( &$editPage, $text, &$resultArr ) { }

class SpamFerret {

 var $previousFilter = false;

// internal data

 var $dbSpam;
 var $objDataClients;
 var $strIPAddr;
 var $idPattern;
 var $doClearThrottle;
 
 function Setting($iName) {
   global $wgSpamFerretSettings;
   return $wgSpamFerretSettings[$iName];
 }
 /*-----
   PROPERTIES USED: $this->idPattern (out)
 */
 function filter( &$title, $text, $section, $editPage = FALSE ) {
   global $wgArticle, $wgDBname, $wgMemc, $messageMemc, $wgVersion, $wgOut;
   global $wgTitle, $wgServer;
   global $debug;
   global $errNum, $errStr;
   global $gErrorText;
   // debugging:
   global $sql;
   $fname = 'wfSpamFerretFilter';
   wfProfileIn( $fname );
   ini_set('track_errors', 1);
   # Call the rest of the hook chain first
   if ( $this->previousFilter ) {

$f = $this->previousFilter; if ( $f( $title, $text, $section ) ) { wfProfileOut( $fname ); return true; }

   }

// initialize variables

   $retVal = FALSE;	// default = assume edit is ok
   $gErrorText = FALSE;
   $isClientKnown = FALSE;
   $this->txtEditRaw = $text;		// DEPRECATED
   $arArgs['edit-raw'] = $text;

// get the IP address of the http client making the edit attempt:

   $this->strIPAddr = wfGetIP();

// Open the database

   $this->OpenDatabase();

// open clients table (extended Throttle version) for reference: //return TRUE;

   $objTblClients = new clsTable($this->dbSpam);
     $objTblClients->Name('ClientThrottle2');
     $objTblClients->KeyName('Address');

// Look up to see if this IP is known; it may already be throttled:

     $this->objDataClients = $objTblClients->GetData('Address="'.$this->strIPAddr.'"');
     if (is_object($this->objDataClients)) {

if ($this->objDataClients->hasRows()) { $isClientKnown = TRUE; }

     }
     if ($isClientKnown) {

$this->objDataClients->FirstRow(); $doBlock = $this->objDataClients->doBlock; if ($doBlock) { $strThrType = 'BLK'; } else { $intRetries = $this->objDataClients->Retries; $intThrottle = $this->Setting('throttle_retries'); $doBlock = $intRetries > $intThrottle; $strThrType = 'THR-'.$intRetries; } if ($doBlock) { $arArgs['diff'] = NULL; // not applicable // retry limit exceeded; check timeout limit if ($this->objDataClients->ThrottleTime < $this->Setting('throttle_timeout')) { $txtMsg = 'Too many spam attempts from your IP address ('.$this->strIPAddr.'). Please come back later.'; if (is_object($editPage)) { $editPage->spamPage($txtMsg); } else { EditPage::spamPage($txtMsg); // older MW code doesn't supply $editPage } $arArgs['code'] = $strThrType; $this->RecordAttempt($arArgs); // record post attempt by throttled client $retVal = true; // client has exceeded spam limit; impose throttle } else { $this->doClearThrottle = true; } }

     } else {

$this->CreateClient();

     }

//$this->txtDiff = 'N/A'; $arArgs['diff'] = 'N/A';

if (!$retVal) { $arRtn = $this->GetDiff($title); // get the diff between edit and current contents $arArgs['diff'] = $arRtn['diff']; $arArgs['doAll'] = FALSE; $arRtn = $this->CheckFilters($arArgs); $arArgs['edit-to-check'] = $arRtn['edit-to-check']; if ( $this->isMatch ) { // spam cue found; display the matching text and don't allow the edit to be saved: wfDebug( "Match!\n" );

// The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:" $retVal = '(pattern #'.$this->idPattern.') ['.htmlspecialchars($this->strMatch).']'; // Log the spam attempt: $arArgs['code'] = '-'; $this->RecordAttempt($arArgs); } else { // no spam cues found; allow the edit to be saved, if nothing else has tripped the filter //if ($this->Setting('log_ok_edits')) { $this->idPattern = NULL; $this->RecordOkEdit($arArgs); //$this->RecordAttempt('OK'); //} } }

wfProfileOut( $fname ); $this->ReportErrors(); //$wgOut->addHTML($out); return $retVal; /**/

 }
 public function OpenDatabase() {
   $this->dbSpam = new clsDatabase($this->Setting('dbspec'));
   $this->dbSpam->Open();
 }
 public function FiltTbl() {
   $doLoad = TRUE;
   $doLoad = empty($this->objFilts);
   if ($doLoad) {
     $objTbl = new clsTable($this->dbSpam);

$objTbl->Name('patterns'); $objTbl->KeyName('ID');

     $this->objFilts = $objTbl;
   }
   return $this->objFilts;
 }
 /*-----
   INPUT:
     $iTitle - page for comparing proposed edit
     $this->txtEditRaw (DEPRECATED; use $iarArgs['text-to-check']
   OUTPUT:
     return ['diff']
 */
 public function GetDiff($iTitle) {
   $objArticleCurr = new Article($iTitle);
   if ($objArticleCurr->exists()) {
     $txtCurr = $objArticleCurr->getContent();
     $txtDiff = FigureDiff($txtCurr,$this->txtEditRaw);
   } else {
     $txtDiff = '!!NEW: '.$this->txtEditRaw;
   }
   //$this->txtDiff = $txtDiff;
   $arOut['diff'] = $txtDiff;
   return $arOut;

}

   /*-----
     INPUT:

$iarArgs['doAll'] $iarArgs['diff'] $this->txtEditRaw - DEPRECATED; use $iarArgs

     OUTPUT:

$this->idPattern

   */
   public function CheckFilters(array $iarArgs) {
   global $gRegexMatches,$gFilterMatches,$gFilterRows,$gFilterCount;
   global $debug;
   assert('is_object($this->dbSpam)');
   $doCheckAll = $iarArgs['doAll'];
   $strChkDiff = strtolower(nz($iarArgs['diff']));
   $objFiltTbl = $this->FiltTbl();
   if ($doCheckAll) {

$sqlFilt = NULL;

   } else {

$sqlFilt = 'isActive';

   }
   $objFiltRows = $objFiltTbl->GetData($sqlFilt);
   $objRow = $objFiltRows;	// for shorthand
   $strTextEdit = strtolower($this->txtEditRaw);
   //$this->txtEditChk = $strTextEdit;	// text after being massaged for checking
   $arOut['edit-to-check'] = $strTextEdit;	// text after being massaged for checking
   $this->isMatch = FALSE;
   $gFilterCount = 0;
   $gFilterRows = $objRow->RowCount();
   while($objRow->NextRow() && (!$this->isMatch || $doCheckAll)) {
     $isMatch = FALSE;
     if ($objRow->isDiff) {

$strTextCk = $strChkDiff;

     } else {

$strTextCk = $strTextEdit;

     }
     if (!is_null($strTextCk)) {

$gFilterCount++; $strPattern = strtolower($objRow->Pattern); $isRegex = $objRow->isRegex; $this->idPattern = $objRow->ID; if ($isRegex) { $isMatch = $this->CheckRegex($strPattern,$strTextCk);

if (isset($php_errormsg)) { $this->AddErrorLine('Filter #'.$this->idPattern.' generated error "'.$php_errormsg); }

if ($isMatch) { $this->strMatch = $gRegexMatches[0]; } } else { if (empty($strPattern)) { $isMatch = FALSE; } else { $this->strMatch = stristr($strTextCk,$strPattern); $isMatch = ($this->strMatch != ); } } if ($isMatch) { $this->isMatch = TRUE; if ($doCheckAll) { $gFilterMatches[$this->idPattern] = $this->strMatch; } }

     }
   }
 }
 /*----
   TO DO:
     replace $gRegexMatches with return array
     make this function static
 */
 public function CheckRegex($iPattern,$iText) {
   global $gRegexMatches,$strDbg;
   $chDelim = '/';
   $strPattCk = $iPattern;
   // (2010-08-17) this next line causes incorrect handling of escaped characters in the filter
   //$strPattCk = str_replace('\\','\\\\',$strPattCk);	// make sure filter backslashes are prefixed to be literal
   $strPattCk = str_replace($chDelim,'\\'.$chDelim,$strPattCk);
   unset($php_errormsg);	// TO DO: explain this
   $strFinal = $chDelim.$strPattCk.$chDelim;
   $strDbg .= "@preg_match(\"$strFinal\",\"$iText\",...)";
   $isMatch = @preg_match($strFinal,$iText,$gRegexMatches);
   return $isMatch;
 }
 public function AddErrorLine($iText) {
   global $gErrorText;
   $gErrorText .= $iText."\n";
 }
 public function ReportErrors() {
   global $wgUser;
   global $wgEmergencyContact;
   global $gErrorText;
   if ($gErrorText) {

$msgEmail = 'Filter error report for user '.$wgUser->getName().":\n\n"; $msgEmail .= $gErrorText; mail($wgEmergencyContact,'spamferret filter error',$msgEmail);

   }
 }
 /*----
   ACTION: Create a new record for the current client
 */
 public function CreateClient() {
   $sql = 'INSERT INTO client (Address,WhenFirst,Count,Retries) VALUES("'.$this->strIPAddr.'",NOW(),1,0)';
   $this->dbSpam->Exec($sql);
 }
 /*----
   ACTION: Update a client's record to reflect a new spam attempt
 */
 public function RecordClientSpam() {
   if ($this->doClearThrottle) {
     $strRetries = '0';
   } else {
     $strRetries = 'Retries+1';
   }
   $sql = 'UPDATE client SET WhenLast=NOW(),Count=Count+1, Retries='.$strRetries.' WHERE Address="'.$this->strIPAddr.'"';
   $this->dbSpam->Exec($sql);
   if ($this->dbSpam->RowsAffected() < 1) {
     $this->CreateClient();
     $this->AddErrorLine('Record not found for client '.$this->strIPAddr);
   }
 }
 /*-----
   INPUT:
     $this->idPattern
     $iarArgs['edit-raw']
 */
 public function RecordAttempt(array $iarArgs) {
   global $wgTitle, $wgServer;
   $iCode = $iarArgs['code'];
   $txtDiff = $iarArgs['diff'];		// was $this->txtDiff
   $txtEdit = $iarArgs['edit-raw'];	// was $this->txtEditChk
   $this->RecordClientSpam();
   $sqlCode = '"'.$iCode.'"';
   $sqlURL = '"'.$this->dbSpam->SafeParam($wgTitle->getFullURL()).'"';
   $sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"';
   $sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText()).'"';
   $sqlEdit = '"'.$this->dbSpam->SafeParam($txtEdit).'"';
   $sqlDiff = '"'.$this->dbSpam->SafeParam($txtDiff).'"';
   $sqlPattern = $this->idPattern;
   if (is_null($sqlPattern)) {
     $sqlPattern = 'NULL';
     $isPattern = FALSE;
   } else {
     $isPattern = TRUE;
   }
   $sqlAddr = '"'.$this->strIPAddr.'"';
   //$sqlMatch = is_null($iMatch)?'NULL':'"'.$iMatch.'"';
   $sql = 'INSERT INTO attempt (`When`,ID_Pattern,Addr_Client,IDS_Session,PageServer,PageName,Code,didAllow,Edit,Diff) VALUES'
     .'(NOW(),'
     .$sqlPattern.','
     .$sqlAddr.','
     .SQL_Value(session_id()).','
     .$sqlSrvr.','
     .$sqlPage.','
     .$sqlCode.','
     .'FALSE,'
     .$sqlEdit.','
     .$sqlDiff.')';
   $ok = $this->dbSpam->Exec($sql);
   if ($ok !== TRUE) {
     $this->AddErrorLine('SQL ['.$sql.'] in RecordAttempt() generated this error: '.$ok);
   }
   if ($isPattern) {
     $sql = 'UPDATE patterns SET WhenTried=NOW(), Count=Count+1 WHERE ID='.$this->idPattern;
     $ok = $this->dbSpam->Exec($sql);
     if ($ok !== TRUE) {

$this->AddErrorLine('SQL ['.$sql.'] generated this error: '.$ok);

     }
   }
 }
 /*
   INPUT:
     $this->txtEditChk
 */
 public function RecordOkEdit(array $iarArgs) {
   global $wgTitle, $wgServer;
   $txtDiff = $iarArgs['diff'];		// was $this->txtDiff
   $txtEdit = $iarArgs['edit-to-check'];	// was $this->txtEditChk
   $sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText()).'"';
   $sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"';
   $sqlEdit = '"'.$this->dbSpam->SafeParam($txtEdit).'"';
   $sqlDiff = '"'.$this->dbSpam->SafeParam($txtDiff).'"';
   $sql = 'INSERT INTO attempt (`When`,ID_Pattern,Addr_Client,IDS_Session,PageServer,PageName,Code,didAllow,Edit,Diff) VALUES '
     .'(NOW(),NULL,'
     .'"'.$this->strIPAddr.'",'
     .SQL_Value(session_id()).','
     .$sqlSrvr.','
     .$sqlPage.',"ok",TRUE,'
     .$sqlEdit.','
     .$sqlDiff.')';
   $ok = $this->dbSpam->Exec($sql);
   if ($ok !== TRUE) {
     $this->AddErrorLine('SQL ['.$sql.'] in RecordOkEdit() generated this error: '.$ok);
   }
 }

}

function ErrorHandler ($errno ,$errstr) {

 global $errNum, $errStr;
 $errNum = $errno;
 $errStr = $errstr;

}

function SQL_Value($iVar) {

 if (is_null($iVar)) {
   return 'NULL';
 } else {
   if (is_numeric($iVar)) {
     return $iVar;
   } else {
     return '"'.$iVar.'"';
   }
 }

} function FigureDiff($iTextOld, $iTextNew) {

 # Make temporary files

// $td = wfTempDir();

 $td = session_save_path();
 $fhOld = fopen( $fnOld = tempnam( $td, 'SpamFerret-old-' ), 'w' );
 $fhNew = fopen( $fnNew = tempnam( $td, 'SpamFerret-new-' ), 'w' );
 fwrite( $fhOld, $iTextOld ); fclose( $fhOld );
 fwrite( $fhNew, $iTextNew ); fclose( $fhNew );
 $ksSpamFerretDiffOptions = '-i -E --suppress-common-lines ';	// this setting applies to all uses of a given filter DB
 // can change for different DBs
 $cmd = 'diff '.$ksSpamFerretDiffOptions.wfEscapeShellArg( $fnOld, $fnNew );
 $handle = popen( $cmd, 'r' );
 $result = ;
 do {
   $data = fread( $handle, 8192 );
   if ( strlen( $data ) == 0 ) {
     break;
   }
   $result .= $data;
 } while ( true );
 pclose( $handle );
 unlink( $fnNew ); unlink( $fnOld );
 //return 'cmd=['.$cmd.'] diff=['.$result.']';
 return $result;

}

} // end of 'MEDIAWIKI' check