SpamFerret.php

Navigation

 * SpamFerret.php

Code
<?php /* HISTORY: 2007-08-19 (Wzl) fixed line 155 call to clsDatabase::Query (method deprecated and removed) 2007-09-30 (Wzl) fixing regex processing 2007-10-11 (Wzl) logging ampersandbot attempts; spam throttle (automatic temporary blacklist) 2007-10-13 (Wzl) fixed some issues which were preventing throttling from working - mainly changes to SQL 2007-10-15 (Wzl) "Code" wasn't being recorded. Decided that normal filtering should use code '-' so NULL means something is wrong. 2007-10-28 (Wzl) Events with THR and AMP codes *still* weren't being recorded because method call was improperly formatted. 2007-12-23 (Wzl) Emails wikimaster if eregi returns an error (due to improperly formatted regex) 2007-12-26 (Wzl) Spam turd rejection / logging 2007-12-27 (Wzl) Fixed spam turd detection to work for new pages too (will probably need refinement) 2007-12-27 (Wzl) Ooops. Replaced missing if-block from regex results when inserted text is found. 2008-08-29 (Wzl) Added permanent IP blocking 2008-09-04 (Wzl) Added (optional) logging of successful edits 2008-09-19 (Wzl) Actually *set* the "didEdit" flag for successful edits 2008-10-21 (Wzl) Fixed minor syntax error in "defines" 2009-02-25 (Wzl) $objArticleCurr->loadLastEdit now causes error in MW 1.14 (was it necessary before?) 2009-03-10 (Wzl) "require" -> "require_once" so other extensions can use data.php without conflict also optional $kfpWzlLibs so data.php can be somewhere not on the path 2009-03-18 (Wzl) Got rid of shared.php requirement; now using kfpLib to locate data.php 2009-03-26 (Wzl) Rewrote data library calls to use new classes (no longer using deprecated/removed classes) Also modified to use newer function hooks 2009-04-24 (Wzl) fixed "strict" bug referencing unset $txtCurr when creating new page 2009-07-05 (Wzl) Using LibMgr 2009-07-14 (Wzl) Added attempts.Diff field, patterns.isDIff 2009-07-15 (Wzl) On advice from FreeNode##php, changed from eregi to preg_match Added option to match diff results instead of submitted edit only Saves diff of each change, approved or not BUG: approved edits are not being logged properly; using "OK" code and logging as failed 2009-07-26 (Wzl) fixed minor warning error on line 252 2009-08-07 (Wzl) email notification working; removed TRD and AMP hard-coded offenses, to be redone as isDiff filters if needed 2009-08-08 (Wzl) create client record immediately if client is not recognized; don't depend on spam filter being triggered 2009-08-09 (Wzl) restructured "client" and "attempt" tables (was "clients", "attempts"); not backwards-compatible 2010-02-24 (Wzl) some code-tidying; trying to restrict passing of data between methods to single array var in args/return 2010-08-17 (Wzl) added some debug code to CheckRegex; fixed problem with escaped chars in filter 2011-04-26 (Wzl) minor bug fixes; hand-merge with version on htyp.org TO DO: * Throttled save attempts should check for spam, just for data-gathering purposes. Possibly non-spam from a throttled IP should not update the "WhenLast" timestamp. Maybe this should be a LocalSettings option? * Figure out how to display a different error message than "the following text is what triggered our filter:" OPTIONAL SETTINGS: kfpLib - path to data.php folder (no final slash) kfsLib_Data - filespec of data.php


 * 1) Loader for spam blacklist feature
 * 2) Include this from LocalSettings.php

if ( defined( 'MEDIAWIKI' ) ) {

$wgExtensionCredits['other'][] = array(	'name' => 'SpamFerret',	'author' => 'Woozle Staddon', 	'url' => 'http://htyp.org/SpamFerret', 	'version' => '2011-04-26',	'description' => 'database-driven wikispam blocker', ); if (!defined('kfsLib_Data')) { if (defined('kfpLib')) { define('kfsLib_Data', kfpLib.'/data.php'); } else { define('kfsLib_Data','data.php');	// assume it's on the path } } if (!defined('LIBMGR')) { require('libmgr.php'); } clsLibMgr::Add('data',	kfsLib_Data,__FILE__,__LINE__); clsLibMgr::Load('data',__FILE__,__LINE__);

/* ============== SET UP CALLBACKS global $wgFilterCallback, $wgPreSpamFilterCallback;

$wgPreSpamFilterCallback = false; if ( defined( 'MW_SUPPORTS_EDITFILTERMERGED' ) ) { $wgHooks['EditFilterMerged'][] = 'wfSpamFerretMerged'; } else { if ( $wgFilterCallback ) { $wgPreSpamFilterCallback = $wgFilterCallback; }   $wgFilterCallback = 'wfSpamFerretFilter'; }

/* $wgHooks['EditFilter'][] = 'wfSpamFerretValidate'; $wgHooks['ArticleSaveComplete'][] = 'wfSpamFerretArticleSave'; $wgHooks['APIEditBeforeSave'][] = 'wfSpamFerretAPIEditBeforeSave';

/* ================ SET GLOBAL OBJECTS function GetSpamFerret { static $objFerret;

if (!isset($objFerret)) { $objFerret = new SpamFerret; }   return $objFerret; } /* ================ CALLBACK FUNCTIONS /** * Hook function for $wgFilterCallback */ //function wfSpamFerretFilter( &$title, $text, $section, &$hookErr, $editSummary ) { function wfSpamFerretFilter( &$title, $text, $section ) { global $wgOut;

$spamObj = GetSpamFerret; $wgOut->addWikiText( "Intercepted by SpamFerretFilter" ); //   $ret = $spamObj->filter( $title, $text, '', $editSummary, $editPage ); return $spamObj->filter( $title, $text, $section ); }

/** * Hook function for EditFilterMerged, replaces wfSpamBlacklistFilter */ function wfSpamFerretMerged( $editPage, $text, &$hookErr, $editSummary ) { global $wgTitle,$wgOut;

if( is_null( $wgTitle ) ) { # API mode # wfSpamBlacklistFilterAPIEditBeforeSave already checked the blacklist return true; }

$spamObj = GetSpamFerret; $title = $editPage->mArticle->getTitle; //   $ret = $spamObj->filter( $title, $text, '', $editSummary, $editPage ); $ret = $spamObj->filter( $title, $text, '', $editPage ); if ( $ret !== false ) $editPage->spamPage( $ret );

// additional text can be added here: //   $wgOut->addWikiText( "Intercepted by SpamFerretMerged" );

// Return convention for hooks is the inverse of $wgFilterCallback return ( $ret === false ); } /** * Hook function for APIEditBeforeSave */ function wfSpamFerretAPIEditBeforeSave( &$editPage, $text, &$resultArr ) { }

class SpamFerret { var $previousFilter = false; // internal data var $dbSpam; var $objDataClients; var $strIPAddr; var $idPattern; var $doClearThrottle; function Setting($iName) { global $wgSpamFerretSettings;

return $wgSpamFerretSettings[$iName]; }

/*-   PROPERTIES USED: $this->idPattern (out) */ function filter( &$title, $text, $section, $editPage = FALSE ) { global $wgArticle, $wgDBname, $wgMemc, $messageMemc, $wgVersion, $wgOut; global $wgTitle, $wgServer; global $debug; global $errNum, $errStr; global $gErrorText; // debugging: global $sql;

$fname = 'wfSpamFerretFilter'; wfProfileIn( $fname );

ini_set('track_errors', 1);

# Call the rest of the hook chain first if ( $this->previousFilter ) { $f = $this->previousFilter; if ( $f( $title, $text, $section ) ) { wfProfileOut( $fname ); return true; }   } // initialize variables $retVal = FALSE;	// default = assume edit is ok   $gErrorText = FALSE; $isClientKnown = FALSE;

$this->txtEditRaw = $text;		// DEPRECATED $arArgs['edit-raw'] = $text;

// get the IP address of the http client making the edit attempt: $this->strIPAddr = wfGetIP; // Open the database $this->OpenDatabase; // open clients table (extended Throttle version) for reference: //return TRUE; $objTblClients = new clsTable($this->dbSpam); $objTblClients->Name('ClientThrottle2'); $objTblClients->KeyName('Address'); // Look up to see if this IP is known; it may already be throttled: $this->objDataClients = $objTblClients->GetData('Address="'.$this->strIPAddr.'"'); if (is_object($this->objDataClients)) { if ($this->objDataClients->hasRows) { $isClientKnown = TRUE; }     }      if ($isClientKnown) { $this->objDataClients->FirstRow; $doBlock = $this->objDataClients->doBlock; if ($doBlock) { $strThrType = 'BLK'; } else { $intRetries = $this->objDataClients->Retries; $intThrottle = $this->Setting('throttle_retries'); $doBlock = $intRetries > $intThrottle; $strThrType = 'THR-'.$intRetries; }	if ($doBlock) { $arArgs['diff'] = NULL;	// not applicable // retry limit exceeded; check timeout limit if ($this->objDataClients->ThrottleTime < $this->Setting('throttle_timeout')) { $txtMsg = 'Too many spam attempts from your IP address ('.$this->strIPAddr.'). Please come back later.'; if (is_object($editPage)) { $editPage->spamPage($txtMsg); } else { EditPage::spamPage($txtMsg);	// older MW code doesn't supply $editPage }	   $arArgs['code'] = $strThrType; $this->RecordAttempt($arArgs);		// record post attempt by throttled client $retVal = true;		// client has exceeded spam limit; impose throttle } else { $this->doClearThrottle = true; }	}     } else { $this->CreateClient; }	//$this->txtDiff = 'N/A'; $arArgs['diff'] = 'N/A';

if (!$retVal) { $arRtn = $this->GetDiff($title);	// get the diff between edit and current contents $arArgs['diff'] = $arRtn['diff']; $arArgs['doAll'] = FALSE; $arRtn = $this->CheckFilters($arArgs); $arArgs['edit-to-check'] = $arRtn['edit-to-check']; if ( $this->isMatch ) { // spam cue found; display the matching text and don't allow the edit to be saved: wfDebug( "Match!\n" ); // The string sent to spamPage will be shown after "The following text is what triggered our spam filter:" $retVal = '(pattern #'.$this->idPattern.') ['.htmlspecialchars($this->strMatch).']'; // Log the spam attempt: $arArgs['code'] = '-'; $this->RecordAttempt($arArgs); } else { // no spam cues found; allow the edit to be saved, if nothing else has tripped the filter //if ($this->Setting('log_ok_edits')) { $this->idPattern = NULL; $this->RecordOkEdit($arArgs); //$this->RecordAttempt('OK'); //}	 }	}

wfProfileOut( $fname ); $this->ReportErrors; //$wgOut->addHTML($out); return $retVal; /**/ }  public function OpenDatabase { $this->dbSpam = new clsDatabase($this->Setting('dbspec')); $this->dbSpam->Open; } public function FiltTbl { $doLoad = TRUE; $doLoad = empty($this->objFilts); if ($doLoad) { $objTbl = new clsTable($this->dbSpam); $objTbl->Name('patterns'); $objTbl->KeyName('ID'); $this->objFilts = $objTbl; }   return $this->objFilts; } /*-    INPUT: $iTitle - page for comparing proposed edit $this->txtEditRaw (DEPRECATED; use $iarArgs['text-to-check']   OUTPUT:      return ['diff']  */  public function GetDiff($iTitle) {    $objArticleCurr = new Article($iTitle);    if ($objArticleCurr->exists) {      $txtCurr = $objArticleCurr->getContent;      $txtDiff = FigureDiff($txtCurr,$this->txtEditRaw);    } else {      $txtDiff = '!!NEW: '.$this->txtEditRaw;    }    //$this->txtDiff = $txtDiff;    $arOut['diff'] = $txtDiff;    return $arOut; }    /*-      INPUT:	$iarArgs['doAll']	$iarArgs['diff']	$this->txtEditRaw - DEPRECATED; use $iarArgs      OUTPUT:	$this->idPattern    */    public function CheckFilters(array $iarArgs) {    global $gRegexMatches,$gFilterMatches,$gFilterRows,$gFilterCount;    global $debug;

assert('is_object($this->dbSpam)');

$doCheckAll = $iarArgs['doAll']; $strChkDiff = strtolower(nz($iarArgs['diff'])); $objFiltTbl = $this->FiltTbl;

if ($doCheckAll) { $sqlFilt = NULL; } else { $sqlFilt = 'isActive'; }

$objFiltRows = $objFiltTbl->GetData($sqlFilt); $objRow = $objFiltRows;	// for shorthand

$strTextEdit = strtolower($this->txtEditRaw); //$this->txtEditChk = $strTextEdit;	// text after being massaged for checking $arOut['edit-to-check'] = $strTextEdit;	// text after being massaged for checking $this->isMatch = FALSE; $gFilterCount = 0; $gFilterRows = $objRow->RowCount;

while($objRow->NextRow && (!$this->isMatch || $doCheckAll)) { $isMatch = FALSE;

if ($objRow->isDiff) { $strTextCk = $strChkDiff; } else { $strTextCk = $strTextEdit; }

if (!is_null($strTextCk)) { $gFilterCount++; $strPattern = strtolower($objRow->Pattern); $isRegex = $objRow->isRegex; $this->idPattern = $objRow->ID; if ($isRegex) { $isMatch = $this->CheckRegex($strPattern,$strTextCk);

if (isset($php_errormsg)) { $this->AddErrorLine('Filter #'.$this->idPattern.' generated error "'.$php_errormsg);	 }

if ($isMatch) { $this->strMatch = $gRegexMatches[0]; }	} else { if (empty($strPattern)) { $isMatch = FALSE; } else { $this->strMatch = stristr($strTextCk,$strPattern); $isMatch = ($this->strMatch != ''); }	}	if ($isMatch) { $this->isMatch = TRUE; if ($doCheckAll) { $gFilterMatches[$this->idPattern] = $this->strMatch; }	}     }    }  }  /*    TO DO: replace $gRegexMatches with return array make this function static */ public function CheckRegex($iPattern,$iText) { global $gRegexMatches,$strDbg;

$chDelim = '/'; $strPattCk = $iPattern; // (2010-08-17) this next line causes incorrect handling of escaped characters in the filter //$strPattCk = str_replace('\\','\\\\',$strPattCk);	// make sure filter backslashes are prefixed to be literal $strPattCk = str_replace($chDelim,'\\'.$chDelim,$strPattCk); unset($php_errormsg);	// TO DO: explain this $strFinal = $chDelim.$strPattCk.$chDelim; $strDbg .= "@preg_match(\"$strFinal\",\"$iText\",...)"; $isMatch = @preg_match($strFinal,$iText,$gRegexMatches);

return $isMatch; } public function AddErrorLine($iText) { global $gErrorText;

$gErrorText .= $iText."\n"; } public function ReportErrors { global $wgUser; global $wgEmergencyContact; global $gErrorText;

if ($gErrorText) { $msgEmail = 'Filter error report for user '.$wgUser->getName.":\n\n"; $msgEmail .= $gErrorText; mail($wgEmergencyContact,'spamferret filter error',$msgEmail); } }  /*    ACTION: Create a new record for the current client */ public function CreateClient { $sql = 'INSERT INTO client (Address,WhenFirst,Count,Retries) VALUES("'.$this->strIPAddr.'",NOW,1,0)'; $this->dbSpam->Exec($sql); } /*    ACTION: Update a client's record to reflect a new spam attempt */ public function RecordClientSpam { if ($this->doClearThrottle) { $strRetries = '0'; } else { $strRetries = 'Retries+1'; }   $sql = 'UPDATE client SET WhenLast=NOW,Count=Count+1, Retries='.$strRetries.' WHERE Address="'.$this->strIPAddr.'"'; $this->dbSpam->Exec($sql); if ($this->dbSpam->RowsAffected < 1) { $this->CreateClient; $this->AddErrorLine('Record not found for client '.$this->strIPAddr); } }  /*-    INPUT: $this->idPattern $iarArgs['edit-raw'] */ public function RecordAttempt(array $iarArgs) { global $wgTitle, $wgServer;

$iCode = $iarArgs['code']; $txtDiff = $iarArgs['diff'];		// was $this->txtDiff $txtEdit = $iarArgs['edit-raw'];	// was $this->txtEditChk $this->RecordClientSpam;

$sqlCode = '"'.$iCode.'"'; $sqlURL = '"'.$this->dbSpam->SafeParam($wgTitle->getFullURL).'"'; $sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"'; $sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText).'"'; $sqlEdit = '"'.$this->dbSpam->SafeParam($txtEdit).'"'; $sqlDiff = '"'.$this->dbSpam->SafeParam($txtDiff).'"'; $sqlPattern = $this->idPattern; if (is_null($sqlPattern)) { $sqlPattern = 'NULL'; $isPattern = FALSE; } else { $isPattern = TRUE; }   $sqlAddr = '"'.$this->strIPAddr.'"'; //$sqlMatch = is_null($iMatch)?'NULL':'"'.$iMatch.'"'; $sql = 'INSERT INTO attempt (`When`,ID_Pattern,Addr_Client,IDS_Session,PageServer,PageName,Code,didAllow,Edit,Diff) VALUES' .'(NOW,'     .$sqlPattern.','      .$sqlAddr.','      .SQL_Value(session_id).','      .$sqlSrvr.','      .$sqlPage.','      .$sqlCode.','      .'FALSE,'      .$sqlEdit.','      .$sqlDiff.')'; $ok = $this->dbSpam->Exec($sql); if ($ok !== TRUE) { $this->AddErrorLine('SQL ['.$sql.'] in RecordAttempt generated this error: '.$ok); }   if ($isPattern) { $sql = 'UPDATE patterns SET WhenTried=NOW, Count=Count+1 WHERE ID='.$this->idPattern; $ok = $this->dbSpam->Exec($sql); if ($ok !== TRUE) { $this->AddErrorLine('SQL ['.$sql.'] generated this error: '.$ok); }   }  }  /*    INPUT: $this->txtEditChk */ public function RecordOkEdit(array $iarArgs) { global $wgTitle, $wgServer;

$txtDiff = $iarArgs['diff'];		// was $this->txtDiff $txtEdit = $iarArgs['edit-to-check'];	// was $this->txtEditChk

$sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText).'"'; $sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"'; $sqlEdit = '"'.$this->dbSpam->SafeParam($txtEdit).'"'; $sqlDiff = '"'.$this->dbSpam->SafeParam($txtDiff).'"'; $sql = 'INSERT INTO attempt (`When`,ID_Pattern,Addr_Client,IDS_Session,PageServer,PageName,Code,didAllow,Edit,Diff) VALUES ' .'(NOW,NULL,'     .'"'.$this->strIPAddr.'",'      .SQL_Value(session_id).','      .$sqlSrvr.','      .$sqlPage.',"ok",TRUE,'      .$sqlEdit.','      .$sqlDiff.')'; $ok = $this->dbSpam->Exec($sql); if ($ok !== TRUE) { $this->AddErrorLine('SQL ['.$sql.'] in RecordOkEdit generated this error: '.$ok); } } }

function ErrorHandler ($errno  ,$errstr) { global $errNum, $errStr;

$errNum = $errno; $errStr = $errstr; }

function SQL_Value($iVar) { if (is_null($iVar)) { return 'NULL'; } else { if (is_numeric($iVar)) { return $iVar; } else { return '"'.$iVar.'"'; } } } function FigureDiff($iTextOld, $iTextNew) { # Make temporary files //	$td = wfTempDir; $td = session_save_path; $fhOld = fopen( $fnOld = tempnam( $td, 'SpamFerret-old-' ), 'w' ); $fhNew = fopen( $fnNew = tempnam( $td, 'SpamFerret-new-' ), 'w' );

fwrite( $fhOld, $iTextOld ); fclose( $fhOld ); fwrite( $fhNew, $iTextNew ); fclose( $fhNew );

$ksSpamFerretDiffOptions = '-i -E --suppress-common-lines ';	// this setting applies to all uses of a given filter DB // can change for different DBs $cmd = 'diff '.$ksSpamFerretDiffOptions.wfEscapeShellArg( $fnOld, $fnNew ); $handle = popen( $cmd, 'r' ); $result = ''; do { $data = fread( $handle, 8192 ); if ( strlen( $data ) == 0 ) { break; }   $result .= $data; } while ( true ); pclose( $handle ); unlink( $fnNew ); unlink( $fnOld ); //return 'cmd=['.$cmd.'] diff=['.$result.']'; return $result; }

} // end of 'MEDIAWIKI' check