SpamFerret.php

From HTYP, the free directory anyone can edit

Jump to: navigation, search

[edit] Navigation

{{#lst:MediaWiki|navbar}}: extensions / fighting spam: SpamFerret: SpamFerret.php

[edit] Code

<?php
/*
 HISTORY:
  2007-08-19 (Wzl) fixed line 155 call to clsDatabase::Query() (method deprecated and removed)
  2007-09-30 (Wzl) fixing regex processing
  2007-10-11 (Wzl) logging ampersandbot attempts; spam throttle (automatic temporary blacklist)
  2007-10-13 (Wzl) fixed some issues which were preventing throttling from working - mainly changes to SQL
  2007-10-15 (Wzl) "Code" wasn't being recorded. Decided that normal filtering should use code '-' so NULL means something is wrong.
  2007-10-28 (Wzl) Events with THR and AMP codes *still* weren't being recorded because method call was improperly formatted.
  2007-12-23 (Wzl) Emails wikimaster if eregi() returns an error (due to improperly formatted regex)
  2007-12-26 (Wzl) Spam turd rejection / logging
  2007-12-27 (Wzl) Fixed spam turd detection to work for new pages too (will probably need refinement)
  2007-12-27 (Wzl) Ooops. Replaced missing if-block from regex results when inserted text is found.
  2008-08-29 (Wzl) Added permanent IP blocking
  2008-09-04 (Wzl) Added (optional) logging of successful edits
  2008-09-19 (Wzl) Actually *set* the "didEdit" flag for successful edits <facepalms>
  2008-10-21 (Wzl) Fixed minor syntax error in "defines"
  2009-02-25 (Wzl) $objArticleCurr->loadLastEdit() now causes error in MW 1.14 (was it necessary before?)
  2009-03-10 (Wzl) "require" -> "require_once" so other extensions can use data.php without conflict
    also optional $kfpWzlLibs so data.php can be somewhere not on the path
  2009-03-18 (Wzl) Got rid of shared.php requirement; now using kfpLib to locate data.php
  2009-03-26 (Wzl) Rewrote data library calls to use new classes (no longer using deprecated/removed classes)
    Also modified to use newer function hooks
  2009-04-24 (Wzl) fixed "strict" bug referencing unset $txtCurr when creating new page
  2009-07-05 (Wzl) Using LibMgr
  2009-07-14 (Wzl) Added attempts.Diff field, patterns.isDIff
  2009-07-15 (Wzl)
    On advice from FreeNode##php, changed from eregi() to preg_match()
    Added option to match diff results instead of submitted edit only
    Saves diff of each change, approved or not
    BUG: approved edits are not being logged properly; using "OK" code and logging as failed
  2009-07-26 (Wzl) fixed minor warning error on line 252
  2009-08-07 (Wzl) email notification working; removed TRD and AMP hard-coded offenses, to be redone as isDiff filters if needed
  2009-08-08 (Wzl) create client record immediately if client is not recognized; don't depend on spam filter being triggered
  2009-08-09 (Wzl) restructured "client" and "attempt" tables (was "clients", "attempts"); not backwards-compatible
  2010-02-24 (Wzl) some code-tidying; trying to restrict passing of data between methods to single array var in args/return
  2010-08-17 (Wzl) added some debug code to CheckRegex(); fixed problem with escaped chars in filter
 TO DO:
  * Throttled save attempts should check for spam, just for data-gathering purposes.
    Possibly non-spam from a throttled IP should not update the "WhenLast" timestamp. Maybe this should be a LocalSettings option?
  * Figure out how to display a different error message than "the following text is what triggered our filter:"
 OPTIONAL SETTINGS:
  kfpLib - path to data.php folder (no final slash)
  kfsLib_Data - filespec of data.php
*/
 
# Loader for spam blacklist feature
# Include this from LocalSettings.php
 
if ( defined( 'MEDIAWIKI' ) ) {
 
$wgExtensionCredits['other'][] = array(
	'name' => 'SpamFerret',
	'author' => 'Woozle Staddon', 
	'url' => 'http://htyp.org/SpamFerret', 
	'version' => '2010-08-17',
	'description' => 'database-driven wikispam blocker',
);
if (!defined('kfsLib_Data')) {
    if (defined('kfpLib')) {
	define('kfsLib_Data', kfpLib.'/data.php');
    } else {
	define('kfsLib_Data','data.php');	// assume it's on the path
    }
}
if (!defined('LIBMGR')) {
    require('libmgr.php');
}
clsLibMgr::Add('data',			kfsLib_Data);
clsLibMgr::Load('data');
 
/* ==============
 SET UP CALLBACKS
*/
global $wgFilterCallback, $wgPreSpamFilterCallback;
 
$wgPreSpamFilterCallback = false;
if ( defined( 'MW_SUPPORTS_EDITFILTERMERGED' ) ) {
    $wgHooks['EditFilterMerged'][] = 'wfSpamFerretMerged';
} else {
    if ( $wgFilterCallback ) {
        $wgPreSpamFilterCallback = $wgFilterCallback;
    }
    $wgFilterCallback = 'wfSpamFerretFilter';
}
 
/*
$wgHooks['EditFilter'][] = 'wfSpamFerretValidate';
$wgHooks['ArticleSaveComplete'][] = 'wfSpamFerretArticleSave';
$wgHooks['APIEditBeforeSave'][] = 'wfSpamFerretAPIEditBeforeSave';
*/
 
/* ================
 SET GLOBAL OBJECTS
*/
function GetSpamFerret() {
    static $objFerret;
 
    if (!isset($objFerret)) {
	$objFerret = new SpamFerret();
    }
    return $objFerret;
}
/* ================
 CALLBACK FUNCTIONS
*/
/**
 * Hook function for $wgFilterCallback
 */
//function wfSpamFerretFilter( &$title, $text, $section, &$hookErr, $editSummary ) {
function wfSpamFerretFilter( &$title, $text, $section ) {
    global $wgOut;
 
    $spamObj = GetSpamFerret();
    $wgOut->addWikiText( "Intercepted by SpamFerretFilter" );
//    $ret = $spamObj->filter( $title, $text, '', $editSummary, $editPage );
    return $spamObj->filter( $title, $text, $section );
}
 
/**
 * Hook function for EditFilterMerged, replaces wfSpamBlacklistFilter
 */
function wfSpamFerretMerged( &$editPage, $text, &$hookErr, $editSummary ) {
    global $wgTitle,$wgOut;
 
    if( is_null( $wgTitle ) ) {
        # API mode
        # wfSpamBlacklistFilterAPIEditBeforeSave already checked the blacklist
        return true;
    }
 
    $spamObj = GetSpamFerret();
    $title = $editPage->mArticle->getTitle();
//    $ret = $spamObj->filter( $title, $text, '', $editSummary, $editPage );
    $ret = $spamObj->filter( $title, $text, '',$editPage );
    if ( $ret !== false ) $editPage->spamPage( $ret );
 
// additional text can be added here:
//    $wgOut->addWikiText( "Intercepted by SpamFerretMerged" );
 
    // Return convention for hooks is the inverse of $wgFilterCallback
    return ( $ret === false );
}
/**
 * Hook function for APIEditBeforeSave
 */
function wfSpamFerretAPIEditBeforeSave( &$editPage, $text, &$resultArr ) {
}
 
class SpamFerret {
    var $previousFilter = false;
// internal data
    var $dbSpam;
    var $objDataClients;
    var $strIPAddr;
    var $idPattern;
    var $doClearThrottle;
 
    function Setting($iName) {
	global $wgSpamFerretSettings;
 
	return $wgSpamFerretSettings[$iName];
    }
    /*-----
      PROPERTIES USED: $this->idPattern (out)
    */
    function filter( &$title, $text, $section,  $editPage = FALSE ) {
	global $wgArticle, $wgDBname, $wgMemc, $messageMemc, $wgVersion, $wgOut;
	global $wgTitle, $wgServer;
	global $debug;
	global $errNum, $errStr;
	global $gErrorText;
	// debugging:
	global $sql;
 
	$fname = 'wfSpamFerretFilter';
	wfProfileIn( $fname );
 
	ini_set('track_errors', 1);
 
	# Call the rest of the hook chain first
	if ( $this->previousFilter ) {
		$f = $this->previousFilter;
		if ( $f( $title, $text, $section ) ) {
			wfProfileOut( $fname );
			return true;
		}
	}
// initialize variables
	$retVal = FALSE;	// default = assume edit is ok
	$gErrorText = FALSE;
	$isClientKnown = FALSE;
 
	$this->txtEditRaw = $text;		// DEPRECATED
	$arArgs['edit-raw'] = $text;
 
// get the IP address of the http client making the edit attempt:
	$this->strIPAddr = wfGetIP();
// Open the database
	$this->OpenDatabase();
// open clients table (extended Throttle version) for reference:
//return TRUE;
	$objTblClients = new clsTable($this->dbSpam);
	  $objTblClients->Name('ClientThrottle2');
	  $objTblClients->KeyName('Address');
// Look up to see if this IP is known; it may already be throttled:
	$this->objDataClients = $objTblClients->GetData('Address="'.$this->strIPAddr.'"');
	if (is_object($this->objDataClients)) {
	    if ($this->objDataClients->hasRows()) {
		$isClientKnown = TRUE;
	    }
	}
	if ($isClientKnown) {
	    $this->objDataClients->FirstRow();
	    $doBlock = $this->objDataClients->doBlock;
	    if ($doBlock) {
		$strThrType = 'BLK';
	    } else {
		$intRetries = $this->objDataClients->Retries;
		$intThrottle = $this->Setting('throttle_retries');
		$doBlock = $intRetries > $intThrottle;
		$strThrType = 'THR-'.$intRetries;
	    }
	    if ($doBlock) {
		$arArgs['diff'] = NULL;	// not applicable
// retry limit exceeded; check timeout limit
		if ($this->objDataClients->ThrottleTime < $this->Setting('throttle_timeout')) {
			$txtMsg = 'Too many spam attempts from your IP address ('.$this->strIPAddr.'). Please come back later.';
			if ($editPage) {
			    $editPage->spamPage($txtMsg);
			} else {
			    EditPage::spamPage($txtMsg);	// older MW code doesn't supply $editPage
			}
			$arArgs['code'] = $strThrType;
			$this->RecordAttempt($arArgs);		// record post attempt by throttled client
			$retVal = true;		// client has exceeded spam limit; impose throttle
		} else {
			$this->doClearThrottle = true;
		}
	    }
	} else {
		$this->CreateClient();
	}
	//$this->txtDiff = 'N/A';
	$arArgs['diff'] = 'N/A';
	if (!$retVal) {
	    $arRtn = $this->GetDiff($title);	// get the diff between edit and current contents
	    $arArgs['diff'] = $arRtn['diff'];
	    $arArgs['doAll'] = FALSE;
	    $arRtn = $this->CheckFilters($arArgs);
	    $arArgs['edit-to-check'] = $arRtn['edit-to-check'];
	    if ( $this->isMatch ) {
	// spam cue found; display the matching text and don't allow the edit to be saved:
		wfDebug( "Match!\n" );
 
	// The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:"
		$retVal = '(pattern #'.$this->idPattern.') '.$this->strMatch;
	// Log the spam attempt:
		$arArgs['code'] = '-';
		$this->RecordAttempt($arArgs);
	    } else {
	// no spam cues found; allow the edit to be saved, if nothing else has tripped the filter
		//if ($this->Setting('log_ok_edits')) {
		    $this->idPattern = NULL;
		    $this->RecordOkEdit($arArgs);
		    //$this->RecordAttempt('OK');
		//}
	    }
	}
	wfProfileOut( $fname );
	$this->ReportErrors();
//$wgOut->addHTML($out);
	return $retVal;
/**/
    }
    public function OpenDatabase() {
	$this->dbSpam = new clsDatabase($this->Setting('dbspec'));
	$this->dbSpam->Open();
    }
    public function FiltTbl() {
	$doLoad = TRUE;
	$doLoad = empty($this->objFilts);
	if ($doLoad) {
	    $objTbl = new clsTable($this->dbSpam);
	      $objTbl->Name('patterns');
	      $objTbl->KeyName('ID');
	    $this->objFilts = $objTbl;
	}
	return $this->objFilts;
    }
    /*-----
      INPUT:
	$iTitle - page for comparing proposed edit
	$this->txtEditRaw (DEPRECATED; use $iarArgs['text-to-check']
      OUTPUT:
	return ['diff']
    */
    public function GetDiff($iTitle) {
	$objArticleCurr = new Article($iTitle);
	if ($objArticleCurr->exists()) {
		$txtCurr = $objArticleCurr->getContent();
		$txtDiff = FigureDiff($txtCurr,$this->txtEditRaw);
	} else {
		$txtDiff = '!!NEW: '.$this->txtEditRaw;
	}
	//$this->txtDiff = $txtDiff;
	$arOut['diff'] = $txtDiff;
	return $arOut;
    }
    /*-----
      INPUT:
	$iarArgs['doAll']
	$iarArgs['diff']
	$this->txtEditRaw - DEPRECATED; use $iarArgs
      OUTPUT:
	$this->idPattern
    */
    public function CheckFilters(array $iarArgs) {
	global $gRegexMatches,$gFilterMatches,$gFilterRows,$gFilterCount;
	global $debug;
 
	assert('is_object($this->dbSpam)');
 
	$doCheckAll = $iarArgs['doAll'];
	$strChkDiff = strtolower(nz($iarArgs['diff']));
	$objFiltTbl = $this->FiltTbl();
 
	if ($doCheckAll) {
	    $sqlFilt = NULL;
	} else {
	    $sqlFilt = 'isActive';
	}
 
	$objFiltRows = $objFiltTbl->GetData($sqlFilt);
	$objRow = $objFiltRows;	// for shorthand
 
	$strTextEdit = strtolower($this->txtEditRaw);
	//$this->txtEditChk = $strTextEdit;	// text after being massaged for checking
	$arOut['edit-to-check'] = $strTextEdit;	// text after being massaged for checking
	$this->isMatch = FALSE;
	$gFilterCount = 0;
	$gFilterRows = $objRow->RowCount();
	while($objRow->NextRow() && (!$this->isMatch || $doCheckAll)) {
	    $isMatch = FALSE;
	    if ($objRow->isDiff) {
		    $strTextCk = $strChkDiff;
	    } else {
		    $strTextCk = $strTextEdit;
	    }
	    if (!is_null($strTextCk)) {
		$gFilterCount++;
		$strPattern = strtolower($objRow->Pattern);
		$isRegex = $objRow->isRegex;
		$this->idPattern = $objRow->ID;
		if ($isRegex) {
		    $isMatch = $this->CheckRegex($strPattern,$strTextCk);
		    if (isset($php_errormsg)) {
			$this->AddErrorLine('Filter #'.$this->idPattern.' generated error "'.$php_errormsg);
		    }
 
		    if ($isMatch) {
			$this->strMatch = $gRegexMatches[0];
		    }
		} else {
		    if (empty($strPattern)) {
			$isMatch = FALSE;
		    } else {
			$this->strMatch = stristr($strTextCk,$strPattern);
			$isMatch = ($this->strMatch != '');
		    }
		}
		if ($isMatch) {
		    $this->isMatch = TRUE;
		    if ($doCheckAll) {
			$gFilterMatches[$this->idPattern] = $this->strMatch;
		    }
		}
	    }
	}
    }
    /*
      TO DO:
	replace $gRegexMatches with return array
	make this function static
    */
    public function CheckRegex($iPattern,$iText) {
	global $gRegexMatches,$strDbg;
 
	$chDelim = '/';
	$strPattCk = $iPattern;
	// (2010-08-17) this next line causes incorrect handling of escaped characters in the filter
	//$strPattCk = str_replace('\\','\\\\',$strPattCk);	// make sure filter backslashes are prefixed to be literal
	$strPattCk = str_replace($chDelim,'\\'.$chDelim,$strPattCk);
	unset($php_errormsg);	// TO DO: explain this
	$strFinal = $chDelim.$strPattCk.$chDelim;
	$strDbg .= "'''@preg_match'''(\"$strFinal\",\"$iText\",...)";
	$isMatch = @preg_match($strFinal,$iText,$gRegexMatches);
	return $isMatch;
    }
 
    public function AddErrorLine($iText) {
	global $gErrorText;
 
	$gErrorText .= $iText."\n";
    }
    public function ReportErrors() {
	global $wgUser;
	global $wgEmergencyContact;
	global $gErrorText;
 
	if ($gErrorText) {
	    $msgEmail = 'Filter error report for user '.$wgUser->getName().":\n\n";
	    $msgEmail .= $gErrorText;
	    mail($wgEmergencyContact,'spamferret filter error',$msgEmail);
	}
    }
/*
 ACTION: Create a new record for the current client
*/
	public function CreateClient() {
		$sql = 'INSERT INTO client (Address,WhenFirst,Count,Retries) VALUES("'.$this->strIPAddr.'",NOW(),1,0)';
		$this->dbSpam->Exec($sql);
	}
/*
 ACTION: Update a client's record to reflect a new spam attempt
*/
	public function RecordClientSpam() {
		if ($this->doClearThrottle) {
			$strRetries = '0';
		} else {
			$strRetries = 'Retries+1';
		}
		$sql = 'UPDATE client SET WhenLast=NOW(),Count=Count+1, Retries='.$strRetries.' WHERE Address="'.$this->strIPAddr.'"';
		$this->dbSpam->Exec($sql);
		if ($this->dbSpam->RowsAffected() < 1) {
			$this->CreateClient();
			$this->AddErrorLine('Record not found for client '.$this->strIPAddr);
		}
	}
	/*-----
	  INPUT:
	    $this->idPattern
	    $iarArgs['edit-raw']
	*/
	public function RecordAttempt(array $iarArgs) {
		global $wgTitle, $wgServer;
 
		$iCode = $iarArgs['code'];
		$txtDiff = $iarArgs['diff'];		// was $this->txtDiff
		$txtEdit = $iarArgs['edit-raw'];	// was $this->txtEditChk
		$this->RecordClientSpam();
 
		$sqlCode = '"'.$iCode.'"';
		$sqlURL = '"'.$this->dbSpam->SafeParam($wgTitle->getFullURL()).'"';
		$sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"';
		$sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText()).'"';
		$sqlEdit = '"'.$this->dbSpam->SafeParam($txtEdit).'"';
		$sqlDiff = '"'.$this->dbSpam->SafeParam($txtDiff).'"';
		$sqlPattern = $this->idPattern;
		if (is_null($sqlPattern)) {
			$sqlPattern = 'NULL';
			$isPattern = FALSE;
		} else {
			$isPattern = TRUE;
		}
		$sqlAddr = '"'.$this->strIPAddr.'"';
		//$sqlMatch = is_null($iMatch)?'NULL':'"'.$iMatch.'"';
		$sql = 'INSERT INTO attempt (`When`,ID_Pattern,Addr_Client,IDS_Session,PageServer,PageName,Code,didAllow,Edit,Diff) VALUES'
		  .'(NOW(),'
		  .$sqlPattern.','
		  .$sqlAddr.','
		  .SQL_Value(session_id()).','
		  .$sqlSrvr.','
		  .$sqlPage.','
		  .$sqlCode.','
		  .'FALSE,'
		  .$sqlEdit.','
		  .$sqlDiff.')';
		$ok = $this->dbSpam->Exec($sql);
		if ($ok !== TRUE) {
		    $this->AddErrorLine('SQL ['.$sql.'] in RecordAttempt() generated this error: '.$ok);
		}
		if ($isPattern) {
		    $sql = 'UPDATE patterns SET WhenTried=NOW(), Count=Count+1 WHERE ID='.$this->idPattern;
		    $ok = $this->dbSpam->Exec($sql);
		    if ($ok !== TRUE) {
			$this->AddErrorLine('SQL ['.$sql.'] generated this error: '.$ok);
		    }
		}
	}
	/*
	  INPUT:
	    $this->txtEditChk
	*/
	public function RecordOkEdit(array $iarArgs) {
		global $wgTitle, $wgServer;
 
		$txtDiff = $iarArgs['diff'];		// was $this->txtDiff
		$txtEdit = $iarArgs['edit-to-check'];	// was $this->txtEditChk
 
		$sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText()).'"';
		$sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"';
		$sqlEdit = '"'.$this->dbSpam->SafeParam($txtEdit).'"';
		$sqlDiff = '"'.$this->dbSpam->SafeParam($txtDiff).'"';
		$sql = 'INSERT INTO attempt (`When`,ID_Pattern,Addr_Client,IDS_Session,PageServer,PageName,Code,didAllow,Edit,Diff) VALUES '
		  .'(NOW(),NULL,'
		  .'"'.$this->strIPAddr.'",'
		  .SQL_Value(session_id()).','
		  .$sqlSrvr.','
		  .$sqlPage.',"ok",TRUE,'
		  .$sqlEdit.','
		  .$sqlDiff.')';
		$ok = $this->dbSpam->Exec($sql);
		if ($ok !== TRUE) {
		    $this->AddErrorLine('SQL ['.$sql.'] in RecordOkEdit() generated this error: '.$ok);
		}
	}
}
 
function ErrorHandler  ($errno  ,$errstr) {
	global $errNum, $errStr;
 
	$errNum = $errno;
	$errStr = $errstr;
}
 
function SQL_Value($iVar) {
	if (is_null($iVar)) {
		return 'NULL';
	} else {
		if (is_numeric($iVar)) {
			return $iVar;
		} else {
			return '"'.$iVar.'"';
		}
	}
}
function FigureDiff($iTextOld, $iTextNew) {
	# Make temporary files
//	$td = wfTempDir();
	$td = session_save_path();
	$fhOld = fopen( $fnOld = tempnam( $td, 'SpamFerret-old-' ), 'w' );
	$fhNew = fopen( $fnNew = tempnam( $td, 'SpamFerret-new-' ), 'w' );
 
	fwrite( $fhOld, $iTextOld ); fclose( $fhOld );
	fwrite( $fhNew, $iTextNew ); fclose( $fhNew );
 
	$ksSpamFerretDiffOptions = '-i -E --suppress-common-lines ';	// this setting applies to all uses of a given filter DB
	// can change for different DBs
	$cmd = 'diff '.$ksSpamFerretDiffOptions.wfEscapeShellArg( $fnOld, $fnNew );
	$handle = popen( $cmd, 'r' );
	$result = '';
	do {
		$data = fread( $handle, 8192 );
		if ( strlen( $data ) == 0 ) {
			break;
		}
		$result .= $data;
	} while ( true );
	pclose( $handle );
	unlink( $fnNew ); unlink( $fnOld );
	//return 'cmd=['.$cmd.'] diff=['.$result.']';
	return $result;
}
 
} // end of 'MEDIAWIKI' check
Personal tools