SpamFerret.php

From HTYP, the free directory anyone can edit

Jump to: navigation, search

[edit] Navigation

computing: software: content management: wiki: MediaWiki: extensions / fighting spam: SpamFerret: SpamFerret.php

[edit] Code

<?php
/*
 HISTORY:
  2007-08-19 (Wzl) fixed line 155 call to clsDatabase::Query() (method deprecated and removed)
  2007-09-30 (Wzl) fixing regex processing
  2007-10-11 (Wzl) logging ampersandbot attempts; spam throttle (automatic temporary blacklist)
  2007-10-13 (Wzl) fixed some issues which were preventing throttling from working - mainly changes to SQL
  2007-10-15 (Wzl) "Code" wasn't being recorded. Decided that normal filtering should use code '-' so NULL means something is wrong.
  2007-10-28 (Wzl) Events with THR and AMP codes *still* weren't being recorded because method call was improperly formatted.
  2007-12-23 (Wzl) Emails wikimaster if eregi() returns an error (due to improperly formatted regex)
  2007-12-26 (Wzl) Spam turd rejection / logging
  2007-12-27 (Wzl) Fixed spam turd detection to work for new pages too (will probably need refinement)
  2007-12-27 (Wzl) Ooops. Replaced missing if-block from regex results when inserted text is found.
 TO DO:
  * Log matching text for regex filters
  * Throttled save attempts should check for spam, just for data-gathering purposes.
    Possibly non-spam from a throttled IP should not update the "WhenLast" timestamp. Maybe this should be a LocalSettings option?
  * Figure out how to display a different error message than "the following text is what triggered our filter:"
*/
 
// debugging activation
define(KDO_DEBUG,0);
define(KDO_DEBUG_STACK,0);
// debugging options
define(KDO_DEBUG_HTML,1);
define(KDO_DEBUG_IMMED,1);
define(KDO_DEBUG_DARK,0);
 
# Loader for spam blacklist feature
# Include this from LocalSettings.php
 
if ( defined( 'MEDIAWIKI' ) ) {
require('shared.php');
 
global $wgFilterCallback, $wgPreSpamFilterCallback;
 
if ( $wgFilterCallback ) {
	$wgPreSpamFilterCallback = $wgFilterCallback;
} else {
	$wgPreSpamFilterCallback = false;
}
 
$wgFilterCallback = 'wfSpamFerretLoader';
$wgExtensionCredits['other'][] = array(
	'name' => 'SpamFerret',
        'author' => 'Woozle Staddon', 
        'url' => 'http://htyp.org/SpamFerret', 
	'version' => '2007-12-27a',
        'description' => 'database-driven wikispam content blocker',
);
 
function wfSpamFerretLoader( &$title, $text, $section ) {
	static $spamObj = false;
	global $wgSpamFerretSettings, $wgPreSpamFilterCallback;
 
	if ( $spamObj === false ) {
		$spamObj = new SpamFerret( $wgSpamFerretSettings );
	}
 
	return $spamObj->filter( $title, $text, $section );
}
class SpamFerret {
	var $dbspec;
	var $throttle_retries;
	var $throttle_timeout;
	var $previousFilter = false;
// internal data
	var $dbSpam;
	var $objDataClients;
	var $strIPAddr;
	var $idPattern;
	var $idClient;
	var $isClientKnown;
	var $doClearThrottle;
 
	function SpamFerret( $settings = array() ) {
		global $IP;
 
		foreach ( $settings as $name => $value ) {
			$this->$name = $value;
		}
	}
 
	function filter( &$title, $text, $section ) {
		global $wgSpamFerretSettings;
		global $wgArticle, $wgDBname, $wgMemc, $messageMemc, $wgVersion, $wgOut;
		global $wgTitle, $wgServer;
		global $wgEmergencyContact;
		global $debug;
		global $errNum, $errStr;
 
		$fname = 'wfSpamFerretFilter';
		wfProfileIn( $fname );
 
		# Call the rest of the hook chain first
		if ( $this->previousFilter ) {
			$f = $this->previousFilter;
			if ( $f( $title, $text, $section ) ) {
				wfProfileOut( $fname );
				return true;
			}
		}
		$retVal = false;	// default = assume edit is ok
 
// get the IP address of the http client making the edit attempt:
		$this->strIPAddr = wfGetIP();
// Open the database
		$this->dbSpam = new clsDatabase($this->dbspec);
// open clients table (extended Throttle version) for reference:
		$objTblClients = new clsDataTable($this->dbSpam,'ClientThrottle');
// Look up to see if this IP is known; it may already be throttled:
		$this->objDataClients = $objTblClients->GetData('Address="'.$this->strIPAddr.'"');
		if (is_object($this->objDataClients)) {
			if ($this->objDataClients->RowCount() > 0) {
				$this->isClientKnown = true;
			}
		}
		if ($this->isClientKnown) {
			$this->idClient = $this->objDataClients->GetValue('ID');
			$intRetries = $this->objDataClients->GetValue('Retries');
			$intThrottle = $this->throttle_retries;
//$debug .= 'IP='.$this->strIPAddr.' ID='.$this->idClient.' RETRIES='.$intRetries.' THROTTLE='.$intThrottle;
			if ($intRetries > $intThrottle) {
// retry limit exceeded; check timeout limit
				if ($this->objDataClients->GetValue('ThrottleTime') < $this->throttle_timeout) {
					EditPage::spamPage('Too many spam attempts from your IP address ('.$this->strIPAddr.'). Please come back later.');
					$this->RecordAttempt('THR-'.$intRetries);	// record post attempt by throttled client
					$retVal = true;		// client has exceeded spam limit; impose throttle
				} else {
					$this->doClearThrottle = true;
				}
			}
		}
 
		if (!$retVal) {
/*
 At this point, there's apparently no reason to block the client just for being who they are,
	so now check for common non-listable offenses. These involve comparing the new contents
	with the original, so first we get the original (current) article contents plus some
	information about what has changed:
 * $strIns = whatever has been inserted at the start of the article (or contents of new article)
*/
			$objArticleCurr = new Article($title);
			$objArticleCurr->loadLastEdit();
			if ($objArticleCurr->exists()) {
				$txtCurr = $objArticleCurr->getContent();
				$lenIns = strpos($text,$txtCurr);
				if ($lenIns !== false) {
					$strIns = substr($text,0,$lenIns);
				}
			} else {
				$lenIns = strlen($text);
				$strIns = $text;
			}
// ** OFFENSE: Spam turds (short bits of nonsense inserted at the beginning of an article):
			if ($strIns != '') {
				// new page is old page with something inserted at the beginning
				// get the inserted text
				$isMatch = preg_match('/^[a-z0-9]+ ?$/',$strIns);
// another way to do it, incomplete:
//				$lenMatch = strspn($strIns, '1234567890abcdefghijklmnopqrstuvwxyz');
//				if (strlen($strIns) - $lenMatch < 2) {
//				}
				if ($isMatch) {
					$this->RecordAttempt('TRD',$strIns);	// record spam attempt (AMP = ampersandbot)
// The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:"
					EditPage::spamPage( '"'.$strIns.'" (spam turd).' );
					$retVal = true;
				}
			}
// ** OFFENSE: Ampersandbot:
			$lenNew = strlen($text);
			$posMatch = strpos($txtCurr, $text);
//$debug = 'OLD=['.substr($txtCurr,0,5).'] NEW=['.substr($text,0,5).'] STRPOS='.$posMatch.' ===0?:'.($posMatch===0);
//$debug .= '...GOT TO HERE ... ';
			if ($posMatch===0) {
//$debug .= '1';
// new string starts the same as old string; is it a truncated subset?
				if ($lenNew < strlen($txtCurr)) {
//$debug .= '2';
// new string is a truncation of old string
// ideally, we would just check to see if the missing character is an ampersand -
// ...but unfortunately, something is quasi-randomly mutating the strings in a way which
// 	leaves the exact position of the "missing character" in some doubt. So what we do is this:
// 1. Find the position of the first ampersand in OLD TEXT:
					$posAmp = strpos($txtCurr,'&');
// 2. Compare this position with the length of NEW TEXT:
					$posDiff = abs(strlen($text)-$posAmp);
// 3. If the difference is less than some limit, then presume Ampersandbot activity:
					if ($posDiff < 3) {
// TO DO: log $posDiff for later analysis
// AMPERSANDBOT DETECTED; refuse to save the edit
						$this->RecordAttempt('AMP');	// record spam attempt (AMP = ampersandbot)
// The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:"
						EditPage::spamPage( 'The removal of everything after the first ampersand. You are an ampersandbot; please go away.' );
// LATER: it would be nice to have a special MediaWiki:Ampersandbot page to display when this happens
						$retVal = true;
					}
				}
			}
		}
		if ($debug) {
			EditPage::spamPage('DEBUG: '.$debug);
			$retVal = true;
		}
 
		if (!$retVal) {
			set_error_handler ('ErrorHandler',E_WARNING);
			$objTblPatterns = new clsDataTable($this->dbSpam,'patterns');
			$objDataPatterns = $objTblPatterns->GetData('isActive');
/*
$debug .= ' objDataPatterns is object:'.is_object($objDataPatterns);
$debug .= ' objDataPatterns.Res is object:'.is_object($objDataPatterns->Res);
$debug .= ' objDataPatterns.Row is array:'.is_array($objDataPatterns->Row);
$debug .= ' objDataPatterns.Res is class '.get_class($objDataPatterns->Res);
$debug .= ' objDataPatterns.Res has '.$objDataPatterns->Res->num_rows.' row(s)';
*/
			$strTextCk = strtolower($text);
//$cr = "\n";
//echo 'DEBUGGING spam filter - please excuse the mess!'.$cr;
			while(is_array($objDataPatterns->Row)) {
				$strPattern = $objDataPatterns->GetValue('Pattern');
				$isRegex = $objDataPatterns->GetValue('isRegex');
				$this->idPattern = $objDataPatterns->GetValue('ID');
//echo '<br>Pattern '.$this->idPattern;
				if ($isRegex) {
//echo 'PATTERN: '.$strPattern.$cr;
//echo ' regex';
					$strPattCk = $strPattern;
// Attempt at using Perl-compatible regex, but it doesn't seem to work (or maybe just too many patterns have issues):
//					$strPattCk_pcre = str_replace('/','\/',$strPattCk);
//					$isMatch = preg_match('/'.$strPattCk_pcre.'/i',$strTextCk,$matches);
					$isMatch = eregi($strPattCk, $strTextCk, $matches);
					if ($errNum) {
						$msgEmail .= 'Filter #'.$this->idPattern.' generated error #'.$errNum.': '.$errStr."\n";
//						echo '<br><b>ERROR</b> #'.$errNum.': '.$errStr;
						$errNum = 0;
					}
 
					if ($isMatch) {
						$strMatch = $matches[0];
//echo ' match: '.$strMatch;
					}
				} else {
//echo ' non-regex';
					$strMatch = stristr ($strTextCk,$strPattern);
					$isMatch = ($strMatch != '');
//echo ' match: '.$strMatch;
				}
	// $debug .= 'ROW: '.DumpArray($objDataPatterns->Row);
 
				if ($isMatch) {
					$objDataPatterns->Row = NULL;	// stop the search
				} else {
					$objDataPatterns->NextRow();	// keep looking
				}
			}
 
			if ( $strMatch != '' ) {
	// spam cue found; display the matching text and don't allow the edit to be saved:
				wfDebug( "Match!\n" );
 
	// The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:"
				EditPage::spamPage( $strMatch );
	// Log the spam attempt:
//				$sql = 'SELECT * FROM clients WHERE Address="'.$this->strIPAddr.'"';
//				$this->objDataClients = $this->dbSpam->Query($sql);
		// update or create client record:
				$this->RecordAttempt('-');
				$retVal = true;
			} else {
	// no spam cues found; allow the edit to be saved, if nothing else has tripped the filter
	/*
				EditPage::spamPage( 'DEBUGGING: '.$debug );
				$retVal = true;
	/*/
	/**/
			}
		}
 
		wfProfileOut( $fname );
		if ($msgEmail) {
			mail  ($wgEmergencyContact,'spamferret filter error',$msgEmail);
//			die();
		}
//$wgOut->addHTML($out);
		return $retVal;
/**/
	}
	public function RecordAttempt($iCode,$iMatch=NULL) {
		global $wgTitle, $wgServer;
 
		if ($this->idClient != 0) {
			if ($this->doClearThrottle) {
				$strRetries = '0';
			} else {
				$strRetries = 'Retries+1';
			}
			$sql = 'UPDATE clients SET WhenLast=NOW(),Count=Count+1, Retries='.$strRetries.' WHERE Address="'.$this->strIPAddr.'"';
			$this->dbSpam->Exec($sql);
		} else {
			$sql = 'INSERT INTO clients (Address,WhenFirst,Count,Retries) VALUES("'.$this->strIPAddr.'",NOW(),1,0)';
			$this->dbSpam->Exec($sql);
			$this->idClient = $this->dbSpam->NewID();
		}
 
		$sqlURL = '"'.$this->dbSpam->SafeParam($wgTitle->getFullURL()).'"';
		$sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"';
		$sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText()).'"';
		$sqlPattern = $this->idPattern;
		if (is_null($sqlPattern)) {
			$sqlPattern = 'NULL';
		} else {
			$isPattern = true;
		}
		$sqlMatch = is_null($iMatch)?'NULL':'"'.$iMatch.'"';
		$sql = 'INSERT INTO attempts (`When`,ID_Pattern,ID_Client,PageServer,PageName,Code,MatchText) VALUES (NOW(),'.$sqlPattern.','.$this->idClient.','.$sqlSrvr.','.$sqlPage.',"'.$iCode.'",'.$sqlMatch.')';
		$this->dbSpam->Exec($sql);
		if ($isPattern) {
			$sql = 'UPDATE patterns SET WhenTried=NOW(), Count=Count+1 WHERE ID='.$this->idPattern;
			$this->dbSpam->Exec($sql);
		}
	}
}
 
function ErrorHandler  ($errno  ,$errstr) {
	global $errNum, $errStr;
 
	$errNum = $errno;
	$errStr = $errstr;
}
 
} // end of 'MEDIAWIKI' check
?>
Personal tools