Difference between revisions of "SpamFerret.php"

from HTYP, the free directory anyone can edit if they can prove to me that they're not a spambot
Jump to navigation Jump to search
m (→‎Navigation: using navbar transclusion)
(→‎Code: 12/23 code update)
Line 11: Line 11:
 
   2007-10-13 (Wzl) fixed some issues which were preventing throttling from working - mainly changes to SQL
 
   2007-10-13 (Wzl) fixed some issues which were preventing throttling from working - mainly changes to SQL
 
   2007-10-15 (Wzl) "Code" wasn't being recorded. Decided that normal filtering should use code '-' so NULL means something is wrong.
 
   2007-10-15 (Wzl) "Code" wasn't being recorded. Decided that normal filtering should use code '-' so NULL means something is wrong.
 +
  2007-10-28 (Wzl) Events with THR and AMP codes *still* weren't being recorded because method call was improperly formatted.
 +
  2007-12-23 (Wzl) Emails wikimaster if eregi() returns an error (due to improperly formatted regex)
 +
TO DO:
 +
  * Throttled save attempts should check for spam, just for data-gathering purposes.
 +
    Possibly non-spam from a throttled IP should not update the "WhenLast" timestamp. Maybe this should be a LocalSettings option?
 +
  * Figure out how to display a different error message than "the following text is what triggered our filter:"
 
*/
 
*/
  
Line 40: Line 46:
 
         'author' => 'Woozle Staddon',  
 
         'author' => 'Woozle Staddon',  
 
         'url' => 'http://htyp.org/SpamFerret',  
 
         'url' => 'http://htyp.org/SpamFerret',  
'version' => '2007-10-13',
+
'version' => '2007-12-23',
 
         'description' => 'database-driven wikispam content blocker',
 
         'description' => 'database-driven wikispam content blocker',
 
);
 
);
Line 80: Line 86:
 
global $wgArticle, $wgDBname, $wgMemc, $messageMemc, $wgVersion, $wgOut;
 
global $wgArticle, $wgDBname, $wgMemc, $messageMemc, $wgVersion, $wgOut;
 
global $wgTitle, $wgServer;
 
global $wgTitle, $wgServer;
 +
global $wgEmergencyContact;
 
global $debug;
 
global $debug;
 +
global $errNum, $errStr;
  
 
$fname = 'wfSpamFerretFilter';
 
$fname = 'wfSpamFerretFilter';
Line 93: Line 101:
 
}
 
}
 
}
 
}
 
 
$retVal = false; // default = assume edit is ok
 
$retVal = false; // default = assume edit is ok
  
Line 117: Line 124:
 
// retry limit exceeded; check timeout limit
 
// retry limit exceeded; check timeout limit
 
if ($this->objDataClients->GetValue('ThrottleTime') < $this->throttle_timeout) {
 
if ($this->objDataClients->GetValue('ThrottleTime') < $this->throttle_timeout) {
$retval = true; // client has exceeded spam limit; impose throttle
+
EditPage::spamPage('Too many spam attempts from your IP address ('.$this->strIPAddr.'). Please come back later.');
RecordAttempt('THR'); // record post attempt by throttled client
+
$this->RecordAttempt('THR-'.$intRetries); // record post attempt by throttled client
EditPage::spamPage('Too many spam attempts from this IP address. Please come back later.');
+
$retVal = true; // client has exceeded spam limit; impose throttle
 
} else {
 
} else {
 
$this->doClearThrottle = true;
 
$this->doClearThrottle = true;
Line 126: Line 133:
 
}
 
}
  
 +
if (!$retVal) {
 
// Check for ampersandbot
 
// Check for ampersandbot
$objArticleCurr = new Article($title);
+
$objArticleCurr = new Article($title);
$objArticleCurr->loadLastEdit();
+
$objArticleCurr->loadLastEdit();
$txtCurr = $objArticleCurr->getContent();
+
$txtCurr = $objArticleCurr->getContent();
$lenNew = strlen($text);
+
$lenNew = strlen($text);
$posMatch = strpos($txtCurr, $text);
+
$posMatch = strpos($txtCurr, $text);
 
//$debug = 'OLD=['.substr($txtCurr,0,5).'] NEW=['.substr($text,0,5).'] STRPOS='.$posMatch.' ===0?:'.($posMatch===0);
 
//$debug = 'OLD=['.substr($txtCurr,0,5).'] NEW=['.substr($text,0,5).'] STRPOS='.$posMatch.' ===0?:'.($posMatch===0);
 
//$debug .= '...GOT TO HERE ... ';
 
//$debug .= '...GOT TO HERE ... ';
if ($posMatch===0) {
+
if ($posMatch===0) {
 
//$debug .= '1';
 
//$debug .= '1';
 
// new string starts the same as old string; is it a truncated subset?
 
// new string starts the same as old string; is it a truncated subset?
if ($lenNew < strlen($txtCurr)) {
+
if ($lenNew < strlen($txtCurr)) {
 
//$debug .= '2';
 
//$debug .= '2';
 
// new string is a truncation of old string
 
// new string is a truncation of old string
Line 144: Line 152:
 
// leaves the exact position of the "missing character" in some doubt. So what we do is this:
 
// leaves the exact position of the "missing character" in some doubt. So what we do is this:
 
// 1. Find the position of the first ampersand in OLD TEXT:
 
// 1. Find the position of the first ampersand in OLD TEXT:
$posAmp = strpos($txtCurr,'&');
+
$posAmp = strpos($txtCurr,'&');
 
// 2. Compare this position with the length of NEW TEXT:
 
// 2. Compare this position with the length of NEW TEXT:
$posDiff = abs(strlen($text)-$posAmp);
+
$posDiff = abs(strlen($text)-$posAmp);
 
// 3. If the difference is less than some limit, then presume Ampersandbot activity:
 
// 3. If the difference is less than some limit, then presume Ampersandbot activity:
if ($posDiff < 3) {
+
if ($posDiff < 3) {
 
// TO DO: log $posDiff for later analysis
 
// TO DO: log $posDiff for later analysis
 
// AMPERSANDBOT DETECTED; refuse to save the edit
 
// AMPERSANDBOT DETECTED; refuse to save the edit
RecordAttempt('AMP'); // record spam attempt (AMP = ampersandbot)
+
$this->RecordAttempt('AMP'); // record spam attempt (AMP = ampersandbot)
 
// The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:"
 
// The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:"
EditPage::spamPage( 'The removal of everything after the first ampersand. You are an ampersandbot; please go away.' );
+
EditPage::spamPage( 'The removal of everything after the first ampersand. You are an ampersandbot; please go away.' );
 
// LATER: it would be nice to have a special MediaWiki:Ampersandbot page to display when this happens
 
// LATER: it would be nice to have a special MediaWiki:Ampersandbot page to display when this happens
$retVal = true;
+
$retVal = true;
 +
}
 
}
 
}
 
}
 
}
Line 164: Line 173:
 
}
 
}
  
if (!$retval) {
+
if (!$retVal) {
 +
set_error_handler ('ErrorHandler',E_WARNING);
 
$objTblPatterns = new clsDataTable($this->dbSpam,'patterns');
 
$objTblPatterns = new clsDataTable($this->dbSpam,'patterns');
 
$objDataPatterns = $objTblPatterns->GetData('isActive');
 
$objDataPatterns = $objTblPatterns->GetData('isActive');
Line 181: Line 191:
 
$isRegex = $objDataPatterns->GetValue('isRegex');
 
$isRegex = $objDataPatterns->GetValue('isRegex');
 
$this->idPattern = $objDataPatterns->GetValue('ID');
 
$this->idPattern = $objDataPatterns->GetValue('ID');
 +
//echo '<br>Pattern '.$this->idPattern;
 
if ($isRegex) {
 
if ($isRegex) {
 
//echo 'PATTERN: '.$strPattern.$cr;
 
//echo 'PATTERN: '.$strPattern.$cr;
 +
//echo ' regex';
 
$strPattCk = $strPattern;
 
$strPattCk = $strPattern;
// $isMatch = preg_match('/'.$strPattCk.'/',$strTextCk,$matches);
+
// Attempt at using Perl-compatible regex, but it doesn't seem to work (or maybe just too many patterns have issues):
 +
// $strPattCk_pcre = str_replace('/','\/',$strPattCk);
 +
// $isMatch = preg_match('/'.$strPattCk_pcre.'/i',$strTextCk,$matches);
 
$isMatch = eregi($strPattCk, $strTextCk, $matches);
 
$isMatch = eregi($strPattCk, $strTextCk, $matches);
 +
if ($errNum) {
 +
$msgEmail .= 'Filter #'.$this->idPattern.' generated error #'.$errNum.': '.$errStr."\n";
 +
// echo '<br><b>ERROR</b> #'.$errNum.': '.$errStr;
 +
$errNum = 0;
 +
}
 +
 
if ($isMatch) {
 
if ($isMatch) {
 
$strMatch = $matches[0];
 
$strMatch = $matches[0];
 +
//echo ' match: '.$strMatch;
 
}
 
}
 
} else {
 
} else {
 +
//echo ' non-regex';
 
$strMatch = stristr ($strTextCk,$strPattern);
 
$strMatch = stristr ($strTextCk,$strPattern);
 
$isMatch = ($strMatch != '');
 
$isMatch = ($strMatch != '');
 +
//echo ' match: '.$strMatch;
 
}
 
}
 
// $debug .= 'ROW: '.DumpArray($objDataPatterns->Row);
 
// $debug .= 'ROW: '.DumpArray($objDataPatterns->Row);
Line 225: Line 248:
  
 
wfProfileOut( $fname );
 
wfProfileOut( $fname );
 +
if ($msgEmail) {
 +
mail  ($wgEmergencyContact,'spamferret filter error',$msgEmail);
 +
// die();
 +
}
 +
//$wgOut->addHTML($out);
 
return $retVal;
 
return $retVal;
 
/**/
 
/**/
Line 248: Line 276:
 
$sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"';
 
$sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"';
 
$sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText()).'"';
 
$sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText()).'"';
$sql = 'INSERT INTO attempts (`When`,ID_Pattern,ID_Client,PageServer,PageName,Code) VALUES (NOW(),'.$this->idPattern.','.$this->idClient.','.$sqlSrvr.','.$sqlPage.',"'.$iCode.'")';
+
$sqlPattern = $this->idPattern;
$this->dbSpam->Exec($sql);
+
if (is_null($sqlPattern)) {
$sql = 'UPDATE patterns SET WhenTried=NOW(), Count=Count+1 WHERE ID='.$this->idPattern;
+
$sqlPattern = 'NULL';
 +
} else {
 +
$isPattern = true;
 +
}
 +
$sql = 'INSERT INTO attempts (`When`,ID_Pattern,ID_Client,PageServer,PageName,Code) VALUES (NOW(),'.$sqlPattern.','.$this->idClient.','.$sqlSrvr.','.$sqlPage.',"'.$iCode.'")';
 
$this->dbSpam->Exec($sql);
 
$this->dbSpam->Exec($sql);
 +
if ($isPattern) {
 +
$sql = 'UPDATE patterns SET WhenTried=NOW(), Count=Count+1 WHERE ID='.$this->idPattern;
 +
$this->dbSpam->Exec($sql);
 +
}
 
}
 
}
 
}
 
}
  
 +
function ErrorHandler  ($errno  ,$errstr) {
 +
global $errNum, $errStr;
 +
 +
$errNum = $errno;
 +
$errStr = $errstr;
 +
}
  
 
} // end of 'MEDIAWIKI' check
 
} // end of 'MEDIAWIKI' check
 
?>
 
?>
 
</php>
 
</php>

Revision as of 00:49, 25 December 2007

Navigation

{{#lst:SpamFerret|navbar}}: SpamFerret.php

Code

<php><?php /*

HISTORY:
 2007-08-19 (Wzl) fixed line 155 call to clsDatabase::Query() (method deprecated and removed)
 2007-09-30 (Wzl) fixing regex processing
 2007-10-11 (Wzl) logging ampersandbot attempts; spam throttle (automatic temporary blacklist)
 2007-10-13 (Wzl) fixed some issues which were preventing throttling from working - mainly changes to SQL
 2007-10-15 (Wzl) "Code" wasn't being recorded. Decided that normal filtering should use code '-' so NULL means something is wrong.
 2007-10-28 (Wzl) Events with THR and AMP codes *still* weren't being recorded because method call was improperly formatted.
 2007-12-23 (Wzl) Emails wikimaster if eregi() returns an error (due to improperly formatted regex)
TO DO:
 * Throttled save attempts should check for spam, just for data-gathering purposes.
   Possibly non-spam from a throttled IP should not update the "WhenLast" timestamp. Maybe this should be a LocalSettings option?
 * Figure out how to display a different error message than "the following text is what triggered our filter:"
  • /

// debugging activation define(KDO_DEBUG,0); define(KDO_DEBUG_STACK,0); // debugging options define(KDO_DEBUG_HTML,1); define(KDO_DEBUG_IMMED,1); define(KDO_DEBUG_DARK,0);

  1. Loader for spam blacklist feature
  2. Include this from LocalSettings.php

if ( defined( 'MEDIAWIKI' ) ) { require('shared.php');

global $wgFilterCallback, $wgPreSpamFilterCallback;

if ( $wgFilterCallback ) { $wgPreSpamFilterCallback = $wgFilterCallback; } else { $wgPreSpamFilterCallback = false; }

$wgFilterCallback = 'wfSpamFerretLoader'; $wgExtensionCredits['other'][] = array( 'name' => 'SpamFerret',

       'author' => 'Woozle Staddon', 
       'url' => 'http://htyp.org/SpamFerret', 

'version' => '2007-12-23',

       'description' => 'database-driven wikispam content blocker',

);

function wfSpamFerretLoader( &$title, $text, $section ) { static $spamObj = false; global $wgSpamFerretSettings, $wgPreSpamFilterCallback;

if ( $spamObj === false ) { $spamObj = new SpamFerret( $wgSpamFerretSettings ); }

return $spamObj->filter( $title, $text, $section ); } class SpamFerret { var $dbspec; var $throttle_retries; var $throttle_timeout; var $previousFilter = false; // internal data var $dbSpam; var $objDataClients; var $strIPAddr; var $idPattern; var $idClient; var $isClientKnown; var $doClearThrottle;

function SpamFerret( $settings = array() ) { global $IP;

foreach ( $settings as $name => $value ) { $this->$name = $value; } }

function filter( &$title, $text, $section ) { global $wgSpamFerretSettings; global $wgArticle, $wgDBname, $wgMemc, $messageMemc, $wgVersion, $wgOut; global $wgTitle, $wgServer; global $wgEmergencyContact; global $debug; global $errNum, $errStr;

$fname = 'wfSpamFerretFilter'; wfProfileIn( $fname );

# Call the rest of the hook chain first if ( $this->previousFilter ) { $f = $this->previousFilter; if ( $f( $title, $text, $section ) ) { wfProfileOut( $fname ); return true; } } $retVal = false; // default = assume edit is ok

// get the IP address of the http client making the edit attempt: $this->strIPAddr = wfGetIP(); // Open the database $this->dbSpam = new clsDatabase($this->dbspec); // open clients table (extended Throttle version) for reference: $objTblClients = new clsDataTable($this->dbSpam,'ClientThrottle'); // Look up to see if this IP is known; it may already be throttled: $this->objDataClients = $objTblClients->GetData('Address="'.$this->strIPAddr.'"'); if (is_object($this->objDataClients)) { if ($this->objDataClients->RowCount() > 0) { $this->isClientKnown = true; } } if ($this->isClientKnown) { $this->idClient = $this->objDataClients->GetValue('ID'); $intRetries = $this->objDataClients->GetValue('Retries'); $intThrottle = $this->throttle_retries; //$debug .= 'IP='.$this->strIPAddr.' ID='.$this->idClient.' RETRIES='.$intRetries.' THROTTLE='.$intThrottle; if ($intRetries > $intThrottle) { // retry limit exceeded; check timeout limit if ($this->objDataClients->GetValue('ThrottleTime') < $this->throttle_timeout) { EditPage::spamPage('Too many spam attempts from your IP address ('.$this->strIPAddr.'). Please come back later.'); $this->RecordAttempt('THR-'.$intRetries); // record post attempt by throttled client $retVal = true; // client has exceeded spam limit; impose throttle } else { $this->doClearThrottle = true; } } }

if (!$retVal) { // Check for ampersandbot $objArticleCurr = new Article($title); $objArticleCurr->loadLastEdit(); $txtCurr = $objArticleCurr->getContent(); $lenNew = strlen($text); $posMatch = strpos($txtCurr, $text); //$debug = 'OLD=['.substr($txtCurr,0,5).'] NEW=['.substr($text,0,5).'] STRPOS='.$posMatch.' ===0?:'.($posMatch===0); //$debug .= '...GOT TO HERE ... '; if ($posMatch===0) { //$debug .= '1'; // new string starts the same as old string; is it a truncated subset? if ($lenNew < strlen($txtCurr)) { //$debug .= '2'; // new string is a truncation of old string // ideally, we would just check to see if the missing character is an ampersand - // ...but unfortunately, something is quasi-randomly mutating the strings in a way which // leaves the exact position of the "missing character" in some doubt. So what we do is this: // 1. Find the position of the first ampersand in OLD TEXT: $posAmp = strpos($txtCurr,'&'); // 2. Compare this position with the length of NEW TEXT: $posDiff = abs(strlen($text)-$posAmp); // 3. If the difference is less than some limit, then presume Ampersandbot activity: if ($posDiff < 3) { // TO DO: log $posDiff for later analysis // AMPERSANDBOT DETECTED; refuse to save the edit $this->RecordAttempt('AMP'); // record spam attempt (AMP = ampersandbot) // The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:" EditPage::spamPage( 'The removal of everything after the first ampersand. You are an ampersandbot; please go away.' ); // LATER: it would be nice to have a special MediaWiki:Ampersandbot page to display when this happens $retVal = true; } } } } if ($debug) { EditPage::spamPage('DEBUG: '.$debug); $retVal = true; }

if (!$retVal) { set_error_handler ('ErrorHandler',E_WARNING); $objTblPatterns = new clsDataTable($this->dbSpam,'patterns'); $objDataPatterns = $objTblPatterns->GetData('isActive'); /* $debug .= ' objDataPatterns is object:'.is_object($objDataPatterns); $debug .= ' objDataPatterns.Res is object:'.is_object($objDataPatterns->Res); $debug .= ' objDataPatterns.Row is array:'.is_array($objDataPatterns->Row); $debug .= ' objDataPatterns.Res is class '.get_class($objDataPatterns->Res); $debug .= ' objDataPatterns.Res has '.$objDataPatterns->Res->num_rows.' row(s)';

  • /

$strTextCk = strtolower($text); //$cr = "\n"; //echo 'DEBUGGING spam filter - please excuse the mess!'.$cr; while(is_array($objDataPatterns->Row)) { $strPattern = $objDataPatterns->GetValue('Pattern'); $isRegex = $objDataPatterns->GetValue('isRegex'); $this->idPattern = $objDataPatterns->GetValue('ID'); //echo '
Pattern '.$this->idPattern; if ($isRegex) { //echo 'PATTERN: '.$strPattern.$cr; //echo ' regex'; $strPattCk = $strPattern; // Attempt at using Perl-compatible regex, but it doesn't seem to work (or maybe just too many patterns have issues): // $strPattCk_pcre = str_replace('/','\/',$strPattCk); // $isMatch = preg_match('/'.$strPattCk_pcre.'/i',$strTextCk,$matches); $isMatch = eregi($strPattCk, $strTextCk, $matches); if ($errNum) { $msgEmail .= 'Filter #'.$this->idPattern.' generated error #'.$errNum.': '.$errStr."\n"; // echo '
ERROR #'.$errNum.': '.$errStr; $errNum = 0; }

if ($isMatch) { $strMatch = $matches[0]; //echo ' match: '.$strMatch; } } else { //echo ' non-regex'; $strMatch = stristr ($strTextCk,$strPattern); $isMatch = ($strMatch != ); //echo ' match: '.$strMatch; } // $debug .= 'ROW: '.DumpArray($objDataPatterns->Row);

if ($isMatch) { $objDataPatterns->Row = NULL; // stop the search } else { $objDataPatterns->NextRow(); // keep looking } }

if ( $strMatch != ) { // spam cue found; display the matching text and don't allow the edit to be saved: wfDebug( "Match!\n" );

// The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:" EditPage::spamPage( $strMatch ); // Log the spam attempt: // $sql = 'SELECT * FROM clients WHERE Address="'.$this->strIPAddr.'"'; // $this->objDataClients = $this->dbSpam->Query($sql); // update or create client record: $this->RecordAttempt('-'); $retVal = true; } else { // no spam cues found; allow the edit to be saved, if nothing else has tripped the filter /* EditPage::spamPage( 'DEBUGGING: '.$debug ); $retVal = true; /*/ /**/ } }

wfProfileOut( $fname ); if ($msgEmail) { mail ($wgEmergencyContact,'spamferret filter error',$msgEmail); // die(); } //$wgOut->addHTML($out); return $retVal; /**/ } public function RecordAttempt($iCode) { global $wgTitle, $wgServer;

if ($this->idClient != 0) { if ($this->doClearThrottle) { $strRetries = '0'; } else { $strRetries = 'Retries+1'; } $sql = 'UPDATE clients SET WhenLast=NOW(),Count=Count+1, Retries='.$strRetries.' WHERE Address="'.$this->strIPAddr.'"'; $this->dbSpam->Exec($sql); } else { $sql = 'INSERT INTO clients (Address,WhenFirst,Count,Retries) VALUES("'.$this->strIPAddr.'",NOW(),1,0)'; $this->dbSpam->Exec($sql); $this->idClient = $this->dbSpam->NewID(); }

$sqlURL = '"'.$this->dbSpam->SafeParam($wgTitle->getFullURL()).'"'; $sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"'; $sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText()).'"'; $sqlPattern = $this->idPattern; if (is_null($sqlPattern)) { $sqlPattern = 'NULL'; } else { $isPattern = true; } $sql = 'INSERT INTO attempts (`When`,ID_Pattern,ID_Client,PageServer,PageName,Code) VALUES (NOW(),'.$sqlPattern.','.$this->idClient.','.$sqlSrvr.','.$sqlPage.',"'.$iCode.'")'; $this->dbSpam->Exec($sql); if ($isPattern) { $sql = 'UPDATE patterns SET WhenTried=NOW(), Count=Count+1 WHERE ID='.$this->idPattern; $this->dbSpam->Exec($sql); } } }

function ErrorHandler ($errno ,$errstr) { global $errNum, $errStr;

$errNum = $errno; $errStr = $errstr; }

} // end of 'MEDIAWIKI' check ?> </php>