Difference between revisions of "SpamFerret.php"
(New page: ==Navigation== computing: software: MediaWiki: fighting spam: SpamFerret: SpamFerret.php ==Code== <php><?php /* HISTORY: 2007-08...) |
(→Code: 10/12 version supports throttling by IP) |
||
Line 6: | Line 6: | ||
HISTORY: | HISTORY: | ||
2007-08-19 (Wzl) fixed line 155 call to clsDatabase::Query() (method deprecated and removed) | 2007-08-19 (Wzl) fixed line 155 call to clsDatabase::Query() (method deprecated and removed) | ||
− | 2007- | + | 2007-09-30 (Wzl) fixing regex processing |
+ | 2007-10-11 (Wzl) logging ampersandbot attempts; spam throttle (automatic temporary blacklist) | ||
*/ | */ | ||
Line 21: | Line 22: | ||
if ( defined( 'MEDIAWIKI' ) ) { | if ( defined( 'MEDIAWIKI' ) ) { | ||
− | require(' | + | require('shared.php'); |
global $wgFilterCallback, $wgPreSpamFilterCallback; | global $wgFilterCallback, $wgPreSpamFilterCallback; | ||
Line 36: | Line 37: | ||
'author' => 'Woozle Staddon', | 'author' => 'Woozle Staddon', | ||
'url' => 'http://htyp.org/SpamFerret', | 'url' => 'http://htyp.org/SpamFerret', | ||
− | 'version' => '2007- | + | 'version' => '2007-10-11', |
'description' => 'database-driven wikispam content blocker', | 'description' => 'database-driven wikispam content blocker', | ||
); | ); | ||
Line 52: | Line 53: | ||
class SpamFerret { | class SpamFerret { | ||
var $dbspec; | var $dbspec; | ||
+ | var $throttle_retries; | ||
+ | var $throttle_timeout; | ||
var $previousFilter = false; | var $previousFilter = false; | ||
+ | // internal data | ||
+ | var $dbSpam; | ||
+ | var $objDataClients; | ||
+ | var $strIPAddr; | ||
+ | var $idPattern; | ||
+ | var $idClient; | ||
+ | var $isClientKnown; | ||
+ | var $doClearThrottle; | ||
function SpamFerret( $settings = array() ) { | function SpamFerret( $settings = array() ) { | ||
Line 63: | Line 74: | ||
function filter( &$title, $text, $section ) { | function filter( &$title, $text, $section ) { | ||
+ | global $wgSpamFerretSettings; | ||
global $wgArticle, $wgDBname, $wgMemc, $messageMemc, $wgVersion, $wgOut; | global $wgArticle, $wgDBname, $wgMemc, $messageMemc, $wgVersion, $wgOut; | ||
global $wgTitle, $wgServer; | global $wgTitle, $wgServer; | ||
Line 80: | Line 92: | ||
$retVal = false; // default = assume edit is ok | $retVal = false; // default = assume edit is ok | ||
+ | |||
+ | // get the IP address of the http client making the edit attempt: | ||
+ | $this->strIPAddr = wfGetIP(); | ||
+ | // Open the database | ||
+ | $this->dbSpam = new clsDatabase($this->dbspec); | ||
+ | // open clients table (extended Throttle version) for reference: | ||
+ | $objTblClients = new clsDataTable($this->dbSpam,'ClientThrottle'); | ||
+ | // Look up to see if this IP is known; it may already be throttled: | ||
+ | $this->objDataClients = $objTblClients->GetData('Address="'.$this->strIPAddr.'"'); | ||
+ | if (is_object($this->objDataClients)) { | ||
+ | if ($this->objDataClients->RowCount() > 0) { | ||
+ | $this->isClientKnown = true; | ||
+ | } | ||
+ | } | ||
+ | if ($this->isClientKnown) { | ||
+ | $this->idClient = $this->objDataClients->GetValue('ID'); | ||
+ | if ($this->objDataClients->GetValue('Retries') > $this->throttle_retries) { | ||
+ | // retry limit exceeded; check timeout limit | ||
+ | if ($this->objDataClients->GetValue('ThrottleTime') < $this->throttle_timeout) { | ||
+ | $retval = true; // client has exceeded spam limit; impose throttle | ||
+ | RecordAttempt('THR'); // record post attempt by throttled client | ||
+ | EditPage::spamPage('Too many spam attempts from this IP address. Please come back later.'); | ||
+ | } else { | ||
+ | $this->doClearThrottle = true; | ||
+ | } | ||
+ | } | ||
+ | } | ||
+ | |||
// Check for ampersandbot | // Check for ampersandbot | ||
$objArticleCurr = new Article($title); | $objArticleCurr = new Article($title); | ||
Line 105: | Line 145: | ||
// TO DO: log $posDiff for later analysis | // TO DO: log $posDiff for later analysis | ||
// AMPERSANDBOT DETECTED; refuse to save the edit | // AMPERSANDBOT DETECTED; refuse to save the edit | ||
+ | RecordAttempt('AMP'); // record spam attempt (AMP = ampersandbot) | ||
// The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:" | // The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:" | ||
EditPage::spamPage( 'The removal of everything after the first ampersand. You are an ampersandbot; please go away.' ); | EditPage::spamPage( 'The removal of everything after the first ampersand. You are an ampersandbot; please go away.' ); | ||
Line 118: | Line 159: | ||
if (!$retval) { | if (!$retval) { | ||
− | + | $objTblPatterns = new clsDataTable($this->dbSpam,'patterns'); | |
− | $ | ||
− | |||
$objDataPatterns = $objTblPatterns->GetData('isActive'); | $objDataPatterns = $objTblPatterns->GetData('isActive'); | ||
/* | /* | ||
Line 129: | Line 168: | ||
$debug .= ' objDataPatterns.Res has '.$objDataPatterns->Res->num_rows.' row(s)'; | $debug .= ' objDataPatterns.Res has '.$objDataPatterns->Res->num_rows.' row(s)'; | ||
*/ | */ | ||
+ | $strTextCk = strtolower($text); | ||
+ | //$cr = "\n"; | ||
+ | //echo 'DEBUGGING spam filter - please excuse the mess!'.$cr; | ||
while(is_array($objDataPatterns->Row)) { | while(is_array($objDataPatterns->Row)) { | ||
$strPattern = $objDataPatterns->GetValue('Pattern'); | $strPattern = $objDataPatterns->GetValue('Pattern'); | ||
$isRegex = $objDataPatterns->GetValue('isRegex'); | $isRegex = $objDataPatterns->GetValue('isRegex'); | ||
− | $idPattern = $objDataPatterns->GetValue('ID'); | + | $this->idPattern = $objDataPatterns->GetValue('ID'); |
if ($isRegex) { | if ($isRegex) { | ||
− | + | //echo 'PATTERN: '.$strPattern.$cr; | |
− | $ | + | $strPattCk = $strPattern; |
− | + | // $isMatch = preg_match('/'.$strPattCk.'/',$strTextCk,$matches); | |
+ | $isMatch = eregi($strPattCk, $strTextCk, $matches); | ||
if ($isMatch) { | if ($isMatch) { | ||
$strMatch = $matches[0]; | $strMatch = $matches[0]; | ||
} | } | ||
} else { | } else { | ||
− | $strMatch = stristr ($ | + | $strMatch = stristr ($strTextCk,$strPattern); |
$isMatch = ($strMatch != ''); | $isMatch = ($strMatch != ''); | ||
} | } | ||
Line 160: | Line 203: | ||
EditPage::spamPage( $strMatch ); | EditPage::spamPage( $strMatch ); | ||
// Log the spam attempt: | // Log the spam attempt: | ||
− | + | // $sql = 'SELECT * FROM clients WHERE Address="'.$this->strIPAddr.'"'; | |
− | + | // $this->objDataClients = $this->dbSpam->Query($sql); | |
− | // $sql = 'SELECT * FROM clients WHERE Address="'.$strIPAddr.'"'; | ||
− | // $objDataClients = $dbSpam->Query($sql | ||
− | |||
− | |||
// update or create client record: | // update or create client record: | ||
− | + | $this->RecordAttempt(NULL); | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
$retVal = true; | $retVal = true; | ||
} else { | } else { | ||
Line 202: | Line 221: | ||
return $retVal; | return $retVal; | ||
/**/ | /**/ | ||
+ | } | ||
+ | public function RecordAttempt($iCode) { | ||
+ | global $wgTitle, $wgServer; | ||
+ | |||
+ | if ($this->idClient != 0) { | ||
+ | if ($this->doClearThrottle) { | ||
+ | $strRetries = '0'; | ||
+ | } else { | ||
+ | $strRetries = 'Retries+1'; | ||
+ | } | ||
+ | $sql = 'UPDATE clients SET WhenLast=NOW(),Count=Count+1, Retries='.$strRetries.' WHERE Address="'.$this->strIPAddr.'"'; | ||
+ | $this->dbSpam->Exec($sql); | ||
+ | } else { | ||
+ | $sql = 'INSERT INTO clients (Address,WhenFirst,Count,Retries) VALUES("'.$this->strIPAddr.'",NOW(),1,0)'; | ||
+ | $this->dbSpam->Exec($sql); | ||
+ | $this->idClient = $this->dbSpam->NewID(); | ||
+ | } | ||
+ | |||
+ | $sqlURL = '"'.$this->dbSpam->SafeParam($wgTitle->getFullURL()).'"'; | ||
+ | $sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"'; | ||
+ | $sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText()).'"'; | ||
+ | $sql = 'INSERT INTO attempts (`When`,ID_Pattern,ID_Client,PageServer,PageName) VALUES (NOW(),'.$this->idPattern.','.$this->idClient.','.$sqlSrvr.','.$sqlPage.')'; | ||
+ | $this->dbSpam->Exec($sql); | ||
+ | $sql = 'UPDATE patterns SET WhenTried=NOW(), Count=Count+1 WHERE ID='.$this->idPattern; | ||
+ | $this->dbSpam->Exec($sql); | ||
} | } | ||
} | } | ||
+ | |||
} // end of 'MEDIAWIKI' check | } // end of 'MEDIAWIKI' check | ||
− | ?></php> | + | ?> |
+ | </php> |
Revision as of 20:01, 12 October 2007
computing: software: MediaWiki: fighting spam: SpamFerret: SpamFerret.php
Code
<php><?php /*
HISTORY: 2007-08-19 (Wzl) fixed line 155 call to clsDatabase::Query() (method deprecated and removed) 2007-09-30 (Wzl) fixing regex processing 2007-10-11 (Wzl) logging ampersandbot attempts; spam throttle (automatic temporary blacklist)
- /
// debugging activation define(KDO_DEBUG,0); define(KDO_DEBUG_STACK,0); // debugging options define(KDO_DEBUG_HTML,1); define(KDO_DEBUG_IMMED,1); define(KDO_DEBUG_DARK,0);
- Loader for spam blacklist feature
- Include this from LocalSettings.php
if ( defined( 'MEDIAWIKI' ) ) { require('shared.php');
global $wgFilterCallback, $wgPreSpamFilterCallback;
if ( $wgFilterCallback ) { $wgPreSpamFilterCallback = $wgFilterCallback; } else { $wgPreSpamFilterCallback = false; }
$wgFilterCallback = 'wfSpamFerretLoader'; $wgExtensionCredits['other'][] = array( 'name' => 'SpamFerret',
'author' => 'Woozle Staddon', 'url' => 'http://htyp.org/SpamFerret',
'version' => '2007-10-11',
'description' => 'database-driven wikispam content blocker',
);
function wfSpamFerretLoader( &$title, $text, $section ) { static $spamObj = false; global $wgSpamFerretSettings, $wgPreSpamFilterCallback;
if ( $spamObj === false ) { $spamObj = new SpamFerret( $wgSpamFerretSettings ); }
return $spamObj->filter( $title, $text, $section ); } class SpamFerret { var $dbspec; var $throttle_retries; var $throttle_timeout; var $previousFilter = false; // internal data var $dbSpam; var $objDataClients; var $strIPAddr; var $idPattern; var $idClient; var $isClientKnown; var $doClearThrottle;
function SpamFerret( $settings = array() ) { global $IP;
foreach ( $settings as $name => $value ) { $this->$name = $value; } }
function filter( &$title, $text, $section ) { global $wgSpamFerretSettings; global $wgArticle, $wgDBname, $wgMemc, $messageMemc, $wgVersion, $wgOut; global $wgTitle, $wgServer; global $debug;
$fname = 'wfSpamFerretFilter'; wfProfileIn( $fname );
# Call the rest of the hook chain first if ( $this->previousFilter ) { $f = $this->previousFilter; if ( $f( $title, $text, $section ) ) { wfProfileOut( $fname ); return true; } }
$retVal = false; // default = assume edit is ok
// get the IP address of the http client making the edit attempt: $this->strIPAddr = wfGetIP(); // Open the database $this->dbSpam = new clsDatabase($this->dbspec); // open clients table (extended Throttle version) for reference: $objTblClients = new clsDataTable($this->dbSpam,'ClientThrottle'); // Look up to see if this IP is known; it may already be throttled: $this->objDataClients = $objTblClients->GetData('Address="'.$this->strIPAddr.'"'); if (is_object($this->objDataClients)) { if ($this->objDataClients->RowCount() > 0) { $this->isClientKnown = true; } } if ($this->isClientKnown) { $this->idClient = $this->objDataClients->GetValue('ID'); if ($this->objDataClients->GetValue('Retries') > $this->throttle_retries) { // retry limit exceeded; check timeout limit if ($this->objDataClients->GetValue('ThrottleTime') < $this->throttle_timeout) { $retval = true; // client has exceeded spam limit; impose throttle RecordAttempt('THR'); // record post attempt by throttled client EditPage::spamPage('Too many spam attempts from this IP address. Please come back later.'); } else { $this->doClearThrottle = true; } } }
// Check for ampersandbot $objArticleCurr = new Article($title); $objArticleCurr->loadLastEdit(); $txtCurr = $objArticleCurr->getContent(); $lenNew = strlen($text); $posMatch = strpos($txtCurr, $text); //$debug = 'OLD=['.substr($txtCurr,0,5).'] NEW=['.substr($text,0,5).'] STRPOS='.$posMatch.' ===0?:'.($posMatch===0); //$debug .= '...GOT TO HERE ... '; if ($posMatch===0) { //$debug .= '1'; // new string starts the same as old string; is it a truncated subset? if ($lenNew < strlen($txtCurr)) { //$debug .= '2'; // new string is a truncation of old string // ideally, we would just check to see if the missing character is an ampersand - // ...but unfortunately, something is quasi-randomly mutating the strings in a way which // leaves the exact position of the "missing character" in some doubt. So what we do is this: // 1. Find the position of the first ampersand in OLD TEXT: $posAmp = strpos($txtCurr,'&'); // 2. Compare this position with the length of NEW TEXT: $posDiff = abs(strlen($text)-$posAmp); // 3. If the difference is less than some limit, then presume Ampersandbot activity: if ($posDiff < 3) { // TO DO: log $posDiff for later analysis // AMPERSANDBOT DETECTED; refuse to save the edit RecordAttempt('AMP'); // record spam attempt (AMP = ampersandbot) // The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:" EditPage::spamPage( 'The removal of everything after the first ampersand. You are an ampersandbot; please go away.' ); // LATER: it would be nice to have a special MediaWiki:Ampersandbot page to display when this happens $retVal = true; } } } if ($debug) { EditPage::spamPage('DEBUG: '.$debug); $retVal = true; }
if (!$retval) { $objTblPatterns = new clsDataTable($this->dbSpam,'patterns'); $objDataPatterns = $objTblPatterns->GetData('isActive'); /* $debug .= ' objDataPatterns is object:'.is_object($objDataPatterns); $debug .= ' objDataPatterns.Res is object:'.is_object($objDataPatterns->Res); $debug .= ' objDataPatterns.Row is array:'.is_array($objDataPatterns->Row); $debug .= ' objDataPatterns.Res is class '.get_class($objDataPatterns->Res); $debug .= ' objDataPatterns.Res has '.$objDataPatterns->Res->num_rows.' row(s)';
- /
$strTextCk = strtolower($text); //$cr = "\n"; //echo 'DEBUGGING spam filter - please excuse the mess!'.$cr; while(is_array($objDataPatterns->Row)) { $strPattern = $objDataPatterns->GetValue('Pattern'); $isRegex = $objDataPatterns->GetValue('isRegex'); $this->idPattern = $objDataPatterns->GetValue('ID'); if ($isRegex) { //echo 'PATTERN: '.$strPattern.$cr; $strPattCk = $strPattern; // $isMatch = preg_match('/'.$strPattCk.'/',$strTextCk,$matches); $isMatch = eregi($strPattCk, $strTextCk, $matches); if ($isMatch) { $strMatch = $matches[0]; } } else { $strMatch = stristr ($strTextCk,$strPattern); $isMatch = ($strMatch != ); } // $debug .= 'ROW: '.DumpArray($objDataPatterns->Row);
if ($isMatch) { $objDataPatterns->Row = NULL; // stop the search } else { $objDataPatterns->NextRow(); // keep looking } }
if ( $strMatch != ) { // spam cue found; display the matching text and don't allow the edit to be saved: wfDebug( "Match!\n" );
// The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:" EditPage::spamPage( $strMatch ); // Log the spam attempt: // $sql = 'SELECT * FROM clients WHERE Address="'.$this->strIPAddr.'"'; // $this->objDataClients = $this->dbSpam->Query($sql); // update or create client record: $this->RecordAttempt(NULL); $retVal = true; } else { // no spam cues found; allow the edit to be saved, if nothing else has tripped the filter /* EditPage::spamPage( 'DEBUGGING: '.$debug ); $retVal = true; /*/ /**/ } }
wfProfileOut( $fname ); return $retVal; /**/ } public function RecordAttempt($iCode) { global $wgTitle, $wgServer;
if ($this->idClient != 0) { if ($this->doClearThrottle) { $strRetries = '0'; } else { $strRetries = 'Retries+1'; } $sql = 'UPDATE clients SET WhenLast=NOW(),Count=Count+1, Retries='.$strRetries.' WHERE Address="'.$this->strIPAddr.'"'; $this->dbSpam->Exec($sql); } else { $sql = 'INSERT INTO clients (Address,WhenFirst,Count,Retries) VALUES("'.$this->strIPAddr.'",NOW(),1,0)'; $this->dbSpam->Exec($sql); $this->idClient = $this->dbSpam->NewID(); }
$sqlURL = '"'.$this->dbSpam->SafeParam($wgTitle->getFullURL()).'"'; $sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"'; $sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText()).'"'; $sql = 'INSERT INTO attempts (`When`,ID_Pattern,ID_Client,PageServer,PageName) VALUES (NOW(),'.$this->idPattern.','.$this->idClient.','.$sqlSrvr.','.$sqlPage.')'; $this->dbSpam->Exec($sql); $sql = 'UPDATE patterns SET WhenTried=NOW(), Count=Count+1 WHERE ID='.$this->idPattern; $this->dbSpam->Exec($sql); } }
} // end of 'MEDIAWIKI' check
?>
</php>