Difference between revisions of "SpamFerret.php"

from HTYP, the free directory anyone can edit if they can prove to me that they're not a spambot
Jump to navigation Jump to search
(→‎Code: 2/25/09 version - includes fix for MW 1.14)
(2009-03-26 version)
Line 3: Line 3:
  
 
==Code==
 
==Code==
<php><?php/*
+
<php><?php
 +
/*
 
  HISTORY:
 
  HISTORY:
 
   2007-08-19 (Wzl) fixed line 155 call to clsDatabase::Query() (method deprecated and removed)
 
   2007-08-19 (Wzl) fixed line 155 call to clsDatabase::Query() (method deprecated and removed)
Line 20: Line 21:
 
   2008-10-21 (Wzl) Fixed minor syntax error in "defines"
 
   2008-10-21 (Wzl) Fixed minor syntax error in "defines"
 
   2009-02-25 (Wzl) $objArticleCurr->loadLastEdit() now causes error in MW 1.14 (was it necessary before?)
 
   2009-02-25 (Wzl) $objArticleCurr->loadLastEdit() now causes error in MW 1.14 (was it necessary before?)
 +
  2009-03-10 (Wzl) "require" -> "require_once" so other extensions can use data.php without conflict
 +
    also optional $kfpWzlLibs so data.php can be somewhere not on the path
 +
  2009-03-18 (Wzl) Got rid of shared.php requirement; now using kfpLib to locate data.php
 +
  2009-03-26 (Wzl) Rewrote data library calls to use new classes (no longer using deprecated/removed classes)
 +
    Also modified to use newer function hooks
 
  TO DO:
 
  TO DO:
 
   * Log matching text for regex filters
 
   * Log matching text for regex filters
Line 25: Line 31:
 
     Possibly non-spam from a throttled IP should not update the "WhenLast" timestamp. Maybe this should be a LocalSettings option?
 
     Possibly non-spam from a throttled IP should not update the "WhenLast" timestamp. Maybe this should be a LocalSettings option?
 
   * Figure out how to display a different error message than "the following text is what triggered our filter:"
 
   * Figure out how to display a different error message than "the following text is what triggered our filter:"
 +
OPTIONAL SETTINGS:
 +
  kfpLib - path to data.php folder (no final slash)
 +
  kfsLib_Data - filespec of data.php
 
*/
 
*/
 
// debugging activation
 
define('KDO_DEBUG',0);
 
define('KDO_DEBUG_STACK',0);
 
// debugging options
 
define('KDO_DEBUG_HTML',0);
 
define('KDO_DEBUG_IMMED',1);
 
define('KDO_DEBUG_DARK',0);
 
  
 
# Loader for spam blacklist feature
 
# Loader for spam blacklist feature
Line 39: Line 40:
  
 
if ( defined( 'MEDIAWIKI' ) ) {
 
if ( defined( 'MEDIAWIKI' ) ) {
//require('shared.php');
 
require('data.php');
 
  
 +
$wgExtensionCredits['other'][] = array(
 +
'name' => 'SpamFerret',
 +
'author' => 'Woozle Staddon',
 +
'url' => 'http://htyp.org/SpamFerret',
 +
'version' => '2009-03-26',
 +
'description' => 'database-driven wikispam blocker',
 +
);
 +
 +
if (!defined('kfsLib_Data')) {
 +
    if (defined('kfpLib')) {
 +
define('kfsLib_Data', kfpLib.'/data.php');
 +
    } else {
 +
define('kfsLib_Data','data.php'); // assume it's on the path
 +
    }
 +
}
 +
require_once kfsLib_Data;
 +
 +
/* ==============
 +
SET UP CALLBACKS
 +
*/
 
global $wgFilterCallback, $wgPreSpamFilterCallback;
 
global $wgFilterCallback, $wgPreSpamFilterCallback;
  
if ( $wgFilterCallback ) {
+
$wgPreSpamFilterCallback = false;
$wgPreSpamFilterCallback = $wgFilterCallback;
+
if ( defined( 'MW_SUPPORTS_EDITFILTERMERGED' ) ) {
 +
    $wgHooks['EditFilterMerged'][] = 'wfSpamFerretMerged';
 
} else {
 
} else {
$wgPreSpamFilterCallback = false;
+
    if ( $wgFilterCallback ) {
 +
        $wgPreSpamFilterCallback = $wgFilterCallback;
 +
    }
 +
    $wgFilterCallback = 'wfSpamFerretFilter';
 
}
 
}
  
$wgFilterCallback = 'wfSpamFerretLoader';
+
/*
$wgExtensionCredits['other'][] = array(
+
$wgHooks['EditFilter'][] = 'wfSpamFerretValidate';
'name' => 'SpamFerret',
+
$wgHooks['ArticleSaveComplete'][] = 'wfSpamFerretArticleSave';
'author' => 'Woozle Staddon',
+
$wgHooks['APIEditBeforeSave'][] = 'wfSpamFerretAPIEditBeforeSave';
'url' => 'http://htyp.org/SpamFerret',  
+
*/
'version' => '2009-02-25',
+
 
'description' => 'database-driven wikispam content blocker',
+
/* ================
);
+
SET GLOBAL OBJECTS
 +
*/
 +
function GetSpamFerret() {
 +
    static $objFerret;
 +
 
 +
    if (!isset($objFerret)) {
 +
$objFerret = new SpamFerret();
 +
    }
 +
    return $objFerret;
 +
}
 +
/* ================
 +
CALLBACK FUNCTIONS
 +
*/
 +
/**
 +
* Hook function for $wgFilterCallback
 +
*/
 +
function wfSpamFerretFilter( &$title, $text, $section, &$hookErr, $editSummary ) {
 +
    global $wgOut;
 +
 
 +
    $spamObj = GetSpamFerret();
 +
    $wgOut->addWikiText( "Intercepted by SpamFerretFilter" );
 +
    return $spamObj->filter( $title, $text, $section );
 +
}
 +
 
 +
/**
 +
* Hook function for EditFilterMerged, replaces wfSpamBlacklistFilter
 +
*/
 +
function wfSpamFerretMerged( &$editPage, $text, &$hookErr, $editSummary ) {
 +
    global $wgTitle,$wgOut;
 +
 
 +
    if( is_null( $wgTitle ) ) {
 +
        # API mode
 +
        # wfSpamBlacklistFilterAPIEditBeforeSave already checked the blacklist
 +
        return true;
 +
    }
  
function wfSpamFerretLoader( &$title, $text, $section ) {
+
    $spamObj = GetSpamFerret();
static $spamObj = false;
+
    $title = $editPage->mArticle->getTitle();
global $wgSpamFerretSettings, $wgPreSpamFilterCallback;
+
    $ret = $spamObj->filter( $title, $text, '', $editSummary, $editPage );
 +
    if ( $ret !== false ) $editPage->spamPage( $ret );
  
if ( $spamObj === false ) {
+
// additional text can be added here:
$spamObj = new SpamFerret( $wgSpamFerretSettings );
+
//    $wgOut->addWikiText( "Intercepted by SpamFerretMerged" );
}
 
  
return $spamObj->filter( $title, $text, $section );
+
    // Return convention for hooks is the inverse of $wgFilterCallback
 +
    return ( $ret === false );
 
}
 
}
 +
/**
 +
* Hook function for APIEditBeforeSave
 +
*/
 +
function wfSpamFerretAPIEditBeforeSave( &$editPage, $text, &$resultArr ) {
 +
}
 +
 
class SpamFerret {
 
class SpamFerret {
var $dbspec;
 
var $throttle_retries;
 
var $throttle_timeout;
 
 
var $previousFilter = false;
 
var $previousFilter = false;
 
// internal data
 
// internal data
Line 82: Line 143:
 
var $doClearThrottle;
 
var $doClearThrottle;
 
 
function SpamFerret( $settings = array() ) {
+
// function SpamFerret() {
global $IP;
+
// }
 +
 
 +
function Setting($iName) {
 +
    global $wgSpamFerretSettings;
  
foreach ( $settings as $name => $value ) {
+
    return $wgSpamFerretSettings[$iName];
$this->$name = $value;
 
}
 
 
}
 
}
  
function filter( &$title, $text, $section ) {
+
function filter( &$title, $text, $section, $editSummary, $editPage ) {
global $wgSpamFerretSettings;
 
 
global $wgArticle, $wgDBname, $wgMemc, $messageMemc, $wgVersion, $wgOut;
 
global $wgArticle, $wgDBname, $wgMemc, $messageMemc, $wgVersion, $wgOut;
 
global $wgTitle, $wgServer;
 
global $wgTitle, $wgServer;
Line 97: Line 158:
 
global $debug;
 
global $debug;
 
global $errNum, $errStr;
 
global $errNum, $errStr;
 +
// debugging:
 +
global $sql;
  
 
$fname = 'wfSpamFerretFilter';
 
$fname = 'wfSpamFerretFilter';
Line 109: Line 172:
 
}
 
}
 
}
 
}
$retVal = false; // default = assume edit is ok
+
// initialize variables
 +
$retVal = FALSE; // default = assume edit is ok
 +
$msgEmail = FALSE;
  
 
// get the IP address of the http client making the edit attempt:
 
// get the IP address of the http client making the edit attempt:
 
$this->strIPAddr = wfGetIP();
 
$this->strIPAddr = wfGetIP();
 
// Open the database
 
// Open the database
$this->dbSpam = new clsDatabase($this->dbspec);
+
$this->dbSpam = new clsDatabase($this->Setting('dbspec'));
 +
$this->dbSpam->Open();
 
// open clients table (extended Throttle version) for reference:
 
// open clients table (extended Throttle version) for reference:
 
//return TRUE;
 
//return TRUE;
$objTblClients = new clsDataTable($this->dbSpam,'ClientThrottle');
+
$objTblClients = new clsTable($this->dbSpam,'ClientThrottle','ID');
 
// Look up to see if this IP is known; it may already be throttled:
 
// Look up to see if this IP is known; it may already be throttled:
 
$this->objDataClients = $objTblClients->GetData('Address="'.$this->strIPAddr.'"');
 
$this->objDataClients = $objTblClients->GetData('Address="'.$this->strIPAddr.'"');
 
if (is_object($this->objDataClients)) {
 
if (is_object($this->objDataClients)) {
if ($this->objDataClients->RowCount() > 0) {
+
if ($this->objDataClients->hasRows()) {
 
$isClientKnown = true;
 
$isClientKnown = true;
 
}
 
}
 
}
 
}
 
if ($isClientKnown) {
 
if ($isClientKnown) {
$this->idClient = $this->objDataClients->GetValue('ID');
+
$this->idClient = $this->objDataClients->ID;
$doBlock = $this->objDataClients->GetValue('doBlock');
+
$doBlock = $this->objDataClients->doBlock;
 
if ($doBlock) {
 
if ($doBlock) {
 
$strThrType = 'BLK';
 
$strThrType = 'BLK';
 
} else {
 
} else {
$intRetries = $this->objDataClients->GetValue('Retries');
+
$intRetries = $this->objDataClients->Retries;
$intThrottle = $this->throttle_retries;
+
$intThrottle = $this->Setting('throttle_retries');
 
$doBlock = $intRetries > $intThrottle;
 
$doBlock = $intRetries > $intThrottle;
 
$strThrType = 'THR-'.$intRetries;
 
$strThrType = 'THR-'.$intRetries;
Line 138: Line 204:
 
if ($doBlock) {
 
if ($doBlock) {
 
// retry limit exceeded; check timeout limit
 
// retry limit exceeded; check timeout limit
if ($this->objDataClients->GetValue('ThrottleTime') < $this->throttle_timeout) {
+
if ($this->objDataClients->ThrottleTime < $this->Setting('throttle_timeout')) {
 
EditPage::spamPage('Too many spam attempts from your IP address ('.$this->strIPAddr.'). Please come back later.');
 
EditPage::spamPage('Too many spam attempts from your IP address ('.$this->strIPAddr.'). Please come back later.');
 
$this->RecordAttempt($strThrType); // record post attempt by throttled client
 
$this->RecordAttempt($strThrType); // record post attempt by throttled client
Line 207: Line 273:
 
// AMPERSANDBOT DETECTED; refuse to save the edit
 
// AMPERSANDBOT DETECTED; refuse to save the edit
 
$this->RecordAttempt('AMP'); // record spam attempt (AMP = ampersandbot)
 
$this->RecordAttempt('AMP'); // record spam attempt (AMP = ampersandbot)
// The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:"
+
// The string returned will be shown after "The following text is what triggered our spam filter:"
EditPage::spamPage( 'The removal of everything after the first ampersand. You are an ampersandbot; please go away.' );
+
$retVal = 'The removal of everything after the first ampersand. You are an ampersandbot; please go away.';
// LATER: it would be nice to have a special MediaWiki:Ampersandbot page to display when this happens
+
// LATER: it would be nice to display the error in a different format from the usual spam page
$retVal = true;
 
 
}
 
}
 
}
 
}
Line 221: Line 286:
  
 
if (!$retVal) {
 
if (!$retVal) {
set_error_handler ('ErrorHandler',E_WARNING);
+
// set_error_handler ('ErrorHandler',E_WARNING);
$objTblPatterns = new clsDataTable($this->dbSpam,'patterns');
+
$objTblPatterns = new clsTable($this->dbSpam,'patterns','ID');
 
$objDataPatterns = $objTblPatterns->GetData('isActive');
 
$objDataPatterns = $objTblPatterns->GetData('isActive');
/*
+
 
$debug .= ' objDataPatterns is object:'.is_object($objDataPatterns);
 
$debug .= ' objDataPatterns.Res is object:'.is_object($objDataPatterns->Res);
 
$debug .= ' objDataPatterns.Row is array:'.is_array($objDataPatterns->Row);
 
$debug .= ' objDataPatterns.Res is class '.get_class($objDataPatterns->Res);
 
$debug .= ' objDataPatterns.Res has '.$objDataPatterns->Res->num_rows.' row(s)';
 
*/
 
 
$strTextCk = strtolower($text);
 
$strTextCk = strtolower($text);
//$cr = "\n";
+
$isMatch = FALSE;
//echo 'DEBUGGING spam filter - please excuse the mess!'.$cr;
+
while($objDataPatterns->NextRow() && !$isMatch) {
while(is_array($objDataPatterns->Row)) {
+
$strPattern = $objDataPatterns->Pattern;
$strPattern = $objDataPatterns->GetValue('Pattern');
+
$isRegex = $objDataPatterns->isRegex;
$isRegex = $objDataPatterns->GetValue('isRegex');
+
$this->idPattern = $objDataPatterns->ID;
$this->idPattern = $objDataPatterns->GetValue('ID');
 
//echo '<br>Pattern '.$this->idPattern;
 
 
if ($isRegex) {
 
if ($isRegex) {
//echo 'PATTERN: '.$strPattern.$cr;
 
//echo ' regex';
 
 
$strPattCk = $strPattern;
 
$strPattCk = $strPattern;
 
// Attempt at using Perl-compatible regex, but it doesn't seem to work (or maybe just too many patterns have issues):
 
// Attempt at using Perl-compatible regex, but it doesn't seem to work (or maybe just too many patterns have issues):
Line 264: Line 319:
 
}
 
}
 
// $debug .= 'ROW: '.DumpArray($objDataPatterns->Row);
 
// $debug .= 'ROW: '.DumpArray($objDataPatterns->Row);
 
if ($isMatch) {
 
$objDataPatterns->Row = NULL; // stop the search
 
} else {
 
$objDataPatterns->NextRow(); // keep looking
 
}
 
 
}
 
}
 
 
Line 277: Line 326:
 
 
 
// The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:"
 
// The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:"
EditPage::spamPage( '(pattern #'.$this->idPattern.') '.$strMatch );
+
$retVal = '(pattern #'.$this->idPattern.') '.$strMatch;
 
// Log the spam attempt:
 
// Log the spam attempt:
 
// $sql = 'SELECT * FROM clients WHERE Address="'.$this->strIPAddr.'"';
 
// $sql = 'SELECT * FROM clients WHERE Address="'.$this->strIPAddr.'"';
Line 283: Line 332:
 
// update or create client record:
 
// update or create client record:
 
$this->RecordAttempt('-');
 
$this->RecordAttempt('-');
$retVal = true;
 
 
} else {
 
} else {
 
// no spam cues found; allow the edit to be saved, if nothing else has tripped the filter
 
// no spam cues found; allow the edit to be saved, if nothing else has tripped the filter
if ($wgSpamFerretSettings['log_ok_edits']) {
+
if ($this->Setting('log_ok_edits')) {
 
$this->RecordOkEdit();
 
$this->RecordOkEdit();
 
}
 
}
Line 295: Line 343:
 
/**/
 
/**/
 
}
 
}
 +
$objDataPatterns->NextRow();
 
}
 
}
  

Revision as of 00:28, 5 April 2009

Navigation

{{#lst:SpamFerret|navbar}}: SpamFerret.php

Code

<php><?php /*

HISTORY:
 2007-08-19 (Wzl) fixed line 155 call to clsDatabase::Query() (method deprecated and removed)
 2007-09-30 (Wzl) fixing regex processing
 2007-10-11 (Wzl) logging ampersandbot attempts; spam throttle (automatic temporary blacklist)
 2007-10-13 (Wzl) fixed some issues which were preventing throttling from working - mainly changes to SQL
 2007-10-15 (Wzl) "Code" wasn't being recorded. Decided that normal filtering should use code '-' so NULL means something is wrong.
 2007-10-28 (Wzl) Events with THR and AMP codes *still* weren't being recorded because method call was improperly formatted.
 2007-12-23 (Wzl) Emails wikimaster if eregi() returns an error (due to improperly formatted regex)
 2007-12-26 (Wzl) Spam turd rejection / logging
 2007-12-27 (Wzl) Fixed spam turd detection to work for new pages too (will probably need refinement)
 2007-12-27 (Wzl) Ooops. Replaced missing if-block from regex results when inserted text is found.
 2008-08-29 (Wzl) Added permanent IP blocking
 2008-09-04 (Wzl) Added (optional) logging of successful edits
 2008-09-19 (Wzl) Actually *set* the "didEdit" flag for successful edits <facepalms>
 2008-10-21 (Wzl) Fixed minor syntax error in "defines"
 2009-02-25 (Wzl) $objArticleCurr->loadLastEdit() now causes error in MW 1.14 (was it necessary before?)
 2009-03-10 (Wzl) "require" -> "require_once" so other extensions can use data.php without conflict
   also optional $kfpWzlLibs so data.php can be somewhere not on the path
 2009-03-18 (Wzl) Got rid of shared.php requirement; now using kfpLib to locate data.php
 2009-03-26 (Wzl) Rewrote data library calls to use new classes (no longer using deprecated/removed classes)
   Also modified to use newer function hooks
TO DO:
 * Log matching text for regex filters
 * Throttled save attempts should check for spam, just for data-gathering purposes.
   Possibly non-spam from a throttled IP should not update the "WhenLast" timestamp. Maybe this should be a LocalSettings option?
 * Figure out how to display a different error message than "the following text is what triggered our filter:"
OPTIONAL SETTINGS:
 kfpLib - path to data.php folder (no final slash)
 kfsLib_Data - filespec of data.php
  • /
  1. Loader for spam blacklist feature
  2. Include this from LocalSettings.php

if ( defined( 'MEDIAWIKI' ) ) {

$wgExtensionCredits['other'][] = array( 'name' => 'SpamFerret', 'author' => 'Woozle Staddon', 'url' => 'http://htyp.org/SpamFerret', 'version' => '2009-03-26', 'description' => 'database-driven wikispam blocker', );

if (!defined('kfsLib_Data')) {

   if (defined('kfpLib')) {

define('kfsLib_Data', kfpLib.'/data.php');

   } else {

define('kfsLib_Data','data.php'); // assume it's on the path

   }

} require_once kfsLib_Data;

/* ==============

SET UP CALLBACKS
  • /

global $wgFilterCallback, $wgPreSpamFilterCallback;

$wgPreSpamFilterCallback = false; if ( defined( 'MW_SUPPORTS_EDITFILTERMERGED' ) ) {

   $wgHooks['EditFilterMerged'][] = 'wfSpamFerretMerged';

} else {

   if ( $wgFilterCallback ) {
       $wgPreSpamFilterCallback = $wgFilterCallback;
   }
   $wgFilterCallback = 'wfSpamFerretFilter';

}

/* $wgHooks['EditFilter'][] = 'wfSpamFerretValidate'; $wgHooks['ArticleSaveComplete'][] = 'wfSpamFerretArticleSave'; $wgHooks['APIEditBeforeSave'][] = 'wfSpamFerretAPIEditBeforeSave';

  • /

/* ================

SET GLOBAL OBJECTS
  • /

function GetSpamFerret() {

   static $objFerret;
   if (!isset($objFerret)) {

$objFerret = new SpamFerret();

   }
   return $objFerret;

} /* ================

CALLBACK FUNCTIONS
  • /

/**

* Hook function for $wgFilterCallback
*/

function wfSpamFerretFilter( &$title, $text, $section, &$hookErr, $editSummary ) {

   global $wgOut;
   $spamObj = GetSpamFerret();
   $wgOut->addWikiText( "Intercepted by SpamFerretFilter" );
   return $spamObj->filter( $title, $text, $section );

}

/**

* Hook function for EditFilterMerged, replaces wfSpamBlacklistFilter
*/

function wfSpamFerretMerged( &$editPage, $text, &$hookErr, $editSummary ) {

   global $wgTitle,$wgOut;
   if( is_null( $wgTitle ) ) {
       # API mode
       # wfSpamBlacklistFilterAPIEditBeforeSave already checked the blacklist
       return true;
   }
   $spamObj = GetSpamFerret();
   $title = $editPage->mArticle->getTitle();
   $ret = $spamObj->filter( $title, $text, , $editSummary, $editPage );
   if ( $ret !== false ) $editPage->spamPage( $ret );

// additional text can be added here: // $wgOut->addWikiText( "Intercepted by SpamFerretMerged" );

   // Return convention for hooks is the inverse of $wgFilterCallback
   return ( $ret === false );

} /**

* Hook function for APIEditBeforeSave
*/

function wfSpamFerretAPIEditBeforeSave( &$editPage, $text, &$resultArr ) { }

class SpamFerret { var $previousFilter = false; // internal data var $dbSpam; var $objDataClients; var $strIPAddr; var $idPattern; var $idClient; var $doClearThrottle;

// function SpamFerret() { // }

function Setting($iName) { global $wgSpamFerretSettings;

return $wgSpamFerretSettings[$iName]; }

function filter( &$title, $text, $section, $editSummary, $editPage ) { global $wgArticle, $wgDBname, $wgMemc, $messageMemc, $wgVersion, $wgOut; global $wgTitle, $wgServer; global $wgEmergencyContact; global $debug; global $errNum, $errStr; // debugging: global $sql;

$fname = 'wfSpamFerretFilter'; wfProfileIn( $fname );

# Call the rest of the hook chain first if ( $this->previousFilter ) { $f = $this->previousFilter; if ( $f( $title, $text, $section ) ) { wfProfileOut( $fname ); return true; } } // initialize variables $retVal = FALSE; // default = assume edit is ok $msgEmail = FALSE;

// get the IP address of the http client making the edit attempt: $this->strIPAddr = wfGetIP(); // Open the database $this->dbSpam = new clsDatabase($this->Setting('dbspec')); $this->dbSpam->Open(); // open clients table (extended Throttle version) for reference: //return TRUE; $objTblClients = new clsTable($this->dbSpam,'ClientThrottle','ID'); // Look up to see if this IP is known; it may already be throttled: $this->objDataClients = $objTblClients->GetData('Address="'.$this->strIPAddr.'"'); if (is_object($this->objDataClients)) { if ($this->objDataClients->hasRows()) { $isClientKnown = true; } } if ($isClientKnown) { $this->idClient = $this->objDataClients->ID; $doBlock = $this->objDataClients->doBlock; if ($doBlock) { $strThrType = 'BLK'; } else { $intRetries = $this->objDataClients->Retries; $intThrottle = $this->Setting('throttle_retries'); $doBlock = $intRetries > $intThrottle; $strThrType = 'THR-'.$intRetries; } if ($doBlock) { // retry limit exceeded; check timeout limit if ($this->objDataClients->ThrottleTime < $this->Setting('throttle_timeout')) { EditPage::spamPage('Too many spam attempts from your IP address ('.$this->strIPAddr.'). Please come back later.'); $this->RecordAttempt($strThrType); // record post attempt by throttled client $retVal = true; // client has exceeded spam limit; impose throttle } else { $this->doClearThrottle = true; } } }

if (!$retVal) { /*

At this point, there's apparently no reason to block the client just for being who they are,

so now check for common non-listable offenses. These involve comparing the new contents with the original, so first we get the original (current) article contents plus some information about what has changed:

* $strIns = whatever has been inserted at the start of the article (or contents of new article)
  • /

$objArticleCurr = new Article($title); //$objArticleCurr->loadLastEdit(); // function is now protected if ($objArticleCurr->exists()) { $txtCurr = $objArticleCurr->getContent(); $lenIns = strpos($text,$txtCurr); if ($lenIns !== false) { $strIns = substr($text,0,$lenIns); } } else { $lenIns = strlen($text); $strIns = $text; } // ** OFFENSE: Spam turds (short bits of nonsense inserted at the beginning of an article): if ($strIns != ) { // new page is old page with something inserted at the beginning // get the inserted text $isMatch = preg_match('/^[a-z0-9]+ ?$/',$strIns); // another way to do it, incomplete: // $lenMatch = strspn($strIns, '1234567890abcdefghijklmnopqrstuvwxyz'); // if (strlen($strIns) - $lenMatch < 2) { // } if ($isMatch) { $this->RecordAttempt('TRD',$strIns); // record spam attempt (AMP = ampersandbot) // The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:" EditPage::spamPage( '"'.$strIns.'" (spam turd).' ); $retVal = true; } } // ** OFFENSE: Ampersandbot: $lenNew = strlen($text); $posMatch = strpos($txtCurr, $text); //$debug = 'OLD=['.substr($txtCurr,0,5).'] NEW=['.substr($text,0,5).'] STRPOS='.$posMatch.' ===0?:'.($posMatch===0); //$debug .= '...GOT TO HERE ... '; if ($posMatch===0) { //$debug .= '1'; // new string starts the same as old string; is it a truncated subset? if ($lenNew < strlen($txtCurr)) { //$debug .= '2'; // new string is a truncation of old string // ideally, we would just check to see if the missing character is an ampersand - // ...but unfortunately, something is quasi-randomly mutating the strings in a way which // leaves the exact position of the "missing character" in some doubt. So what we do is this: // 1. Find the position of the first ampersand in OLD TEXT: $posAmp = strpos($txtCurr,'&'); // 2. Compare this position with the length of NEW TEXT: $posDiff = abs(strlen($text)-$posAmp); // 3. If the difference is less than some limit, then presume Ampersandbot activity: if ($posDiff < 3) { // TO DO: log $posDiff for later analysis // AMPERSANDBOT DETECTED; refuse to save the edit $this->RecordAttempt('AMP'); // record spam attempt (AMP = ampersandbot) // The string returned will be shown after "The following text is what triggered our spam filter:" $retVal = 'The removal of everything after the first ampersand. You are an ampersandbot; please go away.'; // LATER: it would be nice to display the error in a different format from the usual spam page } } } } if ($debug) { EditPage::spamPage('DEBUG: '.$debug); $retVal = true; }

if (!$retVal) { // set_error_handler ('ErrorHandler',E_WARNING); $objTblPatterns = new clsTable($this->dbSpam,'patterns','ID'); $objDataPatterns = $objTblPatterns->GetData('isActive');

$strTextCk = strtolower($text); $isMatch = FALSE; while($objDataPatterns->NextRow() && !$isMatch) { $strPattern = $objDataPatterns->Pattern; $isRegex = $objDataPatterns->isRegex; $this->idPattern = $objDataPatterns->ID; if ($isRegex) { $strPattCk = $strPattern; // Attempt at using Perl-compatible regex, but it doesn't seem to work (or maybe just too many patterns have issues): // $strPattCk_pcre = str_replace('/','\/',$strPattCk); // $isMatch = preg_match('/'.$strPattCk_pcre.'/i',$strTextCk,$matches); $isMatch = eregi($strPattCk, $strTextCk, $matches); if ($errNum) { $msgEmail .= 'Filter #'.$this->idPattern.' generated error #'.$errNum.': '.$errStr."\n"; // echo '
ERROR #'.$errNum.': '.$errStr; $errNum = 0; }

if ($isMatch) { $strMatch = $matches[0]; //echo ' match: '.$strMatch; } } else { //echo ' non-regex'; $strMatch = stristr ($strTextCk,$strPattern); $isMatch = ($strMatch != ); //echo ' match: '.$strMatch; } // $debug .= 'ROW: '.DumpArray($objDataPatterns->Row); }

if ( $strMatch != ) { // spam cue found; display the matching text and don't allow the edit to be saved: wfDebug( "Match!\n" );

// The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:" $retVal = '(pattern #'.$this->idPattern.') '.$strMatch; // Log the spam attempt: // $sql = 'SELECT * FROM clients WHERE Address="'.$this->strIPAddr.'"'; // $this->objDataClients = $this->dbSpam->Query($sql); // update or create client record: $this->RecordAttempt('-'); } else { // no spam cues found; allow the edit to be saved, if nothing else has tripped the filter if ($this->Setting('log_ok_edits')) { $this->RecordOkEdit(); } /* EditPage::spamPage( 'DEBUGGING: '.$debug ); $retVal = true; /*/ /**/ } $objDataPatterns->NextRow(); }

wfProfileOut( $fname ); if ($msgEmail) { mail ($wgEmergencyContact,'spamferret filter error',$msgEmail); // die(); } //$wgOut->addHTML($out); return $retVal; /**/ } public function RecordAttempt($iCode,$iMatch=NULL) { global $wgTitle, $wgServer;

if ($this->idClient != 0) { if ($this->doClearThrottle) { $strRetries = '0'; } else { $strRetries = 'Retries+1'; } $sql = 'UPDATE clients SET WhenLast=NOW(),Count=Count+1, Retries='.$strRetries.' WHERE Address="'.$this->strIPAddr.'"'; $this->dbSpam->Exec($sql); } else { $sql = 'INSERT INTO clients (Address,WhenFirst,Count,Retries) VALUES("'.$this->strIPAddr.'",NOW(),1,0)'; $this->dbSpam->Exec($sql); $this->idClient = $this->dbSpam->NewID(); }

$sqlURL = '"'.$this->dbSpam->SafeParam($wgTitle->getFullURL()).'"'; $sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"'; $sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText()).'"'; $sqlPattern = $this->idPattern; if (is_null($sqlPattern)) { $sqlPattern = 'NULL'; } else { $isPattern = true; } $sqlMatch = is_null($iMatch)?'NULL':'"'.$iMatch.'"'; $sql = 'INSERT INTO attempts (`When`,ID_Pattern,ID_Client,IDS_Session,PageServer,PageName,Code,MatchText) VALUES (NOW(),'.$sqlPattern.','.$this->idClient.','.SQL_Value(session_id()).','.$sqlSrvr.','.$sqlPage.',"'.$iCode.'",'.$sqlMatch.')'; $this->dbSpam->Exec($sql); if ($isPattern) { $sql = 'UPDATE patterns SET WhenTried=NOW(), Count=Count+1 WHERE ID='.$this->idPattern; $this->dbSpam->Exec($sql); } } public function RecordOkEdit() { global $wgTitle, $wgServer;

$sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText()).'"'; $sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"'; $sql = 'INSERT INTO attempts (`When`,ID_Pattern,ID_Client,IDS_Session,PageServer,PageName,Code,MatchText,didAllow) VALUES (NOW(),NULL,'.$this->idClient.','.SQL_Value(session_id()).','.$sqlSrvr.','.$sqlPage.',NULL,NULL,TRUE)'; $this->dbSpam->Exec($sql); } }

function ErrorHandler ($errno ,$errstr) { global $errNum, $errStr;

$errNum = $errno; $errStr = $errstr; }

function SQL_Value($iVar) { if (is_null($iVar)) { return 'NULL'; } else { if (is_numeric($iVar)) { return $iVar; } else { return '"'.$iVar.'"'; } } }

} // end of 'MEDIAWIKI' check</php>