Difference between revisions of "SpamFerret.php"

from HTYP, the free directory anyone can edit if they can prove to me that they're not a spambot
Jump to navigation Jump to search
(revisions thru 8/9 - new table structure)
(2010-08-17 version)
Line 38: Line 38:
 
   2009-08-08 (Wzl) create client record immediately if client is not recognized; don't depend on spam filter being triggered
 
   2009-08-08 (Wzl) create client record immediately if client is not recognized; don't depend on spam filter being triggered
 
   2009-08-09 (Wzl) restructured "client" and "attempt" tables (was "clients", "attempts"); not backwards-compatible
 
   2009-08-09 (Wzl) restructured "client" and "attempt" tables (was "clients", "attempts"); not backwards-compatible
 +
  2010-02-24 (Wzl) some code-tidying; trying to restrict passing of data between methods to single array var in args/return
 +
  2010-08-17 (Wzl) added some debug code to CheckRegex(); fixed problem with escaped chars in filter
 
  TO DO:
 
  TO DO:
 
   * Throttled save attempts should check for spam, just for data-gathering purposes.
 
   * Throttled save attempts should check for spam, just for data-gathering purposes.
Line 56: Line 58:
 
'author' => 'Woozle Staddon',  
 
'author' => 'Woozle Staddon',  
 
'url' => 'http://htyp.org/SpamFerret',  
 
'url' => 'http://htyp.org/SpamFerret',  
'version' => '2009-08-09',
+
'version' => '2010-08-17',
 
'description' => 'database-driven wikispam blocker',
 
'description' => 'database-driven wikispam blocker',
 
);
 
);
Line 135: Line 137:
 
     $title = $editPage->mArticle->getTitle();
 
     $title = $editPage->mArticle->getTitle();
 
//    $ret = $spamObj->filter( $title, $text, '', $editSummary, $editPage );
 
//    $ret = $spamObj->filter( $title, $text, '', $editSummary, $editPage );
     $ret = $spamObj->filter( $title, $text, '' );
+
     $ret = $spamObj->filter( $title, $text, '',$editPage );
 
     if ( $ret !== false ) $editPage->spamPage( $ret );
 
     if ( $ret !== false ) $editPage->spamPage( $ret );
  
Line 151: Line 153:
  
 
class SpamFerret {
 
class SpamFerret {
var $previousFilter = false;
+
    var $previousFilter = false;
 
// internal data
 
// internal data
var $dbSpam;
+
    var $dbSpam;
var $objDataClients;
+
    var $objDataClients;
var $strIPAddr;
+
    var $strIPAddr;
var $idPattern;
+
    var $idPattern;
var $doClearThrottle;
+
    var $doClearThrottle;
 
 
// function SpamFerret() {
+
    function Setting($iName) {
// }
+
global $wgSpamFerretSettings;
  
function Setting($iName) {
+
return $wgSpamFerretSettings[$iName];
    global $wgSpamFerretSettings;
+
    }
 +
    /*-----
 +
      PROPERTIES USED: $this->idPattern (out)
 +
    */
 +
    function filter( &$title, $text, $section,  $editPage = FALSE ) {
 +
global $wgArticle, $wgDBname, $wgMemc, $messageMemc, $wgVersion, $wgOut;
 +
global $wgTitle, $wgServer;
 +
global $debug;
 +
global $errNum, $errStr;
 +
global $gErrorText;
 +
// debugging:
 +
global $sql;
  
    return $wgSpamFerretSettings[$iName];
+
$fname = 'wfSpamFerretFilter';
}
+
wfProfileIn( $fname );
  
// function filter( &$title, $text, $section, $editSummary, $editPage ) {
+
ini_set('track_errors', 1);
function filter( &$title, $text, $section,  $editPage = FALSE ) {
 
global $wgArticle, $wgDBname, $wgMemc, $messageMemc, $wgVersion, $wgOut;
 
global $wgTitle, $wgServer;
 
global $debug;
 
global $errNum, $errStr;
 
global $gErrorText;
 
// debugging:
 
global $sql;
 
  
$fname = 'wfSpamFerretFilter';
+
# Call the rest of the hook chain first
wfProfileIn( $fname );
+
if ( $this->previousFilter ) {
 
+
$f = $this->previousFilter;
ini_set('track_errors', 1);
+
if ( $f( $title, $text, $section ) ) {
 
+
wfProfileOut( $fname );
# Call the rest of the hook chain first
+
return true;
if ( $this->previousFilter ) {
 
$f = $this->previousFilter;
 
if ( $f( $title, $text, $section ) ) {
 
wfProfileOut( $fname );
 
return true;
 
}
 
 
}
 
}
 +
}
 
// initialize variables
 
// initialize variables
$retVal = FALSE; // default = assume edit is ok
+
$retVal = FALSE; // default = assume edit is ok
$gErrorText = FALSE;
+
$gErrorText = FALSE;
$isClientKnown = FALSE;
+
$isClientKnown = FALSE;
  
$this->txtEditRaw = $text;
+
$this->txtEditRaw = $text; // DEPRECATED
 +
$arArgs['edit-raw'] = $text;
  
 
// get the IP address of the http client making the edit attempt:
 
// get the IP address of the http client making the edit attempt:
$this->strIPAddr = wfGetIP();
+
$this->strIPAddr = wfGetIP();
 
// Open the database
 
// Open the database
$this->OpenDatabase();
+
$this->OpenDatabase();
 
// open clients table (extended Throttle version) for reference:
 
// open clients table (extended Throttle version) for reference:
 
//return TRUE;
 
//return TRUE;
$objTblClients = new clsTable($this->dbSpam);
+
$objTblClients = new clsTable($this->dbSpam);
  $objTblClients->Name('ClientThrottle2');
+
  $objTblClients->Name('ClientThrottle2');
  $objTblClients->KeyName('Address');
+
  $objTblClients->KeyName('Address');
 
// Look up to see if this IP is known; it may already be throttled:
 
// Look up to see if this IP is known; it may already be throttled:
$this->objDataClients = $objTblClients->GetData('Address="'.$this->strIPAddr.'"');
+
$this->objDataClients = $objTblClients->GetData('Address="'.$this->strIPAddr.'"');
if (is_object($this->objDataClients)) {
+
if (is_object($this->objDataClients)) {
if ($this->objDataClients->hasRows()) {
+
    if ($this->objDataClients->hasRows()) {
$isClientKnown = TRUE;
+
$isClientKnown = TRUE;
}
+
    }
}
+
}
if ($isClientKnown) {
+
if ($isClientKnown) {
$this->objDataClients->FirstRow();
+
    $this->objDataClients->FirstRow();
// $this->idClient = $this->objDataClients->ID;
+
    $doBlock = $this->objDataClients->doBlock;
$doBlock = $this->objDataClients->doBlock;
+
    if ($doBlock) {
if ($doBlock) {
+
$strThrType = 'BLK';
$strThrType = 'BLK';
+
    } else {
 +
$intRetries = $this->objDataClients->Retries;
 +
$intThrottle = $this->Setting('throttle_retries');
 +
$doBlock = $intRetries > $intThrottle;
 +
$strThrType = 'THR-'.$intRetries;
 +
    }
 +
    if ($doBlock) {
 +
$arArgs['diff'] = NULL; // not applicable
 +
// retry limit exceeded; check timeout limit
 +
if ($this->objDataClients->ThrottleTime < $this->Setting('throttle_timeout')) {
 +
$txtMsg = 'Too many spam attempts from your IP address ('.$this->strIPAddr.'). Please come back later.';
 +
if ($editPage) {
 +
    $editPage->spamPage($txtMsg);
 
} else {
 
} else {
$intRetries = $this->objDataClients->Retries;
+
    EditPage::spamPage($txtMsg); // older MW code doesn't supply $editPage
$intThrottle = $this->Setting('throttle_retries');
 
$doBlock = $intRetries > $intThrottle;
 
$strThrType = 'THR-'.$intRetries;
 
}
 
if ($doBlock) {
 
// retry limit exceeded; check timeout limit
 
if ($this->objDataClients->ThrottleTime < $this->Setting('throttle_timeout')) {
 
$txtMsg = 'Too many spam attempts from your IP address ('.$this->strIPAddr.'). Please come back later.';
 
if ($editPage) {
 
    $editPage->spamPage($txtMsg);
 
} else {
 
    EditPage::spamPage($txtMsg); // older MW code doesn't supply $editPage
 
}
 
$this->RecordAttempt($strThrType); // record post attempt by throttled client
 
$retVal = true; // client has exceeded spam limit; impose throttle
 
} else {
 
$this->doClearThrottle = true;
 
}
 
 
}
 
}
 +
$arArgs['code'] = $strThrType;
 +
$this->RecordAttempt($arArgs); // record post attempt by throttled client
 +
$retVal = true; // client has exceeded spam limit; impose throttle
 
} else {
 
} else {
$this->CreateClient();
+
$this->doClearThrottle = true;
 
}
 
}
 
+
    }
$this->txtDiff = 'N/A';
+
} else {
/* if ($debug) {
+
$this->CreateClient();
EditPage::spamPage('DEBUG: '.$debug);
+
}
$retVal = true;
+
//$this->txtDiff = 'N/A';
}
+
$arArgs['diff'] = 'N/A';
*/
+
if (!$retVal) {
if (!$retVal) {
+
    $arRtn = $this->GetDiff($title); // get the diff between edit and current contents
$this->GetDiff($title); // get the diff between edit and current contents
+
    $arArgs['diff'] = $arRtn['diff'];
$this->CheckFilters(FALSE);
+
    $arArgs['doAll'] = FALSE;
if ( $this->isMatch ) {
+
    $arRtn = $this->CheckFilters($arArgs);
 +
    $arArgs['edit-to-check'] = $arRtn['edit-to-check'];
 +
    if ( $this->isMatch ) {
 
// spam cue found; display the matching text and don't allow the edit to be saved:
 
// spam cue found; display the matching text and don't allow the edit to be saved:
wfDebug( "Match!\n" );
+
wfDebug( "Match!\n" );
 
 
 
// The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:"
 
// The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:"
$retVal = '(pattern #'.$this->idPattern.') '.$this->strMatch;
+
$retVal = '(pattern #'.$this->idPattern.') '.$this->strMatch;
 
// Log the spam attempt:
 
// Log the spam attempt:
$this->RecordAttempt('-');
+
$arArgs['code'] = '-';
} else {
+
$this->RecordAttempt($arArgs);
 +
    } else {
 
// no spam cues found; allow the edit to be saved, if nothing else has tripped the filter
 
// no spam cues found; allow the edit to be saved, if nothing else has tripped the filter
//if ($this->Setting('log_ok_edits')) {
+
//if ($this->Setting('log_ok_edits')) {
$this->idPattern = NULL;
+
    $this->idPattern = NULL;
$this->RecordOkEdit();
+
    $this->RecordOkEdit($arArgs);
//$this->RecordAttempt('OK');
+
    //$this->RecordAttempt('OK');
//}
+
//}
}
+
    }
}
+
}
 
+
wfProfileOut( $fname );
wfProfileOut( $fname );
+
$this->ReportErrors();
$this->ReportErrors();
 
 
//$wgOut->addHTML($out);
 
//$wgOut->addHTML($out);
return $retVal;
+
return $retVal;
 
/**/
 
/**/
 +
    }
 +
    public function OpenDatabase() {
 +
$this->dbSpam = new clsDatabase($this->Setting('dbspec'));
 +
$this->dbSpam->Open();
 +
    }
 +
    public function FiltTbl() {
 +
$doLoad = TRUE;
 +
$doLoad = empty($this->objFilts);
 +
if ($doLoad) {
 +
    $objTbl = new clsTable($this->dbSpam);
 +
      $objTbl->Name('patterns');
 +
      $objTbl->KeyName('ID');
 +
    $this->objFilts = $objTbl;
 
}
 
}
public function OpenDatabase() {
+
return $this->objFilts;
$this->dbSpam = new clsDatabase($this->Setting('dbspec'));
+
    }
$this->dbSpam->Open();
+
    /*-----
 +
      INPUT:
 +
$iTitle - page for comparing proposed edit
 +
$this->txtEditRaw (DEPRECATED; use $iarArgs['text-to-check']
 +
      OUTPUT:
 +
return ['diff']
 +
    */
 +
    public function GetDiff($iTitle) {
 +
$objArticleCurr = new Article($iTitle);
 +
if ($objArticleCurr->exists()) {
 +
$txtCurr = $objArticleCurr->getContent();
 +
$txtDiff = FigureDiff($txtCurr,$this->txtEditRaw);
 +
} else {
 +
$txtDiff = '!!NEW: '.$this->txtEditRaw;
 
}
 
}
public function GetDiff($iTitle) {
+
//$this->txtDiff = $txtDiff;
$objArticleCurr = new Article($iTitle);
+
$arOut['diff'] = $txtDiff;
if ($objArticleCurr->exists()) {
+
return $arOut;
$txtCurr = $objArticleCurr->getContent();
+
    }
$this->txtDiff = FigureDiff($txtCurr,$this->txtEditRaw);
+
    /*-----
/*
+
      INPUT:
$lenIns = strpos($txtEditRaw,$txtCurr);
+
$iarArgs['doAll']
if ($lenIns !== false) {
+
$iarArgs['diff']
    $strIns = substr($this->txtEditRaw,0,$lenIns);
+
$this->txtEditRaw - DEPRECATED; use $iarArgs
} else {
+
      OUTPUT:
    $strIns = '';
+
$this->idPattern
}
+
    */
*/
+
    public function CheckFilters(array $iarArgs) {
} else {
+
global $gRegexMatches,$gFilterMatches,$gFilterRows,$gFilterCount;
/*
+
global $debug;
$lenIns = strlen($text);
+
 
$strIns = $text;
+
assert('is_object($this->dbSpam)');
$txtCurr = NULL; // this isn't actually used anywhere
+
 
// $this->txtDiff = FigureDiff('',$text);
+
$doCheckAll = $iarArgs['doAll'];
*/
+
$strChkDiff = strtolower(nz($iarArgs['diff']));
$this->txtDiff = '!!NEW: '.$this->txtEditRaw;
+
$objFiltTbl = $this->FiltTbl();
}
+
 
 +
if ($doCheckAll) {
 +
    $sqlFilt = NULL;
 +
} else {
 +
    $sqlFilt = 'isActive';
 
}
 
}
public function CheckFilters($iCheckAll) {
 
    global $gRegexMatches,$gFilterMatches,$gFilterRows,$gFilterCount;
 
    global $debug;
 
  
assert('is_object($this->dbSpam)');
+
$objFiltRows = $objFiltTbl->GetData($sqlFilt);
 +
$objRow = $objFiltRows; // for shorthand
  
$this->PatternTbl = new clsTable($this->dbSpam);
+
$strTextEdit = strtolower($this->txtEditRaw);
  $this->PatternTbl->Name('patterns');
+
//$this->txtEditChk = $strTextEdit; // text after being massaged for checking
  $this->PatternTbl->KeyName('ID');
+
$arOut['edit-to-check'] = $strTextEdit; // text after being massaged for checking
 +
$this->isMatch = FALSE;
 +
$gFilterCount = 0;
 +
$gFilterRows = $objRow->RowCount();
 +
while($objRow->NextRow() && (!$this->isMatch || $doCheckAll)) {
 +
    $isMatch = FALSE;
 +
    if ($objRow->isDiff) {
 +
    $strTextCk = $strChkDiff;
 +
    } else {
 +
    $strTextCk = $strTextEdit;
 +
    }
 +
    if (!is_null($strTextCk)) {
 +
$gFilterCount++;
 +
$strPattern = strtolower($objRow->Pattern);
 +
$isRegex = $objRow->isRegex;
 +
$this->idPattern = $objRow->ID;
 +
if ($isRegex) {
 +
    $isMatch = $this->CheckRegex($strPattern,$strTextCk);
 +
    if (isset($php_errormsg)) {
 +
$this->AddErrorLine('Filter #'.$this->idPattern.' generated error "'.$php_errormsg);
 +
    }
  
if ($iCheckAll) {
+
    if ($isMatch) {
    $sqlFilt = NULL;
+
$this->strMatch = $gRegexMatches[0];
 +
    }
 
} else {
 
} else {
    $sqlFilt = 'isActive';
+
    if (empty($strPattern)) {
 +
$isMatch = FALSE;
 +
    } else {
 +
$this->strMatch = stristr($strTextCk,$strPattern);
 +
$isMatch = ($this->strMatch != '');
 +
    }
 
}
 
}
 
+
if ($isMatch) {
$this->PatternRows = $this->PatternTbl->GetData($sqlFilt);
+
    $this->isMatch = TRUE;
$objRow = $this->PatternRows;
+
    if ($doCheckAll) {
 
+
$gFilterMatches[$this->idPattern] = $this->strMatch;
$strTextEdit = strtolower($this->txtEditRaw);
+
    }
$this->txtEditChk = $strTextEdit; // text after being massaged for checking
 
$this->isMatch = FALSE;
 
$gFilterCount = 0;
 
$gFilterRows = $this->PatternRows->RowCount();
 
//$objDataPatterns->StartRows();
 
while($objRow->NextRow() && (!$this->isMatch || $iCheckAll)) {
 
$isMatch = FALSE;
 
if ($objRow->isDiff) {
 
if (isset($this->txtDiff)) {
 
    $strTextCk = $this->txtDiff;
 
} else {
 
    $strTextCk = NULL;
 
}
 
} else {
 
$strTextCk = $strTextEdit;
 
}
 
if (!is_null($strTextCk)) {
 
    $gFilterCount++;
 
    $strPattern = $objRow->Pattern;
 
    $isRegex = $objRow->isRegex;
 
    $this->idPattern = $objRow->ID;
 
    if ($isRegex) {
 
    $isMatch = $this->CheckRegex($strPattern,$strTextCk);
 
 
 
    if (isset($php_errormsg)) {
 
    $this->AddErrorLine('Filter #'.$this->idPattern.' generated error "'.$php_errormsg);
 
    }
 
 
 
    if ($isMatch) {
 
    $this->strMatch = $gRegexMatches[0];
 
    }
 
    } else {
 
    if (empty($strPattern)) {
 
$isMatch = FALSE;
 
    } else {
 
$this->strMatch = stristr($strTextCk,$strPattern);
 
$isMatch = ($this->strMatch != '');
 
    }
 
    }
 
    if ($isMatch) {
 
$this->isMatch = TRUE;
 
if ($iCheckAll) {
 
    $gFilterMatches[$this->idPattern] = $this->strMatch;
 
}
 
    }
 
}
 
 
}
 
}
 +
    }
 
}
 
}
public function CheckRegex($iPattern,$iText) {
+
    }
    global $gRegexMatches;
+
    /*
 +
      TO DO:
 +
replace $gRegexMatches with return array
 +
make this function static
 +
    */
 +
    public function CheckRegex($iPattern,$iText) {
 +
global $gRegexMatches,$strDbg;
  
    $chDelim = '/';
+
$chDelim = '/';
    $strPattCk = str_replace($chDelim,'\\'.$chDelim,$iPattern);
+
$strPattCk = $iPattern;
    unset($php_errormsg);
+
// (2010-08-17) this next line causes incorrect handling of escaped characters in the filter
    $isMatch = @preg_match($chDelim.$strPattCk.$chDelim,$iText,$gRegexMatches);
+
//$strPattCk = str_replace('\\','\\\\',$strPattCk); // make sure filter backslashes are prefixed to be literal
    return $isMatch;
+
$strPattCk = str_replace($chDelim,'\\'.$chDelim,$strPattCk);
}
+
unset($php_errormsg); // TO DO: explain this
 +
$strFinal = $chDelim.$strPattCk.$chDelim;
 +
$strDbg .= "'''@preg_match'''(\"$strFinal\",\"$iText\",...)";
 +
$isMatch = @preg_match($strFinal,$iText,$gRegexMatches);
 +
return $isMatch;
 +
    }
  
public function AddErrorLine($iText) {
+
    public function AddErrorLine($iText) {
    global $gErrorText;
+
global $gErrorText;
  
    $gErrorText .= $iText."\n";
+
$gErrorText .= $iText."\n";
}
+
    }
public function ReportErrors() {
+
    public function ReportErrors() {
    global $wgUser;
+
global $wgUser;
    global $wgEmergencyContact;
+
global $wgEmergencyContact;
    global $gErrorText;
+
global $gErrorText;
  
    if ($gErrorText) {
+
if ($gErrorText) {
$msgEmail = 'Filter error report for user '.$wgUser->getName().":\n\n";
+
    $msgEmail = 'Filter error report for user '.$wgUser->getName().":\n\n";
$msgEmail .= $gErrorText;
+
    $msgEmail .= $gErrorText;
mail($wgEmergencyContact,'spamferret filter error',$msgEmail);
+
    mail($wgEmergencyContact,'spamferret filter error',$msgEmail);
    }
 
 
}
 
}
 +
    }
 
/*
 
/*
 
  ACTION: Create a new record for the current client
 
  ACTION: Create a new record for the current client
Line 422: Line 442:
 
}
 
}
 
}
 
}
public function RecordAttempt($iCode) {
+
/*-----
 +
  INPUT:
 +
    $this->idPattern
 +
    $iarArgs['edit-raw']
 +
*/
 +
public function RecordAttempt(array $iarArgs) {
 
global $wgTitle, $wgServer;
 
global $wgTitle, $wgServer;
  
 +
$iCode = $iarArgs['code'];
 +
$txtDiff = $iarArgs['diff']; // was $this->txtDiff
 +
$txtEdit = $iarArgs['edit-raw']; // was $this->txtEditChk
 
$this->RecordClientSpam();
 
$this->RecordClientSpam();
  
Line 431: Line 459:
 
$sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"';
 
$sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"';
 
$sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText()).'"';
 
$sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText()).'"';
$sqlEdit = '"'.$this->dbSpam->SafeParam($this->txtEditChk).'"';
+
$sqlEdit = '"'.$this->dbSpam->SafeParam($txtEdit).'"';
$sqlDiff = '"'.$this->dbSpam->SafeParam($this->txtDiff).'"';
+
$sqlDiff = '"'.$this->dbSpam->SafeParam($txtDiff).'"';
 
$sqlPattern = $this->idPattern;
 
$sqlPattern = $this->idPattern;
 
if (is_null($sqlPattern)) {
 
if (is_null($sqlPattern)) {
Line 441: Line 469:
 
}
 
}
 
$sqlAddr = '"'.$this->strIPAddr.'"';
 
$sqlAddr = '"'.$this->strIPAddr.'"';
$sqlMatch = is_null($iMatch)?'NULL':'"'.$iMatch.'"';
+
//$sqlMatch = is_null($iMatch)?'NULL':'"'.$iMatch.'"';
 
$sql = 'INSERT INTO attempt (`When`,ID_Pattern,Addr_Client,IDS_Session,PageServer,PageName,Code,didAllow,Edit,Diff) VALUES'
 
$sql = 'INSERT INTO attempt (`When`,ID_Pattern,Addr_Client,IDS_Session,PageServer,PageName,Code,didAllow,Edit,Diff) VALUES'
 
  .'(NOW(),'
 
  .'(NOW(),'
Line 465: Line 493:
 
}
 
}
 
}
 
}
public function RecordOkEdit() {
+
/*
 +
  INPUT:
 +
    $this->txtEditChk
 +
*/
 +
public function RecordOkEdit(array $iarArgs) {
 
global $wgTitle, $wgServer;
 
global $wgTitle, $wgServer;
 +
 +
$txtDiff = $iarArgs['diff']; // was $this->txtDiff
 +
$txtEdit = $iarArgs['edit-to-check']; // was $this->txtEditChk
  
 
$sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText()).'"';
 
$sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText()).'"';
 
$sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"';
 
$sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"';
$sqlEdit = '"'.$this->dbSpam->SafeParam($this->txtEditChk).'"';
+
$sqlEdit = '"'.$this->dbSpam->SafeParam($txtEdit).'"';
$sqlDiff = '"'.$this->dbSpam->SafeParam($this->txtDiff).'"';
+
$sqlDiff = '"'.$this->dbSpam->SafeParam($txtDiff).'"';
 
$sql = 'INSERT INTO attempt (`When`,ID_Pattern,Addr_Client,IDS_Session,PageServer,PageName,Code,didAllow,Edit,Diff) VALUES '
 
$sql = 'INSERT INTO attempt (`When`,ID_Pattern,Addr_Client,IDS_Session,PageServer,PageName,Code,didAllow,Edit,Diff) VALUES '
 
  .'(NOW(),NULL,'
 
  .'(NOW(),NULL,'

Revision as of 18:39, 18 August 2010

Navigation

{{#lst:SpamFerret|navbar}}: SpamFerret.php

Code

<php><?php /*

HISTORY:
 2007-08-19 (Wzl) fixed line 155 call to clsDatabase::Query() (method deprecated and removed)
 2007-09-30 (Wzl) fixing regex processing
 2007-10-11 (Wzl) logging ampersandbot attempts; spam throttle (automatic temporary blacklist)
 2007-10-13 (Wzl) fixed some issues which were preventing throttling from working - mainly changes to SQL
 2007-10-15 (Wzl) "Code" wasn't being recorded. Decided that normal filtering should use code '-' so NULL means something is wrong.
 2007-10-28 (Wzl) Events with THR and AMP codes *still* weren't being recorded because method call was improperly formatted.
 2007-12-23 (Wzl) Emails wikimaster if eregi() returns an error (due to improperly formatted regex)
 2007-12-26 (Wzl) Spam turd rejection / logging
 2007-12-27 (Wzl) Fixed spam turd detection to work for new pages too (will probably need refinement)
 2007-12-27 (Wzl) Ooops. Replaced missing if-block from regex results when inserted text is found.
 2008-08-29 (Wzl) Added permanent IP blocking
 2008-09-04 (Wzl) Added (optional) logging of successful edits
 2008-09-19 (Wzl) Actually *set* the "didEdit" flag for successful edits <facepalms>
 2008-10-21 (Wzl) Fixed minor syntax error in "defines"
 2009-02-25 (Wzl) $objArticleCurr->loadLastEdit() now causes error in MW 1.14 (was it necessary before?)
 2009-03-10 (Wzl) "require" -> "require_once" so other extensions can use data.php without conflict
   also optional $kfpWzlLibs so data.php can be somewhere not on the path
 2009-03-18 (Wzl) Got rid of shared.php requirement; now using kfpLib to locate data.php
 2009-03-26 (Wzl) Rewrote data library calls to use new classes (no longer using deprecated/removed classes)
   Also modified to use newer function hooks
 2009-04-24 (Wzl) fixed "strict" bug referencing unset $txtCurr when creating new page
 2009-07-05 (Wzl) Using LibMgr
 2009-07-14 (Wzl) Added attempts.Diff field, patterns.isDIff
 2009-07-15 (Wzl)
   On advice from FreeNode##php, changed from eregi() to preg_match()
   Added option to match diff results instead of submitted edit only
   Saves diff of each change, approved or not
   BUG: approved edits are not being logged properly; using "OK" code and logging as failed
 2009-07-26 (Wzl) fixed minor warning error on line 252
 2009-08-07 (Wzl) email notification working; removed TRD and AMP hard-coded offenses, to be redone as isDiff filters if needed
 2009-08-08 (Wzl) create client record immediately if client is not recognized; don't depend on spam filter being triggered
 2009-08-09 (Wzl) restructured "client" and "attempt" tables (was "clients", "attempts"); not backwards-compatible
 2010-02-24 (Wzl) some code-tidying; trying to restrict passing of data between methods to single array var in args/return
 2010-08-17 (Wzl) added some debug code to CheckRegex(); fixed problem with escaped chars in filter
TO DO:
 * Throttled save attempts should check for spam, just for data-gathering purposes.
   Possibly non-spam from a throttled IP should not update the "WhenLast" timestamp. Maybe this should be a LocalSettings option?
 * Figure out how to display a different error message than "the following text is what triggered our filter:"
OPTIONAL SETTINGS:
 kfpLib - path to data.php folder (no final slash)
 kfsLib_Data - filespec of data.php
  • /
  1. Loader for spam blacklist feature
  2. Include this from LocalSettings.php

if ( defined( 'MEDIAWIKI' ) ) {

$wgExtensionCredits['other'][] = array( 'name' => 'SpamFerret', 'author' => 'Woozle Staddon', 'url' => 'http://htyp.org/SpamFerret', 'version' => '2010-08-17', 'description' => 'database-driven wikispam blocker', ); if (!defined('kfsLib_Data')) {

   if (defined('kfpLib')) {

define('kfsLib_Data', kfpLib.'/data.php');

   } else {

define('kfsLib_Data','data.php'); // assume it's on the path

   }

} if (!defined('LIBMGR')) {

   require('libmgr.php');

} clsLibMgr::Add('data', kfsLib_Data); clsLibMgr::Load('data');

/* ==============

SET UP CALLBACKS
  • /

global $wgFilterCallback, $wgPreSpamFilterCallback;

$wgPreSpamFilterCallback = false; if ( defined( 'MW_SUPPORTS_EDITFILTERMERGED' ) ) {

   $wgHooks['EditFilterMerged'][] = 'wfSpamFerretMerged';

} else {

   if ( $wgFilterCallback ) {
       $wgPreSpamFilterCallback = $wgFilterCallback;
   }
   $wgFilterCallback = 'wfSpamFerretFilter';

}

/* $wgHooks['EditFilter'][] = 'wfSpamFerretValidate'; $wgHooks['ArticleSaveComplete'][] = 'wfSpamFerretArticleSave'; $wgHooks['APIEditBeforeSave'][] = 'wfSpamFerretAPIEditBeforeSave';

  • /

/* ================

SET GLOBAL OBJECTS
  • /

function GetSpamFerret() {

   static $objFerret;
   if (!isset($objFerret)) {

$objFerret = new SpamFerret();

   }
   return $objFerret;

} /* ================

CALLBACK FUNCTIONS
  • /

/**

* Hook function for $wgFilterCallback
*/

//function wfSpamFerretFilter( &$title, $text, $section, &$hookErr, $editSummary ) { function wfSpamFerretFilter( &$title, $text, $section ) {

   global $wgOut;
   $spamObj = GetSpamFerret();
   $wgOut->addWikiText( "Intercepted by SpamFerretFilter" );

// $ret = $spamObj->filter( $title, $text, , $editSummary, $editPage );

   return $spamObj->filter( $title, $text, $section );

}

/**

* Hook function for EditFilterMerged, replaces wfSpamBlacklistFilter
*/

function wfSpamFerretMerged( &$editPage, $text, &$hookErr, $editSummary ) {

   global $wgTitle,$wgOut;
   if( is_null( $wgTitle ) ) {
       # API mode
       # wfSpamBlacklistFilterAPIEditBeforeSave already checked the blacklist
       return true;
   }
   $spamObj = GetSpamFerret();
   $title = $editPage->mArticle->getTitle();

// $ret = $spamObj->filter( $title, $text, , $editSummary, $editPage );

   $ret = $spamObj->filter( $title, $text, ,$editPage );
   if ( $ret !== false ) $editPage->spamPage( $ret );

// additional text can be added here: // $wgOut->addWikiText( "Intercepted by SpamFerretMerged" );

   // Return convention for hooks is the inverse of $wgFilterCallback
   return ( $ret === false );

} /**

* Hook function for APIEditBeforeSave
*/

function wfSpamFerretAPIEditBeforeSave( &$editPage, $text, &$resultArr ) { }

class SpamFerret {

   var $previousFilter = false;

// internal data

   var $dbSpam;
   var $objDataClients;
   var $strIPAddr;
   var $idPattern;
   var $doClearThrottle;
   function Setting($iName) {

global $wgSpamFerretSettings;

return $wgSpamFerretSettings[$iName];

   }
   /*-----
     PROPERTIES USED: $this->idPattern (out)
   */
   function filter( &$title, $text, $section,  $editPage = FALSE ) {

global $wgArticle, $wgDBname, $wgMemc, $messageMemc, $wgVersion, $wgOut; global $wgTitle, $wgServer; global $debug; global $errNum, $errStr; global $gErrorText; // debugging: global $sql;

$fname = 'wfSpamFerretFilter'; wfProfileIn( $fname );

ini_set('track_errors', 1);

# Call the rest of the hook chain first if ( $this->previousFilter ) { $f = $this->previousFilter; if ( $f( $title, $text, $section ) ) { wfProfileOut( $fname ); return true; } } // initialize variables $retVal = FALSE; // default = assume edit is ok $gErrorText = FALSE; $isClientKnown = FALSE;

$this->txtEditRaw = $text; // DEPRECATED $arArgs['edit-raw'] = $text;

// get the IP address of the http client making the edit attempt: $this->strIPAddr = wfGetIP(); // Open the database $this->OpenDatabase(); // open clients table (extended Throttle version) for reference: //return TRUE; $objTblClients = new clsTable($this->dbSpam); $objTblClients->Name('ClientThrottle2'); $objTblClients->KeyName('Address'); // Look up to see if this IP is known; it may already be throttled: $this->objDataClients = $objTblClients->GetData('Address="'.$this->strIPAddr.'"'); if (is_object($this->objDataClients)) { if ($this->objDataClients->hasRows()) { $isClientKnown = TRUE; } } if ($isClientKnown) { $this->objDataClients->FirstRow(); $doBlock = $this->objDataClients->doBlock; if ($doBlock) { $strThrType = 'BLK'; } else { $intRetries = $this->objDataClients->Retries; $intThrottle = $this->Setting('throttle_retries'); $doBlock = $intRetries > $intThrottle; $strThrType = 'THR-'.$intRetries; } if ($doBlock) { $arArgs['diff'] = NULL; // not applicable // retry limit exceeded; check timeout limit if ($this->objDataClients->ThrottleTime < $this->Setting('throttle_timeout')) { $txtMsg = 'Too many spam attempts from your IP address ('.$this->strIPAddr.'). Please come back later.'; if ($editPage) { $editPage->spamPage($txtMsg); } else { EditPage::spamPage($txtMsg); // older MW code doesn't supply $editPage } $arArgs['code'] = $strThrType; $this->RecordAttempt($arArgs); // record post attempt by throttled client $retVal = true; // client has exceeded spam limit; impose throttle } else { $this->doClearThrottle = true; } } } else { $this->CreateClient(); } //$this->txtDiff = 'N/A'; $arArgs['diff'] = 'N/A'; if (!$retVal) { $arRtn = $this->GetDiff($title); // get the diff between edit and current contents $arArgs['diff'] = $arRtn['diff']; $arArgs['doAll'] = FALSE; $arRtn = $this->CheckFilters($arArgs); $arArgs['edit-to-check'] = $arRtn['edit-to-check']; if ( $this->isMatch ) { // spam cue found; display the matching text and don't allow the edit to be saved: wfDebug( "Match!\n" );

// The string sent to spamPage() will be shown after "The following text is what triggered our spam filter:" $retVal = '(pattern #'.$this->idPattern.') '.$this->strMatch; // Log the spam attempt: $arArgs['code'] = '-'; $this->RecordAttempt($arArgs); } else { // no spam cues found; allow the edit to be saved, if nothing else has tripped the filter //if ($this->Setting('log_ok_edits')) { $this->idPattern = NULL; $this->RecordOkEdit($arArgs); //$this->RecordAttempt('OK'); //} } } wfProfileOut( $fname ); $this->ReportErrors(); //$wgOut->addHTML($out); return $retVal; /**/

   }
   public function OpenDatabase() {

$this->dbSpam = new clsDatabase($this->Setting('dbspec')); $this->dbSpam->Open();

   }
   public function FiltTbl() {

$doLoad = TRUE; $doLoad = empty($this->objFilts); if ($doLoad) { $objTbl = new clsTable($this->dbSpam); $objTbl->Name('patterns'); $objTbl->KeyName('ID'); $this->objFilts = $objTbl; } return $this->objFilts;

   }
   /*-----
     INPUT:

$iTitle - page for comparing proposed edit $this->txtEditRaw (DEPRECATED; use $iarArgs['text-to-check']

     OUTPUT:

return ['diff']

   */
   public function GetDiff($iTitle) {

$objArticleCurr = new Article($iTitle); if ($objArticleCurr->exists()) { $txtCurr = $objArticleCurr->getContent(); $txtDiff = FigureDiff($txtCurr,$this->txtEditRaw); } else { $txtDiff = '!!NEW: '.$this->txtEditRaw; } //$this->txtDiff = $txtDiff; $arOut['diff'] = $txtDiff; return $arOut;

   }
   /*-----
     INPUT:

$iarArgs['doAll'] $iarArgs['diff'] $this->txtEditRaw - DEPRECATED; use $iarArgs

     OUTPUT:

$this->idPattern

   */
   public function CheckFilters(array $iarArgs) {

global $gRegexMatches,$gFilterMatches,$gFilterRows,$gFilterCount; global $debug;

assert('is_object($this->dbSpam)');

$doCheckAll = $iarArgs['doAll']; $strChkDiff = strtolower(nz($iarArgs['diff'])); $objFiltTbl = $this->FiltTbl();

if ($doCheckAll) { $sqlFilt = NULL; } else { $sqlFilt = 'isActive'; }

$objFiltRows = $objFiltTbl->GetData($sqlFilt); $objRow = $objFiltRows; // for shorthand

$strTextEdit = strtolower($this->txtEditRaw); //$this->txtEditChk = $strTextEdit; // text after being massaged for checking $arOut['edit-to-check'] = $strTextEdit; // text after being massaged for checking $this->isMatch = FALSE; $gFilterCount = 0; $gFilterRows = $objRow->RowCount(); while($objRow->NextRow() && (!$this->isMatch || $doCheckAll)) { $isMatch = FALSE; if ($objRow->isDiff) { $strTextCk = $strChkDiff; } else { $strTextCk = $strTextEdit; } if (!is_null($strTextCk)) { $gFilterCount++; $strPattern = strtolower($objRow->Pattern); $isRegex = $objRow->isRegex; $this->idPattern = $objRow->ID; if ($isRegex) { $isMatch = $this->CheckRegex($strPattern,$strTextCk); if (isset($php_errormsg)) { $this->AddErrorLine('Filter #'.$this->idPattern.' generated error "'.$php_errormsg); }

if ($isMatch) { $this->strMatch = $gRegexMatches[0]; } } else { if (empty($strPattern)) { $isMatch = FALSE; } else { $this->strMatch = stristr($strTextCk,$strPattern); $isMatch = ($this->strMatch != ); } } if ($isMatch) { $this->isMatch = TRUE; if ($doCheckAll) { $gFilterMatches[$this->idPattern] = $this->strMatch; } } } }

   }
   /*
     TO DO:

replace $gRegexMatches with return array make this function static

   */
   public function CheckRegex($iPattern,$iText) {

global $gRegexMatches,$strDbg;

$chDelim = '/'; $strPattCk = $iPattern; // (2010-08-17) this next line causes incorrect handling of escaped characters in the filter //$strPattCk = str_replace('\\','\\\\',$strPattCk); // make sure filter backslashes are prefixed to be literal $strPattCk = str_replace($chDelim,'\\'.$chDelim,$strPattCk); unset($php_errormsg); // TO DO: explain this $strFinal = $chDelim.$strPattCk.$chDelim; $strDbg .= "@preg_match(\"$strFinal\",\"$iText\",...)"; $isMatch = @preg_match($strFinal,$iText,$gRegexMatches); return $isMatch;

   }
   public function AddErrorLine($iText) {

global $gErrorText;

$gErrorText .= $iText."\n";

   }
   public function ReportErrors() {

global $wgUser; global $wgEmergencyContact; global $gErrorText;

if ($gErrorText) { $msgEmail = 'Filter error report for user '.$wgUser->getName().":\n\n"; $msgEmail .= $gErrorText; mail($wgEmergencyContact,'spamferret filter error',$msgEmail); }

   }

/*

ACTION: Create a new record for the current client
  • /

public function CreateClient() { $sql = 'INSERT INTO client (Address,WhenFirst,Count,Retries) VALUES("'.$this->strIPAddr.'",NOW(),1,0)'; $this->dbSpam->Exec($sql); } /*

ACTION: Update a client's record to reflect a new spam attempt
  • /

public function RecordClientSpam() { if ($this->doClearThrottle) { $strRetries = '0'; } else { $strRetries = 'Retries+1'; } $sql = 'UPDATE client SET WhenLast=NOW(),Count=Count+1, Retries='.$strRetries.' WHERE Address="'.$this->strIPAddr.'"'; $this->dbSpam->Exec($sql); if ($this->dbSpam->RowsAffected() < 1) { $this->CreateClient(); $this->AddErrorLine('Record not found for client '.$this->strIPAddr); } } /*----- INPUT: $this->idPattern $iarArgs['edit-raw'] */ public function RecordAttempt(array $iarArgs) { global $wgTitle, $wgServer;

$iCode = $iarArgs['code']; $txtDiff = $iarArgs['diff']; // was $this->txtDiff $txtEdit = $iarArgs['edit-raw']; // was $this->txtEditChk $this->RecordClientSpam();

$sqlCode = '"'.$iCode.'"'; $sqlURL = '"'.$this->dbSpam->SafeParam($wgTitle->getFullURL()).'"'; $sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"'; $sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText()).'"'; $sqlEdit = '"'.$this->dbSpam->SafeParam($txtEdit).'"'; $sqlDiff = '"'.$this->dbSpam->SafeParam($txtDiff).'"'; $sqlPattern = $this->idPattern; if (is_null($sqlPattern)) { $sqlPattern = 'NULL'; $isPattern = FALSE; } else { $isPattern = TRUE; } $sqlAddr = '"'.$this->strIPAddr.'"'; //$sqlMatch = is_null($iMatch)?'NULL':'"'.$iMatch.'"'; $sql = 'INSERT INTO attempt (`When`,ID_Pattern,Addr_Client,IDS_Session,PageServer,PageName,Code,didAllow,Edit,Diff) VALUES' .'(NOW(),' .$sqlPattern.',' .$sqlAddr.',' .SQL_Value(session_id()).',' .$sqlSrvr.',' .$sqlPage.',' .$sqlCode.',' .'FALSE,' .$sqlEdit.',' .$sqlDiff.')'; $ok = $this->dbSpam->Exec($sql); if ($ok !== TRUE) { $this->AddErrorLine('SQL ['.$sql.'] in RecordAttempt() generated this error: '.$ok); } if ($isPattern) { $sql = 'UPDATE patterns SET WhenTried=NOW(), Count=Count+1 WHERE ID='.$this->idPattern; $ok = $this->dbSpam->Exec($sql); if ($ok !== TRUE) { $this->AddErrorLine('SQL ['.$sql.'] generated this error: '.$ok); } } } /* INPUT: $this->txtEditChk */ public function RecordOkEdit(array $iarArgs) { global $wgTitle, $wgServer;

$txtDiff = $iarArgs['diff']; // was $this->txtDiff $txtEdit = $iarArgs['edit-to-check']; // was $this->txtEditChk

$sqlPage = '"'.$this->dbSpam->SafeParam($wgTitle->getPrefixedText()).'"'; $sqlSrvr = '"'.$this->dbSpam->SafeParam($wgServer).'"'; $sqlEdit = '"'.$this->dbSpam->SafeParam($txtEdit).'"'; $sqlDiff = '"'.$this->dbSpam->SafeParam($txtDiff).'"'; $sql = 'INSERT INTO attempt (`When`,ID_Pattern,Addr_Client,IDS_Session,PageServer,PageName,Code,didAllow,Edit,Diff) VALUES ' .'(NOW(),NULL,' .'"'.$this->strIPAddr.'",' .SQL_Value(session_id()).',' .$sqlSrvr.',' .$sqlPage.',"ok",TRUE,' .$sqlEdit.',' .$sqlDiff.')'; $ok = $this->dbSpam->Exec($sql); if ($ok !== TRUE) { $this->AddErrorLine('SQL ['.$sql.'] in RecordOkEdit() generated this error: '.$ok); } } }

function ErrorHandler ($errno ,$errstr) { global $errNum, $errStr;

$errNum = $errno; $errStr = $errstr; }

function SQL_Value($iVar) { if (is_null($iVar)) { return 'NULL'; } else { if (is_numeric($iVar)) { return $iVar; } else { return '"'.$iVar.'"'; } } } function FigureDiff($iTextOld, $iTextNew) { # Make temporary files // $td = wfTempDir(); $td = session_save_path(); $fhOld = fopen( $fnOld = tempnam( $td, 'SpamFerret-old-' ), 'w' ); $fhNew = fopen( $fnNew = tempnam( $td, 'SpamFerret-new-' ), 'w' );

fwrite( $fhOld, $iTextOld ); fclose( $fhOld ); fwrite( $fhNew, $iTextNew ); fclose( $fhNew );

$ksSpamFerretDiffOptions = '-i -E --suppress-common-lines '; // this setting applies to all uses of a given filter DB // can change for different DBs $cmd = 'diff '.$ksSpamFerretDiffOptions.wfEscapeShellArg( $fnOld, $fnNew ); $handle = popen( $cmd, 'r' ); $result = ; do { $data = fread( $handle, 8192 ); if ( strlen( $data ) == 0 ) { break; } $result .= $data; } while ( true ); pclose( $handle ); unlink( $fnNew ); unlink( $fnOld ); //return 'cmd=['.$cmd.'] diff=['.$result.']'; return $result; }

} // end of 'MEDIAWIKI' check