#!/usr/bin/perl
# Author: Alan M. Makoev
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script creates rules for Spamassassin (http://spamassassin.apache.org)
# that should bounce out russian spam. The rules are generated for the set
# of tokens read from the $TOKENS_FILE. The tokens are expected to be in utf-8
# encoding.
# The is intended to be "byte-oriented" instead of "char-oriented".
no utf8;
use bytes;
use POSIX qw(setlocale LC_ALL LC_CTYPE);
# The perllocale.html states:
# "eq and ne are unaffected by locale: they always perform a char-by-char
# comparison of their scalar operands"
# "Regular expressions and case-modification functions (uc(), lc(), ucfirst(),
# and lcfirst()) use LC_CTYPE"
setlocale( LC_ALL, "C" );
setlocale( LC_TYPE, "C" );
# The following procedure is based on Convert::Cyrillic package by John
# Neystadt . Unfortunately, the procedure
# Convert::Cyrillic:cstocs did not work on my perl 5.8, because it uses
# UTF-8 for character representation and does not recognise cyrillic chars
# in single-byte encodings. Thus the "tr///" operator leaves original string
# intact.
@{char_tab{'UTF8'}} = ["А","Б","В","Г","Д","Е","Ё","Ж","З","И","Й","К","Л","М","Н","О","П","Р","С","Т","У","Ф","Х","Ц","Ч","Ш","Щ","Ъ","Ы","Ь","Э","Ю","Я","а","б","в","г","д","е","ё","ж","з","и","й","к","л","м","н","о","п","р","с","т","у","ф","х","ц","ч","ш","щ","ъ","ы","ь","э","ю","я"];
@{char_tab{'WIN'}} = ["�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�"];
@{char_tab{'DOS'}} = ["�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�"];
@{char_tab{'ISO'}} = ["�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�"];
@{char_tab{'KOI8'}} = ["�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�"];
@{char_tab{'MAC'}} = ["�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�","�"];
sub cyr_cstocs {
my ($SOURCECHARSET, $DESTINATIONCHARSET, $INBUFFER) = @_;
my $OUTBUFFER="";
my @READCHAR=();
$SOURCECHARSET= uc($SOURCECHARSET);
$SOURCECHARSET='KOI8' if ($SOURCECHARSET eq 'KOI') or ($SOURCECHARSET eq 'KOI8-R');
$SOURCECHARSET='UTF8' if ($SOURCECHARSET eq 'UTF') or ($SOURCECHARSET eq 'UTF-8');
$DESTINATIONCHARSET=uc($DESTINATIONCHARSET);
$DESTINATIONCHARSET='KOI8' if ($DESTINATIONCHARSET eq 'KOI') or ($DESTINATIONCHARSET eq 'KOI8-R');
$DESTINATIONCHARSET='UTF8' if ($DESTINATIONCHARSET eq 'UTF') or ($DESTINATIONCHARSET eq 'UTF-8');
if ($SOURCECHARSET eq 'UTF8')
{
my $CHARCOUNT=0;
$READCHAR[0]=bytes::substr($INBUFFER,0,1);
while ( defined ($READCHAR[0]) )
{
# Cyrillic letters in UTF take two bytes.
# Read two consequent bytes and check if they make a valid
# cyrillic letter
$CHARCOUNT++;
$READCHAR[1]=bytes::substr($INBUFFER,$CHARCOUNT,1);
if ((defined ($READCHAR[1])
&& ((bytes::ord($READCHAR[0])==208 && ((bytes::ord($READCHAR[1])>=144 && bytes::ord($READCHAR[1])<=191) || bytes::ord($READCHAR[1])==129))
||(bytes::ord($READCHAR[0])==209 && ((bytes::ord($READCHAR[1])>=128 && bytes::ord($READCHAR[1])<=143) || bytes::ord($READCHAR[1])==145)) )))
{
# The pair is a valid cyrillic UTF-8 letter - add corresponding
# letter in the destination charset to the output buffer;
$READCHAR[0].=$READCHAR[1];
for (my $CHARINDEX=0;$CHARINDEX<66;$CHARINDEX++)
{
if (${char_tab{'UTF8'}}[$CHARINDEX] eq $READCHAR[0])
{
$OUTBUFFER.=${char_tab{$DESTINATIONCHARSET}}[$CHARINDEX];
last;
}
}
$CHARCOUNT++;
$READCHAR[0]=bytes::substr($INBUFFER,$CHARCOUNT,1);
}
else
{
# The pair is not a valid cyrillic UTF-8 letter - copy the first
# byte to the output buffer "as is" and make shift in the bytes;
$OUTBUFFER.=$READCHAR[0];
$READCHAR[0]=$READCHAR[1];
}
}
}
else
{
# Other encodings (KOI8-R,WIN,DOS,MAC,ISO) all use one byte for
# each letter.
my $CHARCOUNT=0;
$READCHAR[0]=bytes::substr($INBUFFER,0,1);
while ( defined ($READCHAR[0]) )
{
for (my $CHARINDEX=0;$CHARINDEX<66;$CHARINDEX++)
{
if ($READCHAR[0] eq ${char_tab{$SOURCECHARSET}}[$CHARINDEX])
{
$OUTBUFFER.=${char_tab{$DESTINATIONCHARSET}}[$CHARINDEX];
last;
}
}
$CHARCOUNT++;
$READCHAR[0]=bytes::substr($INBUFFER,$CHARCOUNT,1);
}
}
$OUTBUFFER;
}
my $WORK_DIR="/root/sa_rus";
my $TOKENS_FILE=$WORK_DIR."/tokens.utf-8.txt";
my $SPAMASSASSIN_RULES_DIR="/root/sa_rus";
my $RULES_FILE_KOI8=$SPAMASSASSIN_RULES_DIR."/99_russian_koi8_re.cf";
my $RULES_FILE_WIN1251=$SPAMASSASSIN_RULES_DIR."/99_russian_win1251_re.cf";
my $RULES_FILE_UTF8=$SPAMASSASSIN_RULES_DIR."/99_russian_utf8_re.cf";
my $RULES_FILE_COMMON=$SPAMASSASSIN_RULES_DIR."/99_russian_common_re.cf";
# Define regexp patterns that will be used instead of ordinary letters to make
# the rules hit words with cyrillic and latin letters and digits mixed, and
# also to make sure that pattern will cover both lower and upper cases of
# russian letters with any locale set.
# The cyrillic letters used as hash indices are in UTF-8 charset
my %SUBST_SET;
$SUBST_SET{'а'}=["а","А","a","A","\x40"];
$SUBST_SET{'б'}=["б","Б","6"];
$SUBST_SET{'в'}=["в","В","B","8"];
$SUBST_SET{'г'}=["г","Г"];
$SUBST_SET{'д'}=["д","Д"];
$SUBST_SET{'е'}=["е","Е","e","E"];
$SUBST_SET{'ё'}=["ё","Ё","е","Е","e","E"];
$SUBST_SET{'ж'}=["ж","Ж"];
$SUBST_SET{'з'}=["з","З","3"];
$SUBST_SET{'и'}=["и","И","u","U"];
$SUBST_SET{'й'}=["й","Й","и","И","u","U"];
$SUBST_SET{'к'}=["к","К","k","K"];
$SUBST_SET{'л'}=["л","Л"];
$SUBST_SET{'м'}=["м","М","M"];
$SUBST_SET{'н'}=["н","Н","H"];
$SUBST_SET{'о'}=["о","О","o","O","0"];
$SUBST_SET{'п'}=["п","П","n"];
$SUBST_SET{'р'}=["р","Р","p","P"];
$SUBST_SET{'с'}=["с","С","c","C"];
$SUBST_SET{'т'}=["т","Т","T"];
$SUBST_SET{'у'}=["у","У","y","Y"];
$SUBST_SET{'ф'}=["ф","Ф"];
$SUBST_SET{'х'}=["х","Х","x","X"];
$SUBST_SET{'ц'}=["ц","Ц"];
$SUBST_SET{'ч'}=["ч","Ч","4"];
$SUBST_SET{'ш'}=["ш","Ш","щ","Щ","w","W"];
$SUBST_SET{'щ'}=["ш","Ш","щ","Щ","w","W"];
$SUBST_SET{'ъ'}=["ъ","Ъ"];
$SUBST_SET{'ы'}=["ы","Ы"]; # ,"b\|","Ь\|"
$SUBST_SET{'ь'}=["ь","Ь","b"];
$SUBST_SET{'э'}=["э","Э"];
$SUBST_SET{'ю'}=["ю","Ю"];
$SUBST_SET{'я'}=["я","Я"];
$SUBST_SET_SPECIAL{'<#DIGIT#>'}=["[[:digit:]]","о","О","o","O","I","l","з","З","ч","Ч","б","в","В"];
$SUBST_SET_SPECIAL{'<#0#>'}=["0","о","О","o","O"];
$SUBST_SET_SPECIAL{'<#1#>'}=["1","I","l"];
$SUBST_SET_SPECIAL{'<#2#>'}=["2"];
$SUBST_SET_SPECIAL{'<#3#>'}=["3","з","З"];
$SUBST_SET_SPECIAL{'<#4#>'}=["4","ч","Ч"];
$SUBST_SET_SPECIAL{'<#5#>'}=["5"];
$SUBST_SET_SPECIAL{'<#6#>'}=["6","б"];
$SUBST_SET_SPECIAL{'<#7#>'}=["7"];
$SUBST_SET_SPECIAL{'<#8#>'}=["8","в","В","B"];
$SUBST_SET_SPECIAL{'<#9#>'}=["9"];
#********************************
# Now create hash of strings to replace single letters with regexp
# patterns
my %CHAR_SUBST;
foreach my $SUBST_EXPR ( keys(%SUBST_SET) )
{
$CHAR_SUBST_UTF8{$SUBST_EXPR}="(";
$CHAR_SUBST_KOI8{$SUBST_EXPR}="(";
$CHAR_SUBST_WIN1251{$SUBST_EXPR}="(";
foreach my $SUBST_EXPR1 (@{$SUBST_SET{$SUBST_EXPR}})
{
if ( ord(bytes::substr($SUBST_EXPR1,0,1))<128 )
{
$CHAR_SUBST_UTF8{$SUBST_EXPR}.=($SUBST_EXPR1."|");
$CHAR_SUBST_KOI8{$SUBST_EXPR}.=($SUBST_EXPR1."|");
$CHAR_SUBST_WIN1251{$SUBST_EXPR}.=($SUBST_EXPR1."|");
}
else
{
$CHAR_SUBST_UTF8{$SUBST_EXPR}.="(\\x".unpack('H2',bytes::substr($SUBST_EXPR1,0,1))."\\x".unpack('H2',bytes::substr($SUBST_EXPR1,1,1)).")";
$CHAR_SUBST_UTF8{$SUBST_EXPR}.="|";
$SUBST_EXPR2=cyr_cstocs ("UTF-8", "KOI8", $SUBST_EXPR1);
$CHAR_SUBST_KOI8{$SUBST_EXPR}.="\\x".unpack('H2',$SUBST_EXPR2)."|";
$SUBST_EXPR2=cyr_cstocs ("UTF-8", "WIN", $SUBST_EXPR1);
$CHAR_SUBST_WIN1251{$SUBST_EXPR}.="\\x".unpack('H2',$SUBST_EXPR2)."|";
}
}
$CHAR_SUBST_UTF8{$SUBST_EXPR}=~s/\|$/\)\(\[\[\:blank\:\]\[\:punct\:\]\]\?\)/;
$CHAR_SUBST_KOI8{$SUBST_EXPR}=~s/\|$/\)\(\[\[\:blank\:\]\[\:punct\:\]\]\?\)/;
$CHAR_SUBST_WIN1251{$SUBST_EXPR}=~s/\|$/\)\(\[\[\:blank\:\]\[\:punct\:\]\]\?\)/;
}
# Now create hash of strings to replace spectal pseudo-patterns with regexp
# patterns
foreach my $SUBST_EXPR ( keys(%SUBST_SET_SPECIAL) )
{
$CHAR_SUBST_SPECIAL_UTF8{$SUBST_EXPR}="((";
$CHAR_SUBST_SPECIAL_KOI8{$SUBST_EXPR}="((";
$CHAR_SUBST_SPECIAL_WIN1251{$SUBST_EXPR}="((";
foreach my $SUBST_EXPR1 (@{$SUBST_SET_SPECIAL{$SUBST_EXPR}})
{
if ( ord(bytes::substr($SUBST_EXPR1,0,1))<128 )
{
$CHAR_SUBST_SPECIAL_UTF8{$SUBST_EXPR}.=($SUBST_EXPR1."|");
$CHAR_SUBST_SPECIAL_KOI8{$SUBST_EXPR}.=($SUBST_EXPR1."|");
$CHAR_SUBST_SPECIAL_WIN1251{$SUBST_EXPR}.=($SUBST_EXPR1."|");
}
else
{
$CHAR_SUBST_SPECIAL_UTF8{$SUBST_EXPR}.="(\\x".unpack('H2',bytes::substr($SUBST_EXPR1,0,1))."\\x".unpack('H2',bytes::substr($SUBST_EXPR1,1,1)).")";
$CHAR_SUBST_SPECIAL_UTF8{$SUBST_EXPR}.="|";
$SUBST_EXPR2=cyr_cstocs ("UTF-8", "KOI8", $SUBST_EXPR1);
$CHAR_SUBST_SPECIAL_KOI8{$SUBST_EXPR}.="\\x".unpack('H2',$SUBST_EXPR2)."|";
$SUBST_EXPR2=cyr_cstocs ("UTF-8", "WIN", $SUBST_EXPR1);
$CHAR_SUBST_SPECIAL_WIN1251{$SUBST_EXPR}.="\\x".unpack('H2',$SUBST_EXPR2)."|";
}
}
$CHAR_SUBST_SPECIAL_UTF8{$SUBST_EXPR}=~s/\|$/\)\(\[\[\:blank\:\]\[\:punct\:\]\]\*\)\)/;
$CHAR_SUBST_SPECIAL_KOI8{$SUBST_EXPR}=~s/\|$/\)\(\[\[\:blank\:\]\[\:punct\:\]\]\*\)\)/;
$CHAR_SUBST_SPECIAL_WIN1251{$SUBST_EXPR}=~s/\|$/\)\(\[\[\:blank\:\]\[\:punct\:\]\]\*\)\)/;
# print $SUBST_EXPR.":\n" ;
# print "UTF8: ".$CHAR_SUBST_SPECIAL_UTF8{$SUBST_EXPR}."\n";
# print "KOI8: ".$CHAR_SUBST_SPECIAL_KOI8{$SUBST_EXPR}."\n";
# print "WIN1251: ".$CHAR_SUBST_SPECIAL_WIN1251{$SUBST_EXPR}."\n";
}
# Now open the tokens list and build rules based on them
open(TOKENS,'<',$TOKENS_FILE) || die "Could not open file ".$TOKENS_FILE."\n";
open(RULES_KOI8,'>',$RULES_FILE_KOI8) || die "Could not open file ".$RULES_FILE_KOI8."\n";
open(RULES_WIN1251,'>',$RULES_FILE_WIN1251) || die "Could not open file ".$RULES_FILE_WIN1251."\n";
open(RULES_UTF8,'>',$RULES_FILE_UTF8) || die "Could not open file ".$RULES_FILE_UTF8."\n";
open(RULES_COMMON,'>',$RULES_FILE_COMMON) || die "Could not open file ".$RULES_FILE_COMMON."\n";
while ( my $TOKENS_LINE = )
{
if (( $TOKENS_LINE !~ /^#.*/ ) && ( $TOKENS_LINE !~ /^(\s)*(\n)?$/ ))
{
# The line is neither a comment nor empty - process it
chomp $TOKENS_LINE;
@PARTS=split(/\s+/,$TOKENS_LINE); # Split the line into fields and check
# if it is a tokens line (which consists
# of the rule name starting with "RU_"
# and the tokens string itself), a meta-rule
# definition (which starts with "meta"
# keyword), or a score definition (which
# starts with "score" keyword)
if ( scalar(@PARTS) != 2 )
{
# The tokens line must consist of rule name, folowed by whitespace, and
# the tokens string itself. There must be no whitespaces within the
# tokens string (only "\s" or [[:blank:]] regexp constructs)
# If a line is not a valid tokens line - it can be a score definition,
# a meta-rule definition, or a non-russian rule definition. In these
# cases it will be copied to the rules with charset name added where
# necessary. Otherwise it will be skipped with warning.
#
if ( ( $PARTS[0] eq "score" ) && ( $PARTS[2]=~/(\-)?[[:digit:]]+(\.)?[[:digit:]]*/ ) )
{
if ($PARTS[1]=~/^RU_.*/)
{
print RULES_KOI8 "score ".$PARTS[1]."_KOI8 ".$PARTS[2]."\n";
print RULES_WIN1251 "score ".$PARTS[1]."_WIN1251 ".$PARTS[2]."\n";
print RULES_UTF8 "score ".$PARTS[1]."_UTF8 ".$PARTS[2]."\n";
}
else
{
print RULES_COMMON "score ".$PARTS[1]." ".$PARTS[2]."\n";
}
}
elsif ( $PARTS[0] eq "meta" )
{
if ( $PARTS[1]=~/^(__)?RU_.*/ )
{
$RULES_LINE_KOI8="meta ".$PARTS[1]."_KOI8 ";
$RULES_LINE_WIN1251="meta ".$PARTS[1]."_WIN1251 ";
$RULES_LINE_UTF8="meta ".$PARTS[1]."_UTF8 ";
for (my $CHARINDEX=2;$CHARINDEX") and replace
# them with appropriate regexps
# Walk through the tokens line, find cyrillic letters and replace them
# with appropriate regexp
my $CHARCOUNT=0;
my $RULES_LINE_UTF8="";
my $RULES_LINE_KOI8="";
my $RULES_LINE_WIN1251="";
my @READCHAR=();
$READCHAR[0]=bytes::substr($PARTS[1],0,1);
while ( defined ($READCHAR[0]) )
{
# Cyrillic letters in UTF take two bytes.
# Read two consequent bytes and check if they make a valid
# cyrillic letter
$CHARCOUNT++;
$READCHAR[1]=bytes::substr($PARTS[1],$CHARCOUNT,1);
if ((defined ($READCHAR[1])
&&((bytes::ord($READCHAR[0])==208 && ((bytes::ord($READCHAR[1])>=144 && bytes::ord($READCHAR[1])<=191) || bytes::ord($READCHAR[1])==129))
||(bytes::ord($READCHAR[0])==209 && ((bytes::ord($READCHAR[1])>=128 && bytes::ord($READCHAR[1])<=143) || bytes::ord($READCHAR[1])==145)) )))
{
# The pair is a valid cyrillic UTF-8 letter - replace it with proper
# regexp. Only run over lower-case part of charset table
$READCHAR[0].=$READCHAR[1];
for (my $CHARINDEX=33;$CHARINDEX<66;$CHARINDEX++)
{
if ($READCHAR[0] eq ${char_tab{'UTF8'}}[$CHARINDEX])
{
$RULES_LINE_UTF8.=$CHAR_SUBST_UTF8{$READCHAR[0]};
$RULES_LINE_KOI8.=$CHAR_SUBST_KOI8{$READCHAR[0]};
$RULES_LINE_WIN1251.=$CHAR_SUBST_WIN1251{$READCHAR[0]};
last;
}
}
$CHARCOUNT++;
$READCHAR[0]=bytes::substr($PARTS[1],$CHARCOUNT,1);
}
else
{
# The pair is not a cyrillic letter - copy the first byte "as is"
# and shift second byte to first
$RULES_LINE_UTF8.=$READCHAR[0];
$RULES_LINE_KOI8.=$READCHAR[0];
$RULES_LINE_WIN1251.=$READCHAR[0];
$READCHAR[0]=$READCHAR[1];
}
} # while ( defined ($READCHAR[0]) )
# Now scan RULES_LINEs to find encoding-independent pseudo-patterns
# (such as <#DIGIT#>) to replace them with regexps
foreach my $SUBST_EXPR ( keys(%SUBST_SET_SPECIAL) )
{
$RULES_LINE_UTF8=~s/$SUBST_EXPR/$CHAR_SUBST_SPECIAL_UTF8{$SUBST_EXPR}/g;
$RULES_LINE_KOI8=~s/$SUBST_EXPR/$CHAR_SUBST_SPECIAL_KOI8{$SUBST_EXPR}/g;
$RULES_LINE_WIN1251=~s/$SUBST_EXPR/$CHAR_SUBST_SPECIAL_WIN1251{$SUBST_EXPR}/g;
}
# Print tokens in various encodings, so that one can read them and decide
# if he/she needs this rule
print RULES_KOI8 "# -------------------\n";
print RULES_KOI8 "# KOI8-R: \"".cyr_cstocs ("UTF8", "KOI8", $PARTS[1])."\" \n";
print RULES_KOI8 "# Win1251: \"".cyr_cstocs ("UTF8", "WIN", $PARTS[1])."\" \n";
print RULES_KOI8 "# UTF8: \"".$PARTS[1]."\" \n";
print RULES_WIN1251 "# -------------------\n";
print RULES_WIN1251 "# KOI8-R: \"".cyr_cstocs ("UTF8", "KOI8", $PARTS[1])."\" \n";
print RULES_WIN1251 "# Win1251: \"".cyr_cstocs ("UTF8", "WIN", $PARTS[1])."\" \n";
print RULES_WIN1251 "# UTF8: \"".$PARTS[1]."\" \n";
print RULES_UTF8 "# -------------------\n";
print RULES_UTF8 "# KOI8-R: \"".cyr_cstocs ("UTF8", "KOI8", $PARTS[1])."\" \n";
print RULES_UTF8 "# Win1251: \"".cyr_cstocs ("UTF8", "WIN", $PARTS[1])."\" \n";
print RULES_UTF8 "# UTF8: \"".$PARTS[1]."\" \n";
print RULES_KOI8 "body ".$PARTS[0]."_KOI8 /".$RULES_LINE_KOI8."/s\n";
print RULES_WIN1251 "body ".$PARTS[0]."_WIN1251 /".$RULES_LINE_WIN1251."/s\n";
print RULES_UTF8 "body ".$PARTS[0]."_UTF8 /".$RULES_LINE_UTF8."/s\n";
} # if ( scalar(@PARTS) != 2 )
} # if this is not a comment
} # loop over lines in tokens list
close TOKENS;
close RULES_KOI8;
close RULES_WIN1251;
close RULES_UTF8;
close RULES_COMMON;