# -------------------------------------------------------- # Stemmer for french by Benoit Favre # -------------------------------------------------------- # # Copyright (C) 2003 BENOIT FAVRE # # .................................................................. # # STEMMER FOR FRENCH is free software; you can redistribute it and/or modify # it for EDUCATIONAL POURPOSES ONLY and under the terms of the # GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # .................................................................. # # Contact : # BENOIT FAVRE - LIA - UNIVERSITE D'AVIGNON # AGROPARC BP1228 84911 AVIGNON CEDEX 09 FRANCE # benoit.favre@univ-avignon.fr # .................................................................. # # Version : 2003-07-02 #!/usr/bin/perl -w #$accent="[âàçëéêèïîôûù]"; my $vowel="[aeiouyâàëéêèïîôûù]"; my $not_vowel="[^aeiouyâàëéêèïîôûù]"; my $separator="[ ]+"; my $suffix1="(ances?|iqUes?|ismes?|ables?|istes?|eux)"; my $suffix2="(atrices?|ateurs?|ations?)"; my $suffix3="(logies?)"; my $suffix4="(u[st]ions?)"; my $suffix5="(ences?)"; my $suffix6="(ements?)"; my $suffix7="(ités?)"; my $suffix8="(ifs?|ives?)"; my $suffix9="(eaux)"; my $suffix10="(aux)"; my $suffix11="(euses?)"; my $suffix12="(issements?)"; my $suffix13="(amment)"; my $suffix14="(emment)"; my $suffix15="(ments?)"; my $step_2a="(îmes|ît|îtes|i|ie|ies|ir|ira|irai|iraIent|irais|irait|iras|irent|irez|iriez|irions|irons|iront|is|issaIent|issais|issait|issant|issante|issantes|issants|isse|issent|isses|issez|issiez|issions|issons|it)"; my $step_2b_suffix1="(é|ée|ées|és|èrent|er|era|erai|eraIent|erais|erait|eras|erez|eriez|erions|eront|ez|iez)"; my $step_2b_suffix2="(âmes|ât|âtes|a|ai|aIent|ais|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)"; sub stem($) { my $word=shift; my $RV=""; my $R1=""; my $R2=""; my $do_step_2a=0; my $word_not_altered=0; $word=lc($word); # mark special vowels $word=~s/(${vowel})u(${vowel})/$1U$2/; $word=~s/(${vowel})i(${vowel})/$1I$2/; $word=~s/(${vowel})y(${vowel}?)/$1Y$2/; $word=~s/(${vowel}?)y(${vowel})/$1Y$2/; $word=~s/qu/qU/; # find regions if($word=~/^${vowel}${vowel}.(.*)/) { $RV=$1; } elsif($word=~/^${vowel}?${not_vowel}+${vowel}(.*)/) { $RV=$1; } if($word=~/${vowel}${not_vowel}(.*)/) { $R1=$1; } if($R1=~/${vowel}${not_vowel}(.*)/) { $R2=$1; } # step 1 : check suffixes if($R2=~/${suffix1}$/) { $word=~s/${suffix1}$//; } elsif($R2=~s/${suffix2}$//) { $word=~s/${suffix2}$//; if($word=~/ic$/) { if($R2=~/ic$/) { $word=~s/ic$//; } else { $word=~s/ic$/iqU/; } } } elsif($R2=~s/${suffix3}$/log/) { $word=~s/${suffix3}$/log/; } elsif($R2=~s/${suffix4}$/u/) { $word=~s/${suffix4}$/u/; } elsif($R2=~s/${suffix5}$/ent/) { $word=~s/${suffix5}$/ent/; } elsif($RV=~s/${suffix6}$//) { $word=~s/${suffix6}$//; $R2=~s/${suffix6}$//; if($R2=~s/iv$//) { $word=~s/iv$//; if($R2=~/at$/) { $word=~s/at$//; } } elsif($word=~/eus$/) { if($R2=~/eus$/) { $word=~s/eus$//; } elsif($R1=~/eus$/) { $word=~s/eus$/eux/; } } elsif($R2=~/(abl|iqU)$/) { $word=~s/(abl|iqU)$//; } elsif($RV=~/(ièr|Ièr)$/) { $word=~s/(ièr|Ièr)$//; } } elsif($R2=~s/${suffix7}$//) { $word=~s/${suffix7}$//; if($word=~/abil$/) { if($R2=~/abil$/) { $word=~s/abil$//; } else { $word=~s/abil$/abl/; } } elsif($word=~/ic$/) { if($R2=~/ic$/) { $word=~s/ic$//; } else { $word=~s/ic$/iqU/; } } elsif($R2=~s/iv$//) { $word=~s/iv$//; } } elsif($R2=~s/${suffix8}$//) { $word=~s/${suffix8}$//; if($R2=~s/at$//) { $word=~s/at$//; if($word=~/ic$/) { if($R2=~/ic$/) { $word=~s/ic$//; } else { $word=~s/ic$/iqU/; } } } } elsif($word=~s/${suffix9}$/eau/) { } elsif($R1=~/${suffix10}$/) { $word=~s/${suffix10}$/al/; } elsif($R2=~/${suffix11}$/) { $word=~s/${suffix11}$//; } elsif($R1=~/${suffix11}$/) { $word=~s/${suffix11}$/eux/; } elsif($R1=~/${suffix12}$/) { $word=~s/(${not_vowel})${suffix12}$//; } elsif($RV=~/${suffix13}$/) { $word=~s/${suffix13}$/ant/; $do_step_2a=1; } elsif($RV=~/${suffix14}$/) { $word=~s/${suffix14}$/ent/; $do_step_2a=1; } elsif($RV=~/${vowel}${suffix15}$/) { $word=~s/${suffix15}$//; $do_step_2a=1; } else { $do_step_2a=1; $word_not_altered=1; } # step 2a if($do_step_2a==1) { if($RV=~/${not_vowel}${step_2a}$/) { $word=~s/${step_2a}$//; } else { # step 2b if($R2=~/ions$/) { $word=~s/ions$//; } elsif($word=~s/${step_2b_suffix1}$//) { } elsif($RV=~/e${step_2b_suffix2}$/ || $word=~/${step_2b_suffix2}$/) { $word=~s/e?${step_2b_suffix2}$//; } else { $word_not_altered=1; } } } if($word_not_altered==0) { # step 3 $word=~s/Y$/i/; $word=~s/ç$/c/; } else { # step 4 $word=~s/([^aiouès])s$/$1/; if($RV=~/[st]ion$/ && $R2=~/ion$/) { $word=~s/ion$//; } elsif($RV=~/(ier|ière|Ier|Ière)$/) { $word=~s/(ier|ière|Ier|Ière)$/i/; } elsif($RV=~/e$/) { $word=~s/e$//; } elsif($RV=~/guë$/) { $word=~s/ë$//; } } # step 5 if($word=~/(enn|onn|ett|ell|eill)$/) { chop($word); } # step 6 $word=~s/[éè](${not_vowel}+)$/e$1/; $word=lc($word); # remove accents $word=~s/[éèêë]/e/g; $word=~s/[àâä]/a/g; $word=~s/[ç]/c/g; $word=~s/[ùûü]/u/g; $word=~s/[ïî]/i/g; $word=~s/[ôö]/o/g; $word=~s/ÿ/y/g; return $word; }