#     --------------------------------------------------------
#     Stemmer for french by Benoit Favre
#     --------------------------------------------------------
# 
#     Copyright (C) 2003 BENOIT FAVRE
# 
#     ..................................................................
# 
#     STEMMER FOR FRENCH is free software; you can redistribute it and/or modify
#     it for EDUCATIONAL POURPOSES ONLY and under the terms of the 
#     GNU General Public License as published by
#     the Free Software Foundation; either version 2 of the License, or
#     (at your option) any later version.
# 
#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
# 
#     You should have received a copy of the GNU General Public License
#     along with this program; if not, write to the Free Software
#     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#     ..................................................................
# 
#     Contact : 
#               BENOIT FAVRE - LIA - UNIVERSITE D'AVIGNON
#               AGROPARC BP1228 84911  AVIGNON  CEDEX 09  FRANCE
#               benoit.favre@univ-avignon.fr 
#     ..................................................................
#
#     Version : 2003-07-02


#!/usr/bin/perl -w
#$accent="[âàçëéêèïîôûù]";
my $vowel="[aeiouyâàëéêèïîôûù]";
my $not_vowel="[^aeiouyâàëéêèïîôûù]";
my $separator="[ ]+";
my $suffix1="(ances?|iqUes?|ismes?|ables?|istes?|eux)";
my $suffix2="(atrices?|ateurs?|ations?)";
my $suffix3="(logies?)";
my $suffix4="(u[st]ions?)";
my $suffix5="(ences?)";
my $suffix6="(ements?)";
my $suffix7="(ités?)";
my $suffix8="(ifs?|ives?)";
my $suffix9="(eaux)";
my $suffix10="(aux)";
my $suffix11="(euses?)";
my $suffix12="(issements?)";
my $suffix13="(amment)";
my $suffix14="(emment)";
my $suffix15="(ments?)";
my $step_2a="(îmes|ît|îtes|i|ie|ies|ir|ira|irai|iraIent|irais|irait|iras|irent|irez|iriez|irions|irons|iront|is|issaIent|issais|issait|issant|issante|issantes|issants|isse|issent|isses|issez|issiez|issions|issons|it)";
my $step_2b_suffix1="(é|ée|ées|és|èrent|er|era|erai|eraIent|erais|erait|eras|erez|eriez|erions|eront|ez|iez)";
my $step_2b_suffix2="(âmes|ât|âtes|a|ai|aIent|ais|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)";

sub stem($)
{
	my $word=shift;
	my $RV="";
	my $R1="";
	my $R2="";
	my $do_step_2a=0;
	my $word_not_altered=0;
	
	$word=lc($word);
	# mark special vowels
	$word=~s/(${vowel})u(${vowel})/$1U$2/;
	$word=~s/(${vowel})i(${vowel})/$1I$2/;
	$word=~s/(${vowel})y(${vowel}?)/$1Y$2/;
	$word=~s/(${vowel}?)y(${vowel})/$1Y$2/;
	$word=~s/qu/qU/;
	# find regions
	if($word=~/^${vowel}${vowel}.(.*)/)
	{
		$RV=$1;
	}
	elsif($word=~/^${vowel}?${not_vowel}+${vowel}(.*)/)
	{
		$RV=$1;
	}
	if($word=~/${vowel}${not_vowel}(.*)/)
	{
		$R1=$1;
	}
	if($R1=~/${vowel}${not_vowel}(.*)/)
	{
		$R2=$1;
	}
	# step 1 : check suffixes
	if($R2=~/${suffix1}$/)
	{
		$word=~s/${suffix1}$//;
	}
	elsif($R2=~s/${suffix2}$//)
	{
		$word=~s/${suffix2}$//;
		if($word=~/ic$/)
		{
			if($R2=~/ic$/)
			{
				$word=~s/ic$//;
			}
			else
			{
				$word=~s/ic$/iqU/;
			}
		}
	}
	elsif($R2=~s/${suffix3}$/log/)
	{
		$word=~s/${suffix3}$/log/;
	}
	elsif($R2=~s/${suffix4}$/u/)
	{
		$word=~s/${suffix4}$/u/;
	}
	elsif($R2=~s/${suffix5}$/ent/)
	{
		$word=~s/${suffix5}$/ent/;
	}
	elsif($RV=~s/${suffix6}$//)
	{
		$word=~s/${suffix6}$//;
		$R2=~s/${suffix6}$//;
		if($R2=~s/iv$//)
		{
			$word=~s/iv$//;
			if($R2=~/at$/)
			{
				$word=~s/at$//;
			}
		}
		elsif($word=~/eus$/)
		{
			if($R2=~/eus$/)
			{
				$word=~s/eus$//;
			}
			elsif($R1=~/eus$/)
			{
				$word=~s/eus$/eux/;
			}
		}
		elsif($R2=~/(abl|iqU)$/)
		{
			$word=~s/(abl|iqU)$//;
		}
		elsif($RV=~/(ièr|Ièr)$/)
		{
			$word=~s/(ièr|Ièr)$//;
		}
	}
	elsif($R2=~s/${suffix7}$//)
	{
		$word=~s/${suffix7}$//;
		if($word=~/abil$/)
		{
			if($R2=~/abil$/)
			{
				$word=~s/abil$//;
			}
			else
			{
				$word=~s/abil$/abl/;
			}
		}
		elsif($word=~/ic$/)
		{
			if($R2=~/ic$/)
			{
				$word=~s/ic$//;
			}
			else
			{
				$word=~s/ic$/iqU/;
			}
		}
		elsif($R2=~s/iv$//)
		{
			$word=~s/iv$//;
		}
	}
	elsif($R2=~s/${suffix8}$//)
	{
		$word=~s/${suffix8}$//;
		if($R2=~s/at$//)
		{
			$word=~s/at$//;
			if($word=~/ic$/)
			{
				if($R2=~/ic$/)
				{
					$word=~s/ic$//;
				}
				else
				{
					$word=~s/ic$/iqU/;
				}
			}
		}
	}
	elsif($word=~s/${suffix9}$/eau/)
	{
	}
	elsif($R1=~/${suffix10}$/)
	{
		$word=~s/${suffix10}$/al/;
	}
	elsif($R2=~/${suffix11}$/)
	{
		$word=~s/${suffix11}$//;
	}
	elsif($R1=~/${suffix11}$/)
	{
		$word=~s/${suffix11}$/eux/;
	}
	elsif($R1=~/${suffix12}$/)
	{
		$word=~s/(${not_vowel})${suffix12}$//;
	}
	elsif($RV=~/${suffix13}$/)
	{
		$word=~s/${suffix13}$/ant/;
		$do_step_2a=1;
	}
	elsif($RV=~/${suffix14}$/)
	{
		$word=~s/${suffix14}$/ent/;
		$do_step_2a=1;
	}
	elsif($RV=~/${vowel}${suffix15}$/)
	{
		$word=~s/${suffix15}$//;
		$do_step_2a=1;
	}
	else
	{
		$do_step_2a=1;
		$word_not_altered=1;
	}
	# step 2a
	if($do_step_2a==1)
	{
		if($RV=~/${not_vowel}${step_2a}$/)
		{
			$word=~s/${step_2a}$//;
		}
		else
		{
			# step 2b
			if($R2=~/ions$/)
			{
				$word=~s/ions$//;
			}
			elsif($word=~s/${step_2b_suffix1}$//)
			{
			}
			elsif($RV=~/e${step_2b_suffix2}$/ || $word=~/${step_2b_suffix2}$/)
			{
				$word=~s/e?${step_2b_suffix2}$//;
			}
			else
			{
				$word_not_altered=1;
			}
		}
	}
	if($word_not_altered==0)
	{
		# step 3
		$word=~s/Y$/i/;
		$word=~s/ç$/c/;
	}
	else
	{
		# step 4
		$word=~s/([^aiouès])s$/$1/;
		if($RV=~/[st]ion$/ && $R2=~/ion$/)
		{
			$word=~s/ion$//;
		}
		elsif($RV=~/(ier|ière|Ier|Ière)$/)
		{
			$word=~s/(ier|ière|Ier|Ière)$/i/;
		}
		elsif($RV=~/e$/)
		{
			$word=~s/e$//;
		}
		elsif($RV=~/guë$/)
		{
			$word=~s/ë$//;
		}
	}
	# step 5
	if($word=~/(enn|onn|ett|ell|eill)$/)
	{
		chop($word);
	}
	# step 6
	$word=~s/[éè](${not_vowel}+)$/e$1/;
	$word=lc($word);
	# remove accents
	$word=~s/[éèêë]/e/g;
	$word=~s/[àâä]/a/g;
	$word=~s/[ç]/c/g;
	$word=~s/[ùûü]/u/g;
	$word=~s/[ïî]/i/g;
	$word=~s/[ôö]/o/g;
	$word=~s/ÿ/y/g;

	return $word;
}