2026-1-6
This commit is contained in:
242
venv/Lib/site-packages/whoosh/lang/paicehusk.py
Normal file
242
venv/Lib/site-packages/whoosh/lang/paicehusk.py
Normal file
@@ -0,0 +1,242 @@
|
||||
"""This module contains an object that implements the Paice-Husk stemming
|
||||
algorithm.
|
||||
|
||||
If you just want to use the standard Paice-Husk stemming rules, use the
|
||||
module's ``stem()`` function::
|
||||
|
||||
stemmed_word = stem(word)
|
||||
|
||||
If you want to use a custom rule set, read the rules into a string where the
|
||||
rules are separated by newlines, and instantiate the object with the string,
|
||||
then use the object's stem method to stem words::
|
||||
|
||||
stemmer = PaiceHuskStemmer(my_rules_string)
|
||||
stemmed_word = stemmer.stem(word)
|
||||
"""
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
class PaiceHuskStemmer(object):
|
||||
"""Implements the Paice-Husk stemming algorithm.
|
||||
"""
|
||||
|
||||
rule_expr = re.compile(r"""
|
||||
^(?P<ending>\w+)
|
||||
(?P<intact>[*]?)
|
||||
(?P<num>\d+)
|
||||
(?P<append>\w*)
|
||||
(?P<cont>[.>])
|
||||
""", re.UNICODE | re.VERBOSE)
|
||||
|
||||
stem_expr = re.compile("^\w+", re.UNICODE)
|
||||
|
||||
def __init__(self, ruletable):
|
||||
"""
|
||||
:param ruletable: a string containing the rule data, separated
|
||||
by newlines.
|
||||
"""
|
||||
self.rules = defaultdict(list)
|
||||
self.read_rules(ruletable)
|
||||
|
||||
def read_rules(self, ruletable):
|
||||
rule_expr = self.rule_expr
|
||||
rules = self.rules
|
||||
|
||||
for line in ruletable.split("\n"):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
match = rule_expr.match(line)
|
||||
if match:
|
||||
ending = match.group("ending")[::-1]
|
||||
lastchar = ending[-1]
|
||||
intact = match.group("intact") == "*"
|
||||
num = int(match.group("num"))
|
||||
append = match.group("append")
|
||||
cont = match.group("cont") == ">"
|
||||
|
||||
rules[lastchar].append((ending, intact, num, append, cont))
|
||||
else:
|
||||
raise Exception("Bad rule: %r" % line)
|
||||
|
||||
def first_vowel(self, word):
|
||||
vp = min([p for p in [word.find(v) for v in "aeiou"]
|
||||
if p > -1])
|
||||
yp = word.find("y")
|
||||
if yp > 0 and yp < vp:
|
||||
return yp
|
||||
return vp
|
||||
|
||||
def strip_prefix(self, word):
|
||||
for prefix in ("kilo", "micro", "milli", "intra", "ultra", "mega",
|
||||
"nano", "pico", "pseudo"):
|
||||
if word.startswith(prefix):
|
||||
return word[len(prefix):]
|
||||
return word
|
||||
|
||||
def stem(self, word):
|
||||
"""Returns a stemmed version of the argument string.
|
||||
"""
|
||||
|
||||
rules = self.rules
|
||||
match = self.stem_expr.match(word)
|
||||
if not match:
|
||||
return word
|
||||
stem = self.strip_prefix(match.group(0))
|
||||
|
||||
is_intact = True
|
||||
continuing = True
|
||||
while continuing:
|
||||
pfv = self.first_vowel(stem)
|
||||
rulelist = rules.get(stem[-1])
|
||||
if not rulelist:
|
||||
break
|
||||
|
||||
continuing = False
|
||||
for ending, intact, num, append, cont in rulelist:
|
||||
if stem.endswith(ending):
|
||||
if intact and not is_intact:
|
||||
continue
|
||||
newlen = len(stem) - num + len(append)
|
||||
|
||||
if ((pfv == 0 and newlen < 2)
|
||||
or (pfv > 0 and newlen < 3)):
|
||||
# If word starts with vowel, minimum stem length is 2.
|
||||
# If word starts with consonant, minimum stem length is
|
||||
# 3.
|
||||
continue
|
||||
|
||||
is_intact = False
|
||||
stem = stem[:0 - num] + append
|
||||
|
||||
continuing = cont
|
||||
break
|
||||
|
||||
return stem
|
||||
|
||||
# The default rules for the Paice-Husk stemming algorithm
|
||||
|
||||
defaultrules = """
|
||||
ai*2. { -ia > - if intact }
|
||||
a*1. { -a > - if intact }
|
||||
bb1. { -bb > -b }
|
||||
city3s. { -ytic > -ys }
|
||||
ci2> { -ic > - }
|
||||
cn1t> { -nc > -nt }
|
||||
dd1. { -dd > -d }
|
||||
dei3y> { -ied > -y }
|
||||
deec2ss. { -ceed > -cess }
|
||||
dee1. { -eed > -ee }
|
||||
de2> { -ed > - }
|
||||
dooh4> { -hood > - }
|
||||
e1> { -e > - }
|
||||
feil1v. { -lief > -liev }
|
||||
fi2> { -if > - }
|
||||
gni3> { -ing > - }
|
||||
gai3y. { -iag > -y }
|
||||
ga2> { -ag > - }
|
||||
gg1. { -gg > -g }
|
||||
ht*2. { -th > - if intact }
|
||||
hsiug5ct. { -guish > -ct }
|
||||
hsi3> { -ish > - }
|
||||
i*1. { -i > - if intact }
|
||||
i1y> { -i > -y }
|
||||
ji1d. { -ij > -id -- see nois4j> & vis3j> }
|
||||
juf1s. { -fuj > -fus }
|
||||
ju1d. { -uj > -ud }
|
||||
jo1d. { -oj > -od }
|
||||
jeh1r. { -hej > -her }
|
||||
jrev1t. { -verj > -vert }
|
||||
jsim2t. { -misj > -mit }
|
||||
jn1d. { -nj > -nd }
|
||||
j1s. { -j > -s }
|
||||
lbaifi6. { -ifiabl > - }
|
||||
lbai4y. { -iabl > -y }
|
||||
lba3> { -abl > - }
|
||||
lbi3. { -ibl > - }
|
||||
lib2l> { -bil > -bl }
|
||||
lc1. { -cl > c }
|
||||
lufi4y. { -iful > -y }
|
||||
luf3> { -ful > - }
|
||||
lu2. { -ul > - }
|
||||
lai3> { -ial > - }
|
||||
lau3> { -ual > - }
|
||||
la2> { -al > - }
|
||||
ll1. { -ll > -l }
|
||||
mui3. { -ium > - }
|
||||
mu*2. { -um > - if intact }
|
||||
msi3> { -ism > - }
|
||||
mm1. { -mm > -m }
|
||||
nois4j> { -sion > -j }
|
||||
noix4ct. { -xion > -ct }
|
||||
noi3> { -ion > - }
|
||||
nai3> { -ian > - }
|
||||
na2> { -an > - }
|
||||
nee0. { protect -een }
|
||||
ne2> { -en > - }
|
||||
nn1. { -nn > -n }
|
||||
pihs4> { -ship > - }
|
||||
pp1. { -pp > -p }
|
||||
re2> { -er > - }
|
||||
rae0. { protect -ear }
|
||||
ra2. { -ar > - }
|
||||
ro2> { -or > - }
|
||||
ru2> { -ur > - }
|
||||
rr1. { -rr > -r }
|
||||
rt1> { -tr > -t }
|
||||
rei3y> { -ier > -y }
|
||||
sei3y> { -ies > -y }
|
||||
sis2. { -sis > -s }
|
||||
si2> { -is > - }
|
||||
ssen4> { -ness > - }
|
||||
ss0. { protect -ss }
|
||||
suo3> { -ous > - }
|
||||
su*2. { -us > - if intact }
|
||||
s*1> { -s > - if intact }
|
||||
s0. { -s > -s }
|
||||
tacilp4y. { -plicat > -ply }
|
||||
ta2> { -at > - }
|
||||
tnem4> { -ment > - }
|
||||
tne3> { -ent > - }
|
||||
tna3> { -ant > - }
|
||||
tpir2b. { -ript > -rib }
|
||||
tpro2b. { -orpt > -orb }
|
||||
tcud1. { -duct > -duc }
|
||||
tpmus2. { -sumpt > -sum }
|
||||
tpec2iv. { -cept > -ceiv }
|
||||
tulo2v. { -olut > -olv }
|
||||
tsis0. { protect -sist }
|
||||
tsi3> { -ist > - }
|
||||
tt1. { -tt > -t }
|
||||
uqi3. { -iqu > - }
|
||||
ugo1. { -ogu > -og }
|
||||
vis3j> { -siv > -j }
|
||||
vie0. { protect -eiv }
|
||||
vi2> { -iv > - }
|
||||
ylb1> { -bly > -bl }
|
||||
yli3y> { -ily > -y }
|
||||
ylp0. { protect -ply }
|
||||
yl2> { -ly > - }
|
||||
ygo1. { -ogy > -og }
|
||||
yhp1. { -phy > -ph }
|
||||
ymo1. { -omy > -om }
|
||||
ypo1. { -opy > -op }
|
||||
yti3> { -ity > - }
|
||||
yte3> { -ety > - }
|
||||
ytl2. { -lty > -l }
|
||||
yrtsi5. { -istry > - }
|
||||
yra3> { -ary > - }
|
||||
yro3> { -ory > - }
|
||||
yfi3. { -ify > - }
|
||||
ycn2t> { -ncy > -nt }
|
||||
yca3> { -acy > - }
|
||||
zi2> { -iz > - }
|
||||
zy1s. { -yz > -ys }
|
||||
"""
|
||||
|
||||
# Make the standard rules available as a module-level function
|
||||
|
||||
stem = PaiceHuskStemmer(defaultrules).stem
|
||||
Reference in New Issue
Block a user