2026-1-6
This commit is contained in:
175
venv/Lib/site-packages/whoosh/lang/porter.py
Normal file
175
venv/Lib/site-packages/whoosh/lang/porter.py
Normal file
@@ -0,0 +1,175 @@
|
||||
"""
|
||||
Reimplementation of the
|
||||
`Porter stemming algorithm <http://tartarus.org/~martin/PorterStemmer/>`_
|
||||
in Python.
|
||||
|
||||
In my quick tests, this implementation about 3.5 times faster than the
|
||||
seriously weird Python linked from the official page.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
# Suffix replacement lists
|
||||
|
||||
_step2list = {
|
||||
"ational": "ate",
|
||||
"tional": "tion",
|
||||
"enci": "ence",
|
||||
"anci": "ance",
|
||||
"izer": "ize",
|
||||
"bli": "ble",
|
||||
"alli": "al",
|
||||
"entli": "ent",
|
||||
"eli": "e",
|
||||
"ousli": "ous",
|
||||
"ization": "ize",
|
||||
"ation": "ate",
|
||||
"ator": "ate",
|
||||
"alism": "al",
|
||||
"iveness": "ive",
|
||||
"fulness": "ful",
|
||||
"ousness": "ous",
|
||||
"aliti": "al",
|
||||
"iviti": "ive",
|
||||
"biliti": "ble",
|
||||
"logi": "log",
|
||||
}
|
||||
|
||||
_step3list = {
|
||||
"icate": "ic",
|
||||
"ative": "",
|
||||
"alize": "al",
|
||||
"iciti": "ic",
|
||||
"ical": "ic",
|
||||
"ful": "",
|
||||
"ness": "",
|
||||
}
|
||||
|
||||
|
||||
_cons = "[^aeiou]"
|
||||
_vowel = "[aeiouy]"
|
||||
_cons_seq = "[^aeiouy]+"
|
||||
_vowel_seq = "[aeiou]+"
|
||||
|
||||
# m > 0
|
||||
_mgr0 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq)
|
||||
# m == 0
|
||||
_meq1 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + "(" + _vowel_seq + ")?$")
|
||||
# m > 1
|
||||
_mgr1 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + _vowel_seq + _cons_seq)
|
||||
# vowel in stem
|
||||
_s_v = re.compile("^(" + _cons_seq + ")?" + _vowel)
|
||||
# ???
|
||||
_c_v = re.compile("^" + _cons_seq + _vowel + "[^aeiouwxy]$")
|
||||
|
||||
# Patterns used in the rules
|
||||
|
||||
_ed_ing = re.compile("^(.*)(ed|ing)$")
|
||||
_at_bl_iz = re.compile("(at|bl|iz)$")
|
||||
_step1b = re.compile("([^aeiouylsz])\\1$")
|
||||
_step2 = re.compile("^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$")
|
||||
_step3 = re.compile("^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$")
|
||||
_step4_1 = re.compile("^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$")
|
||||
_step4_2 = re.compile("^(.+?)(s|t)(ion)$")
|
||||
_step5 = re.compile("^(.+?)e$")
|
||||
|
||||
|
||||
# Stemming function
|
||||
|
||||
def stem(w):
|
||||
"""Uses the Porter stemming algorithm to remove suffixes from English
|
||||
words.
|
||||
|
||||
>>> stem("fundamentally")
|
||||
"fundament"
|
||||
"""
|
||||
|
||||
if len(w) < 3:
|
||||
return w
|
||||
|
||||
first_is_y = w[0] == "y"
|
||||
if first_is_y:
|
||||
w = "Y" + w[1:]
|
||||
|
||||
# Step 1a
|
||||
if w.endswith("s"):
|
||||
if w.endswith("sses"):
|
||||
w = w[:-2]
|
||||
elif w.endswith("ies"):
|
||||
w = w[:-2]
|
||||
elif w[-2] != "s":
|
||||
w = w[:-1]
|
||||
|
||||
# Step 1b
|
||||
|
||||
if w.endswith("eed"):
|
||||
s = w[:-3]
|
||||
if _mgr0.match(s):
|
||||
w = w[:-1]
|
||||
else:
|
||||
m = _ed_ing.match(w)
|
||||
if m:
|
||||
stem = m.group(1)
|
||||
if _s_v.match(stem):
|
||||
w = stem
|
||||
if _at_bl_iz.match(w):
|
||||
w += "e"
|
||||
elif _step1b.match(w):
|
||||
w = w[:-1]
|
||||
elif _c_v.match(w):
|
||||
w += "e"
|
||||
|
||||
# Step 1c
|
||||
|
||||
if w.endswith("y"):
|
||||
stem = w[:-1]
|
||||
if _s_v.match(stem):
|
||||
w = stem + "i"
|
||||
|
||||
# Step 2
|
||||
|
||||
m = _step2.match(w)
|
||||
if m:
|
||||
stem = m.group(1)
|
||||
suffix = m.group(2)
|
||||
if _mgr0.match(stem):
|
||||
w = stem + _step2list[suffix]
|
||||
|
||||
# Step 3
|
||||
|
||||
m = _step3.match(w)
|
||||
if m:
|
||||
stem = m.group(1)
|
||||
suffix = m.group(2)
|
||||
if _mgr0.match(stem):
|
||||
w = stem + _step3list[suffix]
|
||||
|
||||
# Step 4
|
||||
|
||||
m = _step4_1.match(w)
|
||||
if m:
|
||||
stem = m.group(1)
|
||||
if _mgr1.match(stem):
|
||||
w = stem
|
||||
else:
|
||||
m = _step4_2.match(w)
|
||||
if m:
|
||||
stem = m.group(1) + m.group(2)
|
||||
if _mgr1.match(stem):
|
||||
w = stem
|
||||
|
||||
# Step 5
|
||||
|
||||
m = _step5.match(w)
|
||||
if m:
|
||||
stem = m.group(1)
|
||||
if _mgr1.match(stem) or (_meq1.match(stem) and not _c_v.match(stem)):
|
||||
w = stem
|
||||
|
||||
if w.endswith("ll") and _mgr1.match(w):
|
||||
w = w[:-1]
|
||||
|
||||
if first_is_y:
|
||||
w = "y" + w[1:]
|
||||
|
||||
return w
|
||||
Reference in New Issue
Block a user