2026-1-6
This commit is contained in:
343
venv/Lib/site-packages/whoosh/spelling.py
Normal file
343
venv/Lib/site-packages/whoosh/spelling.py
Normal file
@@ -0,0 +1,343 @@
|
||||
# Copyright 2007 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
"""
|
||||
This module contains helper functions for correcting typos in user queries.
|
||||
"""
|
||||
|
||||
from bisect import bisect_left
|
||||
from heapq import heappush, heapreplace
|
||||
|
||||
from whoosh import highlight
|
||||
from whoosh.compat import iteritems, xrange
|
||||
|
||||
|
||||
# Corrector objects
|
||||
|
||||
class Corrector(object):
|
||||
"""
|
||||
Base class for spelling correction objects. Concrete sub-classes should
|
||||
implement the ``_suggestions`` method.
|
||||
"""
|
||||
|
||||
def suggest(self, text, limit=5, maxdist=2, prefix=0):
|
||||
"""
|
||||
:param text: the text to check. This word will **not** be added to the
|
||||
suggestions, even if it appears in the word graph.
|
||||
:param limit: only return up to this many suggestions. If there are not
|
||||
enough terms in the field within ``maxdist`` of the given word, the
|
||||
returned list will be shorter than this number.
|
||||
:param maxdist: the largest edit distance from the given word to look
|
||||
at. Values higher than 2 are not very effective or efficient.
|
||||
:param prefix: require suggestions to share a prefix of this length
|
||||
with the given word. This is often justifiable since most
|
||||
misspellings do not involve the first letter of the word. Using a
|
||||
prefix dramatically decreases the time it takes to generate the
|
||||
list of words.
|
||||
"""
|
||||
|
||||
_suggestions = self._suggestions
|
||||
|
||||
heap = []
|
||||
for item in _suggestions(text, maxdist, prefix):
|
||||
# Note that the *higher* scores (item[0]) are better!
|
||||
if len(heap) < limit:
|
||||
heappush(heap, item)
|
||||
elif item > heap[0]:
|
||||
heapreplace(heap, item)
|
||||
|
||||
sugs = sorted(heap, key=lambda x: (0 - x[0], x[1]))
|
||||
return [sug for _, sug in sugs]
|
||||
|
||||
def _suggestions(self, text, maxdist, prefix):
|
||||
"""
|
||||
Low-level method that yields a series of (score, "suggestion")
|
||||
tuples.
|
||||
|
||||
:param text: the text to check.
|
||||
:param maxdist: the maximum edit distance.
|
||||
:param prefix: require suggestions to share a prefix of this length
|
||||
with the given word.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class ReaderCorrector(Corrector):
|
||||
"""
|
||||
Suggests corrections based on the content of a field in a reader.
|
||||
|
||||
Ranks suggestions by the edit distance, then by highest to lowest
|
||||
frequency.
|
||||
"""
|
||||
|
||||
def __init__(self, reader, fieldname, fieldobj):
|
||||
self.reader = reader
|
||||
self.fieldname = fieldname
|
||||
self.fieldobj = fieldobj
|
||||
|
||||
def _suggestions(self, text, maxdist, prefix):
|
||||
reader = self.reader
|
||||
freq = reader.frequency
|
||||
|
||||
fieldname = self.fieldname
|
||||
fieldobj = reader.schema[fieldname]
|
||||
sugfield = fieldobj.spelling_fieldname(fieldname)
|
||||
|
||||
for sug in reader.terms_within(sugfield, text, maxdist, prefix=prefix):
|
||||
# Higher scores are better, so negate the distance and frequency
|
||||
f = freq(fieldname, sug) or 1
|
||||
score = 0 - (maxdist + (1.0 / f * 0.5))
|
||||
yield (score, sug)
|
||||
|
||||
|
||||
class ListCorrector(Corrector):
|
||||
"""
|
||||
Suggests corrections based on the content of a sorted list of strings.
|
||||
"""
|
||||
|
||||
def __init__(self, wordlist):
|
||||
self.wordlist = wordlist
|
||||
|
||||
def _suggestions(self, text, maxdist, prefix):
|
||||
from whoosh.automata.lev import levenshtein_automaton
|
||||
from whoosh.automata.fsa import find_all_matches
|
||||
|
||||
seen = set()
|
||||
for i in xrange(1, maxdist + 1):
|
||||
dfa = levenshtein_automaton(text, maxdist, prefix).to_dfa()
|
||||
sk = self.Skipper(self.wordlist)
|
||||
for sug in find_all_matches(dfa, sk):
|
||||
if sug not in seen:
|
||||
seen.add(sug)
|
||||
yield (0 - maxdist), sug
|
||||
|
||||
class Skipper(object):
|
||||
def __init__(self, data):
|
||||
self.data = data
|
||||
self.i = 0
|
||||
|
||||
def __call__(self, w):
|
||||
if self.data[self.i] == w:
|
||||
return w
|
||||
self.i += 1
|
||||
pos = bisect_left(self.data, w, self.i)
|
||||
if pos < len(self.data):
|
||||
return self.data[pos]
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
class MultiCorrector(Corrector):
|
||||
"""
|
||||
Merges suggestions from a list of sub-correctors.
|
||||
"""
|
||||
|
||||
def __init__(self, correctors, op):
|
||||
self.correctors = correctors
|
||||
self.op = op
|
||||
|
||||
def _suggestions(self, text, maxdist, prefix):
|
||||
op = self.op
|
||||
seen = {}
|
||||
for corr in self.correctors:
|
||||
for score, sug in corr._suggestions(text, maxdist, prefix):
|
||||
if sug in seen:
|
||||
seen[sug] = op(seen[sug], score)
|
||||
else:
|
||||
seen[sug] = score
|
||||
return iteritems(seen)
|
||||
|
||||
|
||||
# Query correction
|
||||
|
||||
class Correction(object):
|
||||
"""
|
||||
Represents the corrected version of a user query string. Has the
|
||||
following attributes:
|
||||
|
||||
``query``
|
||||
The corrected :class:`whoosh.query.Query` object.
|
||||
``string``
|
||||
The corrected user query string.
|
||||
``original_query``
|
||||
The original :class:`whoosh.query.Query` object that was corrected.
|
||||
``original_string``
|
||||
The original user query string.
|
||||
``tokens``
|
||||
A list of token objects representing the corrected words.
|
||||
|
||||
You can also use the :meth:`Correction.format_string` method to reformat the
|
||||
corrected query string using a :class:`whoosh.highlight.Formatter` class.
|
||||
For example, to display the corrected query string as HTML with the
|
||||
changed words emphasized::
|
||||
|
||||
from whoosh import highlight
|
||||
|
||||
correction = mysearcher.correct_query(q, qstring)
|
||||
|
||||
hf = highlight.HtmlFormatter(classname="change")
|
||||
html = correction.format_string(hf)
|
||||
"""
|
||||
|
||||
def __init__(self, q, qstring, corr_q, tokens):
|
||||
self.original_query = q
|
||||
self.query = corr_q
|
||||
self.original_string = qstring
|
||||
self.tokens = tokens
|
||||
|
||||
if self.original_string:
|
||||
self.string = self.format_string(highlight.NullFormatter())
|
||||
else:
|
||||
self.string = ''
|
||||
|
||||
def __repr__(self):
|
||||
return "%s(%r, %r)" % (self.__class__.__name__, self.query,
|
||||
self.string)
|
||||
|
||||
def format_string(self, formatter):
|
||||
"""
|
||||
Highlights the corrected words in the original query string using the
|
||||
given :class:`~whoosh.highlight.Formatter`.
|
||||
|
||||
:param formatter: A :class:`whoosh.highlight.Formatter` instance.
|
||||
:return: the output of the formatter (usually a string).
|
||||
"""
|
||||
|
||||
if not self.original_string:
|
||||
return ''
|
||||
if isinstance(formatter, type):
|
||||
formatter = formatter()
|
||||
|
||||
fragment = highlight.Fragment(self.original_string, self.tokens)
|
||||
return formatter.format_fragment(fragment, replace=True)
|
||||
|
||||
|
||||
# QueryCorrector objects
|
||||
|
||||
class QueryCorrector(object):
|
||||
"""
|
||||
Base class for objects that correct words in a user query.
|
||||
"""
|
||||
|
||||
def __init__(self, fieldname):
|
||||
self.fieldname = fieldname
|
||||
|
||||
def correct_query(self, q, qstring):
|
||||
"""
|
||||
Returns a :class:`Correction` object representing the corrected
|
||||
form of the given query.
|
||||
|
||||
:param q: the original :class:`whoosh.query.Query` tree to be
|
||||
corrected.
|
||||
:param qstring: the original user query. This may be None if the
|
||||
original query string is not available, in which case the
|
||||
``Correction.string`` attribute will also be None.
|
||||
:rtype: :class:`Correction`
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def field(self):
|
||||
return self.fieldname
|
||||
|
||||
|
||||
class SimpleQueryCorrector(QueryCorrector):
|
||||
"""
|
||||
A simple query corrector based on a mapping of field names to
|
||||
:class:`Corrector` objects, and a list of ``("fieldname", "text")`` tuples
|
||||
to correct. And terms in the query that appear in list of term tuples are
|
||||
corrected using the appropriate corrector.
|
||||
"""
|
||||
|
||||
def __init__(self, correctors, terms, aliases=None, prefix=0, maxdist=2):
|
||||
"""
|
||||
:param correctors: a dictionary mapping field names to
|
||||
:class:`Corrector` objects.
|
||||
:param terms: a sequence of ``("fieldname", "text")`` tuples
|
||||
representing terms to be corrected.
|
||||
:param aliases: a dictionary mapping field names in the query to
|
||||
field names for spelling suggestions.
|
||||
:param prefix: suggested replacement words must share this number of
|
||||
initial characters with the original word. Increasing this even to
|
||||
just ``1`` can dramatically speed up suggestions, and may be
|
||||
justifiable since spellling mistakes rarely involve the first
|
||||
letter of a word.
|
||||
:param maxdist: the maximum number of "edits" (insertions, deletions,
|
||||
subsitutions, or transpositions of letters) allowed between the
|
||||
original word and any suggestion. Values higher than ``2`` may be
|
||||
slow.
|
||||
"""
|
||||
|
||||
self.correctors = correctors
|
||||
self.aliases = aliases or {}
|
||||
self.termset = frozenset(terms)
|
||||
self.prefix = prefix
|
||||
self.maxdist = maxdist
|
||||
|
||||
def correct_query(self, q, qstring):
|
||||
correctors = self.correctors
|
||||
aliases = self.aliases
|
||||
termset = self.termset
|
||||
prefix = self.prefix
|
||||
maxdist = self.maxdist
|
||||
|
||||
# A list of tokens that were changed by a corrector
|
||||
corrected_tokens = []
|
||||
|
||||
# The corrected query tree. We don't need to deepcopy the original
|
||||
# because we use Query.replace() to find-and-replace the corrected
|
||||
# words and it returns a copy of the query tree.
|
||||
corrected_q = q
|
||||
|
||||
# For every word in the original query...
|
||||
# Note we can't put these in a set, because we must preserve WHERE
|
||||
# in the query each token occured so we can format them later
|
||||
for token in q.all_tokens():
|
||||
fname = token.fieldname
|
||||
aname = aliases.get(fname, fname)
|
||||
|
||||
# If this is one of the words we're supposed to correct...
|
||||
if (fname, token.text) in termset:
|
||||
c = correctors[aname]
|
||||
sugs = c.suggest(token.text, prefix=prefix, maxdist=maxdist)
|
||||
if sugs:
|
||||
# This is a "simple" corrector, so we just pick the first
|
||||
# suggestion :/
|
||||
sug = sugs[0]
|
||||
|
||||
# Return a new copy of the original query with this word
|
||||
# replaced by the correction
|
||||
corrected_q = corrected_q.replace(token.fieldname,
|
||||
token.text, sug)
|
||||
# Add the token to the list of corrected tokens (for the
|
||||
# formatter to use later)
|
||||
token.original = token.text
|
||||
token.text = sug
|
||||
corrected_tokens.append(token)
|
||||
|
||||
return Correction(q, qstring, corrected_q, corrected_tokens)
|
||||
Reference in New Issue
Block a user