2026-1-6
This commit is contained in:
69
venv/Lib/site-packages/whoosh/analysis/__init__.py
Normal file
69
venv/Lib/site-packages/whoosh/analysis/__init__.py
Normal file
@@ -0,0 +1,69 @@
|
||||
# Copyright 2007 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
"""Classes and functions for turning a piece of text into an indexable stream
|
||||
of "tokens" (usually equivalent to words). There are three general classes
|
||||
involved in analysis:
|
||||
|
||||
* Tokenizers are always at the start of the text processing pipeline. They take
|
||||
a string and yield Token objects (actually, the same token object over and
|
||||
over, for performance reasons) corresponding to the tokens (words) in the
|
||||
text.
|
||||
|
||||
Every tokenizer is a callable that takes a string and returns an iterator of
|
||||
tokens.
|
||||
|
||||
* Filters take the tokens from the tokenizer and perform various
|
||||
transformations on them. For example, the LowercaseFilter converts all tokens
|
||||
to lowercase, which is usually necessary when indexing regular English text.
|
||||
|
||||
Every filter is a callable that takes a token generator and returns a token
|
||||
generator.
|
||||
|
||||
* Analyzers are convenience functions/classes that "package up" a tokenizer and
|
||||
zero or more filters into a single unit. For example, the StandardAnalyzer
|
||||
combines a RegexTokenizer, LowercaseFilter, and StopFilter.
|
||||
|
||||
Every analyzer is a callable that takes a string and returns a token
|
||||
iterator. (So Tokenizers can be used as Analyzers if you don't need any
|
||||
filtering).
|
||||
|
||||
You can compose tokenizers and filters together using the ``|`` character::
|
||||
|
||||
my_analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter()
|
||||
|
||||
The first item must be a tokenizer and the rest must be filters (you can't put
|
||||
a filter first or a tokenizer after the first item).
|
||||
"""
|
||||
|
||||
from whoosh.analysis.acore import *
|
||||
from whoosh.analysis.tokenizers import *
|
||||
from whoosh.analysis.filters import *
|
||||
from whoosh.analysis.morph import *
|
||||
from whoosh.analysis.intraword import *
|
||||
from whoosh.analysis.ngrams import *
|
||||
from whoosh.analysis.analyzers import *
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
156
venv/Lib/site-packages/whoosh/analysis/acore.py
Normal file
156
venv/Lib/site-packages/whoosh/analysis/acore.py
Normal file
@@ -0,0 +1,156 @@
|
||||
# Copyright 2007 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
from whoosh.compat import iteritems
|
||||
|
||||
|
||||
# Exceptions
|
||||
|
||||
class CompositionError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
# Utility functions
|
||||
|
||||
def unstopped(tokenstream):
|
||||
"""Removes tokens from a token stream where token.stopped = True.
|
||||
"""
|
||||
return (t for t in tokenstream if not t.stopped)
|
||||
|
||||
|
||||
def entoken(textstream, positions=False, chars=False, start_pos=0,
|
||||
start_char=0, **kwargs):
|
||||
"""Takes a sequence of unicode strings and yields a series of Token objects
|
||||
(actually the same Token object over and over, for performance reasons),
|
||||
with the attributes filled in with reasonable values (for example, if
|
||||
``positions`` or ``chars`` is True, the function assumes each token was
|
||||
separated by one space).
|
||||
"""
|
||||
|
||||
pos = start_pos
|
||||
char = start_char
|
||||
t = Token(positions=positions, chars=chars, **kwargs)
|
||||
|
||||
for text in textstream:
|
||||
t.text = text
|
||||
|
||||
if positions:
|
||||
t.pos = pos
|
||||
pos += 1
|
||||
|
||||
if chars:
|
||||
t.startchar = char
|
||||
char = char + len(text)
|
||||
t.endchar = char
|
||||
|
||||
yield t
|
||||
|
||||
|
||||
# Token object
|
||||
|
||||
class Token(object):
|
||||
"""
|
||||
Represents a "token" (usually a word) extracted from the source text being
|
||||
indexed.
|
||||
|
||||
See "Advanced analysis" in the user guide for more information.
|
||||
|
||||
Because object instantiation in Python is slow, tokenizers should create
|
||||
ONE SINGLE Token object and YIELD IT OVER AND OVER, changing the attributes
|
||||
each time.
|
||||
|
||||
This trick means that consumers of tokens (i.e. filters) must never try to
|
||||
hold onto the token object between loop iterations, or convert the token
|
||||
generator into a list. Instead, save the attributes between iterations,
|
||||
not the object::
|
||||
|
||||
def RemoveDuplicatesFilter(self, stream):
|
||||
# Removes duplicate words.
|
||||
lasttext = None
|
||||
for token in stream:
|
||||
# Only yield the token if its text doesn't
|
||||
# match the previous token.
|
||||
if lasttext != token.text:
|
||||
yield token
|
||||
lasttext = token.text
|
||||
|
||||
...or, call token.copy() to get a copy of the token object.
|
||||
"""
|
||||
|
||||
def __init__(self, positions=False, chars=False, removestops=True, mode='',
|
||||
**kwargs):
|
||||
"""
|
||||
:param positions: Whether tokens should have the token position in the
|
||||
'pos' attribute.
|
||||
:param chars: Whether tokens should have character offsets in the
|
||||
'startchar' and 'endchar' attributes.
|
||||
:param removestops: whether to remove stop words from the stream (if
|
||||
the tokens pass through a stop filter).
|
||||
:param mode: contains a string describing the purpose for which the
|
||||
analyzer is being called, i.e. 'index' or 'query'.
|
||||
"""
|
||||
|
||||
self.positions = positions
|
||||
self.chars = chars
|
||||
self.stopped = False
|
||||
self.boost = 1.0
|
||||
self.removestops = removestops
|
||||
self.mode = mode
|
||||
self.__dict__.update(kwargs)
|
||||
|
||||
def __repr__(self):
|
||||
parms = ", ".join("%s=%r" % (name, value)
|
||||
for name, value in iteritems(self.__dict__))
|
||||
return "%s(%s)" % (self.__class__.__name__, parms)
|
||||
|
||||
def copy(self):
|
||||
# This is faster than using the copy module
|
||||
return Token(**self.__dict__)
|
||||
|
||||
|
||||
# Composition support
|
||||
|
||||
class Composable(object):
|
||||
is_morph = False
|
||||
|
||||
def __or__(self, other):
|
||||
from whoosh.analysis.analyzers import CompositeAnalyzer
|
||||
|
||||
if not isinstance(other, Composable):
|
||||
raise TypeError("%r is not composable with %r" % (self, other))
|
||||
return CompositeAnalyzer(self, other)
|
||||
|
||||
def __repr__(self):
|
||||
attrs = ""
|
||||
if self.__dict__:
|
||||
attrs = ", ".join("%s=%r" % (key, value)
|
||||
for key, value
|
||||
in iteritems(self.__dict__))
|
||||
return self.__class__.__name__ + "(%s)" % attrs
|
||||
|
||||
def has_morph(self):
|
||||
return self.is_morph
|
||||
296
venv/Lib/site-packages/whoosh/analysis/analyzers.py
Normal file
296
venv/Lib/site-packages/whoosh/analysis/analyzers.py
Normal file
@@ -0,0 +1,296 @@
|
||||
# Copyright 2007 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
from whoosh.analysis.acore import Composable, CompositionError
|
||||
from whoosh.analysis.tokenizers import Tokenizer
|
||||
from whoosh.analysis.filters import LowercaseFilter
|
||||
from whoosh.analysis.filters import StopFilter, STOP_WORDS
|
||||
from whoosh.analysis.morph import StemFilter
|
||||
from whoosh.analysis.intraword import IntraWordFilter
|
||||
from whoosh.analysis.tokenizers import default_pattern
|
||||
from whoosh.analysis.tokenizers import CommaSeparatedTokenizer
|
||||
from whoosh.analysis.tokenizers import IDTokenizer
|
||||
from whoosh.analysis.tokenizers import RegexTokenizer
|
||||
from whoosh.analysis.tokenizers import SpaceSeparatedTokenizer
|
||||
from whoosh.lang.porter import stem
|
||||
|
||||
|
||||
# Analyzers
|
||||
|
||||
class Analyzer(Composable):
|
||||
""" Abstract base class for analyzers.
|
||||
"""
|
||||
|
||||
def __repr__(self):
|
||||
return "%s()" % self.__class__.__name__
|
||||
|
||||
def __eq__(self, other):
|
||||
return (other
|
||||
and self.__class__ is other.__class__
|
||||
and self.__dict__ == other.__dict__)
|
||||
|
||||
def __call__(self, value, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
def clean(self):
|
||||
pass
|
||||
|
||||
|
||||
class CompositeAnalyzer(Analyzer):
|
||||
def __init__(self, *composables):
|
||||
self.items = []
|
||||
|
||||
for comp in composables:
|
||||
if isinstance(comp, CompositeAnalyzer):
|
||||
self.items.extend(comp.items)
|
||||
else:
|
||||
self.items.append(comp)
|
||||
|
||||
# Tokenizers must start a chain, and then only filters after that
|
||||
# (because analyzers take a string and return a generator of tokens,
|
||||
# and filters take and return generators of tokens)
|
||||
for item in self.items[1:]:
|
||||
if isinstance(item, Tokenizer):
|
||||
raise CompositionError("Only one tokenizer allowed at the start"
|
||||
" of the analyzer: %r" % self.items)
|
||||
|
||||
def __repr__(self):
|
||||
return "%s(%s)" % (self.__class__.__name__,
|
||||
", ".join(repr(item) for item in self.items))
|
||||
|
||||
def __call__(self, value, no_morph=False, **kwargs):
|
||||
items = self.items
|
||||
# Start with tokenizer
|
||||
gen = items[0](value, **kwargs)
|
||||
# Run filters
|
||||
for item in items[1:]:
|
||||
if not (no_morph and hasattr(item, "is_morph") and item.is_morph):
|
||||
gen = item(gen)
|
||||
return gen
|
||||
|
||||
def __getitem__(self, item):
|
||||
return self.items.__getitem__(item)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.items)
|
||||
|
||||
def __eq__(self, other):
|
||||
return (other
|
||||
and self.__class__ is other.__class__
|
||||
and self.items == other.items)
|
||||
|
||||
def clean(self):
|
||||
for item in self.items:
|
||||
if hasattr(item, "clean"):
|
||||
item.clean()
|
||||
|
||||
def has_morph(self):
|
||||
return any(item.is_morph for item in self.items)
|
||||
|
||||
|
||||
# Functions that return composed analyzers
|
||||
|
||||
def IDAnalyzer(lowercase=False):
|
||||
"""Deprecated, just use an IDTokenizer directly, with a LowercaseFilter if
|
||||
desired.
|
||||
"""
|
||||
|
||||
tokenizer = IDTokenizer()
|
||||
if lowercase:
|
||||
tokenizer = tokenizer | LowercaseFilter()
|
||||
return tokenizer
|
||||
|
||||
|
||||
def KeywordAnalyzer(lowercase=False, commas=False):
|
||||
"""Parses whitespace- or comma-separated tokens.
|
||||
|
||||
>>> ana = KeywordAnalyzer()
|
||||
>>> [token.text for token in ana("Hello there, this is a TEST")]
|
||||
["Hello", "there,", "this", "is", "a", "TEST"]
|
||||
|
||||
:param lowercase: whether to lowercase the tokens.
|
||||
:param commas: if True, items are separated by commas rather than
|
||||
whitespace.
|
||||
"""
|
||||
|
||||
if commas:
|
||||
tokenizer = CommaSeparatedTokenizer()
|
||||
else:
|
||||
tokenizer = SpaceSeparatedTokenizer()
|
||||
if lowercase:
|
||||
tokenizer = tokenizer | LowercaseFilter()
|
||||
return tokenizer
|
||||
|
||||
|
||||
def RegexAnalyzer(expression=r"\w+(\.?\w+)*", gaps=False):
|
||||
"""Deprecated, just use a RegexTokenizer directly.
|
||||
"""
|
||||
|
||||
return RegexTokenizer(expression=expression, gaps=gaps)
|
||||
|
||||
|
||||
def SimpleAnalyzer(expression=default_pattern, gaps=False):
|
||||
"""Composes a RegexTokenizer with a LowercaseFilter.
|
||||
|
||||
>>> ana = SimpleAnalyzer()
|
||||
>>> [token.text for token in ana("Hello there, this is a TEST")]
|
||||
["hello", "there", "this", "is", "a", "test"]
|
||||
|
||||
:param expression: The regular expression pattern to use to extract tokens.
|
||||
:param gaps: If True, the tokenizer *splits* on the expression, rather
|
||||
than matching on the expression.
|
||||
"""
|
||||
|
||||
return RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()
|
||||
|
||||
|
||||
def StandardAnalyzer(expression=default_pattern, stoplist=STOP_WORDS,
|
||||
minsize=2, maxsize=None, gaps=False):
|
||||
"""Composes a RegexTokenizer with a LowercaseFilter and optional
|
||||
StopFilter.
|
||||
|
||||
>>> ana = StandardAnalyzer()
|
||||
>>> [token.text for token in ana("Testing is testing and testing")]
|
||||
["testing", "testing", "testing"]
|
||||
|
||||
:param expression: The regular expression pattern to use to extract tokens.
|
||||
:param stoplist: A list of stop words. Set this to None to disable
|
||||
the stop word filter.
|
||||
:param minsize: Words smaller than this are removed from the stream.
|
||||
:param maxsize: Words longer that this are removed from the stream.
|
||||
:param gaps: If True, the tokenizer *splits* on the expression, rather
|
||||
than matching on the expression.
|
||||
"""
|
||||
|
||||
ret = RegexTokenizer(expression=expression, gaps=gaps)
|
||||
chain = ret | LowercaseFilter()
|
||||
if stoplist is not None:
|
||||
chain = chain | StopFilter(stoplist=stoplist, minsize=minsize,
|
||||
maxsize=maxsize)
|
||||
return chain
|
||||
|
||||
|
||||
def StemmingAnalyzer(expression=default_pattern, stoplist=STOP_WORDS,
|
||||
minsize=2, maxsize=None, gaps=False, stemfn=stem,
|
||||
ignore=None, cachesize=50000):
|
||||
"""Composes a RegexTokenizer with a lower case filter, an optional stop
|
||||
filter, and a stemming filter.
|
||||
|
||||
>>> ana = StemmingAnalyzer()
|
||||
>>> [token.text for token in ana("Testing is testing and testing")]
|
||||
["test", "test", "test"]
|
||||
|
||||
:param expression: The regular expression pattern to use to extract tokens.
|
||||
:param stoplist: A list of stop words. Set this to None to disable
|
||||
the stop word filter.
|
||||
:param minsize: Words smaller than this are removed from the stream.
|
||||
:param maxsize: Words longer that this are removed from the stream.
|
||||
:param gaps: If True, the tokenizer *splits* on the expression, rather
|
||||
than matching on the expression.
|
||||
:param ignore: a set of words to not stem.
|
||||
:param cachesize: the maximum number of stemmed words to cache. The larger
|
||||
this number, the faster stemming will be but the more memory it will
|
||||
use. Use None for no cache, or -1 for an unbounded cache.
|
||||
"""
|
||||
|
||||
ret = RegexTokenizer(expression=expression, gaps=gaps)
|
||||
chain = ret | LowercaseFilter()
|
||||
if stoplist is not None:
|
||||
chain = chain | StopFilter(stoplist=stoplist, minsize=minsize,
|
||||
maxsize=maxsize)
|
||||
return chain | StemFilter(stemfn=stemfn, ignore=ignore,
|
||||
cachesize=cachesize)
|
||||
|
||||
|
||||
def FancyAnalyzer(expression=r"\s+", stoplist=STOP_WORDS, minsize=2,
|
||||
maxsize=None, gaps=True, splitwords=True, splitnums=True,
|
||||
mergewords=False, mergenums=False):
|
||||
"""Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and
|
||||
StopFilter.
|
||||
|
||||
>>> ana = FancyAnalyzer()
|
||||
>>> [token.text for token in ana("Should I call getInt or get_real?")]
|
||||
["should", "call", "getInt", "get", "int", "get_real", "get", "real"]
|
||||
|
||||
:param expression: The regular expression pattern to use to extract tokens.
|
||||
:param stoplist: A list of stop words. Set this to None to disable
|
||||
the stop word filter.
|
||||
:param minsize: Words smaller than this are removed from the stream.
|
||||
:param maxsize: Words longer that this are removed from the stream.
|
||||
:param gaps: If True, the tokenizer *splits* on the expression, rather
|
||||
than matching on the expression.
|
||||
"""
|
||||
|
||||
return (RegexTokenizer(expression=expression, gaps=gaps)
|
||||
| IntraWordFilter(splitwords=splitwords, splitnums=splitnums,
|
||||
mergewords=mergewords, mergenums=mergenums)
|
||||
| LowercaseFilter()
|
||||
| StopFilter(stoplist=stoplist, minsize=minsize)
|
||||
)
|
||||
|
||||
|
||||
def LanguageAnalyzer(lang, expression=default_pattern, gaps=False,
|
||||
cachesize=50000):
|
||||
"""Configures a simple analyzer for the given language, with a
|
||||
LowercaseFilter, StopFilter, and StemFilter.
|
||||
|
||||
>>> ana = LanguageAnalyzer("es")
|
||||
>>> [token.text for token in ana("Por el mar corren las liebres")]
|
||||
['mar', 'corr', 'liebr']
|
||||
|
||||
The list of available languages is in `whoosh.lang.languages`.
|
||||
You can use :func:`whoosh.lang.has_stemmer` and
|
||||
:func:`whoosh.lang.has_stopwords` to check if a given language has a
|
||||
stemming function and/or stop word list available.
|
||||
|
||||
:param expression: The regular expression pattern to use to extract tokens.
|
||||
:param gaps: If True, the tokenizer *splits* on the expression, rather
|
||||
than matching on the expression.
|
||||
:param cachesize: the maximum number of stemmed words to cache. The larger
|
||||
this number, the faster stemming will be but the more memory it will
|
||||
use.
|
||||
"""
|
||||
|
||||
from whoosh.lang import NoStemmer, NoStopWords
|
||||
|
||||
# Make the start of the chain
|
||||
chain = (RegexTokenizer(expression=expression, gaps=gaps)
|
||||
| LowercaseFilter())
|
||||
|
||||
# Add a stop word filter
|
||||
try:
|
||||
chain = chain | StopFilter(lang=lang)
|
||||
except NoStopWords:
|
||||
pass
|
||||
|
||||
# Add a stemming filter
|
||||
try:
|
||||
chain = chain | StemFilter(lang=lang, cachesize=cachesize)
|
||||
except NoStemmer:
|
||||
pass
|
||||
|
||||
return chain
|
||||
479
venv/Lib/site-packages/whoosh/analysis/filters.py
Normal file
479
venv/Lib/site-packages/whoosh/analysis/filters.py
Normal file
@@ -0,0 +1,479 @@
|
||||
# coding=utf-8
|
||||
|
||||
# Copyright 2007 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
from itertools import chain
|
||||
|
||||
from whoosh.compat import next, xrange
|
||||
from whoosh.analysis.acore import Composable
|
||||
from whoosh.util.text import rcompile
|
||||
|
||||
|
||||
# Default list of stop words (words so common it's usually wasteful to index
|
||||
# them). This list is used by the StopFilter class, which allows you to supply
|
||||
# an optional list to override this one.
|
||||
|
||||
STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
|
||||
'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
|
||||
'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
|
||||
'to', 'us', 'we', 'when', 'will', 'with', 'yet',
|
||||
'you', 'your'))
|
||||
|
||||
|
||||
# Simple pattern for filtering URLs, may be useful
|
||||
|
||||
url_pattern = rcompile("""
|
||||
(
|
||||
[A-Za-z+]+:// # URL protocol
|
||||
\\S+? # URL body
|
||||
(?=\\s|[.]\\s|$|[.]$) # Stop at space/end, or a dot followed by space/end
|
||||
) | ( # or...
|
||||
\w+([:.]?\w+)* # word characters, with opt. internal colons/dots
|
||||
)
|
||||
""", verbose=True)
|
||||
|
||||
|
||||
# Filters
|
||||
|
||||
class Filter(Composable):
|
||||
"""Base class for Filter objects. A Filter subclass must implement a
|
||||
filter() method that takes a single argument, which is an iterator of Token
|
||||
objects, and yield a series of Token objects in return.
|
||||
|
||||
Filters that do morphological transformation of tokens (e.g. stemming)
|
||||
should set their ``is_morph`` attribute to True.
|
||||
"""
|
||||
|
||||
def __eq__(self, other):
|
||||
return (other
|
||||
and self.__class__ is other.__class__
|
||||
and self.__dict__ == other.__dict__)
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __call__(self, tokens):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class PassFilter(Filter):
|
||||
"""An identity filter: passes the tokens through untouched.
|
||||
"""
|
||||
|
||||
def __call__(self, tokens):
|
||||
return tokens
|
||||
|
||||
|
||||
class LoggingFilter(Filter):
|
||||
"""Prints the contents of every filter that passes through as a debug
|
||||
log entry.
|
||||
"""
|
||||
|
||||
def __init__(self, logger=None):
|
||||
"""
|
||||
:param target: the logger to use. If omitted, the "whoosh.analysis"
|
||||
logger is used.
|
||||
"""
|
||||
|
||||
if logger is None:
|
||||
import logging
|
||||
logger = logging.getLogger("whoosh.analysis")
|
||||
self.logger = logger
|
||||
|
||||
def __call__(self, tokens):
|
||||
logger = self.logger
|
||||
for t in tokens:
|
||||
logger.debug(repr(t))
|
||||
yield t
|
||||
|
||||
|
||||
class MultiFilter(Filter):
|
||||
"""Chooses one of two or more sub-filters based on the 'mode' attribute
|
||||
of the token stream.
|
||||
"""
|
||||
|
||||
default_filter = PassFilter()
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""Use keyword arguments to associate mode attribute values with
|
||||
instantiated filters.
|
||||
|
||||
>>> iwf_for_index = IntraWordFilter(mergewords=True, mergenums=False)
|
||||
>>> iwf_for_query = IntraWordFilter(mergewords=False, mergenums=False)
|
||||
>>> mf = MultiFilter(index=iwf_for_index, query=iwf_for_query)
|
||||
|
||||
This class expects that the value of the mode attribute is consistent
|
||||
among all tokens in a token stream.
|
||||
"""
|
||||
self.filters = kwargs
|
||||
|
||||
def __eq__(self, other):
|
||||
return (other
|
||||
and self.__class__ is other.__class__
|
||||
and self.filters == other.filters)
|
||||
|
||||
def __call__(self, tokens):
|
||||
# Only selects on the first token
|
||||
t = next(tokens)
|
||||
filter = self.filters.get(t.mode, self.default_filter)
|
||||
return filter(chain([t], tokens))
|
||||
|
||||
|
||||
class TeeFilter(Filter):
|
||||
"""Interleaves the results of two or more filters (or filter chains).
|
||||
|
||||
NOTE: because it needs to create copies of each token for each sub-filter,
|
||||
this filter is quite slow.
|
||||
|
||||
>>> target = "ALFA BRAVO CHARLIE"
|
||||
>>> # In one branch, we'll lower-case the tokens
|
||||
>>> f1 = LowercaseFilter()
|
||||
>>> # In the other branch, we'll reverse the tokens
|
||||
>>> f2 = ReverseTextFilter()
|
||||
>>> ana = RegexTokenizer(r"\S+") | TeeFilter(f1, f2)
|
||||
>>> [token.text for token in ana(target)]
|
||||
["alfa", "AFLA", "bravo", "OVARB", "charlie", "EILRAHC"]
|
||||
|
||||
To combine the incoming token stream with the output of a filter chain, use
|
||||
``TeeFilter`` and make one of the filters a :class:`PassFilter`.
|
||||
|
||||
>>> f1 = PassFilter()
|
||||
>>> f2 = BiWordFilter()
|
||||
>>> ana = RegexTokenizer(r"\S+") | TeeFilter(f1, f2) | LowercaseFilter()
|
||||
>>> [token.text for token in ana(target)]
|
||||
["alfa", "alfa-bravo", "bravo", "bravo-charlie", "charlie"]
|
||||
"""
|
||||
|
||||
def __init__(self, *filters):
|
||||
if len(filters) < 2:
|
||||
raise Exception("TeeFilter requires two or more filters")
|
||||
self.filters = filters
|
||||
|
||||
def __eq__(self, other):
|
||||
return (self.__class__ is other.__class__
|
||||
and self.filters == other.fitlers)
|
||||
|
||||
def __call__(self, tokens):
|
||||
from itertools import tee
|
||||
|
||||
count = len(self.filters)
|
||||
# Tee the token iterator and wrap each teed iterator with the
|
||||
# corresponding filter
|
||||
gens = [filter(t.copy() for t in gen) for filter, gen
|
||||
in zip(self.filters, tee(tokens, count))]
|
||||
# Keep a count of the number of running iterators
|
||||
running = count
|
||||
while running:
|
||||
for i, gen in enumerate(gens):
|
||||
if gen is not None:
|
||||
try:
|
||||
yield next(gen)
|
||||
except StopIteration:
|
||||
gens[i] = None
|
||||
running -= 1
|
||||
|
||||
|
||||
class ReverseTextFilter(Filter):
|
||||
"""Reverses the text of each token.
|
||||
|
||||
>>> ana = RegexTokenizer() | ReverseTextFilter()
|
||||
>>> [token.text for token in ana("hello there")]
|
||||
["olleh", "ereht"]
|
||||
"""
|
||||
|
||||
def __call__(self, tokens):
|
||||
for t in tokens:
|
||||
t.text = t.text[::-1]
|
||||
yield t
|
||||
|
||||
|
||||
class LowercaseFilter(Filter):
|
||||
"""Uses unicode.lower() to lowercase token text.
|
||||
|
||||
>>> rext = RegexTokenizer()
|
||||
>>> stream = rext("This is a TEST")
|
||||
>>> [token.text for token in LowercaseFilter(stream)]
|
||||
["this", "is", "a", "test"]
|
||||
"""
|
||||
|
||||
def __call__(self, tokens):
|
||||
for t in tokens:
|
||||
t.text = t.text.lower()
|
||||
yield t
|
||||
|
||||
|
||||
class StripFilter(Filter):
|
||||
"""Calls unicode.strip() on the token text.
|
||||
"""
|
||||
|
||||
def __call__(self, tokens):
|
||||
for t in tokens:
|
||||
t.text = t.text.strip()
|
||||
yield t
|
||||
|
||||
|
||||
class StopFilter(Filter):
|
||||
"""Marks "stop" words (words too common to index) in the stream (and by
|
||||
default removes them).
|
||||
|
||||
Make sure you precede this filter with a :class:`LowercaseFilter`.
|
||||
|
||||
>>> stopper = RegexTokenizer() | StopFilter()
|
||||
>>> [token.text for token in stopper(u"this is a test")]
|
||||
["test"]
|
||||
>>> es_stopper = RegexTokenizer() | StopFilter(lang="es")
|
||||
>>> [token.text for token in es_stopper(u"el lapiz es en la mesa")]
|
||||
["lapiz", "mesa"]
|
||||
|
||||
The list of available languages is in `whoosh.lang.languages`.
|
||||
You can use :func:`whoosh.lang.has_stopwords` to check if a given language
|
||||
has a stop word list available.
|
||||
"""
|
||||
|
||||
def __init__(self, stoplist=STOP_WORDS, minsize=2, maxsize=None,
|
||||
renumber=True, lang=None):
|
||||
"""
|
||||
:param stoplist: A collection of words to remove from the stream.
|
||||
This is converted to a frozenset. The default is a list of
|
||||
common English stop words.
|
||||
:param minsize: The minimum length of token texts. Tokens with
|
||||
text smaller than this will be stopped. The default is 2.
|
||||
:param maxsize: The maximum length of token texts. Tokens with text
|
||||
larger than this will be stopped. Use None to allow any length.
|
||||
:param renumber: Change the 'pos' attribute of unstopped tokens
|
||||
to reflect their position with the stopped words removed.
|
||||
:param lang: Automatically get a list of stop words for the given
|
||||
language
|
||||
"""
|
||||
|
||||
stops = set()
|
||||
if stoplist:
|
||||
stops.update(stoplist)
|
||||
if lang:
|
||||
from whoosh.lang import stopwords_for_language
|
||||
|
||||
stops.update(stopwords_for_language(lang))
|
||||
|
||||
self.stops = frozenset(stops)
|
||||
self.min = minsize
|
||||
self.max = maxsize
|
||||
self.renumber = renumber
|
||||
|
||||
def __eq__(self, other):
|
||||
return (other
|
||||
and self.__class__ is other.__class__
|
||||
and self.stops == other.stops
|
||||
and self.min == other.min
|
||||
and self.renumber == other.renumber)
|
||||
|
||||
def __call__(self, tokens):
|
||||
stoplist = self.stops
|
||||
minsize = self.min
|
||||
maxsize = self.max
|
||||
renumber = self.renumber
|
||||
|
||||
pos = None
|
||||
for t in tokens:
|
||||
text = t.text
|
||||
if (len(text) >= minsize
|
||||
and (maxsize is None or len(text) <= maxsize)
|
||||
and text not in stoplist):
|
||||
# This is not a stop word
|
||||
if renumber and t.positions:
|
||||
if pos is None:
|
||||
pos = t.pos
|
||||
else:
|
||||
pos += 1
|
||||
t.pos = pos
|
||||
t.stopped = False
|
||||
yield t
|
||||
else:
|
||||
# This is a stop word
|
||||
if not t.removestops:
|
||||
# This IS a stop word, but we're not removing them
|
||||
t.stopped = True
|
||||
yield t
|
||||
|
||||
|
||||
class CharsetFilter(Filter):
|
||||
"""Translates the text of tokens by calling unicode.translate() using the
|
||||
supplied character mapping object. This is useful for case and accent
|
||||
folding.
|
||||
|
||||
The ``whoosh.support.charset`` module has a useful map for accent folding.
|
||||
|
||||
>>> from whoosh.support.charset import accent_map
|
||||
>>> retokenizer = RegexTokenizer()
|
||||
>>> chfilter = CharsetFilter(accent_map)
|
||||
>>> [t.text for t in chfilter(retokenizer(u'café'))]
|
||||
[u'cafe']
|
||||
|
||||
Another way to get a character mapping object is to convert a Sphinx
|
||||
charset table file using
|
||||
:func:`whoosh.support.charset.charset_table_to_dict`.
|
||||
|
||||
>>> from whoosh.support.charset import charset_table_to_dict
|
||||
>>> from whoosh.support.charset import default_charset
|
||||
>>> retokenizer = RegexTokenizer()
|
||||
>>> charmap = charset_table_to_dict(default_charset)
|
||||
>>> chfilter = CharsetFilter(charmap)
|
||||
>>> [t.text for t in chfilter(retokenizer(u'Stra\\xdfe'))]
|
||||
[u'strase']
|
||||
|
||||
The Sphinx charset table format is described at
|
||||
http://www.sphinxsearch.com/docs/current.html#conf-charset-table.
|
||||
"""
|
||||
|
||||
__inittypes__ = dict(charmap=dict)
|
||||
|
||||
def __init__(self, charmap):
|
||||
"""
|
||||
:param charmap: a dictionary mapping from integer character numbers to
|
||||
unicode characters, as required by the unicode.translate() method.
|
||||
"""
|
||||
|
||||
self.charmap = charmap
|
||||
|
||||
def __eq__(self, other):
|
||||
return (other
|
||||
and self.__class__ is other.__class__
|
||||
and self.charmap == other.charmap)
|
||||
|
||||
def __call__(self, tokens):
|
||||
assert hasattr(tokens, "__iter__")
|
||||
charmap = self.charmap
|
||||
for t in tokens:
|
||||
t.text = t.text.translate(charmap)
|
||||
yield t
|
||||
|
||||
|
||||
class DelimitedAttributeFilter(Filter):
|
||||
"""Looks for delimiter characters in the text of each token and stores the
|
||||
data after the delimiter in a named attribute on the token.
|
||||
|
||||
The defaults are set up to use the ``^`` character as a delimiter and store
|
||||
the value after the ``^`` as the boost for the token.
|
||||
|
||||
>>> daf = DelimitedAttributeFilter(delimiter="^", attribute="boost")
|
||||
>>> ana = RegexTokenizer("\\\\S+") | DelimitedAttributeFilter()
|
||||
>>> for t in ana(u("image render^2 file^0.5"))
|
||||
... print("%r %f" % (t.text, t.boost))
|
||||
'image' 1.0
|
||||
'render' 2.0
|
||||
'file' 0.5
|
||||
|
||||
Note that you need to make sure your tokenizer includes the delimiter and
|
||||
data as part of the token!
|
||||
"""
|
||||
|
||||
def __init__(self, delimiter="^", attribute="boost", default=1.0,
|
||||
type=float):
|
||||
"""
|
||||
:param delimiter: a string that, when present in a token's text,
|
||||
separates the actual text from the "data" payload.
|
||||
:param attribute: the name of the attribute in which to store the
|
||||
data on the token.
|
||||
:param default: the value to use for the attribute for tokens that
|
||||
don't have delimited data.
|
||||
:param type: the type of the data, for example ``str`` or ``float``.
|
||||
This is used to convert the string value of the data before
|
||||
storing it in the attribute.
|
||||
"""
|
||||
|
||||
self.delim = delimiter
|
||||
self.attr = attribute
|
||||
self.default = default
|
||||
self.type = type
|
||||
|
||||
def __eq__(self, other):
|
||||
return (other and self.__class__ is other.__class__
|
||||
and self.delim == other.delim
|
||||
and self.attr == other.attr
|
||||
and self.default == other.default)
|
||||
|
||||
def __call__(self, tokens):
|
||||
delim = self.delim
|
||||
attr = self.attr
|
||||
default = self.default
|
||||
type_ = self.type
|
||||
|
||||
for t in tokens:
|
||||
text = t.text
|
||||
pos = text.find(delim)
|
||||
if pos > -1:
|
||||
setattr(t, attr, type_(text[pos + 1:]))
|
||||
if t.chars:
|
||||
t.endchar -= len(t.text) - pos
|
||||
t.text = text[:pos]
|
||||
else:
|
||||
setattr(t, attr, default)
|
||||
|
||||
yield t
|
||||
|
||||
|
||||
class SubstitutionFilter(Filter):
|
||||
"""Performs a regular expression substitution on the token text.
|
||||
|
||||
This is especially useful for removing text from tokens, for example
|
||||
hyphens::
|
||||
|
||||
ana = RegexTokenizer(r"\\S+") | SubstitutionFilter("-", "")
|
||||
|
||||
Because it has the full power of the re.sub() method behind it, this filter
|
||||
can perform some fairly complex transformations. For example, to take
|
||||
tokens like ``'a=b', 'c=d', 'e=f'`` and change them to ``'b=a', 'd=c',
|
||||
'f=e'``::
|
||||
|
||||
# Analyzer that swaps the text on either side of an equal sign
|
||||
rt = RegexTokenizer(r"\\S+")
|
||||
sf = SubstitutionFilter("([^/]*)/(./*)", r"\\2/\\1")
|
||||
ana = rt | sf
|
||||
"""
|
||||
|
||||
def __init__(self, pattern, replacement):
|
||||
"""
|
||||
:param pattern: a pattern string or compiled regular expression object
|
||||
describing the text to replace.
|
||||
:param replacement: the substitution text.
|
||||
"""
|
||||
|
||||
self.pattern = rcompile(pattern)
|
||||
self.replacement = replacement
|
||||
|
||||
def __eq__(self, other):
|
||||
return (other and self.__class__ is other.__class__
|
||||
and self.pattern == other.pattern
|
||||
and self.replacement == other.replacement)
|
||||
|
||||
def __call__(self, tokens):
|
||||
pattern = self.pattern
|
||||
replacement = self.replacement
|
||||
|
||||
for t in tokens:
|
||||
t.text = pattern.sub(replacement, t.text)
|
||||
yield t
|
||||
494
venv/Lib/site-packages/whoosh/analysis/intraword.py
Normal file
494
venv/Lib/site-packages/whoosh/analysis/intraword.py
Normal file
@@ -0,0 +1,494 @@
|
||||
# Copyright 2007 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
import re
|
||||
from collections import deque
|
||||
|
||||
from whoosh.compat import u, text_type
|
||||
from whoosh.compat import xrange
|
||||
from whoosh.analysis.filters import Filter
|
||||
|
||||
|
||||
class CompoundWordFilter(Filter):
|
||||
"""Given a set of words (or any object with a ``__contains__`` method),
|
||||
break any tokens in the stream that are composites of words in the word set
|
||||
into their individual parts.
|
||||
|
||||
Given the correct set of words, this filter can break apart run-together
|
||||
words and trademarks (e.g. "turbosquid", "applescript"). It can also be
|
||||
useful for agglutinative languages such as German.
|
||||
|
||||
The ``keep_compound`` argument lets you decide whether to keep the
|
||||
compound word in the token stream along with the word segments.
|
||||
|
||||
>>> cwf = CompoundWordFilter(wordset, keep_compound=True)
|
||||
>>> analyzer = RegexTokenizer(r"\S+") | cwf
|
||||
>>> [t.text for t in analyzer("I do not like greeneggs and ham")
|
||||
["I", "do", "not", "like", "greeneggs", "green", "eggs", "and", "ham"]
|
||||
>>> cwf.keep_compound = False
|
||||
>>> [t.text for t in analyzer("I do not like greeneggs and ham")
|
||||
["I", "do", "not", "like", "green", "eggs", "and", "ham"]
|
||||
"""
|
||||
|
||||
def __init__(self, wordset, keep_compound=True):
|
||||
"""
|
||||
:param wordset: an object with a ``__contains__`` method, such as a
|
||||
set, containing strings to look for inside the tokens.
|
||||
:param keep_compound: if True (the default), the original compound
|
||||
token will be retained in the stream before the subwords.
|
||||
"""
|
||||
|
||||
self.wordset = wordset
|
||||
self.keep_compound = keep_compound
|
||||
|
||||
def subwords(self, s, memo):
|
||||
if s in self.wordset:
|
||||
return [s]
|
||||
if s in memo:
|
||||
return memo[s]
|
||||
|
||||
for i in xrange(1, len(s)):
|
||||
prefix = s[:i]
|
||||
if prefix in self.wordset:
|
||||
suffix = s[i:]
|
||||
suffix_subs = self.subwords(suffix, memo)
|
||||
if suffix_subs:
|
||||
result = [prefix] + suffix_subs
|
||||
memo[s] = result
|
||||
return result
|
||||
|
||||
return None
|
||||
|
||||
def __call__(self, tokens):
|
||||
keep_compound = self.keep_compound
|
||||
memo = {}
|
||||
subwords = self.subwords
|
||||
for t in tokens:
|
||||
subs = subwords(t.text, memo)
|
||||
if subs:
|
||||
if len(subs) > 1 and keep_compound:
|
||||
yield t
|
||||
for subword in subs:
|
||||
t.text = subword
|
||||
yield t
|
||||
else:
|
||||
yield t
|
||||
|
||||
|
||||
class BiWordFilter(Filter):
|
||||
"""Merges adjacent tokens into "bi-word" tokens, so that for example::
|
||||
|
||||
"the", "sign", "of", "four"
|
||||
|
||||
becomes::
|
||||
|
||||
"the-sign", "sign-of", "of-four"
|
||||
|
||||
This can be used to create fields for pseudo-phrase searching, where if
|
||||
all the terms match the document probably contains the phrase, but the
|
||||
searching is faster than actually doing a phrase search on individual word
|
||||
terms.
|
||||
|
||||
The ``BiWordFilter`` is much faster than using the otherwise equivalent
|
||||
``ShingleFilter(2)``.
|
||||
"""
|
||||
|
||||
def __init__(self, sep="-"):
|
||||
self.sep = sep
|
||||
|
||||
def __call__(self, tokens):
|
||||
sep = self.sep
|
||||
prev_text = None
|
||||
prev_startchar = None
|
||||
prev_pos = None
|
||||
atleastone = False
|
||||
|
||||
for token in tokens:
|
||||
# Save the original text of this token
|
||||
text = token.text
|
||||
|
||||
# Save the original position
|
||||
positions = token.positions
|
||||
if positions:
|
||||
ps = token.pos
|
||||
|
||||
# Save the original start char
|
||||
chars = token.chars
|
||||
if chars:
|
||||
sc = token.startchar
|
||||
|
||||
if prev_text is not None:
|
||||
# Use the pos and startchar from the previous token
|
||||
if positions:
|
||||
token.pos = prev_pos
|
||||
if chars:
|
||||
token.startchar = prev_startchar
|
||||
|
||||
# Join the previous token text and the current token text to
|
||||
# form the biword token
|
||||
token.text = "".join((prev_text, sep, text))
|
||||
yield token
|
||||
atleastone = True
|
||||
|
||||
# Save the originals and the new "previous" values
|
||||
prev_text = text
|
||||
if chars:
|
||||
prev_startchar = sc
|
||||
if positions:
|
||||
prev_pos = ps
|
||||
|
||||
# If no bi-words were emitted, that is, the token stream only had
|
||||
# a single token, then emit that single token.
|
||||
if not atleastone:
|
||||
yield token
|
||||
|
||||
|
||||
class ShingleFilter(Filter):
|
||||
"""Merges a certain number of adjacent tokens into multi-word tokens, so
|
||||
that for example::
|
||||
|
||||
"better", "a", "witty", "fool", "than", "a", "foolish", "wit"
|
||||
|
||||
with ``ShingleFilter(3, ' ')`` becomes::
|
||||
|
||||
'better a witty', 'a witty fool', 'witty fool than', 'fool than a',
|
||||
'than a foolish', 'a foolish wit'
|
||||
|
||||
This can be used to create fields for pseudo-phrase searching, where if
|
||||
all the terms match the document probably contains the phrase, but the
|
||||
searching is faster than actually doing a phrase search on individual word
|
||||
terms.
|
||||
|
||||
If you're using two-word shingles, you should use the functionally
|
||||
equivalent ``BiWordFilter`` instead because it's faster than
|
||||
``ShingleFilter``.
|
||||
"""
|
||||
|
||||
def __init__(self, size=2, sep="-"):
|
||||
self.size = size
|
||||
self.sep = sep
|
||||
|
||||
def __call__(self, tokens):
|
||||
size = self.size
|
||||
sep = self.sep
|
||||
buf = deque()
|
||||
atleastone = False
|
||||
|
||||
def make_token():
|
||||
tk = buf[0]
|
||||
tk.text = sep.join([t.text for t in buf])
|
||||
if tk.chars:
|
||||
tk.endchar = buf[-1].endchar
|
||||
return tk
|
||||
|
||||
for token in tokens:
|
||||
if not token.stopped:
|
||||
buf.append(token.copy())
|
||||
if len(buf) == size:
|
||||
atleastone = True
|
||||
yield make_token()
|
||||
buf.popleft()
|
||||
|
||||
# If no shingles were emitted, that is, the token stream had fewer than
|
||||
# 'size' tokens, then emit a single token with whatever tokens there
|
||||
# were
|
||||
if not atleastone and buf:
|
||||
yield make_token()
|
||||
|
||||
|
||||
class IntraWordFilter(Filter):
|
||||
"""Splits words into subwords and performs optional transformations on
|
||||
subword groups. This filter is funtionally based on yonik's
|
||||
WordDelimiterFilter in Solr, but shares no code with it.
|
||||
|
||||
* Split on intra-word delimiters, e.g. `Wi-Fi` -> `Wi`, `Fi`.
|
||||
* When splitwords=True, split on case transitions,
|
||||
e.g. `PowerShot` -> `Power`, `Shot`.
|
||||
* When splitnums=True, split on letter-number transitions,
|
||||
e.g. `SD500` -> `SD`, `500`.
|
||||
* Leading and trailing delimiter characters are ignored.
|
||||
* Trailing possesive "'s" removed from subwords,
|
||||
e.g. `O'Neil's` -> `O`, `Neil`.
|
||||
|
||||
The mergewords and mergenums arguments turn on merging of subwords.
|
||||
|
||||
When the merge arguments are false, subwords are not merged.
|
||||
|
||||
* `PowerShot` -> `0`:`Power`, `1`:`Shot` (where `0` and `1` are token
|
||||
positions).
|
||||
|
||||
When one or both of the merge arguments are true, consecutive runs of
|
||||
alphabetic and/or numeric subwords are merged into an additional token with
|
||||
the same position as the last sub-word.
|
||||
|
||||
* `PowerShot` -> `0`:`Power`, `1`:`Shot`, `1`:`PowerShot`
|
||||
* `A's+B's&C's` -> `0`:`A`, `1`:`B`, `2`:`C`, `2`:`ABC`
|
||||
* `Super-Duper-XL500-42-AutoCoder!` -> `0`:`Super`, `1`:`Duper`, `2`:`XL`,
|
||||
`2`:`SuperDuperXL`,
|
||||
`3`:`500`, `4`:`42`, `4`:`50042`, `5`:`Auto`, `6`:`Coder`,
|
||||
`6`:`AutoCoder`
|
||||
|
||||
When using this filter you should use a tokenizer that only splits on
|
||||
whitespace, so the tokenizer does not remove intra-word delimiters before
|
||||
this filter can see them, and put this filter before any use of
|
||||
LowercaseFilter.
|
||||
|
||||
>>> rt = RegexTokenizer(r"\\S+")
|
||||
>>> iwf = IntraWordFilter()
|
||||
>>> lcf = LowercaseFilter()
|
||||
>>> analyzer = rt | iwf | lcf
|
||||
|
||||
One use for this filter is to help match different written representations
|
||||
of a concept. For example, if the source text contained `wi-fi`, you
|
||||
probably want `wifi`, `WiFi`, `wi-fi`, etc. to match. One way of doing this
|
||||
is to specify mergewords=True and/or mergenums=True in the analyzer used
|
||||
for indexing, and mergewords=False / mergenums=False in the analyzer used
|
||||
for querying.
|
||||
|
||||
>>> iwf_i = IntraWordFilter(mergewords=True, mergenums=True)
|
||||
>>> iwf_q = IntraWordFilter(mergewords=False, mergenums=False)
|
||||
>>> iwf = MultiFilter(index=iwf_i, query=iwf_q)
|
||||
>>> analyzer = RegexTokenizer(r"\S+") | iwf | LowercaseFilter()
|
||||
|
||||
(See :class:`MultiFilter`.)
|
||||
"""
|
||||
|
||||
is_morph = True
|
||||
|
||||
__inittypes__ = dict(delims=text_type, splitwords=bool, splitnums=bool,
|
||||
mergewords=bool, mergenums=bool)
|
||||
|
||||
def __init__(self, delims=u("-_'\"()!@#$%^&*[]{}<>\|;:,./?`~=+"),
|
||||
splitwords=True, splitnums=True,
|
||||
mergewords=False, mergenums=False):
|
||||
"""
|
||||
:param delims: a string of delimiter characters.
|
||||
:param splitwords: if True, split at case transitions,
|
||||
e.g. `PowerShot` -> `Power`, `Shot`
|
||||
:param splitnums: if True, split at letter-number transitions,
|
||||
e.g. `SD500` -> `SD`, `500`
|
||||
:param mergewords: merge consecutive runs of alphabetic subwords into
|
||||
an additional token with the same position as the last subword.
|
||||
:param mergenums: merge consecutive runs of numeric subwords into an
|
||||
additional token with the same position as the last subword.
|
||||
"""
|
||||
|
||||
from whoosh.support.unicode import digits, lowercase, uppercase
|
||||
|
||||
self.delims = re.escape(delims)
|
||||
|
||||
# Expression for text between delimiter characters
|
||||
self.between = re.compile(u("[^%s]+") % (self.delims,), re.UNICODE)
|
||||
# Expression for removing "'s" from the end of sub-words
|
||||
dispat = u("(?<=[%s%s])'[Ss](?=$|[%s])") % (lowercase, uppercase,
|
||||
self.delims)
|
||||
self.possessive = re.compile(dispat, re.UNICODE)
|
||||
|
||||
# Expression for finding case and letter-number transitions
|
||||
lower2upper = u("[%s][%s]") % (lowercase, uppercase)
|
||||
letter2digit = u("[%s%s][%s]") % (lowercase, uppercase, digits)
|
||||
digit2letter = u("[%s][%s%s]") % (digits, lowercase, uppercase)
|
||||
if splitwords and splitnums:
|
||||
splitpat = u("(%s|%s|%s)") % (lower2upper, letter2digit,
|
||||
digit2letter)
|
||||
self.boundary = re.compile(splitpat, re.UNICODE)
|
||||
elif splitwords:
|
||||
self.boundary = re.compile(text_type(lower2upper), re.UNICODE)
|
||||
elif splitnums:
|
||||
numpat = u("(%s|%s)") % (letter2digit, digit2letter)
|
||||
self.boundary = re.compile(numpat, re.UNICODE)
|
||||
|
||||
self.splitting = splitwords or splitnums
|
||||
self.mergewords = mergewords
|
||||
self.mergenums = mergenums
|
||||
|
||||
def __eq__(self, other):
|
||||
return other and self.__class__ is other.__class__\
|
||||
and self.__dict__ == other.__dict__
|
||||
|
||||
def _split(self, string):
|
||||
bound = self.boundary
|
||||
|
||||
# Yields (startchar, endchar) pairs for each indexable substring in
|
||||
# the given string, e.g. "WikiWord" -> (0, 4), (4, 8)
|
||||
|
||||
# Whether we're splitting on transitions (case changes, letter -> num,
|
||||
# num -> letter, etc.)
|
||||
splitting = self.splitting
|
||||
|
||||
# Make a list (dispos, for "dispossessed") of (startchar, endchar)
|
||||
# pairs for runs of text between "'s"
|
||||
if "'" in string:
|
||||
# Split on possessive 's
|
||||
dispos = []
|
||||
prev = 0
|
||||
for match in self.possessive.finditer(string):
|
||||
dispos.append((prev, match.start()))
|
||||
prev = match.end()
|
||||
if prev < len(string):
|
||||
dispos.append((prev, len(string)))
|
||||
else:
|
||||
# Shortcut if there's no apostrophe in the string
|
||||
dispos = ((0, len(string)),)
|
||||
|
||||
# For each run between 's
|
||||
for sc, ec in dispos:
|
||||
# Split on boundary characters
|
||||
for part_match in self.between.finditer(string, sc, ec):
|
||||
part_start = part_match.start()
|
||||
part_end = part_match.end()
|
||||
|
||||
if splitting:
|
||||
# The point to start splitting at
|
||||
prev = part_start
|
||||
# Find transitions (e.g. "iW" or "a0")
|
||||
for bmatch in bound.finditer(string, part_start, part_end):
|
||||
# The point in the middle of the transition
|
||||
pivot = bmatch.start() + 1
|
||||
# Yield from the previous match to the transition
|
||||
yield (prev, pivot)
|
||||
# Make the transition the new starting point
|
||||
prev = pivot
|
||||
|
||||
# If there's leftover text at the end, yield it too
|
||||
if prev < part_end:
|
||||
yield (prev, part_end)
|
||||
else:
|
||||
# Not splitting on transitions, just yield the part
|
||||
yield (part_start, part_end)
|
||||
|
||||
def _merge(self, parts):
|
||||
mergewords = self.mergewords
|
||||
mergenums = self.mergenums
|
||||
|
||||
# Current type (1=alpah, 2=digit)
|
||||
last = 0
|
||||
# Where to insert a merged term in the original list
|
||||
insertat = 0
|
||||
# Buffer for parts to merge
|
||||
buf = []
|
||||
# Iterate on a copy of the parts list so we can modify the original as
|
||||
# we go
|
||||
|
||||
def insert_item(buf, at, newpos):
|
||||
newtext = "".join(item[0] for item in buf)
|
||||
newsc = buf[0][2] # start char of first item in buffer
|
||||
newec = buf[-1][3] # end char of last item in buffer
|
||||
parts.insert(insertat, (newtext, newpos, newsc, newec))
|
||||
|
||||
for item in list(parts):
|
||||
# item = (text, pos, startchar, endchar)
|
||||
text = item[0]
|
||||
pos = item[1]
|
||||
|
||||
# Set the type of this part
|
||||
if text.isalpha():
|
||||
this = 1
|
||||
elif text.isdigit():
|
||||
this = 2
|
||||
else:
|
||||
this = None
|
||||
|
||||
# Is this the same type as the previous part?
|
||||
if (buf and (this == last == 1 and mergewords)
|
||||
or (this == last == 2 and mergenums)):
|
||||
# This part is the same type as the previous. Add it to the
|
||||
# buffer of parts to merge.
|
||||
buf.append(item)
|
||||
else:
|
||||
# This part is different than the previous.
|
||||
if len(buf) > 1:
|
||||
# If the buffer has at least two parts in it, merge them
|
||||
# and add them to the original list of parts.
|
||||
insert_item(buf, insertat, pos - 1)
|
||||
insertat += 1
|
||||
# Reset the buffer
|
||||
buf = [item]
|
||||
last = this
|
||||
insertat += 1
|
||||
|
||||
# If there are parts left in the buffer at the end, merge them and add
|
||||
# them to the original list.
|
||||
if len(buf) > 1:
|
||||
insert_item(buf, len(parts), pos)
|
||||
|
||||
def __call__(self, tokens):
|
||||
mergewords = self.mergewords
|
||||
mergenums = self.mergenums
|
||||
|
||||
# This filter renumbers tokens as it expands them. New position
|
||||
# counter.
|
||||
newpos = None
|
||||
for t in tokens:
|
||||
text = t.text
|
||||
|
||||
# If this is the first token we've seen, use it to set the new
|
||||
# position counter
|
||||
if newpos is None:
|
||||
if t.positions:
|
||||
newpos = t.pos
|
||||
else:
|
||||
# Token doesn't have positions, just use 0
|
||||
newpos = 0
|
||||
|
||||
if ((text.isalpha() and (text.islower() or text.isupper()))
|
||||
or text.isdigit()):
|
||||
# Short-circuit the common cases of no delimiters, no case
|
||||
# transitions, only digits, etc.
|
||||
t.pos = newpos
|
||||
yield t
|
||||
newpos += 1
|
||||
else:
|
||||
# Split the token text on delimiters, word and/or number
|
||||
# boundaries into a list of (text, pos, startchar, endchar)
|
||||
# tuples
|
||||
ranges = self._split(text)
|
||||
parts = [(text[sc:ec], i + newpos, sc, ec)
|
||||
for i, (sc, ec) in enumerate(ranges)]
|
||||
|
||||
# Did the split yield more than one part?
|
||||
if len(parts) > 1:
|
||||
# If the options are set, merge consecutive runs of all-
|
||||
# letters and/or all-numbers.
|
||||
if mergewords or mergenums:
|
||||
self._merge(parts)
|
||||
|
||||
# Yield tokens for the parts
|
||||
chars = t.chars
|
||||
if chars:
|
||||
base = t.startchar
|
||||
for text, pos, startchar, endchar in parts:
|
||||
t.text = text
|
||||
t.pos = pos
|
||||
if t.chars:
|
||||
t.startchar = base + startchar
|
||||
t.endchar = base + endchar
|
||||
yield t
|
||||
|
||||
if parts:
|
||||
# Set the new position counter based on the last part
|
||||
newpos = parts[-1][1] + 1
|
||||
267
venv/Lib/site-packages/whoosh/analysis/morph.py
Normal file
267
venv/Lib/site-packages/whoosh/analysis/morph.py
Normal file
@@ -0,0 +1,267 @@
|
||||
# Copyright 2007 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
from whoosh.analysis.filters import Filter
|
||||
from whoosh.compat import integer_types
|
||||
from whoosh.lang.dmetaphone import double_metaphone
|
||||
from whoosh.lang.porter import stem
|
||||
from whoosh.util.cache import lfu_cache, unbound_cache
|
||||
|
||||
|
||||
class StemFilter(Filter):
|
||||
"""Stems (removes suffixes from) the text of tokens using the Porter
|
||||
stemming algorithm. Stemming attempts to reduce multiple forms of the same
|
||||
root word (for example, "rendering", "renders", "rendered", etc.) to a
|
||||
single word in the index.
|
||||
|
||||
>>> stemmer = RegexTokenizer() | StemFilter()
|
||||
>>> [token.text for token in stemmer("fundamentally willows")]
|
||||
["fundament", "willow"]
|
||||
|
||||
You can pass your own stemming function to the StemFilter. The default
|
||||
is the Porter stemming algorithm for English.
|
||||
|
||||
>>> stemfilter = StemFilter(stem_function)
|
||||
|
||||
You can also use one of the Snowball stemming functions by passing the
|
||||
`lang` keyword argument.
|
||||
|
||||
>>> stemfilter = StemFilter(lang="ru")
|
||||
|
||||
The list of available languages is in `whoosh.lang.languages`.
|
||||
You can use :func:`whoosh.lang.has_stemmer` to check if a given language has
|
||||
a stemming function available.
|
||||
|
||||
By default, this class wraps an LRU cache around the stemming function. The
|
||||
``cachesize`` keyword argument sets the size of the cache. To make the
|
||||
cache unbounded (the class caches every input), use ``cachesize=-1``. To
|
||||
disable caching, use ``cachesize=None``.
|
||||
|
||||
If you compile and install the py-stemmer library, the
|
||||
:class:`PyStemmerFilter` provides slightly easier access to the language
|
||||
stemmers in that library.
|
||||
"""
|
||||
|
||||
__inittypes__ = dict(stemfn=object, ignore=list)
|
||||
|
||||
is_morph = True
|
||||
|
||||
def __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000):
|
||||
"""
|
||||
:param stemfn: the function to use for stemming.
|
||||
:param lang: if not None, overrides the stemfn with a language stemmer
|
||||
from the ``whoosh.lang.snowball`` package.
|
||||
:param ignore: a set/list of words that should not be stemmed. This is
|
||||
converted into a frozenset. If you omit this argument, all tokens
|
||||
are stemmed.
|
||||
:param cachesize: the maximum number of words to cache. Use ``-1`` for
|
||||
an unbounded cache, or ``None`` for no caching.
|
||||
"""
|
||||
|
||||
self.stemfn = stemfn
|
||||
self.lang = lang
|
||||
self.ignore = frozenset() if ignore is None else frozenset(ignore)
|
||||
self.cachesize = cachesize
|
||||
# clear() sets the _stem attr to a cached wrapper around self.stemfn
|
||||
self.clear()
|
||||
|
||||
def __getstate__(self):
|
||||
# Can't pickle a dynamic function, so we have to remove the _stem
|
||||
# attribute from the state
|
||||
return dict([(k, self.__dict__[k]) for k in self.__dict__
|
||||
if k != "_stem"])
|
||||
|
||||
def __setstate__(self, state):
|
||||
# Check for old instances of StemFilter class, which didn't have a
|
||||
# cachesize attribute and pickled the cache attribute
|
||||
if "cachesize" not in state:
|
||||
self.cachesize = 50000
|
||||
if "ignores" in state:
|
||||
self.ignore = state["ignores"]
|
||||
elif "ignore" not in state:
|
||||
self.ignore = frozenset()
|
||||
if "lang" not in state:
|
||||
self.lang = None
|
||||
if "cache" in state:
|
||||
del state["cache"]
|
||||
|
||||
self.__dict__.update(state)
|
||||
# Set the _stem attribute
|
||||
self.clear()
|
||||
|
||||
def clear(self):
|
||||
if self.lang:
|
||||
from whoosh.lang import stemmer_for_language
|
||||
stemfn = stemmer_for_language(self.lang)
|
||||
else:
|
||||
stemfn = self.stemfn
|
||||
|
||||
if isinstance(self.cachesize, integer_types) and self.cachesize != 0:
|
||||
if self.cachesize < 0:
|
||||
self._stem = unbound_cache(stemfn)
|
||||
elif self.cachesize > 1:
|
||||
self._stem = lfu_cache(self.cachesize)(stemfn)
|
||||
else:
|
||||
self._stem = stemfn
|
||||
|
||||
def cache_info(self):
|
||||
if self.cachesize <= 1:
|
||||
return None
|
||||
return self._stem.cache_info()
|
||||
|
||||
def __eq__(self, other):
|
||||
return (other and self.__class__ is other.__class__
|
||||
and self.stemfn == other.stemfn)
|
||||
|
||||
def __call__(self, tokens):
|
||||
stemfn = self._stem
|
||||
ignore = self.ignore
|
||||
|
||||
for t in tokens:
|
||||
if not t.stopped:
|
||||
text = t.text
|
||||
if text not in ignore:
|
||||
t.text = stemfn(text)
|
||||
yield t
|
||||
|
||||
|
||||
class PyStemmerFilter(StemFilter):
|
||||
"""This is a simple subclass of StemFilter that works with the py-stemmer
|
||||
third-party library. You must have the py-stemmer library installed to use
|
||||
this filter.
|
||||
|
||||
>>> PyStemmerFilter("spanish")
|
||||
"""
|
||||
|
||||
def __init__(self, lang="english", ignore=None, cachesize=10000):
|
||||
"""
|
||||
:param lang: a string identifying the stemming algorithm to use. You
|
||||
can get a list of available algorithms by with the
|
||||
:meth:`PyStemmerFilter.algorithms` method. The identification
|
||||
strings are directly from the py-stemmer library.
|
||||
:param ignore: a set/list of words that should not be stemmed. This is
|
||||
converted into a frozenset. If you omit this argument, all tokens
|
||||
are stemmed.
|
||||
:param cachesize: the maximum number of words to cache.
|
||||
"""
|
||||
|
||||
self.lang = lang
|
||||
self.ignore = frozenset() if ignore is None else frozenset(ignore)
|
||||
self.cachesize = cachesize
|
||||
self._stem = self._get_stemmer_fn()
|
||||
|
||||
def algorithms(self):
|
||||
"""Returns a list of stemming algorithms provided by the py-stemmer
|
||||
library.
|
||||
"""
|
||||
|
||||
import Stemmer # @UnresolvedImport
|
||||
|
||||
return Stemmer.algorithms()
|
||||
|
||||
def cache_info(self):
|
||||
return None
|
||||
|
||||
def _get_stemmer_fn(self):
|
||||
import Stemmer # @UnresolvedImport
|
||||
|
||||
stemmer = Stemmer.Stemmer(self.lang)
|
||||
stemmer.maxCacheSize = self.cachesize
|
||||
return stemmer.stemWord
|
||||
|
||||
def __getstate__(self):
|
||||
# Can't pickle a dynamic function, so we have to remove the _stem
|
||||
# attribute from the state
|
||||
return dict([(k, self.__dict__[k]) for k in self.__dict__
|
||||
if k != "_stem"])
|
||||
|
||||
def __setstate__(self, state):
|
||||
# Check for old instances of StemFilter class, which didn't have a
|
||||
# cachesize attribute and pickled the cache attribute
|
||||
if "cachesize" not in state:
|
||||
self.cachesize = 10000
|
||||
if "ignores" in state:
|
||||
self.ignore = state["ignores"]
|
||||
elif "ignore" not in state:
|
||||
self.ignore = frozenset()
|
||||
if "cache" in state:
|
||||
del state["cache"]
|
||||
|
||||
self.__dict__.update(state)
|
||||
# Set the _stem attribute
|
||||
self._stem = self._get_stemmer_fn()
|
||||
|
||||
|
||||
class DoubleMetaphoneFilter(Filter):
|
||||
"""Transforms the text of the tokens using Lawrence Philips's Double
|
||||
Metaphone algorithm. This algorithm attempts to encode words in such a way
|
||||
that similar-sounding words reduce to the same code. This may be useful for
|
||||
fields containing the names of people and places, and other uses where
|
||||
tolerance of spelling differences is desireable.
|
||||
"""
|
||||
|
||||
is_morph = True
|
||||
|
||||
def __init__(self, primary_boost=1.0, secondary_boost=0.5, combine=False):
|
||||
"""
|
||||
:param primary_boost: the boost to apply to the token containing the
|
||||
primary code.
|
||||
:param secondary_boost: the boost to apply to the token containing the
|
||||
secondary code, if any.
|
||||
:param combine: if True, the original unencoded tokens are kept in the
|
||||
stream, preceding the encoded tokens.
|
||||
"""
|
||||
|
||||
self.primary_boost = primary_boost
|
||||
self.secondary_boost = secondary_boost
|
||||
self.combine = combine
|
||||
|
||||
def __eq__(self, other):
|
||||
return (other
|
||||
and self.__class__ is other.__class__
|
||||
and self.primary_boost == other.primary_boost)
|
||||
|
||||
def __call__(self, tokens):
|
||||
primary_boost = self.primary_boost
|
||||
secondary_boost = self.secondary_boost
|
||||
combine = self.combine
|
||||
|
||||
for t in tokens:
|
||||
if combine:
|
||||
yield t
|
||||
|
||||
primary, secondary = double_metaphone(t.text)
|
||||
b = t.boost
|
||||
# Overwrite the token's text and boost and yield it
|
||||
if primary:
|
||||
t.text = primary
|
||||
t.boost = b * primary_boost
|
||||
yield t
|
||||
if secondary:
|
||||
t.text = secondary
|
||||
t.boost = b * secondary_boost
|
||||
yield t
|
||||
237
venv/Lib/site-packages/whoosh/analysis/ngrams.py
Normal file
237
venv/Lib/site-packages/whoosh/analysis/ngrams.py
Normal file
@@ -0,0 +1,237 @@
|
||||
# Copyright 2007 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
from whoosh.compat import text_type
|
||||
from whoosh.compat import xrange
|
||||
from whoosh.analysis.acore import Token
|
||||
from whoosh.analysis.filters import Filter, LowercaseFilter
|
||||
from whoosh.analysis.tokenizers import Tokenizer, RegexTokenizer
|
||||
|
||||
|
||||
# Tokenizer
|
||||
|
||||
class NgramTokenizer(Tokenizer):
|
||||
"""Splits input text into N-grams instead of words.
|
||||
|
||||
>>> ngt = NgramTokenizer(4)
|
||||
>>> [token.text for token in ngt("hi there")]
|
||||
["hi t", "i th", " the", "ther", "here"]
|
||||
|
||||
Note that this tokenizer does NOT use a regular expression to extract
|
||||
words, so the grams emitted by it will contain whitespace, punctuation,
|
||||
etc. You may want to massage the input or add a custom filter to this
|
||||
tokenizer's output.
|
||||
|
||||
Alternatively, if you only want sub-word grams without whitespace, you
|
||||
could combine a RegexTokenizer with NgramFilter instead.
|
||||
"""
|
||||
|
||||
__inittypes__ = dict(minsize=int, maxsize=int)
|
||||
|
||||
def __init__(self, minsize, maxsize=None):
|
||||
"""
|
||||
:param minsize: The minimum size of the N-grams.
|
||||
:param maxsize: The maximum size of the N-grams. If you omit
|
||||
this parameter, maxsize == minsize.
|
||||
"""
|
||||
|
||||
self.min = minsize
|
||||
self.max = maxsize or minsize
|
||||
|
||||
def __eq__(self, other):
|
||||
if self.__class__ is other.__class__:
|
||||
if self.min == other.min and self.max == other.max:
|
||||
return True
|
||||
return False
|
||||
|
||||
def __call__(self, value, positions=False, chars=False, keeporiginal=False,
|
||||
removestops=True, start_pos=0, start_char=0, mode='',
|
||||
**kwargs):
|
||||
assert isinstance(value, text_type), "%r is not unicode" % value
|
||||
|
||||
inlen = len(value)
|
||||
t = Token(positions, chars, removestops=removestops, mode=mode)
|
||||
pos = start_pos
|
||||
|
||||
if mode == "query":
|
||||
size = min(self.max, inlen)
|
||||
for start in xrange(0, inlen - size + 1):
|
||||
end = start + size
|
||||
if end > inlen:
|
||||
continue
|
||||
t.text = value[start:end]
|
||||
if keeporiginal:
|
||||
t.original = t.text
|
||||
t.stopped = False
|
||||
if positions:
|
||||
t.pos = pos
|
||||
if chars:
|
||||
t.startchar = start_char + start
|
||||
t.endchar = start_char + end
|
||||
yield t
|
||||
pos += 1
|
||||
else:
|
||||
for start in xrange(0, inlen - self.min + 1):
|
||||
for size in xrange(self.min, self.max + 1):
|
||||
end = start + size
|
||||
if end > inlen:
|
||||
continue
|
||||
t.text = value[start:end]
|
||||
if keeporiginal:
|
||||
t.original = t.text
|
||||
t.stopped = False
|
||||
if positions:
|
||||
t.pos = pos
|
||||
if chars:
|
||||
t.startchar = start_char + start
|
||||
t.endchar = start_char + end
|
||||
|
||||
yield t
|
||||
pos += 1
|
||||
|
||||
|
||||
# Filter
|
||||
|
||||
class NgramFilter(Filter):
|
||||
"""Splits token text into N-grams.
|
||||
|
||||
>>> rext = RegexTokenizer()
|
||||
>>> stream = rext("hello there")
|
||||
>>> ngf = NgramFilter(4)
|
||||
>>> [token.text for token in ngf(stream)]
|
||||
["hell", "ello", "ther", "here"]
|
||||
"""
|
||||
|
||||
__inittypes__ = dict(minsize=int, maxsize=int)
|
||||
|
||||
def __init__(self, minsize, maxsize=None, at=None):
|
||||
"""
|
||||
:param minsize: The minimum size of the N-grams.
|
||||
:param maxsize: The maximum size of the N-grams. If you omit this
|
||||
parameter, maxsize == minsize.
|
||||
:param at: If 'start', only take N-grams from the start of each word.
|
||||
if 'end', only take N-grams from the end of each word. Otherwise,
|
||||
take all N-grams from the word (the default).
|
||||
"""
|
||||
|
||||
self.min = minsize
|
||||
self.max = maxsize or minsize
|
||||
self.at = 0
|
||||
if at == "start":
|
||||
self.at = -1
|
||||
elif at == "end":
|
||||
self.at = 1
|
||||
|
||||
def __eq__(self, other):
|
||||
return other and self.__class__ is other.__class__\
|
||||
and self.min == other.min and self.max == other.max
|
||||
|
||||
def __call__(self, tokens):
|
||||
assert hasattr(tokens, "__iter__")
|
||||
at = self.at
|
||||
for t in tokens:
|
||||
text = t.text
|
||||
if len(text) < self.min:
|
||||
continue
|
||||
|
||||
chars = t.chars
|
||||
if chars:
|
||||
startchar = t.startchar
|
||||
# Token positions don't mean much for N-grams,
|
||||
# so we'll leave the token's original position
|
||||
# untouched.
|
||||
|
||||
if t.mode == "query":
|
||||
size = min(self.max, len(t.text))
|
||||
if at == -1:
|
||||
t.text = text[:size]
|
||||
if chars:
|
||||
t.endchar = startchar + size
|
||||
yield t
|
||||
elif at == 1:
|
||||
t.text = text[0 - size:]
|
||||
if chars:
|
||||
t.startchar = t.endchar - size
|
||||
yield t
|
||||
else:
|
||||
for start in xrange(0, len(text) - size + 1):
|
||||
t.text = text[start:start + size]
|
||||
if chars:
|
||||
t.startchar = startchar + start
|
||||
t.endchar = startchar + start + size
|
||||
yield t
|
||||
else:
|
||||
if at == -1:
|
||||
limit = min(self.max, len(text))
|
||||
for size in xrange(self.min, limit + 1):
|
||||
t.text = text[:size]
|
||||
if chars:
|
||||
t.endchar = startchar + size
|
||||
yield t
|
||||
|
||||
elif at == 1:
|
||||
if chars:
|
||||
original_startchar = t.startchar
|
||||
start = max(0, len(text) - self.max)
|
||||
for i in xrange(start, len(text) - self.min + 1):
|
||||
t.text = text[i:]
|
||||
if chars:
|
||||
t.startchar = original_startchar + i
|
||||
yield t
|
||||
else:
|
||||
for start in xrange(0, len(text) - self.min + 1):
|
||||
for size in xrange(self.min, self.max + 1):
|
||||
end = start + size
|
||||
if end > len(text):
|
||||
continue
|
||||
|
||||
t.text = text[start:end]
|
||||
|
||||
if chars:
|
||||
t.startchar = startchar + start
|
||||
t.endchar = startchar + end
|
||||
|
||||
yield t
|
||||
|
||||
|
||||
# Analyzers
|
||||
|
||||
def NgramAnalyzer(minsize, maxsize=None):
|
||||
"""Composes an NgramTokenizer and a LowercaseFilter.
|
||||
|
||||
>>> ana = NgramAnalyzer(4)
|
||||
>>> [token.text for token in ana("hi there")]
|
||||
["hi t", "i th", " the", "ther", "here"]
|
||||
"""
|
||||
|
||||
return NgramTokenizer(minsize, maxsize=maxsize) | LowercaseFilter()
|
||||
|
||||
|
||||
def NgramWordAnalyzer(minsize, maxsize=None, tokenizer=None, at=None):
|
||||
if not tokenizer:
|
||||
tokenizer = RegexTokenizer()
|
||||
return tokenizer | LowercaseFilter() | NgramFilter(minsize, maxsize, at=at)
|
||||
338
venv/Lib/site-packages/whoosh/analysis/tokenizers.py
Normal file
338
venv/Lib/site-packages/whoosh/analysis/tokenizers.py
Normal file
@@ -0,0 +1,338 @@
|
||||
# Copyright 2007 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
from whoosh.compat import u, text_type
|
||||
from whoosh.analysis.acore import Composable, Token
|
||||
from whoosh.util.text import rcompile
|
||||
|
||||
|
||||
default_pattern = rcompile(r"\w+(\.?\w+)*")
|
||||
|
||||
|
||||
# Tokenizers
|
||||
|
||||
|
||||
class Tokenizer(Composable):
|
||||
"""Base class for Tokenizers.
|
||||
"""
|
||||
|
||||
def __eq__(self, other):
|
||||
return other and self.__class__ is other.__class__
|
||||
|
||||
|
||||
class IDTokenizer(Tokenizer):
|
||||
"""Yields the entire input string as a single token. For use in indexed but
|
||||
untokenized fields, such as a document's path.
|
||||
|
||||
>>> idt = IDTokenizer()
|
||||
>>> [token.text for token in idt("/a/b 123 alpha")]
|
||||
["/a/b 123 alpha"]
|
||||
"""
|
||||
|
||||
def __call__(self, value, positions=False, chars=False,
|
||||
keeporiginal=False, removestops=True,
|
||||
start_pos=0, start_char=0, mode='', **kwargs):
|
||||
assert isinstance(value, text_type), "%r is not unicode" % value
|
||||
t = Token(positions, chars, removestops=removestops, mode=mode,
|
||||
**kwargs)
|
||||
t.text = value
|
||||
t.boost = 1.0
|
||||
if keeporiginal:
|
||||
t.original = value
|
||||
if positions:
|
||||
t.pos = start_pos + 1
|
||||
if chars:
|
||||
t.startchar = start_char
|
||||
t.endchar = start_char + len(value)
|
||||
yield t
|
||||
|
||||
|
||||
class RegexTokenizer(Tokenizer):
|
||||
"""
|
||||
Uses a regular expression to extract tokens from text.
|
||||
|
||||
>>> rex = RegexTokenizer()
|
||||
>>> [token.text for token in rex(u("hi there 3.141 big-time under_score"))]
|
||||
["hi", "there", "3.141", "big", "time", "under_score"]
|
||||
"""
|
||||
|
||||
def __init__(self, expression=default_pattern, gaps=False):
|
||||
"""
|
||||
:param expression: A regular expression object or string. Each match
|
||||
of the expression equals a token. Group 0 (the entire matched text)
|
||||
is used as the text of the token. If you require more complicated
|
||||
handling of the expression match, simply write your own tokenizer.
|
||||
:param gaps: If True, the tokenizer *splits* on the expression, rather
|
||||
than matching on the expression.
|
||||
"""
|
||||
|
||||
self.expression = rcompile(expression)
|
||||
self.gaps = gaps
|
||||
|
||||
def __eq__(self, other):
|
||||
if self.__class__ is other.__class__:
|
||||
if self.expression.pattern == other.expression.pattern:
|
||||
return True
|
||||
return False
|
||||
|
||||
def __call__(self, value, positions=False, chars=False, keeporiginal=False,
|
||||
removestops=True, start_pos=0, start_char=0, tokenize=True,
|
||||
mode='', **kwargs):
|
||||
"""
|
||||
:param value: The unicode string to tokenize.
|
||||
:param positions: Whether to record token positions in the token.
|
||||
:param chars: Whether to record character offsets in the token.
|
||||
:param start_pos: The position number of the first token. For example,
|
||||
if you set start_pos=2, the tokens will be numbered 2,3,4,...
|
||||
instead of 0,1,2,...
|
||||
:param start_char: The offset of the first character of the first
|
||||
token. For example, if you set start_char=2, the text "aaa bbb"
|
||||
will have chars (2,5),(6,9) instead (0,3),(4,7).
|
||||
:param tokenize: if True, the text should be tokenized.
|
||||
"""
|
||||
|
||||
assert isinstance(value, text_type), "%s is not unicode" % repr(value)
|
||||
|
||||
t = Token(positions, chars, removestops=removestops, mode=mode,
|
||||
**kwargs)
|
||||
if not tokenize:
|
||||
t.original = t.text = value
|
||||
t.boost = 1.0
|
||||
if positions:
|
||||
t.pos = start_pos
|
||||
if chars:
|
||||
t.startchar = start_char
|
||||
t.endchar = start_char + len(value)
|
||||
yield t
|
||||
elif not self.gaps:
|
||||
# The default: expression matches are used as tokens
|
||||
for pos, match in enumerate(self.expression.finditer(value)):
|
||||
t.text = match.group(0)
|
||||
t.boost = 1.0
|
||||
if keeporiginal:
|
||||
t.original = t.text
|
||||
t.stopped = False
|
||||
if positions:
|
||||
t.pos = start_pos + pos
|
||||
if chars:
|
||||
t.startchar = start_char + match.start()
|
||||
t.endchar = start_char + match.end()
|
||||
yield t
|
||||
else:
|
||||
# When gaps=True, iterate through the matches and
|
||||
# yield the text between them.
|
||||
prevend = 0
|
||||
pos = start_pos
|
||||
for match in self.expression.finditer(value):
|
||||
start = prevend
|
||||
end = match.start()
|
||||
text = value[start:end]
|
||||
if text:
|
||||
t.text = text
|
||||
t.boost = 1.0
|
||||
if keeporiginal:
|
||||
t.original = t.text
|
||||
t.stopped = False
|
||||
if positions:
|
||||
t.pos = pos
|
||||
pos += 1
|
||||
if chars:
|
||||
t.startchar = start_char + start
|
||||
t.endchar = start_char + end
|
||||
|
||||
yield t
|
||||
|
||||
prevend = match.end()
|
||||
|
||||
# If the last "gap" was before the end of the text,
|
||||
# yield the last bit of text as a final token.
|
||||
if prevend < len(value):
|
||||
t.text = value[prevend:]
|
||||
t.boost = 1.0
|
||||
if keeporiginal:
|
||||
t.original = t.text
|
||||
t.stopped = False
|
||||
if positions:
|
||||
t.pos = pos
|
||||
if chars:
|
||||
t.startchar = prevend
|
||||
t.endchar = len(value)
|
||||
yield t
|
||||
|
||||
|
||||
class CharsetTokenizer(Tokenizer):
|
||||
"""Tokenizes and translates text according to a character mapping object.
|
||||
Characters that map to None are considered token break characters. For all
|
||||
other characters the map is used to translate the character. This is useful
|
||||
for case and accent folding.
|
||||
|
||||
This tokenizer loops character-by-character and so will likely be much
|
||||
slower than :class:`RegexTokenizer`.
|
||||
|
||||
One way to get a character mapping object is to convert a Sphinx charset
|
||||
table file using :func:`whoosh.support.charset.charset_table_to_dict`.
|
||||
|
||||
>>> from whoosh.support.charset import charset_table_to_dict
|
||||
>>> from whoosh.support.charset import default_charset
|
||||
>>> charmap = charset_table_to_dict(default_charset)
|
||||
>>> chtokenizer = CharsetTokenizer(charmap)
|
||||
>>> [t.text for t in chtokenizer(u'Stra\\xdfe ABC')]
|
||||
[u'strase', u'abc']
|
||||
|
||||
The Sphinx charset table format is described at
|
||||
http://www.sphinxsearch.com/docs/current.html#conf-charset-table.
|
||||
"""
|
||||
|
||||
__inittype__ = dict(charmap=str)
|
||||
|
||||
def __init__(self, charmap):
|
||||
"""
|
||||
:param charmap: a mapping from integer character numbers to unicode
|
||||
characters, as used by the unicode.translate() method.
|
||||
"""
|
||||
self.charmap = charmap
|
||||
|
||||
def __eq__(self, other):
|
||||
return (other
|
||||
and self.__class__ is other.__class__
|
||||
and self.charmap == other.charmap)
|
||||
|
||||
def __call__(self, value, positions=False, chars=False, keeporiginal=False,
|
||||
removestops=True, start_pos=0, start_char=0, tokenize=True,
|
||||
mode='', **kwargs):
|
||||
"""
|
||||
:param value: The unicode string to tokenize.
|
||||
:param positions: Whether to record token positions in the token.
|
||||
:param chars: Whether to record character offsets in the token.
|
||||
:param start_pos: The position number of the first token. For example,
|
||||
if you set start_pos=2, the tokens will be numbered 2,3,4,...
|
||||
instead of 0,1,2,...
|
||||
:param start_char: The offset of the first character of the first
|
||||
token. For example, if you set start_char=2, the text "aaa bbb"
|
||||
will have chars (2,5),(6,9) instead (0,3),(4,7).
|
||||
:param tokenize: if True, the text should be tokenized.
|
||||
"""
|
||||
|
||||
assert isinstance(value, text_type), "%r is not unicode" % value
|
||||
|
||||
t = Token(positions, chars, removestops=removestops, mode=mode,
|
||||
**kwargs)
|
||||
if not tokenize:
|
||||
t.original = t.text = value
|
||||
t.boost = 1.0
|
||||
if positions:
|
||||
t.pos = start_pos
|
||||
if chars:
|
||||
t.startchar = start_char
|
||||
t.endchar = start_char + len(value)
|
||||
yield t
|
||||
else:
|
||||
text = u("")
|
||||
charmap = self.charmap
|
||||
pos = start_pos
|
||||
startchar = currentchar = start_char
|
||||
for char in value:
|
||||
tchar = charmap[ord(char)]
|
||||
if tchar:
|
||||
text += tchar
|
||||
else:
|
||||
if currentchar > startchar:
|
||||
t.text = text
|
||||
t.boost = 1.0
|
||||
if keeporiginal:
|
||||
t.original = t.text
|
||||
if positions:
|
||||
t.pos = pos
|
||||
pos += 1
|
||||
if chars:
|
||||
t.startchar = startchar
|
||||
t.endchar = currentchar
|
||||
yield t
|
||||
startchar = currentchar + 1
|
||||
text = u("")
|
||||
|
||||
currentchar += 1
|
||||
|
||||
if currentchar > startchar:
|
||||
t.text = value[startchar:currentchar]
|
||||
t.boost = 1.0
|
||||
if keeporiginal:
|
||||
t.original = t.text
|
||||
if positions:
|
||||
t.pos = pos
|
||||
if chars:
|
||||
t.startchar = startchar
|
||||
t.endchar = currentchar
|
||||
yield t
|
||||
|
||||
|
||||
def SpaceSeparatedTokenizer():
|
||||
"""Returns a RegexTokenizer that splits tokens by whitespace.
|
||||
|
||||
>>> sst = SpaceSeparatedTokenizer()
|
||||
>>> [token.text for token in sst("hi there big-time, what's up")]
|
||||
["hi", "there", "big-time,", "what's", "up"]
|
||||
"""
|
||||
|
||||
return RegexTokenizer(r"[^ \t\r\n]+")
|
||||
|
||||
|
||||
def CommaSeparatedTokenizer():
|
||||
"""Splits tokens by commas.
|
||||
|
||||
Note that the tokenizer calls unicode.strip() on each match of the regular
|
||||
expression.
|
||||
|
||||
>>> cst = CommaSeparatedTokenizer()
|
||||
>>> [token.text for token in cst("hi there, what's , up")]
|
||||
["hi there", "what's", "up"]
|
||||
"""
|
||||
|
||||
from whoosh.analysis.filters import StripFilter
|
||||
|
||||
return RegexTokenizer(r"[^,]+") | StripFilter()
|
||||
|
||||
|
||||
class PathTokenizer(Tokenizer):
|
||||
"""A simple tokenizer that given a string ``"/a/b/c"`` yields tokens
|
||||
``["/a", "/a/b", "/a/b/c"]``.
|
||||
"""
|
||||
|
||||
def __init__(self, expression="[^/]+"):
|
||||
self.expr = rcompile(expression)
|
||||
|
||||
def __call__(self, value, positions=False, start_pos=0, **kwargs):
|
||||
assert isinstance(value, text_type), "%r is not unicode" % value
|
||||
token = Token(positions, **kwargs)
|
||||
pos = start_pos
|
||||
for match in self.expr.finditer(value):
|
||||
token.text = value[:match.end()]
|
||||
if positions:
|
||||
token.pos = pos
|
||||
pos += 1
|
||||
yield token
|
||||
|
||||
Reference in New Issue
Block a user