2026-1-6
This commit is contained in:
31
venv/Lib/site-packages/whoosh/matching/__init__.py
Normal file
31
venv/Lib/site-packages/whoosh/matching/__init__.py
Normal file
@@ -0,0 +1,31 @@
|
||||
# Copyright 2012 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
from whoosh.matching.mcore import *
|
||||
from whoosh.matching.binary import *
|
||||
from whoosh.matching.wrappers import *
|
||||
from whoosh.matching.combo import *
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
803
venv/Lib/site-packages/whoosh/matching/binary.py
Normal file
803
venv/Lib/site-packages/whoosh/matching/binary.py
Normal file
@@ -0,0 +1,803 @@
|
||||
# Copyright 2010 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
from whoosh.matching import mcore
|
||||
|
||||
|
||||
class BiMatcher(mcore.Matcher):
|
||||
"""Base class for matchers that combine the results of two sub-matchers in
|
||||
some way.
|
||||
"""
|
||||
|
||||
def __init__(self, a, b):
|
||||
super(BiMatcher, self).__init__()
|
||||
self.a = a
|
||||
self.b = b
|
||||
|
||||
def reset(self):
|
||||
self.a.reset()
|
||||
self.b.reset()
|
||||
|
||||
def __repr__(self):
|
||||
return "%s(%r, %r)" % (self.__class__.__name__, self.a, self.b)
|
||||
|
||||
def children(self):
|
||||
return [self.a, self.b]
|
||||
|
||||
def copy(self):
|
||||
return self.__class__(self.a.copy(), self.b.copy())
|
||||
|
||||
def depth(self):
|
||||
return 1 + max(self.a.depth(), self.b.depth())
|
||||
|
||||
def skip_to(self, id):
|
||||
if not self.is_active():
|
||||
raise mcore.ReadTooFar
|
||||
ra = self.a.skip_to(id)
|
||||
rb = self.b.skip_to(id)
|
||||
return ra or rb
|
||||
|
||||
def supports_block_quality(self):
|
||||
return (self.a.supports_block_quality()
|
||||
and self.b.supports_block_quality())
|
||||
|
||||
def supports(self, astype):
|
||||
return self.a.supports(astype) and self.b.supports(astype)
|
||||
|
||||
|
||||
class AdditiveBiMatcher(BiMatcher):
|
||||
"""Base class for binary matchers where the scores of the sub-matchers are
|
||||
added together.
|
||||
"""
|
||||
|
||||
def max_quality(self):
|
||||
q = 0.0
|
||||
if self.a.is_active():
|
||||
q += self.a.max_quality()
|
||||
if self.b.is_active():
|
||||
q += self.b.max_quality()
|
||||
return q
|
||||
|
||||
def block_quality(self):
|
||||
bq = 0.0
|
||||
if self.a.is_active():
|
||||
bq += self.a.block_quality()
|
||||
if self.b.is_active():
|
||||
bq += self.b.block_quality()
|
||||
return bq
|
||||
|
||||
def weight(self):
|
||||
return (self.a.weight() + self.b.weight())
|
||||
|
||||
def score(self):
|
||||
return (self.a.score() + self.b.score())
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.__class__ is type(other)
|
||||
|
||||
def __lt__(self, other):
|
||||
return type(other) is self.__class__
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self.__eq__(other)
|
||||
|
||||
def __gt__(self, other):
|
||||
return not (self.__lt__(other) or self.__eq__(other))
|
||||
|
||||
def __le__(self, other):
|
||||
return self.__eq__(other) or self.__lt__(other)
|
||||
|
||||
def __ge__(self, other):
|
||||
return self.__eq__(other) or self.__gt__(other)
|
||||
|
||||
|
||||
class UnionMatcher(AdditiveBiMatcher):
|
||||
"""Matches the union (OR) of the postings in the two sub-matchers.
|
||||
"""
|
||||
|
||||
_id = None
|
||||
|
||||
def replace(self, minquality=0):
|
||||
a = self.a
|
||||
b = self.b
|
||||
a_active = a.is_active()
|
||||
b_active = b.is_active()
|
||||
|
||||
# If neither sub-matcher on its own has a high enough max quality to
|
||||
# contribute, convert to an intersection matcher
|
||||
if minquality and a_active and b_active:
|
||||
a_max = a.max_quality()
|
||||
b_max = b.max_quality()
|
||||
if a_max < minquality and b_max < minquality:
|
||||
return IntersectionMatcher(a, b).replace(minquality)
|
||||
elif a_max < minquality:
|
||||
return AndMaybeMatcher(b, a)
|
||||
elif b_max < minquality:
|
||||
return AndMaybeMatcher(a, b)
|
||||
|
||||
# If one or both of the sub-matchers are inactive, convert
|
||||
if not (a_active or b_active):
|
||||
return mcore.NullMatcher()
|
||||
elif not a_active:
|
||||
return b.replace(minquality)
|
||||
elif not b_active:
|
||||
return a.replace(minquality)
|
||||
|
||||
a = a.replace(minquality - b.max_quality() if minquality else 0)
|
||||
b = b.replace(minquality - a.max_quality() if minquality else 0)
|
||||
# If one of the sub-matchers changed, return a new union
|
||||
if a is not self.a or b is not self.b:
|
||||
return self.__class__(a, b)
|
||||
else:
|
||||
self._id = None
|
||||
return self
|
||||
|
||||
def is_active(self):
|
||||
return self.a.is_active() or self.b.is_active()
|
||||
|
||||
def skip_to(self, id):
|
||||
self._id = None
|
||||
ra = rb = False
|
||||
|
||||
if self.a.is_active():
|
||||
ra = self.a.skip_to(id)
|
||||
if self.b.is_active():
|
||||
rb = self.b.skip_to(id)
|
||||
|
||||
return ra or rb
|
||||
|
||||
def id(self):
|
||||
_id = self._id
|
||||
if _id is not None:
|
||||
return _id
|
||||
|
||||
a = self.a
|
||||
b = self.b
|
||||
if not a.is_active():
|
||||
_id = b.id()
|
||||
elif not b.is_active():
|
||||
_id = a.id()
|
||||
else:
|
||||
_id = min(a.id(), b.id())
|
||||
self._id = _id
|
||||
return _id
|
||||
|
||||
# Using sets is faster in most cases, but could potentially use a lot of
|
||||
# memory. Comment out this method override to not use sets.
|
||||
#def all_ids(self):
|
||||
# return iter(sorted(set(self.a.all_ids()) | set(self.b.all_ids())))
|
||||
|
||||
def next(self):
|
||||
self._id = None
|
||||
|
||||
a = self.a
|
||||
b = self.b
|
||||
a_active = a.is_active()
|
||||
b_active = b.is_active()
|
||||
|
||||
# Shortcut when one matcher is inactive
|
||||
if not (a_active or b_active):
|
||||
raise mcore.ReadTooFar
|
||||
elif not a_active:
|
||||
return b.next()
|
||||
elif not b_active:
|
||||
return a.next()
|
||||
|
||||
a_id = a.id()
|
||||
b_id = b.id()
|
||||
ar = br = None
|
||||
|
||||
# After all that, here's the actual implementation
|
||||
if a_id <= b_id:
|
||||
ar = a.next()
|
||||
if b_id <= a_id:
|
||||
br = b.next()
|
||||
return ar or br
|
||||
|
||||
def spans(self):
|
||||
if not self.a.is_active():
|
||||
return self.b.spans()
|
||||
if not self.b.is_active():
|
||||
return self.a.spans()
|
||||
|
||||
id_a = self.a.id()
|
||||
id_b = self.b.id()
|
||||
if id_a < id_b:
|
||||
return self.a.spans()
|
||||
elif id_b < id_a:
|
||||
return self.b.spans()
|
||||
else:
|
||||
return sorted(set(self.a.spans()) | set(self.b.spans()))
|
||||
|
||||
def weight(self):
|
||||
a = self.a
|
||||
b = self.b
|
||||
|
||||
if not a.is_active():
|
||||
return b.weight()
|
||||
if not b.is_active():
|
||||
return a.weight()
|
||||
|
||||
id_a = a.id()
|
||||
id_b = b.id()
|
||||
if id_a < id_b:
|
||||
return a.weight()
|
||||
elif id_b < id_a:
|
||||
return b.weight()
|
||||
else:
|
||||
return (a.weight() + b.weight())
|
||||
|
||||
def score(self):
|
||||
a = self.a
|
||||
b = self.b
|
||||
|
||||
if not a.is_active():
|
||||
return b.score()
|
||||
if not b.is_active():
|
||||
return a.score()
|
||||
|
||||
id_a = a.id()
|
||||
id_b = b.id()
|
||||
if id_a < id_b:
|
||||
return a.score()
|
||||
elif id_b < id_a:
|
||||
return b.score()
|
||||
else:
|
||||
return (a.score() + b.score())
|
||||
|
||||
def skip_to_quality(self, minquality):
|
||||
self._id = None
|
||||
|
||||
a = self.a
|
||||
b = self.b
|
||||
if not (a.is_active() or b.is_active()):
|
||||
raise mcore.ReadTooFar
|
||||
|
||||
# Short circuit if one matcher is inactive
|
||||
if not a.is_active():
|
||||
return b.skip_to_quality(minquality)
|
||||
elif not b.is_active():
|
||||
return a.skip_to_quality(minquality)
|
||||
|
||||
skipped = 0
|
||||
aq = a.block_quality()
|
||||
bq = b.block_quality()
|
||||
while a.is_active() and b.is_active() and aq + bq <= minquality:
|
||||
if aq < bq:
|
||||
skipped += a.skip_to_quality(minquality - bq)
|
||||
aq = a.block_quality()
|
||||
else:
|
||||
skipped += b.skip_to_quality(minquality - aq)
|
||||
bq = b.block_quality()
|
||||
|
||||
return skipped
|
||||
|
||||
|
||||
class DisjunctionMaxMatcher(UnionMatcher):
|
||||
"""Matches the union (OR) of two sub-matchers. Where both sub-matchers
|
||||
match the same posting, returns the weight/score of the higher-scoring
|
||||
posting.
|
||||
"""
|
||||
|
||||
# TODO: this class inherits from AdditiveBiMatcher (through UnionMatcher)
|
||||
# but it does not add the scores of the sub-matchers together (it
|
||||
# overrides all methods that perform addition). Need to clean up the
|
||||
# inheritance.
|
||||
|
||||
def __init__(self, a, b, tiebreak=0.0):
|
||||
super(DisjunctionMaxMatcher, self).__init__(a, b)
|
||||
self.tiebreak = tiebreak
|
||||
|
||||
def copy(self):
|
||||
return self.__class__(self.a.copy(), self.b.copy(),
|
||||
tiebreak=self.tiebreak)
|
||||
|
||||
def replace(self, minquality=0):
|
||||
a = self.a
|
||||
b = self.b
|
||||
a_active = a.is_active()
|
||||
b_active = b.is_active()
|
||||
|
||||
# DisMax takes the max of the sub-matcher qualities instead of adding
|
||||
# them, so we need special logic here
|
||||
if minquality and a_active and b_active:
|
||||
a_max = a.max_quality()
|
||||
b_max = b.max_quality()
|
||||
|
||||
if a_max < minquality and b_max < minquality:
|
||||
# If neither sub-matcher has a high enough max quality to
|
||||
# contribute, return an inactive matcher
|
||||
return mcore.NullMatcher()
|
||||
elif b_max < minquality:
|
||||
# If the b matcher can't contribute, return a
|
||||
return a.replace(minquality)
|
||||
elif a_max < minquality:
|
||||
# If the a matcher can't contribute, return b
|
||||
return b.replace(minquality)
|
||||
|
||||
if not (a_active or b_active):
|
||||
return mcore.NullMatcher()
|
||||
elif not a_active:
|
||||
return b.replace(minquality)
|
||||
elif not b_active:
|
||||
return a.replace(minquality)
|
||||
|
||||
# We CAN pass the minquality down here, since we don't add the two
|
||||
# scores together
|
||||
a = a.replace(minquality)
|
||||
b = b.replace(minquality)
|
||||
a_active = a.is_active()
|
||||
b_active = b.is_active()
|
||||
# It's kind of tedious to check for inactive sub-matchers all over
|
||||
# again here after we replace them, but it's probably better than
|
||||
# returning a replacement with an inactive sub-matcher
|
||||
if not (a_active and b_active):
|
||||
return mcore.NullMatcher()
|
||||
elif not a_active:
|
||||
return b
|
||||
elif not b_active:
|
||||
return a
|
||||
elif a is not self.a or b is not self.b:
|
||||
# If one of the sub-matchers changed, return a new DisMax
|
||||
return self.__class__(a, b)
|
||||
else:
|
||||
return self
|
||||
|
||||
def score(self):
|
||||
if not self.a.is_active():
|
||||
return self.b.score()
|
||||
elif not self.b.is_active():
|
||||
return self.a.score()
|
||||
else:
|
||||
return max(self.a.score(), self.b.score())
|
||||
|
||||
def max_quality(self):
|
||||
return max(self.a.max_quality(), self.b.max_quality())
|
||||
|
||||
def block_quality(self):
|
||||
return max(self.a.block_quality(), self.b.block_quality())
|
||||
|
||||
def skip_to_quality(self, minquality):
|
||||
a = self.a
|
||||
b = self.b
|
||||
|
||||
# Short circuit if one matcher is inactive
|
||||
if not a.is_active():
|
||||
sk = b.skip_to_quality(minquality)
|
||||
return sk
|
||||
elif not b.is_active():
|
||||
return a.skip_to_quality(minquality)
|
||||
|
||||
skipped = 0
|
||||
aq = a.block_quality()
|
||||
bq = b.block_quality()
|
||||
while a.is_active() and b.is_active() and max(aq, bq) <= minquality:
|
||||
if aq <= minquality:
|
||||
skipped += a.skip_to_quality(minquality)
|
||||
aq = a.block_quality()
|
||||
if bq <= minquality:
|
||||
skipped += b.skip_to_quality(minquality)
|
||||
bq = b.block_quality()
|
||||
return skipped
|
||||
|
||||
|
||||
class IntersectionMatcher(AdditiveBiMatcher):
|
||||
"""Matches the intersection (AND) of the postings in the two sub-matchers.
|
||||
"""
|
||||
|
||||
def __init__(self, a, b):
|
||||
super(IntersectionMatcher, self).__init__(a, b)
|
||||
self._find_first()
|
||||
|
||||
def reset(self):
|
||||
self.a.reset()
|
||||
self.b.reset()
|
||||
self._find_first()
|
||||
|
||||
def _find_first(self):
|
||||
if (self.a.is_active()
|
||||
and self.b.is_active()
|
||||
and self.a.id() != self.b.id()):
|
||||
self._find_next()
|
||||
|
||||
def replace(self, minquality=0):
|
||||
a = self.a
|
||||
b = self.b
|
||||
a_active = a.is_active()
|
||||
b_active = b.is_active()
|
||||
|
||||
if not (a_active and b_active):
|
||||
# Intersection matcher requires that both sub-matchers be active
|
||||
return mcore.NullMatcher()
|
||||
|
||||
if minquality:
|
||||
a_max = a.max_quality()
|
||||
b_max = b.max_quality()
|
||||
if a_max + b_max < minquality:
|
||||
# If the combined quality of the sub-matchers can't contribute,
|
||||
# return an inactive matcher
|
||||
return mcore.NullMatcher()
|
||||
# Require that the replacements be able to contribute results
|
||||
# higher than the minquality
|
||||
a_min = minquality - b_max
|
||||
b_min = minquality - a_max
|
||||
else:
|
||||
a_min = b_min = 0
|
||||
|
||||
a = a.replace(a_min)
|
||||
b = b.replace(b_min)
|
||||
a_active = a.is_active()
|
||||
b_active = b.is_active()
|
||||
if not (a_active or b_active):
|
||||
return mcore.NullMatcher()
|
||||
elif not a_active:
|
||||
return b
|
||||
elif not b_active:
|
||||
return a
|
||||
elif a is not self.a or b is not self.b:
|
||||
return self.__class__(a, b)
|
||||
else:
|
||||
return self
|
||||
|
||||
def is_active(self):
|
||||
return self.a.is_active() and self.b.is_active()
|
||||
|
||||
def _find_next(self):
|
||||
a = self.a
|
||||
b = self.b
|
||||
a_id = a.id()
|
||||
b_id = b.id()
|
||||
assert a_id != b_id
|
||||
r = False
|
||||
|
||||
while a.is_active() and b.is_active() and a_id != b_id:
|
||||
if a_id < b_id:
|
||||
ra = a.skip_to(b_id)
|
||||
if not a.is_active():
|
||||
return
|
||||
r = r or ra
|
||||
a_id = a.id()
|
||||
else:
|
||||
rb = b.skip_to(a_id)
|
||||
if not b.is_active():
|
||||
return
|
||||
r = r or rb
|
||||
b_id = b.id()
|
||||
return r
|
||||
|
||||
def id(self):
|
||||
return self.a.id()
|
||||
|
||||
# Using sets is faster in some cases, but could potentially use a lot of
|
||||
# memory
|
||||
def all_ids(self):
|
||||
return iter(sorted(set(self.a.all_ids()) & set(self.b.all_ids())))
|
||||
|
||||
def skip_to(self, id):
|
||||
if not self.is_active():
|
||||
raise mcore.ReadTooFar
|
||||
ra = self.a.skip_to(id)
|
||||
rb = self.b.skip_to(id)
|
||||
if self.is_active():
|
||||
rn = False
|
||||
if self.a.id() != self.b.id():
|
||||
rn = self._find_next()
|
||||
return ra or rb or rn
|
||||
|
||||
def skip_to_quality(self, minquality):
|
||||
a = self.a
|
||||
b = self.b
|
||||
minquality = minquality
|
||||
|
||||
skipped = 0
|
||||
aq = a.block_quality()
|
||||
bq = b.block_quality()
|
||||
while a.is_active() and b.is_active() and aq + bq <= minquality:
|
||||
if aq < bq:
|
||||
# If the block quality of A is less than B, skip A ahead until
|
||||
# it can contribute at least the balance of the required min
|
||||
# quality when added to B
|
||||
sk = a.skip_to_quality(minquality - bq)
|
||||
skipped += sk
|
||||
if not sk and a.is_active():
|
||||
# The matcher couldn't skip ahead for some reason, so just
|
||||
# advance and try again
|
||||
a.next()
|
||||
else:
|
||||
# And vice-versa
|
||||
sk = b.skip_to_quality(minquality - aq)
|
||||
skipped += sk
|
||||
if not sk and b.is_active():
|
||||
b.next()
|
||||
|
||||
if not a.is_active() or not b.is_active():
|
||||
# One of the matchers is exhausted
|
||||
break
|
||||
if a.id() != b.id():
|
||||
# We want to always leave in a state where the matchers are at
|
||||
# the same document, so call _find_next() to sync them
|
||||
self._find_next()
|
||||
|
||||
# Get the block qualities at the new matcher positions
|
||||
aq = a.block_quality()
|
||||
bq = b.block_quality()
|
||||
return skipped
|
||||
|
||||
def next(self):
|
||||
if not self.is_active():
|
||||
raise mcore.ReadTooFar
|
||||
|
||||
# We must assume that the ids are equal whenever next() is called (they
|
||||
# should have been made equal by _find_next), so advance them both
|
||||
ar = self.a.next()
|
||||
if self.is_active():
|
||||
nr = self._find_next()
|
||||
return ar or nr
|
||||
|
||||
def spans(self):
|
||||
return sorted(set(self.a.spans()) | set(self.b.spans()))
|
||||
|
||||
|
||||
class AndNotMatcher(BiMatcher):
|
||||
"""Matches the postings in the first sub-matcher that are NOT present in
|
||||
the second sub-matcher.
|
||||
"""
|
||||
|
||||
def __init__(self, a, b):
|
||||
super(AndNotMatcher, self).__init__(a, b)
|
||||
self._find_first()
|
||||
|
||||
def reset(self):
|
||||
self.a.reset()
|
||||
self.b.reset()
|
||||
self._find_first()
|
||||
|
||||
def _find_first(self):
|
||||
if (self.a.is_active()
|
||||
and self.b.is_active()
|
||||
and self.a.id() == self.b.id()):
|
||||
self._find_next()
|
||||
|
||||
def is_active(self):
|
||||
return self.a.is_active()
|
||||
|
||||
def _find_next(self):
|
||||
pos = self.a
|
||||
neg = self.b
|
||||
if not neg.is_active():
|
||||
return
|
||||
pos_id = pos.id()
|
||||
r = False
|
||||
|
||||
if neg.id() < pos_id:
|
||||
neg.skip_to(pos_id)
|
||||
|
||||
while pos.is_active() and neg.is_active() and pos_id == neg.id():
|
||||
nr = pos.next()
|
||||
if not pos.is_active():
|
||||
break
|
||||
|
||||
r = r or nr
|
||||
pos_id = pos.id()
|
||||
neg.skip_to(pos_id)
|
||||
|
||||
return r
|
||||
|
||||
def supports_block_quality(self):
|
||||
return self.a.supports_block_quality()
|
||||
|
||||
def replace(self, minquality=0):
|
||||
if not self.a.is_active():
|
||||
# The a matcher is required, so if it's inactive, return an
|
||||
# inactive matcher
|
||||
return mcore.NullMatcher()
|
||||
elif (minquality
|
||||
and self.a.max_quality() < minquality):
|
||||
# If the quality of the required matcher isn't high enough to
|
||||
# contribute, return an inactive matcher
|
||||
return mcore.NullMatcher()
|
||||
elif not self.b.is_active():
|
||||
# If the prohibited matcher is inactive, convert to just the
|
||||
# required matcher
|
||||
return self.a.replace(minquality)
|
||||
|
||||
a = self.a.replace(minquality)
|
||||
b = self.b.replace()
|
||||
if a is not self.a or b is not self.b:
|
||||
# If one of the sub-matchers was replaced, return a new AndNot
|
||||
return self.__class__(a, b)
|
||||
else:
|
||||
return self
|
||||
|
||||
def max_quality(self):
|
||||
return self.a.max_quality()
|
||||
|
||||
def block_quality(self):
|
||||
return self.a.block_quality()
|
||||
|
||||
def skip_to_quality(self, minquality):
|
||||
skipped = self.a.skip_to_quality(minquality)
|
||||
self._find_next()
|
||||
return skipped
|
||||
|
||||
def id(self):
|
||||
return self.a.id()
|
||||
|
||||
def next(self):
|
||||
if not self.a.is_active():
|
||||
raise mcore.ReadTooFar
|
||||
ar = self.a.next()
|
||||
nr = False
|
||||
if self.a.is_active() and self.b.is_active():
|
||||
nr = self._find_next()
|
||||
return ar or nr
|
||||
|
||||
def skip_to(self, id):
|
||||
if not self.a.is_active():
|
||||
raise mcore.ReadTooFar
|
||||
if id < self.a.id():
|
||||
return
|
||||
|
||||
self.a.skip_to(id)
|
||||
if self.b.is_active():
|
||||
self.b.skip_to(id)
|
||||
self._find_next()
|
||||
|
||||
def weight(self):
|
||||
return self.a.weight()
|
||||
|
||||
def score(self):
|
||||
return self.a.score()
|
||||
|
||||
def supports(self, astype):
|
||||
return self.a.supports(astype)
|
||||
|
||||
def value(self):
|
||||
return self.a.value()
|
||||
|
||||
def value_as(self, astype):
|
||||
return self.a.value_as(astype)
|
||||
|
||||
|
||||
class AndMaybeMatcher(AdditiveBiMatcher):
|
||||
"""Matches postings in the first sub-matcher, and if the same posting is
|
||||
in the second sub-matcher, adds their scores.
|
||||
"""
|
||||
|
||||
def __init__(self, a, b):
|
||||
AdditiveBiMatcher.__init__(self, a, b)
|
||||
self._first_b()
|
||||
|
||||
def reset(self):
|
||||
self.a.reset()
|
||||
self.b.reset()
|
||||
self._first_b()
|
||||
|
||||
def _first_b(self):
|
||||
a = self.a
|
||||
b = self.b
|
||||
if a.is_active() and b.is_active() and a.id() != b.id():
|
||||
b.skip_to(a.id())
|
||||
|
||||
def is_active(self):
|
||||
return self.a.is_active()
|
||||
|
||||
def id(self):
|
||||
return self.a.id()
|
||||
|
||||
def next(self):
|
||||
if not self.a.is_active():
|
||||
raise mcore.ReadTooFar
|
||||
|
||||
ar = self.a.next()
|
||||
br = False
|
||||
if self.a.is_active() and self.b.is_active():
|
||||
br = self.b.skip_to(self.a.id())
|
||||
return ar or br
|
||||
|
||||
def skip_to(self, id):
|
||||
if not self.a.is_active():
|
||||
raise mcore.ReadTooFar
|
||||
|
||||
ra = self.a.skip_to(id)
|
||||
rb = False
|
||||
if self.a.is_active() and self.b.is_active():
|
||||
rb = self.b.skip_to(id)
|
||||
return ra or rb
|
||||
|
||||
def replace(self, minquality=0):
|
||||
a = self.a
|
||||
b = self.b
|
||||
a_active = a.is_active()
|
||||
b_active = b.is_active()
|
||||
|
||||
if not a_active:
|
||||
return mcore.NullMatcher()
|
||||
elif minquality and b_active:
|
||||
if a.max_quality() + b.max_quality() < minquality:
|
||||
# If the combined max quality of the sub-matchers isn't high
|
||||
# enough to possibly contribute, return an inactive matcher
|
||||
return mcore.NullMatcher()
|
||||
elif a.max_quality() < minquality:
|
||||
# If the max quality of the main sub-matcher isn't high enough
|
||||
# to ever contribute without the optional sub- matcher, change
|
||||
# into an IntersectionMatcher
|
||||
return IntersectionMatcher(self.a, self.b)
|
||||
elif not b_active:
|
||||
return a.replace(minquality)
|
||||
|
||||
new_a = a.replace(minquality - b.max_quality())
|
||||
new_b = b.replace(minquality - a.max_quality())
|
||||
if new_a is not a or new_b is not b:
|
||||
# If one of the sub-matchers changed, return a new AndMaybe
|
||||
return self.__class__(new_a, new_b)
|
||||
else:
|
||||
return self
|
||||
|
||||
def skip_to_quality(self, minquality):
|
||||
a = self.a
|
||||
b = self.b
|
||||
minquality = minquality
|
||||
|
||||
if not a.is_active():
|
||||
raise mcore.ReadTooFar
|
||||
if not b.is_active():
|
||||
return a.skip_to_quality(minquality)
|
||||
|
||||
skipped = 0
|
||||
aq = a.block_quality()
|
||||
bq = b.block_quality()
|
||||
while a.is_active() and b.is_active() and aq + bq <= minquality:
|
||||
if aq < bq:
|
||||
skipped += a.skip_to_quality(minquality - bq)
|
||||
aq = a.block_quality()
|
||||
else:
|
||||
skipped += b.skip_to_quality(minquality - aq)
|
||||
bq = b.block_quality()
|
||||
|
||||
return skipped
|
||||
|
||||
def weight(self):
|
||||
if self.a.id() == self.b.id():
|
||||
return self.a.weight() + self.b.weight()
|
||||
else:
|
||||
return self.a.weight()
|
||||
|
||||
def score(self):
|
||||
if self.b.is_active() and self.a.id() == self.b.id():
|
||||
return self.a.score() + self.b.score()
|
||||
else:
|
||||
return self.a.score()
|
||||
|
||||
def supports(self, astype):
|
||||
return self.a.supports(astype)
|
||||
|
||||
def value(self):
|
||||
return self.a.value()
|
||||
|
||||
def value_as(self, astype):
|
||||
return self.a.value_as(astype)
|
||||
312
venv/Lib/site-packages/whoosh/matching/combo.py
Normal file
312
venv/Lib/site-packages/whoosh/matching/combo.py
Normal file
@@ -0,0 +1,312 @@
|
||||
# Copyright 2010 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
from __future__ import division
|
||||
from array import array
|
||||
|
||||
from whoosh.compat import xrange
|
||||
from whoosh.matching import mcore
|
||||
|
||||
|
||||
class CombinationMatcher(mcore.Matcher):
|
||||
def __init__(self, submatchers, boost=1.0):
|
||||
self._submatchers = submatchers
|
||||
self._boost = boost
|
||||
|
||||
def supports_block_quality(self):
|
||||
return all(m.supports_block_quality() for m in self._submatchers)
|
||||
|
||||
def max_quality(self):
|
||||
return max(m.max_quality() for m in self._submatchers
|
||||
if m.is_active()) * self._boost
|
||||
|
||||
def supports(self, astype):
|
||||
return all(m.supports(astype) for m in self._submatchers)
|
||||
|
||||
def children(self):
|
||||
return iter(self._submatchers)
|
||||
|
||||
def score(self):
|
||||
return sum(m.score() for m in self._submatchers) * self._boost
|
||||
|
||||
|
||||
class PreloadedUnionMatcher(CombinationMatcher):
|
||||
"""Instead of marching the sub-matchers along in parallel, this
|
||||
matcher pre-reads the scores for EVERY MATCHING DOCUMENT, trading memory
|
||||
for speed.
|
||||
|
||||
This is faster than the implementation using a binary tree of
|
||||
:class:`~whoosh.matching.binary.UnionMatcher` objects (possibly just
|
||||
because of less overhead), but it doesn't allow getting information about
|
||||
the "current" document other than the score, because there isn't really a
|
||||
current document, just an array of scores.
|
||||
"""
|
||||
|
||||
def __init__(self, submatchers, doccount, boost=1.0, scored=True):
|
||||
CombinationMatcher.__init__(self, submatchers, boost=boost)
|
||||
|
||||
self._doccount = doccount
|
||||
|
||||
a = array("d")
|
||||
active = [subm for subm in self._submatchers if subm.is_active()]
|
||||
if active:
|
||||
offset = self._docnum = min(m.id() for m in active)
|
||||
for m in active:
|
||||
while m.is_active():
|
||||
if scored:
|
||||
score = m.score() * boost
|
||||
else:
|
||||
score = boost
|
||||
|
||||
docnum = m.id()
|
||||
place = docnum - offset
|
||||
if len(a) <= place:
|
||||
a.extend(0 for _ in xrange(place - len(a) + 1))
|
||||
a[place] += score
|
||||
m.next()
|
||||
self._a = a
|
||||
self._offset = offset
|
||||
else:
|
||||
self._docnum = 0
|
||||
self._offset = 0
|
||||
self._a = a
|
||||
|
||||
def is_active(self):
|
||||
return self._docnum - self._offset < len(self._a)
|
||||
|
||||
def id(self):
|
||||
return self._docnum
|
||||
|
||||
def score(self):
|
||||
return self._a[self._docnum - self._offset]
|
||||
|
||||
def next(self):
|
||||
a = self._a
|
||||
offset = self._offset
|
||||
place = self._docnum - offset
|
||||
|
||||
place += 1
|
||||
while place < len(a) and a[place] == 0:
|
||||
place += 1
|
||||
self._docnum = place + offset
|
||||
|
||||
def max_quality(self):
|
||||
return max(self._a[self._docnum - self._offset:])
|
||||
|
||||
def block_quality(self):
|
||||
return self.max_quality()
|
||||
|
||||
def skip_to(self, docnum):
|
||||
if docnum < self._docnum:
|
||||
return
|
||||
|
||||
self._docnum = docnum
|
||||
i = docnum - self._offset
|
||||
if i < len(self._a) and self._a[i] == 0:
|
||||
self.next()
|
||||
|
||||
def skip_to_quality(self, minquality):
|
||||
a = self._a
|
||||
offset = self._offset
|
||||
place = self._docnum - offset
|
||||
|
||||
skipped = 0
|
||||
while place < len(a) and a[place] <= minquality:
|
||||
place += 1
|
||||
skipped = 1
|
||||
|
||||
self._docnum = place + offset
|
||||
return skipped
|
||||
|
||||
def supports(self, astype):
|
||||
# This matcher doesn't support any posting values
|
||||
return False
|
||||
|
||||
def all_ids(self):
|
||||
a = self._a
|
||||
offset = self._offset
|
||||
place = self._docnum - offset
|
||||
|
||||
while place < len(a):
|
||||
if a[place] > 0:
|
||||
yield place + offset
|
||||
place += 1
|
||||
|
||||
|
||||
class ArrayUnionMatcher(CombinationMatcher):
|
||||
"""Instead of marching the sub-matchers along in parallel, this matcher
|
||||
pre-reads the scores for a large block of documents at a time from each
|
||||
matcher, accumulating the scores in an array.
|
||||
|
||||
This is faster than the implementation using a binary tree of
|
||||
:class:`~whoosh.matching.binary.UnionMatcher` objects (possibly just
|
||||
because of less overhead), but it doesn't allow getting information about
|
||||
the "current" document other than the score, because there isn't really a
|
||||
current document, just an array of scores.
|
||||
"""
|
||||
|
||||
def __init__(self, submatchers, doccount, boost=1.0, scored=True,
|
||||
partsize=2048):
|
||||
CombinationMatcher.__init__(self, submatchers, boost=boost)
|
||||
self._scored = scored
|
||||
self._doccount = doccount
|
||||
|
||||
if not partsize:
|
||||
partsize = doccount
|
||||
self._partsize = partsize
|
||||
|
||||
self._a = array("d", (0 for _ in xrange(self._partsize)))
|
||||
self._docnum = self._min_id()
|
||||
self._read_part()
|
||||
|
||||
def __repr__(self):
|
||||
return ("%s(%r, boost=%f, scored=%r, partsize=%d)"
|
||||
% (self.__class__.__name__, self._submatchers, self._boost,
|
||||
self._scored, self._partsize))
|
||||
|
||||
def _min_id(self):
|
||||
active = [subm for subm in self._submatchers if subm.is_active()]
|
||||
if active:
|
||||
return min(subm.id() for subm in active)
|
||||
else:
|
||||
return self._doccount
|
||||
|
||||
def _read_part(self):
|
||||
scored = self._scored
|
||||
boost = self._boost
|
||||
limit = min(self._docnum + self._partsize, self._doccount)
|
||||
offset = self._docnum
|
||||
a = self._a
|
||||
|
||||
# Clear the array
|
||||
for i in xrange(self._partsize):
|
||||
a[i] = 0
|
||||
|
||||
# Add the scores from the submatchers into the array
|
||||
for m in self._submatchers:
|
||||
while m.is_active() and m.id() < limit:
|
||||
i = m.id() - offset
|
||||
if scored:
|
||||
a[i] += m.score() * boost
|
||||
else:
|
||||
a[i] = 1
|
||||
m.next()
|
||||
|
||||
self._offset = offset
|
||||
self._limit = limit
|
||||
|
||||
def _find_next(self):
|
||||
a = self._a
|
||||
docnum = self._docnum
|
||||
offset = self._offset
|
||||
limit = self._limit
|
||||
|
||||
while docnum < limit:
|
||||
if a[docnum - offset] > 0:
|
||||
break
|
||||
docnum += 1
|
||||
|
||||
if docnum == limit:
|
||||
self._docnum = self._min_id()
|
||||
self._read_part()
|
||||
else:
|
||||
self._docnum = docnum
|
||||
|
||||
def supports(self, astype):
|
||||
# This matcher doesn't support any posting values
|
||||
return False
|
||||
|
||||
def is_active(self):
|
||||
return self._docnum < self._doccount
|
||||
|
||||
def max_quality(self):
|
||||
return max(m.max_quality() for m in self._submatchers)
|
||||
|
||||
def block_quality(self):
|
||||
return max(self._a)
|
||||
|
||||
def skip_to(self, docnum):
|
||||
if docnum < self._offset:
|
||||
# We've already passed it
|
||||
return
|
||||
elif docnum < self._limit:
|
||||
# It's in the current part
|
||||
self._docnum = docnum
|
||||
self._find_next()
|
||||
return
|
||||
|
||||
# Advance all active submatchers
|
||||
submatchers = self._submatchers
|
||||
active = False
|
||||
for subm in submatchers:
|
||||
if subm.is_active():
|
||||
subm.skip_to(docnum)
|
||||
|
||||
if any(subm.is_active() for subm in submatchers):
|
||||
# Rebuffer
|
||||
self._docnum = self._min_id()
|
||||
self._read_part()
|
||||
else:
|
||||
self._docnum = self._doccount
|
||||
|
||||
def skip_to_quality(self, minquality):
|
||||
skipped = 0
|
||||
while self.is_active() and self.block_quality() <= minquality:
|
||||
skipped += 1
|
||||
self._docnum = self._limit
|
||||
self._read_part()
|
||||
if self.is_active():
|
||||
self._find_next()
|
||||
return skipped
|
||||
|
||||
def id(self):
|
||||
return self._docnum
|
||||
|
||||
def all_ids(self):
|
||||
doccount = self._doccount
|
||||
docnum = self._docnum
|
||||
offset = self._offset
|
||||
limit = self._limit
|
||||
|
||||
a = self._a
|
||||
while docnum < doccount:
|
||||
if a[docnum - offset] > 0:
|
||||
yield docnum
|
||||
|
||||
docnum += 1
|
||||
if docnum == limit:
|
||||
self._docnum = docnum
|
||||
self._read_part()
|
||||
offset = self._offset
|
||||
limit = self._limit
|
||||
|
||||
def next(self):
|
||||
self._docnum += 1
|
||||
return self._find_next()
|
||||
|
||||
def score(self):
|
||||
return self._a[self._docnum - self._offset]
|
||||
622
venv/Lib/site-packages/whoosh/matching/mcore.py
Normal file
622
venv/Lib/site-packages/whoosh/matching/mcore.py
Normal file
@@ -0,0 +1,622 @@
|
||||
# Copyright 2010 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
"""
|
||||
This module contains "matcher" classes. Matchers deal with posting lists. The
|
||||
most basic matcher, which reads the list of postings for a term, will be
|
||||
provided by the backend implementation (for example,
|
||||
:class:`whoosh.filedb.filepostings.FilePostingReader`). The classes in this
|
||||
module provide additional functionality, such as combining the results of two
|
||||
matchers, or modifying the results of a matcher.
|
||||
|
||||
You do not need to deal with the classes in this module unless you need to
|
||||
write your own Matcher implementation to provide some new functionality. These
|
||||
classes are not instantiated by the user. They are usually created by a
|
||||
:class:`~whoosh.query.Query` object's :meth:`~whoosh.query.Query.matcher()`
|
||||
method, which returns the appropriate matcher to implement the query (for
|
||||
example, the :class:`~whoosh.query.Or` query's
|
||||
:meth:`~whoosh.query.Or.matcher()` method returns a
|
||||
:py:class:`~whoosh.matching.UnionMatcher` object).
|
||||
|
||||
Certain backends support "quality" optimizations. These backends have the
|
||||
ability to skip ahead if it knows the current block of postings can't
|
||||
contribute to the top N documents. If the matcher tree and backend support
|
||||
these optimizations, the matcher's :meth:`Matcher.supports_block_quality()`
|
||||
method will return ``True``.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from itertools import repeat
|
||||
|
||||
from whoosh.compat import izip, xrange
|
||||
from whoosh.compat import abstractmethod
|
||||
|
||||
|
||||
# Exceptions
|
||||
|
||||
class ReadTooFar(Exception):
|
||||
"""Raised when :meth:`~whoosh.matching.Matcher.next()` or
|
||||
:meth:`~whoosh.matching.Matcher.skip_to()` are called on an inactive
|
||||
matcher.
|
||||
"""
|
||||
|
||||
|
||||
class NoQualityAvailable(Exception):
|
||||
"""Raised when quality methods are called on a matcher that does not
|
||||
support block quality optimizations.
|
||||
"""
|
||||
|
||||
|
||||
# Classes
|
||||
|
||||
class Matcher(object):
|
||||
"""Base class for all matchers.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def is_active(self):
|
||||
"""Returns True if this matcher is still "active", that is, it has not
|
||||
yet reached the end of the posting list.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def reset(self):
|
||||
"""Returns to the start of the posting list.
|
||||
|
||||
Note that reset() may not do what you expect after you call
|
||||
:meth:`Matcher.replace()`, since this can mean calling reset() not on
|
||||
the original matcher, but on an optimized replacement.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def term(self):
|
||||
"""Returns a ``("fieldname", "termtext")`` tuple for the term this
|
||||
matcher matches, or None if this matcher is not a term matcher.
|
||||
"""
|
||||
|
||||
return None
|
||||
|
||||
def term_matchers(self):
|
||||
"""Returns an iterator of term matchers in this tree.
|
||||
"""
|
||||
|
||||
if self.term() is not None:
|
||||
yield self
|
||||
else:
|
||||
for cm in self.children():
|
||||
for m in cm.term_matchers():
|
||||
yield m
|
||||
|
||||
def matching_terms(self, id=None):
|
||||
"""Returns an iterator of ``("fieldname", "termtext")`` tuples for the
|
||||
**currently matching** term matchers in this tree.
|
||||
"""
|
||||
|
||||
if not self.is_active():
|
||||
return
|
||||
|
||||
if id is None:
|
||||
id = self.id()
|
||||
elif id != self.id():
|
||||
return
|
||||
|
||||
t = self.term()
|
||||
if t is None:
|
||||
for c in self.children():
|
||||
for t in c.matching_terms(id):
|
||||
yield t
|
||||
else:
|
||||
yield t
|
||||
|
||||
def is_leaf(self):
|
||||
return not bool(self.children())
|
||||
|
||||
def children(self):
|
||||
"""Returns an (possibly empty) list of the submatchers of this
|
||||
matcher.
|
||||
"""
|
||||
|
||||
return []
|
||||
|
||||
def replace(self, minquality=0):
|
||||
"""Returns a possibly-simplified version of this matcher. For example,
|
||||
if one of the children of a UnionMatcher is no longer active, calling
|
||||
this method on the UnionMatcher will return the other child.
|
||||
"""
|
||||
|
||||
return self
|
||||
|
||||
@abstractmethod
|
||||
def copy(self):
|
||||
"""Returns a copy of this matcher.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def depth(self):
|
||||
"""Returns the depth of the tree under this matcher, or 0 if this
|
||||
matcher does not have any children.
|
||||
"""
|
||||
|
||||
return 0
|
||||
|
||||
def supports_block_quality(self):
|
||||
"""Returns True if this matcher supports the use of ``quality`` and
|
||||
``block_quality``.
|
||||
"""
|
||||
|
||||
return False
|
||||
|
||||
def max_quality(self):
|
||||
"""Returns the maximum possible quality measurement for this matcher,
|
||||
according to the current weighting algorithm. Raises
|
||||
``NoQualityAvailable`` if the matcher or weighting do not support
|
||||
quality measurements.
|
||||
"""
|
||||
|
||||
raise NoQualityAvailable(self.__class__)
|
||||
|
||||
def block_quality(self):
|
||||
"""Returns a quality measurement of the current block of postings,
|
||||
according to the current weighting algorithm. Raises
|
||||
``NoQualityAvailable`` if the matcher or weighting do not support
|
||||
quality measurements.
|
||||
"""
|
||||
|
||||
raise NoQualityAvailable(self.__class__)
|
||||
|
||||
@abstractmethod
|
||||
def id(self):
|
||||
"""Returns the ID of the current posting.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def all_ids(self):
|
||||
"""Returns a generator of all IDs in the matcher.
|
||||
|
||||
What this method returns for a matcher that has already read some
|
||||
postings (whether it only yields the remaining postings or all postings
|
||||
from the beginning) is undefined, so it's best to only use this method
|
||||
on fresh matchers.
|
||||
"""
|
||||
|
||||
i = 0
|
||||
m = self
|
||||
while m.is_active():
|
||||
yield m.id()
|
||||
m.next()
|
||||
i += 1
|
||||
if i == 10:
|
||||
m = m.replace()
|
||||
i = 0
|
||||
|
||||
def all_items(self):
|
||||
"""Returns a generator of all (ID, encoded value) pairs in the matcher.
|
||||
|
||||
What this method returns for a matcher that has already read some
|
||||
postings (whether it only yields the remaining postings or all postings
|
||||
from the beginning) is undefined, so it's best to only use this method
|
||||
on fresh matchers.
|
||||
"""
|
||||
|
||||
i = 0
|
||||
m = self
|
||||
while self.is_active():
|
||||
yield (m.id(), m.value())
|
||||
m.next()
|
||||
i += 1
|
||||
if i == 10:
|
||||
m = m.replace()
|
||||
i = 0
|
||||
|
||||
def items_as(self, astype):
|
||||
"""Returns a generator of all (ID, decoded value) pairs in the matcher.
|
||||
|
||||
What this method returns for a matcher that has already read some
|
||||
postings (whether it only yields the remaining postings or all postings
|
||||
from the beginning) is undefined, so it's best to only use this method
|
||||
on fresh matchers.
|
||||
"""
|
||||
|
||||
while self.is_active():
|
||||
yield (self.id(), self.value_as(astype))
|
||||
self.next()
|
||||
|
||||
@abstractmethod
|
||||
def value(self):
|
||||
"""Returns the encoded value of the current posting.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def supports(self, astype):
|
||||
"""Returns True if the field's format supports the named data type,
|
||||
for example 'frequency' or 'characters'.
|
||||
"""
|
||||
|
||||
raise NotImplementedError("supports not implemented in %s"
|
||||
% self.__class__)
|
||||
|
||||
@abstractmethod
|
||||
def value_as(self, astype):
|
||||
"""Returns the value(s) of the current posting as the given type.
|
||||
"""
|
||||
|
||||
raise NotImplementedError("value_as not implemented in %s"
|
||||
% self.__class__)
|
||||
|
||||
def spans(self):
|
||||
"""Returns a list of :class:`~whoosh.query.spans.Span` objects for the
|
||||
matches in this document. Raises an exception if the field being
|
||||
searched does not store positions.
|
||||
"""
|
||||
|
||||
from whoosh.query.spans import Span
|
||||
|
||||
if self.supports("characters"):
|
||||
return [Span(pos, startchar=startchar, endchar=endchar)
|
||||
for pos, startchar, endchar in self.value_as("characters")]
|
||||
elif self.supports("positions"):
|
||||
return [Span(pos) for pos in self.value_as("positions")]
|
||||
else:
|
||||
raise Exception("Field does not support spans")
|
||||
|
||||
def skip_to(self, id):
|
||||
"""Moves this matcher to the first posting with an ID equal to or
|
||||
greater than the given ID.
|
||||
"""
|
||||
|
||||
while self.is_active() and self.id() < id:
|
||||
self.next()
|
||||
|
||||
def skip_to_quality(self, minquality):
|
||||
"""Moves this matcher to the next block with greater than the given
|
||||
minimum quality value.
|
||||
"""
|
||||
|
||||
raise NotImplementedError(self.__class__.__name__)
|
||||
|
||||
@abstractmethod
|
||||
def next(self):
|
||||
"""Moves this matcher to the next posting.
|
||||
"""
|
||||
|
||||
raise NotImplementedError(self.__class__.__name__)
|
||||
|
||||
def weight(self):
|
||||
"""Returns the weight of the current posting.
|
||||
"""
|
||||
|
||||
return self.value_as("weight")
|
||||
|
||||
@abstractmethod
|
||||
def score(self):
|
||||
"""Returns the score of the current posting.
|
||||
"""
|
||||
|
||||
raise NotImplementedError(self.__class__.__name__)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.__class__ is type(other)
|
||||
|
||||
def __lt__(self, other):
|
||||
return type(other) is self.__class__
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self.__eq__(other)
|
||||
|
||||
def __gt__(self, other):
|
||||
return not (self.__lt__(other) or self.__eq__(other))
|
||||
|
||||
def __le__(self, other):
|
||||
return self.__eq__(other) or self.__lt__(other)
|
||||
|
||||
def __ge__(self, other):
|
||||
return self.__eq__(other) or self.__gt__(other)
|
||||
|
||||
|
||||
# Simple intermediate classes
|
||||
|
||||
class ConstantScoreMatcher(Matcher):
|
||||
def __init__(self, score=1.0):
|
||||
self._score = score
|
||||
|
||||
def supports_block_quality(self):
|
||||
return True
|
||||
|
||||
def max_quality(self):
|
||||
return self._score
|
||||
|
||||
def block_quality(self):
|
||||
return self._score
|
||||
|
||||
def skip_to_quality(self, minquality):
|
||||
if minquality >= self._score:
|
||||
self.go_inactive()
|
||||
|
||||
def score(self):
|
||||
return self._score
|
||||
|
||||
|
||||
# Null matcher
|
||||
|
||||
class NullMatcherClass(Matcher):
|
||||
"""Matcher with no postings which is never active.
|
||||
"""
|
||||
|
||||
def __call__(self):
|
||||
return self
|
||||
|
||||
def __repr__(self):
|
||||
return "<NullMatcher>"
|
||||
|
||||
def supports_block_quality(self):
|
||||
return True
|
||||
|
||||
def max_quality(self):
|
||||
return 0
|
||||
|
||||
def block_quality(self):
|
||||
return 0
|
||||
|
||||
def skip_to_quality(self, minquality):
|
||||
return 0
|
||||
|
||||
def is_active(self):
|
||||
return False
|
||||
|
||||
def reset(self):
|
||||
pass
|
||||
|
||||
def all_ids(self):
|
||||
return []
|
||||
|
||||
def copy(self):
|
||||
return self
|
||||
|
||||
|
||||
# Singleton instance
|
||||
NullMatcher = NullMatcherClass()
|
||||
|
||||
|
||||
class ListMatcher(Matcher):
|
||||
"""Synthetic matcher backed by a list of IDs.
|
||||
"""
|
||||
|
||||
def __init__(self, ids, weights=None, values=None, format=None,
|
||||
scorer=None, position=0, all_weights=None, term=None,
|
||||
terminfo=None):
|
||||
"""
|
||||
:param ids: a list of doc IDs.
|
||||
:param weights: a list of weights corresponding to the list of IDs.
|
||||
If this argument is not supplied, a list of 1.0 values is used.
|
||||
:param values: a list of encoded values corresponding to the list of
|
||||
IDs.
|
||||
:param format: a :class:`whoosh.formats.Format` object representing the
|
||||
format of the field.
|
||||
:param scorer: a :class:`whoosh.scoring.BaseScorer` object for scoring
|
||||
the postings.
|
||||
:param term: a ``("fieldname", "text")`` tuple, or None if this is not
|
||||
a term matcher.
|
||||
"""
|
||||
|
||||
self._ids = ids
|
||||
self._weights = weights
|
||||
self._all_weights = all_weights
|
||||
self._values = values
|
||||
self._i = position
|
||||
self._format = format
|
||||
self._scorer = scorer
|
||||
self._term = term
|
||||
self._terminfo = terminfo
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s>" % self.__class__.__name__
|
||||
|
||||
def is_active(self):
|
||||
return self._i < len(self._ids)
|
||||
|
||||
def reset(self):
|
||||
self._i = 0
|
||||
|
||||
def skip_to(self, id):
|
||||
if not self.is_active():
|
||||
raise ReadTooFar
|
||||
if id < self.id():
|
||||
return
|
||||
|
||||
while self._i < len(self._ids) and self._ids[self._i] < id:
|
||||
self._i += 1
|
||||
|
||||
def term(self):
|
||||
return self._term
|
||||
|
||||
def copy(self):
|
||||
return self.__class__(self._ids, self._weights, self._values,
|
||||
self._format, self._scorer, self._i,
|
||||
self._all_weights)
|
||||
|
||||
def replace(self, minquality=0):
|
||||
if not self.is_active():
|
||||
return NullMatcher()
|
||||
elif minquality and self.max_quality() < minquality:
|
||||
return NullMatcher()
|
||||
else:
|
||||
return self
|
||||
|
||||
def supports_block_quality(self):
|
||||
return (self._scorer is not None
|
||||
and self._scorer.supports_block_quality())
|
||||
|
||||
def max_quality(self):
|
||||
# This matcher treats all postings in the list as one "block", so the
|
||||
# block quality is the same as the quality of the entire list
|
||||
if self._scorer:
|
||||
return self._scorer.block_quality(self)
|
||||
else:
|
||||
return self.block_max_weight()
|
||||
|
||||
def block_quality(self):
|
||||
return self._scorer.block_quality(self)
|
||||
|
||||
def skip_to_quality(self, minquality):
|
||||
while self._i < len(self._ids) and self.block_quality() <= minquality:
|
||||
self._i += 1
|
||||
return 0
|
||||
|
||||
def id(self):
|
||||
return self._ids[self._i]
|
||||
|
||||
def all_ids(self):
|
||||
return iter(self._ids)
|
||||
|
||||
def all_items(self):
|
||||
values = self._values
|
||||
if values is None:
|
||||
values = repeat('')
|
||||
|
||||
return izip(self._ids, values)
|
||||
|
||||
def value(self):
|
||||
if self._values:
|
||||
v = self._values[self._i]
|
||||
|
||||
if isinstance(v, list):
|
||||
# This object supports "values" that are actually lists of
|
||||
# value strings. This is to support combining the results of
|
||||
# several different matchers into a single ListMatcher (see the
|
||||
# TOO_MANY_CLAUSES functionality of MultiTerm). We combine the
|
||||
# values here instead of combining them first and then making
|
||||
# the ListMatcher to avoid wasting time combining values if the
|
||||
# consumer never asks for them.
|
||||
assert len(v) > 0
|
||||
if len(v) == 1:
|
||||
v = v[0]
|
||||
else:
|
||||
v = self._format.combine(v)
|
||||
# Replace the list with the computed value string
|
||||
self._values[self._i] = v
|
||||
|
||||
return v
|
||||
else:
|
||||
return ''
|
||||
|
||||
def value_as(self, astype):
|
||||
decoder = self._format.decoder(astype)
|
||||
return decoder(self.value())
|
||||
|
||||
def supports(self, astype):
|
||||
return self._format.supports(astype)
|
||||
|
||||
def next(self):
|
||||
self._i += 1
|
||||
|
||||
def weight(self):
|
||||
if self._all_weights:
|
||||
return self._all_weights
|
||||
elif self._weights:
|
||||
return self._weights[self._i]
|
||||
else:
|
||||
return 1.0
|
||||
|
||||
def block_min_length(self):
|
||||
return self._terminfo.min_length()
|
||||
|
||||
def block_max_length(self):
|
||||
return self._terminfo.max_length()
|
||||
|
||||
def block_max_weight(self):
|
||||
if self._all_weights:
|
||||
return self._all_weights
|
||||
elif self._weights:
|
||||
return max(self._weights)
|
||||
elif self._terminfo is not None:
|
||||
return self._terminfo.max_weight()
|
||||
else:
|
||||
return 1.0
|
||||
|
||||
def score(self):
|
||||
if self._scorer:
|
||||
return self._scorer.score(self)
|
||||
else:
|
||||
return self.weight()
|
||||
|
||||
|
||||
# Term/vector leaf posting matcher middleware
|
||||
|
||||
class LeafMatcher(Matcher):
|
||||
# Subclasses need to set
|
||||
# self.scorer -- a Scorer object or None
|
||||
# self.format -- Format object for the posting values
|
||||
|
||||
def __repr__(self):
|
||||
return "%s(%r, %s)" % (self.__class__.__name__, self.term(),
|
||||
self.is_active())
|
||||
|
||||
def term(self):
|
||||
return self._term
|
||||
|
||||
def items_as(self, astype):
|
||||
decoder = self.format.decoder(astype)
|
||||
for id, value in self.all_items():
|
||||
yield (id, decoder(value))
|
||||
|
||||
def supports(self, astype):
|
||||
return self.format.supports(astype)
|
||||
|
||||
def value_as(self, astype):
|
||||
decoder = self.format.decoder(astype)
|
||||
return decoder(self.value())
|
||||
|
||||
def spans(self):
|
||||
from whoosh.query.spans import Span
|
||||
|
||||
if self.supports("characters"):
|
||||
return [Span(pos, startchar=startchar, endchar=endchar)
|
||||
for pos, startchar, endchar in self.value_as("characters")]
|
||||
elif self.supports("positions"):
|
||||
return [Span(pos) for pos in self.value_as("positions")]
|
||||
else:
|
||||
raise Exception("Field does not support positions (%r)"
|
||||
% self.term())
|
||||
|
||||
def supports_block_quality(self):
|
||||
return self.scorer and self.scorer.supports_block_quality()
|
||||
|
||||
def max_quality(self):
|
||||
return self.scorer.max_quality()
|
||||
|
||||
def block_quality(self):
|
||||
return self.scorer.block_quality(self)
|
||||
|
||||
def score(self):
|
||||
return self.scorer.score(self)
|
||||
572
venv/Lib/site-packages/whoosh/matching/wrappers.py
Normal file
572
venv/Lib/site-packages/whoosh/matching/wrappers.py
Normal file
@@ -0,0 +1,572 @@
|
||||
# Copyright 2010 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
from __future__ import division
|
||||
|
||||
from whoosh.compat import xrange
|
||||
from whoosh.matching import mcore
|
||||
|
||||
|
||||
class WrappingMatcher(mcore.Matcher):
|
||||
"""Base class for matchers that wrap sub-matchers.
|
||||
"""
|
||||
|
||||
def __init__(self, child, boost=1.0):
|
||||
self.child = child
|
||||
self.boost = boost
|
||||
|
||||
def __repr__(self):
|
||||
return "%s(%r, boost=%s)" % (self.__class__.__name__, self.child,
|
||||
self.boost)
|
||||
|
||||
def copy(self):
|
||||
kwargs = {}
|
||||
if hasattr(self, "boost"):
|
||||
kwargs["boost"] = self.boost
|
||||
return self.__class__(self.child.copy(), **kwargs)
|
||||
|
||||
def depth(self):
|
||||
return 1 + self.child.depth()
|
||||
|
||||
def _replacement(self, newchild):
|
||||
return self.__class__(newchild, boost=self.boost)
|
||||
|
||||
def replace(self, minquality=0):
|
||||
# Replace the child matcher
|
||||
r = self.child.replace(minquality)
|
||||
if r is not self.child:
|
||||
# If the child changed, return a new wrapper on the new child
|
||||
return self._replacement(r)
|
||||
else:
|
||||
return self
|
||||
|
||||
def id(self):
|
||||
return self.child.id()
|
||||
|
||||
def all_ids(self):
|
||||
return self.child.all_ids()
|
||||
|
||||
def is_active(self):
|
||||
return self.child.is_active()
|
||||
|
||||
def reset(self):
|
||||
self.child.reset()
|
||||
|
||||
def children(self):
|
||||
return [self.child]
|
||||
|
||||
def supports(self, astype):
|
||||
return self.child.supports(astype)
|
||||
|
||||
def value(self):
|
||||
return self.child.value()
|
||||
|
||||
def value_as(self, astype):
|
||||
return self.child.value_as(astype)
|
||||
|
||||
def spans(self):
|
||||
return self.child.spans()
|
||||
|
||||
def skip_to(self, id):
|
||||
return self.child.skip_to(id)
|
||||
|
||||
def next(self):
|
||||
self.child.next()
|
||||
|
||||
def supports_block_quality(self):
|
||||
return self.child.supports_block_quality()
|
||||
|
||||
def skip_to_quality(self, minquality):
|
||||
return self.child.skip_to_quality(minquality / self.boost)
|
||||
|
||||
def max_quality(self):
|
||||
return self.child.max_quality() * self.boost
|
||||
|
||||
def block_quality(self):
|
||||
return self.child.block_quality() * self.boost
|
||||
|
||||
def weight(self):
|
||||
return self.child.weight() * self.boost
|
||||
|
||||
def score(self):
|
||||
return self.child.score() * self.boost
|
||||
|
||||
|
||||
class MultiMatcher(mcore.Matcher):
|
||||
"""Serializes the results of a list of sub-matchers.
|
||||
"""
|
||||
|
||||
def __init__(self, matchers, idoffsets, scorer=None, current=0):
|
||||
"""
|
||||
:param matchers: a list of Matcher objects.
|
||||
:param idoffsets: a list of offsets corresponding to items in the
|
||||
``matchers`` list.
|
||||
"""
|
||||
|
||||
self.matchers = matchers
|
||||
self.offsets = idoffsets
|
||||
self.scorer = scorer
|
||||
self.current = current
|
||||
self._next_matcher()
|
||||
|
||||
def __repr__(self):
|
||||
return "%s(%r, %r, current=%s)" % (self.__class__.__name__,
|
||||
self.matchers, self.offsets,
|
||||
self.current)
|
||||
|
||||
def is_active(self):
|
||||
return self.current < len(self.matchers)
|
||||
|
||||
def reset(self):
|
||||
for mr in self.matchers:
|
||||
mr.reset()
|
||||
self.current = 0
|
||||
|
||||
def children(self):
|
||||
return [self.matchers[self.current]]
|
||||
|
||||
def _next_matcher(self):
|
||||
matchers = self.matchers
|
||||
while (self.current < len(matchers)
|
||||
and not matchers[self.current].is_active()):
|
||||
self.current += 1
|
||||
|
||||
def copy(self):
|
||||
return self.__class__([mr.copy() for mr in self.matchers],
|
||||
self.offsets, current=self.current)
|
||||
|
||||
def depth(self):
|
||||
if self.is_active():
|
||||
return 1 + max(mr.depth() for mr in self.matchers[self.current:])
|
||||
else:
|
||||
return 0
|
||||
|
||||
def replace(self, minquality=0):
|
||||
m = self
|
||||
if minquality:
|
||||
# Skip sub-matchers that don't have a high enough max quality to
|
||||
# contribute
|
||||
while (m.is_active()
|
||||
and m.matchers[m.current].max_quality() < minquality):
|
||||
m = self.__class__(self.matchers, self.offsets, self.scorer,
|
||||
m.current + 1)
|
||||
m._next_matcher()
|
||||
|
||||
if not m.is_active():
|
||||
return mcore.NullMatcher()
|
||||
|
||||
# TODO: Possible optimization: if the last matcher is current, replace
|
||||
# this with the last matcher, but wrap it with a matcher that adds the
|
||||
# offset. Have to check whether that's actually faster, though.
|
||||
return m
|
||||
|
||||
def id(self):
|
||||
current = self.current
|
||||
return self.matchers[current].id() + self.offsets[current]
|
||||
|
||||
def all_ids(self):
|
||||
offsets = self.offsets
|
||||
for i, mr in enumerate(self.matchers):
|
||||
for id in mr.all_ids():
|
||||
yield id + offsets[i]
|
||||
|
||||
def spans(self):
|
||||
return self.matchers[self.current].spans()
|
||||
|
||||
def supports(self, astype):
|
||||
return self.matchers[self.current].supports(astype)
|
||||
|
||||
def value(self):
|
||||
return self.matchers[self.current].value()
|
||||
|
||||
def value_as(self, astype):
|
||||
return self.matchers[self.current].value_as(astype)
|
||||
|
||||
def next(self):
|
||||
if not self.is_active():
|
||||
raise mcore.ReadTooFar
|
||||
|
||||
self.matchers[self.current].next()
|
||||
if not self.matchers[self.current].is_active():
|
||||
self._next_matcher()
|
||||
|
||||
def skip_to(self, id):
|
||||
if not self.is_active():
|
||||
raise mcore.ReadTooFar
|
||||
if id <= self.id():
|
||||
return
|
||||
|
||||
matchers = self.matchers
|
||||
offsets = self.offsets
|
||||
r = False
|
||||
|
||||
while self.current < len(matchers) and id > self.id():
|
||||
mr = matchers[self.current]
|
||||
sr = mr.skip_to(id - offsets[self.current])
|
||||
r = sr or r
|
||||
if mr.is_active():
|
||||
break
|
||||
|
||||
self._next_matcher()
|
||||
|
||||
return r
|
||||
|
||||
def supports_block_quality(self):
|
||||
return all(mr.supports_block_quality() for mr
|
||||
in self.matchers[self.current:])
|
||||
|
||||
def max_quality(self):
|
||||
return max(m.max_quality() for m in self.matchers[self.current:])
|
||||
|
||||
def block_quality(self):
|
||||
return self.matchers[self.current].block_quality()
|
||||
|
||||
def weight(self):
|
||||
return self.matchers[self.current].weight()
|
||||
|
||||
def score(self):
|
||||
return self.scorer.score(self)
|
||||
|
||||
|
||||
def ExcludeMatcher(child, excluded, boost=1.0):
|
||||
return FilterMatcher(child, excluded, exclude=True, boost=boost)
|
||||
|
||||
|
||||
class FilterMatcher(WrappingMatcher):
|
||||
"""Filters the postings from the wrapped based on whether the IDs are
|
||||
present in or absent from a set.
|
||||
"""
|
||||
|
||||
def __init__(self, child, ids, exclude=False, boost=1.0):
|
||||
"""
|
||||
:param child: the child matcher.
|
||||
:param ids: a set of IDs to filter by.
|
||||
:param exclude: by default, only IDs from the wrapped matcher that are
|
||||
**in** the set are used. If this argument is True, only IDs from
|
||||
the wrapped matcher that are **not in** the set are used.
|
||||
"""
|
||||
|
||||
super(FilterMatcher, self).__init__(child)
|
||||
self._ids = ids
|
||||
self._exclude = exclude
|
||||
self.boost = boost
|
||||
self._find_next()
|
||||
|
||||
def __repr__(self):
|
||||
return "%s(%r, %r, %r, boost=%s)" % (self.__class__.__name__,
|
||||
self.child, self._ids,
|
||||
self._exclude, self.boost)
|
||||
|
||||
def reset(self):
|
||||
self.child.reset()
|
||||
self._find_next()
|
||||
|
||||
def copy(self):
|
||||
return self.__class__(self.child.copy(), self._ids, self._exclude,
|
||||
boost=self.boost)
|
||||
|
||||
def _replacement(self, newchild):
|
||||
return self.__class__(newchild, self._ids, exclude=self._exclude,
|
||||
boost=self.boost)
|
||||
|
||||
def _find_next(self):
|
||||
child = self.child
|
||||
ids = self._ids
|
||||
r = False
|
||||
|
||||
if self._exclude:
|
||||
while child.is_active() and child.id() in ids:
|
||||
r = child.next() or r
|
||||
else:
|
||||
while child.is_active() and child.id() not in ids:
|
||||
r = child.next() or r
|
||||
return r
|
||||
|
||||
def next(self):
|
||||
self.child.next()
|
||||
self._find_next()
|
||||
|
||||
def skip_to(self, id):
|
||||
self.child.skip_to(id)
|
||||
self._find_next()
|
||||
|
||||
def all_ids(self):
|
||||
ids = self._ids
|
||||
if self._exclude:
|
||||
return (id for id in self.child.all_ids() if id not in ids)
|
||||
else:
|
||||
return (id for id in self.child.all_ids() if id in ids)
|
||||
|
||||
def all_items(self):
|
||||
ids = self._ids
|
||||
if self._exclude:
|
||||
return (item for item in self.child.all_items()
|
||||
if item[0] not in ids)
|
||||
else:
|
||||
return (item for item in self.child.all_items() if item[0] in ids)
|
||||
|
||||
|
||||
class InverseMatcher(WrappingMatcher):
|
||||
"""Synthetic matcher, generates postings that are NOT present in the
|
||||
wrapped matcher.
|
||||
"""
|
||||
|
||||
def __init__(self, child, limit, missing=None, weight=1.0, id=0):
|
||||
super(InverseMatcher, self).__init__(child)
|
||||
self.limit = limit
|
||||
self._weight = weight
|
||||
self.missing = missing or (lambda id: False)
|
||||
self._id = id
|
||||
self._find_next()
|
||||
|
||||
def copy(self):
|
||||
return self.__class__(self.child.copy(), self.limit,
|
||||
weight=self._weight, missing=self.missing,
|
||||
id=self._id)
|
||||
|
||||
def _replacement(self, newchild):
|
||||
return self.__class__(newchild, self.limit, missing=self.missing,
|
||||
weight=self._weight, id=self._id)
|
||||
|
||||
def is_active(self):
|
||||
return self._id < self.limit
|
||||
|
||||
def reset(self):
|
||||
self.child.reset()
|
||||
self._id = 0
|
||||
self._find_next()
|
||||
|
||||
def supports_block_quality(self):
|
||||
return False
|
||||
|
||||
def _find_next(self):
|
||||
child = self.child
|
||||
missing = self.missing
|
||||
|
||||
# If the current docnum isn't missing and the child matcher is
|
||||
# exhausted (so we don't have to worry about skipping its matches), we
|
||||
# don't have to do anything
|
||||
if not child.is_active() and not missing(self._id):
|
||||
return
|
||||
|
||||
# Skip missing documents
|
||||
while self._id < self.limit and missing(self._id):
|
||||
self._id += 1
|
||||
|
||||
# Catch the child matcher up to where this matcher is
|
||||
if child.is_active() and child.id() < self._id:
|
||||
child.skip_to(self._id)
|
||||
|
||||
# While self._id is missing or is in the child matcher, increase it
|
||||
while child.is_active() and self._id < self.limit:
|
||||
if missing(self._id):
|
||||
self._id += 1
|
||||
continue
|
||||
|
||||
if self._id == child.id():
|
||||
self._id += 1
|
||||
child.next()
|
||||
continue
|
||||
|
||||
break
|
||||
|
||||
def id(self):
|
||||
return self._id
|
||||
|
||||
def all_ids(self):
|
||||
return mcore.Matcher.all_ids(self)
|
||||
|
||||
def next(self):
|
||||
if self._id >= self.limit:
|
||||
raise mcore.ReadTooFar
|
||||
self._id += 1
|
||||
self._find_next()
|
||||
|
||||
def skip_to(self, id):
|
||||
if self._id >= self.limit:
|
||||
raise mcore.ReadTooFar
|
||||
if id < self._id:
|
||||
return
|
||||
self._id = id
|
||||
self._find_next()
|
||||
|
||||
def weight(self):
|
||||
return self._weight
|
||||
|
||||
def score(self):
|
||||
return self._weight
|
||||
|
||||
|
||||
class RequireMatcher(WrappingMatcher):
|
||||
"""Matches postings that are in both sub-matchers, but only uses scores
|
||||
from the first.
|
||||
"""
|
||||
|
||||
def __init__(self, a, b):
|
||||
from whoosh.matching.binary import IntersectionMatcher
|
||||
|
||||
self.a = a
|
||||
self.b = b
|
||||
WrappingMatcher.__init__(self, IntersectionMatcher(a, b))
|
||||
|
||||
def copy(self):
|
||||
return self.__class__(self.a.copy(), self.b.copy())
|
||||
|
||||
def supports_block_quality(self):
|
||||
return self.a.supports_block_quality()
|
||||
|
||||
def replace(self, minquality=0):
|
||||
if not self.child.is_active():
|
||||
# If one of the sub-matchers is inactive, go inactive
|
||||
return mcore.NullMatcher()
|
||||
elif minquality and self.a.max_quality() < minquality:
|
||||
# If the required matcher doesn't have a high enough max quality
|
||||
# to possibly contribute, return an inactive matcher
|
||||
return mcore.NullMatcher()
|
||||
|
||||
new_a = self.a.replace(minquality)
|
||||
new_b = self.b.replace()
|
||||
if not new_a.is_active():
|
||||
return mcore.NullMatcher()
|
||||
elif new_a is not self.a or new_b is not self.b:
|
||||
# If one of the sub-matchers changed, return a new Require
|
||||
return self.__class__(new_a, self.b)
|
||||
else:
|
||||
return self
|
||||
|
||||
def max_quality(self):
|
||||
return self.a.max_quality()
|
||||
|
||||
def block_quality(self):
|
||||
return self.a.block_quality()
|
||||
|
||||
def skip_to_quality(self, minquality):
|
||||
skipped = self.a.skip_to_quality(minquality)
|
||||
self.child._find_next()
|
||||
return skipped
|
||||
|
||||
def weight(self):
|
||||
return self.a.weight()
|
||||
|
||||
def score(self):
|
||||
return self.a.score()
|
||||
|
||||
def supports(self, astype):
|
||||
return self.a.supports(astype)
|
||||
|
||||
def value(self):
|
||||
return self.a.value()
|
||||
|
||||
def value_as(self, astype):
|
||||
return self.a.value_as(astype)
|
||||
|
||||
|
||||
class ConstantScoreWrapperMatcher(WrappingMatcher):
|
||||
def __init__(self, child, score=1.0):
|
||||
WrappingMatcher.__init__(self, child)
|
||||
self._score = score
|
||||
|
||||
def copy(self):
|
||||
return self.__class__(self.child.copy(), score=self._score)
|
||||
|
||||
def _replacement(self, newchild):
|
||||
return self.__class__(newchild, score=self._score)
|
||||
|
||||
def max_quality(self):
|
||||
return self._score
|
||||
|
||||
def block_quality(self):
|
||||
return self._score
|
||||
|
||||
def score(self):
|
||||
return self._score
|
||||
|
||||
|
||||
class SingleTermMatcher(WrappingMatcher):
|
||||
"""Makes a tree of matchers act as if they were a matcher for a single
|
||||
term for the purposes of "what terms are matching?" questions.
|
||||
"""
|
||||
|
||||
def __init__(self, child, term):
|
||||
WrappingMatcher.__init__(self, child)
|
||||
self._term = term
|
||||
|
||||
def term(self):
|
||||
return self._term
|
||||
|
||||
def replace(self, minquality=0):
|
||||
return self
|
||||
|
||||
|
||||
class CoordMatcher(WrappingMatcher):
|
||||
"""Modifies the computed score to penalize documents that don't match all
|
||||
terms in the matcher tree.
|
||||
|
||||
Because this matcher modifies the score, it may give unexpected results
|
||||
when compared to another matcher returning the unmodified score.
|
||||
"""
|
||||
|
||||
def __init__(self, child, scale=1.0):
|
||||
WrappingMatcher.__init__(self, child)
|
||||
self._termcount = len(list(child.term_matchers()))
|
||||
self._maxqual = child.max_quality()
|
||||
self._scale = scale
|
||||
|
||||
def _replacement(self, newchild):
|
||||
return self.__class__(newchild, scale=self._scale)
|
||||
|
||||
def _sqr(self, score, matching):
|
||||
# This is the "SQR" (Short Query Ranking) function used by Apple's old
|
||||
# V-twin search library, described in the paper "V-Twin: A Lightweight
|
||||
# Engine for Interactive Use".
|
||||
#
|
||||
# http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.56.1916
|
||||
|
||||
# score - document score using the current weighting function
|
||||
# matching - number of matching terms in the current document
|
||||
termcount = self._termcount # Number of terms in this tree
|
||||
scale = self._scale # Scaling factor
|
||||
|
||||
sqr = ((score + ((matching - 1) / (termcount - scale) ** 2))
|
||||
* ((termcount - 1) / termcount))
|
||||
return sqr
|
||||
|
||||
def max_quality(self):
|
||||
return self._sqr(self.child.max_quality(), self._termcount)
|
||||
|
||||
def block_quality(self):
|
||||
return self._sqr(self.child.block_quality(), self._termcount)
|
||||
|
||||
def score(self):
|
||||
child = self.child
|
||||
|
||||
score = child.score()
|
||||
matching = 0
|
||||
for _ in child.matching_terms(child.id()):
|
||||
matching += 1
|
||||
|
||||
return self._sqr(score, matching)
|
||||
Reference in New Issue
Block a user