This commit is contained in:
“shengyudong”
2026-01-06 14:18:39 +08:00
commit 5a384b694e
10345 changed files with 2050918 additions and 0 deletions

View File

@@ -0,0 +1,31 @@
# Copyright 2012 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
from whoosh.matching.mcore import *
from whoosh.matching.binary import *
from whoosh.matching.wrappers import *
from whoosh.matching.combo import *

View File

@@ -0,0 +1,803 @@
# Copyright 2010 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
from whoosh.matching import mcore
class BiMatcher(mcore.Matcher):
"""Base class for matchers that combine the results of two sub-matchers in
some way.
"""
def __init__(self, a, b):
super(BiMatcher, self).__init__()
self.a = a
self.b = b
def reset(self):
self.a.reset()
self.b.reset()
def __repr__(self):
return "%s(%r, %r)" % (self.__class__.__name__, self.a, self.b)
def children(self):
return [self.a, self.b]
def copy(self):
return self.__class__(self.a.copy(), self.b.copy())
def depth(self):
return 1 + max(self.a.depth(), self.b.depth())
def skip_to(self, id):
if not self.is_active():
raise mcore.ReadTooFar
ra = self.a.skip_to(id)
rb = self.b.skip_to(id)
return ra or rb
def supports_block_quality(self):
return (self.a.supports_block_quality()
and self.b.supports_block_quality())
def supports(self, astype):
return self.a.supports(astype) and self.b.supports(astype)
class AdditiveBiMatcher(BiMatcher):
"""Base class for binary matchers where the scores of the sub-matchers are
added together.
"""
def max_quality(self):
q = 0.0
if self.a.is_active():
q += self.a.max_quality()
if self.b.is_active():
q += self.b.max_quality()
return q
def block_quality(self):
bq = 0.0
if self.a.is_active():
bq += self.a.block_quality()
if self.b.is_active():
bq += self.b.block_quality()
return bq
def weight(self):
return (self.a.weight() + self.b.weight())
def score(self):
return (self.a.score() + self.b.score())
def __eq__(self, other):
return self.__class__ is type(other)
def __lt__(self, other):
return type(other) is self.__class__
def __ne__(self, other):
return not self.__eq__(other)
def __gt__(self, other):
return not (self.__lt__(other) or self.__eq__(other))
def __le__(self, other):
return self.__eq__(other) or self.__lt__(other)
def __ge__(self, other):
return self.__eq__(other) or self.__gt__(other)
class UnionMatcher(AdditiveBiMatcher):
"""Matches the union (OR) of the postings in the two sub-matchers.
"""
_id = None
def replace(self, minquality=0):
a = self.a
b = self.b
a_active = a.is_active()
b_active = b.is_active()
# If neither sub-matcher on its own has a high enough max quality to
# contribute, convert to an intersection matcher
if minquality and a_active and b_active:
a_max = a.max_quality()
b_max = b.max_quality()
if a_max < minquality and b_max < minquality:
return IntersectionMatcher(a, b).replace(minquality)
elif a_max < minquality:
return AndMaybeMatcher(b, a)
elif b_max < minquality:
return AndMaybeMatcher(a, b)
# If one or both of the sub-matchers are inactive, convert
if not (a_active or b_active):
return mcore.NullMatcher()
elif not a_active:
return b.replace(minquality)
elif not b_active:
return a.replace(minquality)
a = a.replace(minquality - b.max_quality() if minquality else 0)
b = b.replace(minquality - a.max_quality() if minquality else 0)
# If one of the sub-matchers changed, return a new union
if a is not self.a or b is not self.b:
return self.__class__(a, b)
else:
self._id = None
return self
def is_active(self):
return self.a.is_active() or self.b.is_active()
def skip_to(self, id):
self._id = None
ra = rb = False
if self.a.is_active():
ra = self.a.skip_to(id)
if self.b.is_active():
rb = self.b.skip_to(id)
return ra or rb
def id(self):
_id = self._id
if _id is not None:
return _id
a = self.a
b = self.b
if not a.is_active():
_id = b.id()
elif not b.is_active():
_id = a.id()
else:
_id = min(a.id(), b.id())
self._id = _id
return _id
# Using sets is faster in most cases, but could potentially use a lot of
# memory. Comment out this method override to not use sets.
#def all_ids(self):
# return iter(sorted(set(self.a.all_ids()) | set(self.b.all_ids())))
def next(self):
self._id = None
a = self.a
b = self.b
a_active = a.is_active()
b_active = b.is_active()
# Shortcut when one matcher is inactive
if not (a_active or b_active):
raise mcore.ReadTooFar
elif not a_active:
return b.next()
elif not b_active:
return a.next()
a_id = a.id()
b_id = b.id()
ar = br = None
# After all that, here's the actual implementation
if a_id <= b_id:
ar = a.next()
if b_id <= a_id:
br = b.next()
return ar or br
def spans(self):
if not self.a.is_active():
return self.b.spans()
if not self.b.is_active():
return self.a.spans()
id_a = self.a.id()
id_b = self.b.id()
if id_a < id_b:
return self.a.spans()
elif id_b < id_a:
return self.b.spans()
else:
return sorted(set(self.a.spans()) | set(self.b.spans()))
def weight(self):
a = self.a
b = self.b
if not a.is_active():
return b.weight()
if not b.is_active():
return a.weight()
id_a = a.id()
id_b = b.id()
if id_a < id_b:
return a.weight()
elif id_b < id_a:
return b.weight()
else:
return (a.weight() + b.weight())
def score(self):
a = self.a
b = self.b
if not a.is_active():
return b.score()
if not b.is_active():
return a.score()
id_a = a.id()
id_b = b.id()
if id_a < id_b:
return a.score()
elif id_b < id_a:
return b.score()
else:
return (a.score() + b.score())
def skip_to_quality(self, minquality):
self._id = None
a = self.a
b = self.b
if not (a.is_active() or b.is_active()):
raise mcore.ReadTooFar
# Short circuit if one matcher is inactive
if not a.is_active():
return b.skip_to_quality(minquality)
elif not b.is_active():
return a.skip_to_quality(minquality)
skipped = 0
aq = a.block_quality()
bq = b.block_quality()
while a.is_active() and b.is_active() and aq + bq <= minquality:
if aq < bq:
skipped += a.skip_to_quality(minquality - bq)
aq = a.block_quality()
else:
skipped += b.skip_to_quality(minquality - aq)
bq = b.block_quality()
return skipped
class DisjunctionMaxMatcher(UnionMatcher):
"""Matches the union (OR) of two sub-matchers. Where both sub-matchers
match the same posting, returns the weight/score of the higher-scoring
posting.
"""
# TODO: this class inherits from AdditiveBiMatcher (through UnionMatcher)
# but it does not add the scores of the sub-matchers together (it
# overrides all methods that perform addition). Need to clean up the
# inheritance.
def __init__(self, a, b, tiebreak=0.0):
super(DisjunctionMaxMatcher, self).__init__(a, b)
self.tiebreak = tiebreak
def copy(self):
return self.__class__(self.a.copy(), self.b.copy(),
tiebreak=self.tiebreak)
def replace(self, minquality=0):
a = self.a
b = self.b
a_active = a.is_active()
b_active = b.is_active()
# DisMax takes the max of the sub-matcher qualities instead of adding
# them, so we need special logic here
if minquality and a_active and b_active:
a_max = a.max_quality()
b_max = b.max_quality()
if a_max < minquality and b_max < minquality:
# If neither sub-matcher has a high enough max quality to
# contribute, return an inactive matcher
return mcore.NullMatcher()
elif b_max < minquality:
# If the b matcher can't contribute, return a
return a.replace(minquality)
elif a_max < minquality:
# If the a matcher can't contribute, return b
return b.replace(minquality)
if not (a_active or b_active):
return mcore.NullMatcher()
elif not a_active:
return b.replace(minquality)
elif not b_active:
return a.replace(minquality)
# We CAN pass the minquality down here, since we don't add the two
# scores together
a = a.replace(minquality)
b = b.replace(minquality)
a_active = a.is_active()
b_active = b.is_active()
# It's kind of tedious to check for inactive sub-matchers all over
# again here after we replace them, but it's probably better than
# returning a replacement with an inactive sub-matcher
if not (a_active and b_active):
return mcore.NullMatcher()
elif not a_active:
return b
elif not b_active:
return a
elif a is not self.a or b is not self.b:
# If one of the sub-matchers changed, return a new DisMax
return self.__class__(a, b)
else:
return self
def score(self):
if not self.a.is_active():
return self.b.score()
elif not self.b.is_active():
return self.a.score()
else:
return max(self.a.score(), self.b.score())
def max_quality(self):
return max(self.a.max_quality(), self.b.max_quality())
def block_quality(self):
return max(self.a.block_quality(), self.b.block_quality())
def skip_to_quality(self, minquality):
a = self.a
b = self.b
# Short circuit if one matcher is inactive
if not a.is_active():
sk = b.skip_to_quality(minquality)
return sk
elif not b.is_active():
return a.skip_to_quality(minquality)
skipped = 0
aq = a.block_quality()
bq = b.block_quality()
while a.is_active() and b.is_active() and max(aq, bq) <= minquality:
if aq <= minquality:
skipped += a.skip_to_quality(minquality)
aq = a.block_quality()
if bq <= minquality:
skipped += b.skip_to_quality(minquality)
bq = b.block_quality()
return skipped
class IntersectionMatcher(AdditiveBiMatcher):
"""Matches the intersection (AND) of the postings in the two sub-matchers.
"""
def __init__(self, a, b):
super(IntersectionMatcher, self).__init__(a, b)
self._find_first()
def reset(self):
self.a.reset()
self.b.reset()
self._find_first()
def _find_first(self):
if (self.a.is_active()
and self.b.is_active()
and self.a.id() != self.b.id()):
self._find_next()
def replace(self, minquality=0):
a = self.a
b = self.b
a_active = a.is_active()
b_active = b.is_active()
if not (a_active and b_active):
# Intersection matcher requires that both sub-matchers be active
return mcore.NullMatcher()
if minquality:
a_max = a.max_quality()
b_max = b.max_quality()
if a_max + b_max < minquality:
# If the combined quality of the sub-matchers can't contribute,
# return an inactive matcher
return mcore.NullMatcher()
# Require that the replacements be able to contribute results
# higher than the minquality
a_min = minquality - b_max
b_min = minquality - a_max
else:
a_min = b_min = 0
a = a.replace(a_min)
b = b.replace(b_min)
a_active = a.is_active()
b_active = b.is_active()
if not (a_active or b_active):
return mcore.NullMatcher()
elif not a_active:
return b
elif not b_active:
return a
elif a is not self.a or b is not self.b:
return self.__class__(a, b)
else:
return self
def is_active(self):
return self.a.is_active() and self.b.is_active()
def _find_next(self):
a = self.a
b = self.b
a_id = a.id()
b_id = b.id()
assert a_id != b_id
r = False
while a.is_active() and b.is_active() and a_id != b_id:
if a_id < b_id:
ra = a.skip_to(b_id)
if not a.is_active():
return
r = r or ra
a_id = a.id()
else:
rb = b.skip_to(a_id)
if not b.is_active():
return
r = r or rb
b_id = b.id()
return r
def id(self):
return self.a.id()
# Using sets is faster in some cases, but could potentially use a lot of
# memory
def all_ids(self):
return iter(sorted(set(self.a.all_ids()) & set(self.b.all_ids())))
def skip_to(self, id):
if not self.is_active():
raise mcore.ReadTooFar
ra = self.a.skip_to(id)
rb = self.b.skip_to(id)
if self.is_active():
rn = False
if self.a.id() != self.b.id():
rn = self._find_next()
return ra or rb or rn
def skip_to_quality(self, minquality):
a = self.a
b = self.b
minquality = minquality
skipped = 0
aq = a.block_quality()
bq = b.block_quality()
while a.is_active() and b.is_active() and aq + bq <= minquality:
if aq < bq:
# If the block quality of A is less than B, skip A ahead until
# it can contribute at least the balance of the required min
# quality when added to B
sk = a.skip_to_quality(minquality - bq)
skipped += sk
if not sk and a.is_active():
# The matcher couldn't skip ahead for some reason, so just
# advance and try again
a.next()
else:
# And vice-versa
sk = b.skip_to_quality(minquality - aq)
skipped += sk
if not sk and b.is_active():
b.next()
if not a.is_active() or not b.is_active():
# One of the matchers is exhausted
break
if a.id() != b.id():
# We want to always leave in a state where the matchers are at
# the same document, so call _find_next() to sync them
self._find_next()
# Get the block qualities at the new matcher positions
aq = a.block_quality()
bq = b.block_quality()
return skipped
def next(self):
if not self.is_active():
raise mcore.ReadTooFar
# We must assume that the ids are equal whenever next() is called (they
# should have been made equal by _find_next), so advance them both
ar = self.a.next()
if self.is_active():
nr = self._find_next()
return ar or nr
def spans(self):
return sorted(set(self.a.spans()) | set(self.b.spans()))
class AndNotMatcher(BiMatcher):
"""Matches the postings in the first sub-matcher that are NOT present in
the second sub-matcher.
"""
def __init__(self, a, b):
super(AndNotMatcher, self).__init__(a, b)
self._find_first()
def reset(self):
self.a.reset()
self.b.reset()
self._find_first()
def _find_first(self):
if (self.a.is_active()
and self.b.is_active()
and self.a.id() == self.b.id()):
self._find_next()
def is_active(self):
return self.a.is_active()
def _find_next(self):
pos = self.a
neg = self.b
if not neg.is_active():
return
pos_id = pos.id()
r = False
if neg.id() < pos_id:
neg.skip_to(pos_id)
while pos.is_active() and neg.is_active() and pos_id == neg.id():
nr = pos.next()
if not pos.is_active():
break
r = r or nr
pos_id = pos.id()
neg.skip_to(pos_id)
return r
def supports_block_quality(self):
return self.a.supports_block_quality()
def replace(self, minquality=0):
if not self.a.is_active():
# The a matcher is required, so if it's inactive, return an
# inactive matcher
return mcore.NullMatcher()
elif (minquality
and self.a.max_quality() < minquality):
# If the quality of the required matcher isn't high enough to
# contribute, return an inactive matcher
return mcore.NullMatcher()
elif not self.b.is_active():
# If the prohibited matcher is inactive, convert to just the
# required matcher
return self.a.replace(minquality)
a = self.a.replace(minquality)
b = self.b.replace()
if a is not self.a or b is not self.b:
# If one of the sub-matchers was replaced, return a new AndNot
return self.__class__(a, b)
else:
return self
def max_quality(self):
return self.a.max_quality()
def block_quality(self):
return self.a.block_quality()
def skip_to_quality(self, minquality):
skipped = self.a.skip_to_quality(minquality)
self._find_next()
return skipped
def id(self):
return self.a.id()
def next(self):
if not self.a.is_active():
raise mcore.ReadTooFar
ar = self.a.next()
nr = False
if self.a.is_active() and self.b.is_active():
nr = self._find_next()
return ar or nr
def skip_to(self, id):
if not self.a.is_active():
raise mcore.ReadTooFar
if id < self.a.id():
return
self.a.skip_to(id)
if self.b.is_active():
self.b.skip_to(id)
self._find_next()
def weight(self):
return self.a.weight()
def score(self):
return self.a.score()
def supports(self, astype):
return self.a.supports(astype)
def value(self):
return self.a.value()
def value_as(self, astype):
return self.a.value_as(astype)
class AndMaybeMatcher(AdditiveBiMatcher):
"""Matches postings in the first sub-matcher, and if the same posting is
in the second sub-matcher, adds their scores.
"""
def __init__(self, a, b):
AdditiveBiMatcher.__init__(self, a, b)
self._first_b()
def reset(self):
self.a.reset()
self.b.reset()
self._first_b()
def _first_b(self):
a = self.a
b = self.b
if a.is_active() and b.is_active() and a.id() != b.id():
b.skip_to(a.id())
def is_active(self):
return self.a.is_active()
def id(self):
return self.a.id()
def next(self):
if not self.a.is_active():
raise mcore.ReadTooFar
ar = self.a.next()
br = False
if self.a.is_active() and self.b.is_active():
br = self.b.skip_to(self.a.id())
return ar or br
def skip_to(self, id):
if not self.a.is_active():
raise mcore.ReadTooFar
ra = self.a.skip_to(id)
rb = False
if self.a.is_active() and self.b.is_active():
rb = self.b.skip_to(id)
return ra or rb
def replace(self, minquality=0):
a = self.a
b = self.b
a_active = a.is_active()
b_active = b.is_active()
if not a_active:
return mcore.NullMatcher()
elif minquality and b_active:
if a.max_quality() + b.max_quality() < minquality:
# If the combined max quality of the sub-matchers isn't high
# enough to possibly contribute, return an inactive matcher
return mcore.NullMatcher()
elif a.max_quality() < minquality:
# If the max quality of the main sub-matcher isn't high enough
# to ever contribute without the optional sub- matcher, change
# into an IntersectionMatcher
return IntersectionMatcher(self.a, self.b)
elif not b_active:
return a.replace(minquality)
new_a = a.replace(minquality - b.max_quality())
new_b = b.replace(minquality - a.max_quality())
if new_a is not a or new_b is not b:
# If one of the sub-matchers changed, return a new AndMaybe
return self.__class__(new_a, new_b)
else:
return self
def skip_to_quality(self, minquality):
a = self.a
b = self.b
minquality = minquality
if not a.is_active():
raise mcore.ReadTooFar
if not b.is_active():
return a.skip_to_quality(minquality)
skipped = 0
aq = a.block_quality()
bq = b.block_quality()
while a.is_active() and b.is_active() and aq + bq <= minquality:
if aq < bq:
skipped += a.skip_to_quality(minquality - bq)
aq = a.block_quality()
else:
skipped += b.skip_to_quality(minquality - aq)
bq = b.block_quality()
return skipped
def weight(self):
if self.a.id() == self.b.id():
return self.a.weight() + self.b.weight()
else:
return self.a.weight()
def score(self):
if self.b.is_active() and self.a.id() == self.b.id():
return self.a.score() + self.b.score()
else:
return self.a.score()
def supports(self, astype):
return self.a.supports(astype)
def value(self):
return self.a.value()
def value_as(self, astype):
return self.a.value_as(astype)

View File

@@ -0,0 +1,312 @@
# Copyright 2010 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
from __future__ import division
from array import array
from whoosh.compat import xrange
from whoosh.matching import mcore
class CombinationMatcher(mcore.Matcher):
def __init__(self, submatchers, boost=1.0):
self._submatchers = submatchers
self._boost = boost
def supports_block_quality(self):
return all(m.supports_block_quality() for m in self._submatchers)
def max_quality(self):
return max(m.max_quality() for m in self._submatchers
if m.is_active()) * self._boost
def supports(self, astype):
return all(m.supports(astype) for m in self._submatchers)
def children(self):
return iter(self._submatchers)
def score(self):
return sum(m.score() for m in self._submatchers) * self._boost
class PreloadedUnionMatcher(CombinationMatcher):
"""Instead of marching the sub-matchers along in parallel, this
matcher pre-reads the scores for EVERY MATCHING DOCUMENT, trading memory
for speed.
This is faster than the implementation using a binary tree of
:class:`~whoosh.matching.binary.UnionMatcher` objects (possibly just
because of less overhead), but it doesn't allow getting information about
the "current" document other than the score, because there isn't really a
current document, just an array of scores.
"""
def __init__(self, submatchers, doccount, boost=1.0, scored=True):
CombinationMatcher.__init__(self, submatchers, boost=boost)
self._doccount = doccount
a = array("d")
active = [subm for subm in self._submatchers if subm.is_active()]
if active:
offset = self._docnum = min(m.id() for m in active)
for m in active:
while m.is_active():
if scored:
score = m.score() * boost
else:
score = boost
docnum = m.id()
place = docnum - offset
if len(a) <= place:
a.extend(0 for _ in xrange(place - len(a) + 1))
a[place] += score
m.next()
self._a = a
self._offset = offset
else:
self._docnum = 0
self._offset = 0
self._a = a
def is_active(self):
return self._docnum - self._offset < len(self._a)
def id(self):
return self._docnum
def score(self):
return self._a[self._docnum - self._offset]
def next(self):
a = self._a
offset = self._offset
place = self._docnum - offset
place += 1
while place < len(a) and a[place] == 0:
place += 1
self._docnum = place + offset
def max_quality(self):
return max(self._a[self._docnum - self._offset:])
def block_quality(self):
return self.max_quality()
def skip_to(self, docnum):
if docnum < self._docnum:
return
self._docnum = docnum
i = docnum - self._offset
if i < len(self._a) and self._a[i] == 0:
self.next()
def skip_to_quality(self, minquality):
a = self._a
offset = self._offset
place = self._docnum - offset
skipped = 0
while place < len(a) and a[place] <= minquality:
place += 1
skipped = 1
self._docnum = place + offset
return skipped
def supports(self, astype):
# This matcher doesn't support any posting values
return False
def all_ids(self):
a = self._a
offset = self._offset
place = self._docnum - offset
while place < len(a):
if a[place] > 0:
yield place + offset
place += 1
class ArrayUnionMatcher(CombinationMatcher):
"""Instead of marching the sub-matchers along in parallel, this matcher
pre-reads the scores for a large block of documents at a time from each
matcher, accumulating the scores in an array.
This is faster than the implementation using a binary tree of
:class:`~whoosh.matching.binary.UnionMatcher` objects (possibly just
because of less overhead), but it doesn't allow getting information about
the "current" document other than the score, because there isn't really a
current document, just an array of scores.
"""
def __init__(self, submatchers, doccount, boost=1.0, scored=True,
partsize=2048):
CombinationMatcher.__init__(self, submatchers, boost=boost)
self._scored = scored
self._doccount = doccount
if not partsize:
partsize = doccount
self._partsize = partsize
self._a = array("d", (0 for _ in xrange(self._partsize)))
self._docnum = self._min_id()
self._read_part()
def __repr__(self):
return ("%s(%r, boost=%f, scored=%r, partsize=%d)"
% (self.__class__.__name__, self._submatchers, self._boost,
self._scored, self._partsize))
def _min_id(self):
active = [subm for subm in self._submatchers if subm.is_active()]
if active:
return min(subm.id() for subm in active)
else:
return self._doccount
def _read_part(self):
scored = self._scored
boost = self._boost
limit = min(self._docnum + self._partsize, self._doccount)
offset = self._docnum
a = self._a
# Clear the array
for i in xrange(self._partsize):
a[i] = 0
# Add the scores from the submatchers into the array
for m in self._submatchers:
while m.is_active() and m.id() < limit:
i = m.id() - offset
if scored:
a[i] += m.score() * boost
else:
a[i] = 1
m.next()
self._offset = offset
self._limit = limit
def _find_next(self):
a = self._a
docnum = self._docnum
offset = self._offset
limit = self._limit
while docnum < limit:
if a[docnum - offset] > 0:
break
docnum += 1
if docnum == limit:
self._docnum = self._min_id()
self._read_part()
else:
self._docnum = docnum
def supports(self, astype):
# This matcher doesn't support any posting values
return False
def is_active(self):
return self._docnum < self._doccount
def max_quality(self):
return max(m.max_quality() for m in self._submatchers)
def block_quality(self):
return max(self._a)
def skip_to(self, docnum):
if docnum < self._offset:
# We've already passed it
return
elif docnum < self._limit:
# It's in the current part
self._docnum = docnum
self._find_next()
return
# Advance all active submatchers
submatchers = self._submatchers
active = False
for subm in submatchers:
if subm.is_active():
subm.skip_to(docnum)
if any(subm.is_active() for subm in submatchers):
# Rebuffer
self._docnum = self._min_id()
self._read_part()
else:
self._docnum = self._doccount
def skip_to_quality(self, minquality):
skipped = 0
while self.is_active() and self.block_quality() <= minquality:
skipped += 1
self._docnum = self._limit
self._read_part()
if self.is_active():
self._find_next()
return skipped
def id(self):
return self._docnum
def all_ids(self):
doccount = self._doccount
docnum = self._docnum
offset = self._offset
limit = self._limit
a = self._a
while docnum < doccount:
if a[docnum - offset] > 0:
yield docnum
docnum += 1
if docnum == limit:
self._docnum = docnum
self._read_part()
offset = self._offset
limit = self._limit
def next(self):
self._docnum += 1
return self._find_next()
def score(self):
return self._a[self._docnum - self._offset]

View File

@@ -0,0 +1,622 @@
# Copyright 2010 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
"""
This module contains "matcher" classes. Matchers deal with posting lists. The
most basic matcher, which reads the list of postings for a term, will be
provided by the backend implementation (for example,
:class:`whoosh.filedb.filepostings.FilePostingReader`). The classes in this
module provide additional functionality, such as combining the results of two
matchers, or modifying the results of a matcher.
You do not need to deal with the classes in this module unless you need to
write your own Matcher implementation to provide some new functionality. These
classes are not instantiated by the user. They are usually created by a
:class:`~whoosh.query.Query` object's :meth:`~whoosh.query.Query.matcher()`
method, which returns the appropriate matcher to implement the query (for
example, the :class:`~whoosh.query.Or` query's
:meth:`~whoosh.query.Or.matcher()` method returns a
:py:class:`~whoosh.matching.UnionMatcher` object).
Certain backends support "quality" optimizations. These backends have the
ability to skip ahead if it knows the current block of postings can't
contribute to the top N documents. If the matcher tree and backend support
these optimizations, the matcher's :meth:`Matcher.supports_block_quality()`
method will return ``True``.
"""
import sys
from itertools import repeat
from whoosh.compat import izip, xrange
from whoosh.compat import abstractmethod
# Exceptions
class ReadTooFar(Exception):
"""Raised when :meth:`~whoosh.matching.Matcher.next()` or
:meth:`~whoosh.matching.Matcher.skip_to()` are called on an inactive
matcher.
"""
class NoQualityAvailable(Exception):
"""Raised when quality methods are called on a matcher that does not
support block quality optimizations.
"""
# Classes
class Matcher(object):
"""Base class for all matchers.
"""
@abstractmethod
def is_active(self):
"""Returns True if this matcher is still "active", that is, it has not
yet reached the end of the posting list.
"""
raise NotImplementedError
@abstractmethod
def reset(self):
"""Returns to the start of the posting list.
Note that reset() may not do what you expect after you call
:meth:`Matcher.replace()`, since this can mean calling reset() not on
the original matcher, but on an optimized replacement.
"""
raise NotImplementedError
def term(self):
"""Returns a ``("fieldname", "termtext")`` tuple for the term this
matcher matches, or None if this matcher is not a term matcher.
"""
return None
def term_matchers(self):
"""Returns an iterator of term matchers in this tree.
"""
if self.term() is not None:
yield self
else:
for cm in self.children():
for m in cm.term_matchers():
yield m
def matching_terms(self, id=None):
"""Returns an iterator of ``("fieldname", "termtext")`` tuples for the
**currently matching** term matchers in this tree.
"""
if not self.is_active():
return
if id is None:
id = self.id()
elif id != self.id():
return
t = self.term()
if t is None:
for c in self.children():
for t in c.matching_terms(id):
yield t
else:
yield t
def is_leaf(self):
return not bool(self.children())
def children(self):
"""Returns an (possibly empty) list of the submatchers of this
matcher.
"""
return []
def replace(self, minquality=0):
"""Returns a possibly-simplified version of this matcher. For example,
if one of the children of a UnionMatcher is no longer active, calling
this method on the UnionMatcher will return the other child.
"""
return self
@abstractmethod
def copy(self):
"""Returns a copy of this matcher.
"""
raise NotImplementedError
def depth(self):
"""Returns the depth of the tree under this matcher, or 0 if this
matcher does not have any children.
"""
return 0
def supports_block_quality(self):
"""Returns True if this matcher supports the use of ``quality`` and
``block_quality``.
"""
return False
def max_quality(self):
"""Returns the maximum possible quality measurement for this matcher,
according to the current weighting algorithm. Raises
``NoQualityAvailable`` if the matcher or weighting do not support
quality measurements.
"""
raise NoQualityAvailable(self.__class__)
def block_quality(self):
"""Returns a quality measurement of the current block of postings,
according to the current weighting algorithm. Raises
``NoQualityAvailable`` if the matcher or weighting do not support
quality measurements.
"""
raise NoQualityAvailable(self.__class__)
@abstractmethod
def id(self):
"""Returns the ID of the current posting.
"""
raise NotImplementedError
def all_ids(self):
"""Returns a generator of all IDs in the matcher.
What this method returns for a matcher that has already read some
postings (whether it only yields the remaining postings or all postings
from the beginning) is undefined, so it's best to only use this method
on fresh matchers.
"""
i = 0
m = self
while m.is_active():
yield m.id()
m.next()
i += 1
if i == 10:
m = m.replace()
i = 0
def all_items(self):
"""Returns a generator of all (ID, encoded value) pairs in the matcher.
What this method returns for a matcher that has already read some
postings (whether it only yields the remaining postings or all postings
from the beginning) is undefined, so it's best to only use this method
on fresh matchers.
"""
i = 0
m = self
while self.is_active():
yield (m.id(), m.value())
m.next()
i += 1
if i == 10:
m = m.replace()
i = 0
def items_as(self, astype):
"""Returns a generator of all (ID, decoded value) pairs in the matcher.
What this method returns for a matcher that has already read some
postings (whether it only yields the remaining postings or all postings
from the beginning) is undefined, so it's best to only use this method
on fresh matchers.
"""
while self.is_active():
yield (self.id(), self.value_as(astype))
self.next()
@abstractmethod
def value(self):
"""Returns the encoded value of the current posting.
"""
raise NotImplementedError
@abstractmethod
def supports(self, astype):
"""Returns True if the field's format supports the named data type,
for example 'frequency' or 'characters'.
"""
raise NotImplementedError("supports not implemented in %s"
% self.__class__)
@abstractmethod
def value_as(self, astype):
"""Returns the value(s) of the current posting as the given type.
"""
raise NotImplementedError("value_as not implemented in %s"
% self.__class__)
def spans(self):
"""Returns a list of :class:`~whoosh.query.spans.Span` objects for the
matches in this document. Raises an exception if the field being
searched does not store positions.
"""
from whoosh.query.spans import Span
if self.supports("characters"):
return [Span(pos, startchar=startchar, endchar=endchar)
for pos, startchar, endchar in self.value_as("characters")]
elif self.supports("positions"):
return [Span(pos) for pos in self.value_as("positions")]
else:
raise Exception("Field does not support spans")
def skip_to(self, id):
"""Moves this matcher to the first posting with an ID equal to or
greater than the given ID.
"""
while self.is_active() and self.id() < id:
self.next()
def skip_to_quality(self, minquality):
"""Moves this matcher to the next block with greater than the given
minimum quality value.
"""
raise NotImplementedError(self.__class__.__name__)
@abstractmethod
def next(self):
"""Moves this matcher to the next posting.
"""
raise NotImplementedError(self.__class__.__name__)
def weight(self):
"""Returns the weight of the current posting.
"""
return self.value_as("weight")
@abstractmethod
def score(self):
"""Returns the score of the current posting.
"""
raise NotImplementedError(self.__class__.__name__)
def __eq__(self, other):
return self.__class__ is type(other)
def __lt__(self, other):
return type(other) is self.__class__
def __ne__(self, other):
return not self.__eq__(other)
def __gt__(self, other):
return not (self.__lt__(other) or self.__eq__(other))
def __le__(self, other):
return self.__eq__(other) or self.__lt__(other)
def __ge__(self, other):
return self.__eq__(other) or self.__gt__(other)
# Simple intermediate classes
class ConstantScoreMatcher(Matcher):
def __init__(self, score=1.0):
self._score = score
def supports_block_quality(self):
return True
def max_quality(self):
return self._score
def block_quality(self):
return self._score
def skip_to_quality(self, minquality):
if minquality >= self._score:
self.go_inactive()
def score(self):
return self._score
# Null matcher
class NullMatcherClass(Matcher):
"""Matcher with no postings which is never active.
"""
def __call__(self):
return self
def __repr__(self):
return "<NullMatcher>"
def supports_block_quality(self):
return True
def max_quality(self):
return 0
def block_quality(self):
return 0
def skip_to_quality(self, minquality):
return 0
def is_active(self):
return False
def reset(self):
pass
def all_ids(self):
return []
def copy(self):
return self
# Singleton instance
NullMatcher = NullMatcherClass()
class ListMatcher(Matcher):
"""Synthetic matcher backed by a list of IDs.
"""
def __init__(self, ids, weights=None, values=None, format=None,
scorer=None, position=0, all_weights=None, term=None,
terminfo=None):
"""
:param ids: a list of doc IDs.
:param weights: a list of weights corresponding to the list of IDs.
If this argument is not supplied, a list of 1.0 values is used.
:param values: a list of encoded values corresponding to the list of
IDs.
:param format: a :class:`whoosh.formats.Format` object representing the
format of the field.
:param scorer: a :class:`whoosh.scoring.BaseScorer` object for scoring
the postings.
:param term: a ``("fieldname", "text")`` tuple, or None if this is not
a term matcher.
"""
self._ids = ids
self._weights = weights
self._all_weights = all_weights
self._values = values
self._i = position
self._format = format
self._scorer = scorer
self._term = term
self._terminfo = terminfo
def __repr__(self):
return "<%s>" % self.__class__.__name__
def is_active(self):
return self._i < len(self._ids)
def reset(self):
self._i = 0
def skip_to(self, id):
if not self.is_active():
raise ReadTooFar
if id < self.id():
return
while self._i < len(self._ids) and self._ids[self._i] < id:
self._i += 1
def term(self):
return self._term
def copy(self):
return self.__class__(self._ids, self._weights, self._values,
self._format, self._scorer, self._i,
self._all_weights)
def replace(self, minquality=0):
if not self.is_active():
return NullMatcher()
elif minquality and self.max_quality() < minquality:
return NullMatcher()
else:
return self
def supports_block_quality(self):
return (self._scorer is not None
and self._scorer.supports_block_quality())
def max_quality(self):
# This matcher treats all postings in the list as one "block", so the
# block quality is the same as the quality of the entire list
if self._scorer:
return self._scorer.block_quality(self)
else:
return self.block_max_weight()
def block_quality(self):
return self._scorer.block_quality(self)
def skip_to_quality(self, minquality):
while self._i < len(self._ids) and self.block_quality() <= minquality:
self._i += 1
return 0
def id(self):
return self._ids[self._i]
def all_ids(self):
return iter(self._ids)
def all_items(self):
values = self._values
if values is None:
values = repeat('')
return izip(self._ids, values)
def value(self):
if self._values:
v = self._values[self._i]
if isinstance(v, list):
# This object supports "values" that are actually lists of
# value strings. This is to support combining the results of
# several different matchers into a single ListMatcher (see the
# TOO_MANY_CLAUSES functionality of MultiTerm). We combine the
# values here instead of combining them first and then making
# the ListMatcher to avoid wasting time combining values if the
# consumer never asks for them.
assert len(v) > 0
if len(v) == 1:
v = v[0]
else:
v = self._format.combine(v)
# Replace the list with the computed value string
self._values[self._i] = v
return v
else:
return ''
def value_as(self, astype):
decoder = self._format.decoder(astype)
return decoder(self.value())
def supports(self, astype):
return self._format.supports(astype)
def next(self):
self._i += 1
def weight(self):
if self._all_weights:
return self._all_weights
elif self._weights:
return self._weights[self._i]
else:
return 1.0
def block_min_length(self):
return self._terminfo.min_length()
def block_max_length(self):
return self._terminfo.max_length()
def block_max_weight(self):
if self._all_weights:
return self._all_weights
elif self._weights:
return max(self._weights)
elif self._terminfo is not None:
return self._terminfo.max_weight()
else:
return 1.0
def score(self):
if self._scorer:
return self._scorer.score(self)
else:
return self.weight()
# Term/vector leaf posting matcher middleware
class LeafMatcher(Matcher):
# Subclasses need to set
# self.scorer -- a Scorer object or None
# self.format -- Format object for the posting values
def __repr__(self):
return "%s(%r, %s)" % (self.__class__.__name__, self.term(),
self.is_active())
def term(self):
return self._term
def items_as(self, astype):
decoder = self.format.decoder(astype)
for id, value in self.all_items():
yield (id, decoder(value))
def supports(self, astype):
return self.format.supports(astype)
def value_as(self, astype):
decoder = self.format.decoder(astype)
return decoder(self.value())
def spans(self):
from whoosh.query.spans import Span
if self.supports("characters"):
return [Span(pos, startchar=startchar, endchar=endchar)
for pos, startchar, endchar in self.value_as("characters")]
elif self.supports("positions"):
return [Span(pos) for pos in self.value_as("positions")]
else:
raise Exception("Field does not support positions (%r)"
% self.term())
def supports_block_quality(self):
return self.scorer and self.scorer.supports_block_quality()
def max_quality(self):
return self.scorer.max_quality()
def block_quality(self):
return self.scorer.block_quality(self)
def score(self):
return self.scorer.score(self)

View File

@@ -0,0 +1,572 @@
# Copyright 2010 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
from __future__ import division
from whoosh.compat import xrange
from whoosh.matching import mcore
class WrappingMatcher(mcore.Matcher):
"""Base class for matchers that wrap sub-matchers.
"""
def __init__(self, child, boost=1.0):
self.child = child
self.boost = boost
def __repr__(self):
return "%s(%r, boost=%s)" % (self.__class__.__name__, self.child,
self.boost)
def copy(self):
kwargs = {}
if hasattr(self, "boost"):
kwargs["boost"] = self.boost
return self.__class__(self.child.copy(), **kwargs)
def depth(self):
return 1 + self.child.depth()
def _replacement(self, newchild):
return self.__class__(newchild, boost=self.boost)
def replace(self, minquality=0):
# Replace the child matcher
r = self.child.replace(minquality)
if r is not self.child:
# If the child changed, return a new wrapper on the new child
return self._replacement(r)
else:
return self
def id(self):
return self.child.id()
def all_ids(self):
return self.child.all_ids()
def is_active(self):
return self.child.is_active()
def reset(self):
self.child.reset()
def children(self):
return [self.child]
def supports(self, astype):
return self.child.supports(astype)
def value(self):
return self.child.value()
def value_as(self, astype):
return self.child.value_as(astype)
def spans(self):
return self.child.spans()
def skip_to(self, id):
return self.child.skip_to(id)
def next(self):
self.child.next()
def supports_block_quality(self):
return self.child.supports_block_quality()
def skip_to_quality(self, minquality):
return self.child.skip_to_quality(minquality / self.boost)
def max_quality(self):
return self.child.max_quality() * self.boost
def block_quality(self):
return self.child.block_quality() * self.boost
def weight(self):
return self.child.weight() * self.boost
def score(self):
return self.child.score() * self.boost
class MultiMatcher(mcore.Matcher):
"""Serializes the results of a list of sub-matchers.
"""
def __init__(self, matchers, idoffsets, scorer=None, current=0):
"""
:param matchers: a list of Matcher objects.
:param idoffsets: a list of offsets corresponding to items in the
``matchers`` list.
"""
self.matchers = matchers
self.offsets = idoffsets
self.scorer = scorer
self.current = current
self._next_matcher()
def __repr__(self):
return "%s(%r, %r, current=%s)" % (self.__class__.__name__,
self.matchers, self.offsets,
self.current)
def is_active(self):
return self.current < len(self.matchers)
def reset(self):
for mr in self.matchers:
mr.reset()
self.current = 0
def children(self):
return [self.matchers[self.current]]
def _next_matcher(self):
matchers = self.matchers
while (self.current < len(matchers)
and not matchers[self.current].is_active()):
self.current += 1
def copy(self):
return self.__class__([mr.copy() for mr in self.matchers],
self.offsets, current=self.current)
def depth(self):
if self.is_active():
return 1 + max(mr.depth() for mr in self.matchers[self.current:])
else:
return 0
def replace(self, minquality=0):
m = self
if minquality:
# Skip sub-matchers that don't have a high enough max quality to
# contribute
while (m.is_active()
and m.matchers[m.current].max_quality() < minquality):
m = self.__class__(self.matchers, self.offsets, self.scorer,
m.current + 1)
m._next_matcher()
if not m.is_active():
return mcore.NullMatcher()
# TODO: Possible optimization: if the last matcher is current, replace
# this with the last matcher, but wrap it with a matcher that adds the
# offset. Have to check whether that's actually faster, though.
return m
def id(self):
current = self.current
return self.matchers[current].id() + self.offsets[current]
def all_ids(self):
offsets = self.offsets
for i, mr in enumerate(self.matchers):
for id in mr.all_ids():
yield id + offsets[i]
def spans(self):
return self.matchers[self.current].spans()
def supports(self, astype):
return self.matchers[self.current].supports(astype)
def value(self):
return self.matchers[self.current].value()
def value_as(self, astype):
return self.matchers[self.current].value_as(astype)
def next(self):
if not self.is_active():
raise mcore.ReadTooFar
self.matchers[self.current].next()
if not self.matchers[self.current].is_active():
self._next_matcher()
def skip_to(self, id):
if not self.is_active():
raise mcore.ReadTooFar
if id <= self.id():
return
matchers = self.matchers
offsets = self.offsets
r = False
while self.current < len(matchers) and id > self.id():
mr = matchers[self.current]
sr = mr.skip_to(id - offsets[self.current])
r = sr or r
if mr.is_active():
break
self._next_matcher()
return r
def supports_block_quality(self):
return all(mr.supports_block_quality() for mr
in self.matchers[self.current:])
def max_quality(self):
return max(m.max_quality() for m in self.matchers[self.current:])
def block_quality(self):
return self.matchers[self.current].block_quality()
def weight(self):
return self.matchers[self.current].weight()
def score(self):
return self.scorer.score(self)
def ExcludeMatcher(child, excluded, boost=1.0):
return FilterMatcher(child, excluded, exclude=True, boost=boost)
class FilterMatcher(WrappingMatcher):
"""Filters the postings from the wrapped based on whether the IDs are
present in or absent from a set.
"""
def __init__(self, child, ids, exclude=False, boost=1.0):
"""
:param child: the child matcher.
:param ids: a set of IDs to filter by.
:param exclude: by default, only IDs from the wrapped matcher that are
**in** the set are used. If this argument is True, only IDs from
the wrapped matcher that are **not in** the set are used.
"""
super(FilterMatcher, self).__init__(child)
self._ids = ids
self._exclude = exclude
self.boost = boost
self._find_next()
def __repr__(self):
return "%s(%r, %r, %r, boost=%s)" % (self.__class__.__name__,
self.child, self._ids,
self._exclude, self.boost)
def reset(self):
self.child.reset()
self._find_next()
def copy(self):
return self.__class__(self.child.copy(), self._ids, self._exclude,
boost=self.boost)
def _replacement(self, newchild):
return self.__class__(newchild, self._ids, exclude=self._exclude,
boost=self.boost)
def _find_next(self):
child = self.child
ids = self._ids
r = False
if self._exclude:
while child.is_active() and child.id() in ids:
r = child.next() or r
else:
while child.is_active() and child.id() not in ids:
r = child.next() or r
return r
def next(self):
self.child.next()
self._find_next()
def skip_to(self, id):
self.child.skip_to(id)
self._find_next()
def all_ids(self):
ids = self._ids
if self._exclude:
return (id for id in self.child.all_ids() if id not in ids)
else:
return (id for id in self.child.all_ids() if id in ids)
def all_items(self):
ids = self._ids
if self._exclude:
return (item for item in self.child.all_items()
if item[0] not in ids)
else:
return (item for item in self.child.all_items() if item[0] in ids)
class InverseMatcher(WrappingMatcher):
"""Synthetic matcher, generates postings that are NOT present in the
wrapped matcher.
"""
def __init__(self, child, limit, missing=None, weight=1.0, id=0):
super(InverseMatcher, self).__init__(child)
self.limit = limit
self._weight = weight
self.missing = missing or (lambda id: False)
self._id = id
self._find_next()
def copy(self):
return self.__class__(self.child.copy(), self.limit,
weight=self._weight, missing=self.missing,
id=self._id)
def _replacement(self, newchild):
return self.__class__(newchild, self.limit, missing=self.missing,
weight=self._weight, id=self._id)
def is_active(self):
return self._id < self.limit
def reset(self):
self.child.reset()
self._id = 0
self._find_next()
def supports_block_quality(self):
return False
def _find_next(self):
child = self.child
missing = self.missing
# If the current docnum isn't missing and the child matcher is
# exhausted (so we don't have to worry about skipping its matches), we
# don't have to do anything
if not child.is_active() and not missing(self._id):
return
# Skip missing documents
while self._id < self.limit and missing(self._id):
self._id += 1
# Catch the child matcher up to where this matcher is
if child.is_active() and child.id() < self._id:
child.skip_to(self._id)
# While self._id is missing or is in the child matcher, increase it
while child.is_active() and self._id < self.limit:
if missing(self._id):
self._id += 1
continue
if self._id == child.id():
self._id += 1
child.next()
continue
break
def id(self):
return self._id
def all_ids(self):
return mcore.Matcher.all_ids(self)
def next(self):
if self._id >= self.limit:
raise mcore.ReadTooFar
self._id += 1
self._find_next()
def skip_to(self, id):
if self._id >= self.limit:
raise mcore.ReadTooFar
if id < self._id:
return
self._id = id
self._find_next()
def weight(self):
return self._weight
def score(self):
return self._weight
class RequireMatcher(WrappingMatcher):
"""Matches postings that are in both sub-matchers, but only uses scores
from the first.
"""
def __init__(self, a, b):
from whoosh.matching.binary import IntersectionMatcher
self.a = a
self.b = b
WrappingMatcher.__init__(self, IntersectionMatcher(a, b))
def copy(self):
return self.__class__(self.a.copy(), self.b.copy())
def supports_block_quality(self):
return self.a.supports_block_quality()
def replace(self, minquality=0):
if not self.child.is_active():
# If one of the sub-matchers is inactive, go inactive
return mcore.NullMatcher()
elif minquality and self.a.max_quality() < minquality:
# If the required matcher doesn't have a high enough max quality
# to possibly contribute, return an inactive matcher
return mcore.NullMatcher()
new_a = self.a.replace(minquality)
new_b = self.b.replace()
if not new_a.is_active():
return mcore.NullMatcher()
elif new_a is not self.a or new_b is not self.b:
# If one of the sub-matchers changed, return a new Require
return self.__class__(new_a, self.b)
else:
return self
def max_quality(self):
return self.a.max_quality()
def block_quality(self):
return self.a.block_quality()
def skip_to_quality(self, minquality):
skipped = self.a.skip_to_quality(minquality)
self.child._find_next()
return skipped
def weight(self):
return self.a.weight()
def score(self):
return self.a.score()
def supports(self, astype):
return self.a.supports(astype)
def value(self):
return self.a.value()
def value_as(self, astype):
return self.a.value_as(astype)
class ConstantScoreWrapperMatcher(WrappingMatcher):
def __init__(self, child, score=1.0):
WrappingMatcher.__init__(self, child)
self._score = score
def copy(self):
return self.__class__(self.child.copy(), score=self._score)
def _replacement(self, newchild):
return self.__class__(newchild, score=self._score)
def max_quality(self):
return self._score
def block_quality(self):
return self._score
def score(self):
return self._score
class SingleTermMatcher(WrappingMatcher):
"""Makes a tree of matchers act as if they were a matcher for a single
term for the purposes of "what terms are matching?" questions.
"""
def __init__(self, child, term):
WrappingMatcher.__init__(self, child)
self._term = term
def term(self):
return self._term
def replace(self, minquality=0):
return self
class CoordMatcher(WrappingMatcher):
"""Modifies the computed score to penalize documents that don't match all
terms in the matcher tree.
Because this matcher modifies the score, it may give unexpected results
when compared to another matcher returning the unmodified score.
"""
def __init__(self, child, scale=1.0):
WrappingMatcher.__init__(self, child)
self._termcount = len(list(child.term_matchers()))
self._maxqual = child.max_quality()
self._scale = scale
def _replacement(self, newchild):
return self.__class__(newchild, scale=self._scale)
def _sqr(self, score, matching):
# This is the "SQR" (Short Query Ranking) function used by Apple's old
# V-twin search library, described in the paper "V-Twin: A Lightweight
# Engine for Interactive Use".
#
# http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.56.1916
# score - document score using the current weighting function
# matching - number of matching terms in the current document
termcount = self._termcount # Number of terms in this tree
scale = self._scale # Scaling factor
sqr = ((score + ((matching - 1) / (termcount - scale) ** 2))
* ((termcount - 1) / termcount))
return sqr
def max_quality(self):
return self._sqr(self.child.max_quality(), self._termcount)
def block_quality(self):
return self._sqr(self.child.block_quality(), self._termcount)
def score(self):
child = self.child
score = child.score()
matching = 0
for _ in child.matching_terms(child.id()):
matching += 1
return self._sqr(score, matching)