This commit is contained in:
“shengyudong”
2026-01-06 14:18:39 +08:00
commit 5a384b694e
10345 changed files with 2050918 additions and 0 deletions

View File

@@ -0,0 +1,30 @@
# Copyright 2010 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
from whoosh.qparser.default import *
from whoosh.qparser.plugins import *
from whoosh.qparser.syntax import *

View File

@@ -0,0 +1,65 @@
# Copyright 2010 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
"""
This module contains common utility objects/functions for the other query
parser modules.
"""
import sys
from whoosh.compat import string_type
class QueryParserError(Exception):
def __init__(self, cause, msg=None):
super(QueryParserError, self).__init__(str(cause))
self.cause = cause
def get_single_text(field, text, **kwargs):
"""Returns the first token from an analyzer's output.
"""
for t in field.process_text(text, mode="query", **kwargs):
return t
def attach(q, stxnode):
if q:
try:
q.startchar = stxnode.startchar
q.endchar = stxnode.endchar
except AttributeError:
raise AttributeError("Can't set attribute on %s"
% q.__class__.__name__)
return q
def print_debug(level, msg, out=sys.stderr):
if level:
out.write("%s%s\n" % (" " * (level - 1), msg))

View File

@@ -0,0 +1,922 @@
# Copyright 2010 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
import re
import sys
from datetime import datetime, timedelta
from whoosh.compat import string_type, iteritems
from whoosh.qparser import plugins, syntax
from whoosh.qparser.taggers import Tagger
from whoosh.support.relativedelta import relativedelta
from whoosh.util.text import rcompile
from whoosh.util.times import adatetime, timespan
from whoosh.util.times import fill_in, is_void, relative_days
from whoosh.util.times import TimeError
class DateParseError(Exception):
"Represents an error in parsing date text."
# Utility functions
def print_debug(level, msg, *args):
if level > 0:
print((" " * (level - 1)) + (msg % args))
# Parser element objects
class Props(object):
"""A dumb little object that just puts copies a dictionary into attibutes
so I can use dot syntax instead of square bracket string item lookup and
save a little bit of typing. Used by :class:`Regex`.
"""
def __init__(self, **args):
self.__dict__ = args
def __repr__(self):
return repr(self.__dict__)
def get(self, key, default=None):
return self.__dict__.get(key, default)
class ParserBase(object):
"""Base class for date parser elements.
"""
def to_parser(self, e):
if isinstance(e, string_type):
return Regex(e)
else:
return e
def parse(self, text, dt, pos=0, debug=-9999):
raise NotImplementedError
def date_from(self, text, dt=None, pos=0, debug=-9999):
if dt is None:
dt = datetime.now()
d, pos = self.parse(text, dt, pos, debug + 1)
return d
class MultiBase(ParserBase):
"""Base class for date parser elements such as Sequence and Bag that
have sub-elements.
"""
def __init__(self, elements, name=None):
"""
:param elements: the sub-elements to match.
:param name: a name for this element (for debugging purposes only).
"""
self.elements = [self.to_parser(e) for e in elements]
self.name = name
def __repr__(self):
return "%s<%s>%r" % (self.__class__.__name__, self.name or '',
self.elements)
class Sequence(MultiBase):
"""Merges the dates parsed by a sequence of sub-elements.
"""
def __init__(self, elements, sep="(\\s+|\\s*,\\s*)", name=None,
progressive=False):
"""
:param elements: the sequence of sub-elements to parse.
:param sep: a separator regular expression to match between elements,
or None to not have separators.
:param name: a name for this element (for debugging purposes only).
:param progressive: if True, elements after the first do not need to
match. That is, for elements (a, b, c) and progressive=True, the
sequence matches like ``a[b[c]]``.
"""
super(Sequence, self).__init__(elements, name)
self.sep_pattern = sep
if sep:
self.sep_expr = rcompile(sep, re.IGNORECASE)
else:
self.sep_expr = None
self.progressive = progressive
def parse(self, text, dt, pos=0, debug=-9999):
d = adatetime()
first = True
foundall = False
failed = False
print_debug(debug, "Seq %s sep=%r text=%r", self.name,
self.sep_pattern, text[pos:])
for e in self.elements:
print_debug(debug, "Seq %s text=%r", self.name, text[pos:])
if self.sep_expr and not first:
print_debug(debug, "Seq %s looking for sep", self.name)
m = self.sep_expr.match(text, pos)
if m:
pos = m.end()
else:
print_debug(debug, "Seq %s didn't find sep", self.name)
break
print_debug(debug, "Seq %s trying=%r at=%s", self.name, e, pos)
try:
at, newpos = e.parse(text, dt, pos=pos, debug=debug + 1)
except TimeError:
failed = True
break
print_debug(debug, "Seq %s result=%r", self.name, at)
if not at:
break
pos = newpos
print_debug(debug, "Seq %s adding=%r to=%r", self.name, at, d)
try:
d = fill_in(d, at)
except TimeError:
print_debug(debug, "Seq %s Error in fill_in", self.name)
failed = True
break
print_debug(debug, "Seq %s filled date=%r", self.name, d)
first = False
else:
foundall = True
if not failed and (foundall or (not first and self.progressive)):
print_debug(debug, "Seq %s final=%r", self.name, d)
return (d, pos)
else:
print_debug(debug, "Seq %s failed", self.name)
return (None, None)
class Combo(Sequence):
"""Parses a sequence of elements in order and combines the dates parsed
by the sub-elements somehow. The default behavior is to accept two dates
from the sub-elements and turn them into a range.
"""
def __init__(self, elements, fn=None, sep="(\\s+|\\s*,\\s*)", min=2, max=2,
name=None):
"""
:param elements: the sequence of sub-elements to parse.
:param fn: a function to run on all dates found. It should return a
datetime, adatetime, or timespan object. If this argument is None,
the default behavior accepts two dates and returns a timespan.
:param sep: a separator regular expression to match between elements,
or None to not have separators.
:param min: the minimum number of dates required from the sub-elements.
:param max: the maximum number of dates allowed from the sub-elements.
:param name: a name for this element (for debugging purposes only).
"""
super(Combo, self).__init__(elements, sep=sep, name=name)
self.fn = fn
self.min = min
self.max = max
def parse(self, text, dt, pos=0, debug=-9999):
dates = []
first = True
print_debug(debug, "Combo %s sep=%r text=%r", self.name,
self.sep_pattern, text[pos:])
for e in self.elements:
if self.sep_expr and not first:
print_debug(debug, "Combo %s looking for sep at %r",
self.name, text[pos:])
m = self.sep_expr.match(text, pos)
if m:
pos = m.end()
else:
print_debug(debug, "Combo %s didn't find sep", self.name)
return (None, None)
print_debug(debug, "Combo %s trying=%r", self.name, e)
try:
at, pos = e.parse(text, dt, pos, debug + 1)
except TimeError:
at, pos = None, None
print_debug(debug, "Combo %s result=%r", self.name, at)
if at is None:
return (None, None)
first = False
if is_void(at):
continue
if len(dates) == self.max:
print_debug(debug, "Combo %s length > %s", self.name, self.max)
return (None, None)
dates.append(at)
print_debug(debug, "Combo %s dates=%r", self.name, dates)
if len(dates) < self.min:
print_debug(debug, "Combo %s length < %s", self.name, self.min)
return (None, None)
return (self.dates_to_timespan(dates), pos)
def dates_to_timespan(self, dates):
if self.fn:
return self.fn(dates)
elif len(dates) == 2:
return timespan(dates[0], dates[1])
else:
raise DateParseError("Don't know what to do with %r" % (dates,))
class Choice(MultiBase):
"""Returns the date from the first of its sub-elements that matches.
"""
def parse(self, text, dt, pos=0, debug=-9999):
print_debug(debug, "Choice %s text=%r", self.name, text[pos:])
for e in self.elements:
print_debug(debug, "Choice %s trying=%r", self.name, e)
try:
d, newpos = e.parse(text, dt, pos, debug + 1)
except TimeError:
d, newpos = None, None
if d:
print_debug(debug, "Choice %s matched", self.name)
return (d, newpos)
print_debug(debug, "Choice %s no match", self.name)
return (None, None)
class Bag(MultiBase):
"""Parses its sub-elements in any order and merges the dates.
"""
def __init__(self, elements, sep="(\\s+|\\s*,\\s*)", onceper=True,
requireall=False, allof=None, anyof=None, name=None):
"""
:param elements: the sub-elements to parse.
:param sep: a separator regular expression to match between elements,
or None to not have separators.
:param onceper: only allow each element to match once.
:param requireall: if True, the sub-elements can match in any order,
but they must all match.
:param allof: a list of indexes into the list of elements. When this
argument is not None, this element matches only if all the
indicated sub-elements match.
:param allof: a list of indexes into the list of elements. When this
argument is not None, this element matches only if any of the
indicated sub-elements match.
:param name: a name for this element (for debugging purposes only).
"""
super(Bag, self).__init__(elements, name)
self.sep_expr = rcompile(sep, re.IGNORECASE)
self.onceper = onceper
self.requireall = requireall
self.allof = allof
self.anyof = anyof
def parse(self, text, dt, pos=0, debug=-9999):
first = True
d = adatetime()
seen = [False] * len(self.elements)
while True:
newpos = pos
print_debug(debug, "Bag %s text=%r", self.name, text[pos:])
if not first:
print_debug(debug, "Bag %s looking for sep", self.name)
m = self.sep_expr.match(text, pos)
if m:
newpos = m.end()
else:
print_debug(debug, "Bag %s didn't find sep", self.name)
break
for i, e in enumerate(self.elements):
print_debug(debug, "Bag %s trying=%r", self.name, e)
try:
at, xpos = e.parse(text, dt, newpos, debug + 1)
except TimeError:
at, xpos = None, None
print_debug(debug, "Bag %s result=%r", self.name, at)
if at:
if self.onceper and seen[i]:
return (None, None)
d = fill_in(d, at)
newpos = xpos
seen[i] = True
break
else:
break
pos = newpos
if self.onceper and all(seen):
break
first = False
if (not any(seen)
or (self.allof and not all(seen[pos] for pos in self.allof))
or (self.anyof and not any(seen[pos] for pos in self.anyof))
or (self.requireall and not all(seen))):
return (None, None)
print_debug(debug, "Bag %s final=%r", self.name, d)
return (d, pos)
class Optional(ParserBase):
"""Wraps a sub-element to indicate that the sub-element is optional.
"""
def __init__(self, element):
self.element = self.to_parser(element)
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self.element)
def parse(self, text, dt, pos=0, debug=-9999):
try:
d, pos = self.element.parse(text, dt, pos, debug + 1)
except TimeError:
d, pos = None, None
if d:
return (d, pos)
else:
return (adatetime(), pos)
class ToEnd(ParserBase):
"""Wraps a sub-element and requires that the end of the sub-element's match
be the end of the text.
"""
def __init__(self, element):
self.element = element
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self.element)
def parse(self, text, dt, pos=0, debug=-9999):
try:
d, pos = self.element.parse(text, dt, pos, debug + 1)
except TimeError:
d, pos = None, None
if d and pos == len(text):
return (d, pos)
else:
return (None, None)
class Regex(ParserBase):
"""Matches a regular expression and maps named groups in the pattern to
datetime attributes using a function or overridden method.
There are two points at which you can customize the behavior of this class,
either by supplying functions to the initializer or overriding methods.
* The ``modify`` function or ``modify_props`` method takes a ``Props``
object containing the named groups and modifies its values (in place).
* The ``fn`` function or ``props_to_date`` method takes a ``Props`` object
and the base datetime and returns an adatetime/datetime.
"""
fn = None
modify = None
def __init__(self, pattern, fn=None, modify=None):
self.pattern = pattern
self.expr = rcompile(pattern, re.IGNORECASE)
self.fn = fn
self.modify = modify
def __repr__(self):
return "<%r>" % (self.pattern,)
def parse(self, text, dt, pos=0, debug=-9999):
m = self.expr.match(text, pos)
if not m:
return (None, None)
props = self.extract(m)
self.modify_props(props)
try:
d = self.props_to_date(props, dt)
except TimeError:
d = None
if d:
return (d, m.end())
else:
return (None, None)
def extract(self, match):
d = match.groupdict()
for key, value in iteritems(d):
try:
value = int(value)
d[key] = value
except (ValueError, TypeError):
pass
return Props(**d)
def modify_props(self, props):
if self.modify:
self.modify(props)
def props_to_date(self, props, dt):
if self.fn:
return self.fn(props, dt)
else:
args = {}
for key in adatetime.units:
args[key] = props.get(key)
return adatetime(**args)
class Month(Regex):
def __init__(self, *patterns):
self.patterns = patterns
self.exprs = [rcompile(pat, re.IGNORECASE) for pat in self.patterns]
self.pattern = ("(?P<month>"
+ "|".join("(%s)" % pat for pat in self.patterns)
+ ")")
self.expr = rcompile(self.pattern, re.IGNORECASE)
def modify_props(self, p):
text = p.month
for i, expr in enumerate(self.exprs):
m = expr.match(text)
if m:
p.month = i + 1
break
class PlusMinus(Regex):
def __init__(self, years, months, weeks, days, hours, minutes, seconds):
rel_years = "((?P<years>[0-9]+) *(%s))?" % years
rel_months = "((?P<months>[0-9]+) *(%s))?" % months
rel_weeks = "((?P<weeks>[0-9]+) *(%s))?" % weeks
rel_days = "((?P<days>[0-9]+) *(%s))?" % days
rel_hours = "((?P<hours>[0-9]+) *(%s))?" % hours
rel_mins = "((?P<mins>[0-9]+) *(%s))?" % minutes
rel_secs = "((?P<secs>[0-9]+) *(%s))?" % seconds
self.pattern = ("(?P<dir>[+-]) *%s *%s *%s *%s *%s *%s *%s(?=(\\W|$))"
% (rel_years, rel_months, rel_weeks, rel_days,
rel_hours, rel_mins, rel_secs))
self.expr = rcompile(self.pattern, re.IGNORECASE)
def props_to_date(self, p, dt):
if p.dir == "-":
dir = -1
else:
dir = 1
delta = relativedelta(years=(p.get("years") or 0) * dir,
months=(p.get("months") or 0) * dir,
weeks=(p.get("weeks") or 0) * dir,
days=(p.get("days") or 0) * dir,
hours=(p.get("hours") or 0) * dir,
minutes=(p.get("mins") or 0) * dir,
seconds=(p.get("secs") or 0) * dir)
return dt + delta
class Daynames(Regex):
def __init__(self, next, last, daynames):
self.next_pattern = next
self.last_pattern = last
self._dayname_exprs = tuple(rcompile(pat, re.IGNORECASE)
for pat in daynames)
dn_pattern = "|".join(daynames)
self.pattern = ("(?P<dir>%s|%s) +(?P<day>%s)(?=(\\W|$))"
% (next, last, dn_pattern))
self.expr = rcompile(self.pattern, re.IGNORECASE)
def props_to_date(self, p, dt):
if re.match(p.dir, self.last_pattern):
dir = -1
else:
dir = 1
for daynum, expr in enumerate(self._dayname_exprs):
m = expr.match(p.day)
if m:
break
current_daynum = dt.weekday()
days_delta = relative_days(current_daynum, daynum, dir)
d = dt.date() + timedelta(days=days_delta)
return adatetime(year=d.year, month=d.month, day=d.day)
class Time12(Regex):
def __init__(self):
self.pattern = ("(?P<hour>[1-9]|10|11|12)(:(?P<mins>[0-5][0-9])"
"(:(?P<secs>[0-5][0-9])(\\.(?P<usecs>[0-9]{1,5}))?)?)?"
"\\s*(?P<ampm>am|pm)(?=(\\W|$))")
self.expr = rcompile(self.pattern, re.IGNORECASE)
def props_to_date(self, p, dt):
isam = p.ampm.lower().startswith("a")
if p.hour == 12:
if isam:
hr = 0
else:
hr = 12
else:
hr = p.hour
if not isam:
hr += 12
return adatetime(hour=hr, minute=p.mins, second=p.secs, microsecond=p.usecs)
# Top-level parser classes
class DateParser(object):
"""Base class for locale-specific parser classes.
"""
day = Regex("(?P<day>([123][0-9])|[1-9])(?=(\\W|$))(?!=:)",
lambda p, dt: adatetime(day=p.day))
year = Regex("(?P<year>[0-9]{4})(?=(\\W|$))",
lambda p, dt: adatetime(year=p.year))
time24 = Regex("(?P<hour>([0-1][0-9])|(2[0-3])):(?P<mins>[0-5][0-9])"
"(:(?P<secs>[0-5][0-9])(\\.(?P<usecs>[0-9]{1,5}))?)?"
"(?=(\\W|$))",
lambda p, dt: adatetime(hour=p.hour, minute=p.mins,
second=p.secs, microsecond=p.usecs))
time12 = Time12()
def __init__(self):
simple_year = "(?P<year>[0-9]{4})"
simple_month = "(?P<month>[0-1][0-9])"
simple_day = "(?P<day>[0-3][0-9])"
simple_hour = "(?P<hour>([0-1][0-9])|(2[0-3]))"
simple_minute = "(?P<minute>[0-5][0-9])"
simple_second = "(?P<second>[0-5][0-9])"
simple_usec = "(?P<microsecond>[0-9]{6})"
tup = (simple_year, simple_month, simple_day, simple_hour,
simple_minute, simple_second, simple_usec)
simple_seq = Sequence(tup, sep="[- .:/]*", name="simple",
progressive=True)
self.simple = Sequence((simple_seq, "(?=(\\s|$))"), sep='')
self.setup()
def setup(self):
raise NotImplementedError
#
def get_parser(self):
return self.all
def parse(self, text, dt, pos=0, debug=-9999):
parser = self.get_parser()
d, newpos = parser.parse(text, dt, pos=pos, debug=debug)
if isinstance(d, (adatetime, timespan)):
d = d.disambiguated(dt)
return (d, newpos)
def date_from(self, text, basedate=None, pos=0, debug=-9999, toend=True):
if basedate is None:
basedate = datetime.utcnow()
parser = self.get_parser()
if toend:
parser = ToEnd(parser)
d = parser.date_from(text, basedate, pos=pos, debug=debug)
if isinstance(d, (adatetime, timespan)):
d = d.disambiguated(basedate)
return d
class English(DateParser):
day = Regex("(?P<day>([123][0-9])|[1-9])(st|nd|rd|th)?(?=(\\W|$))",
lambda p, dt: adatetime(day=p.day))
def setup(self):
self.plusdate = PlusMinus("years|year|yrs|yr|ys|y",
"months|month|mons|mon|mos|mo",
"weeks|week|wks|wk|ws|w",
"days|day|dys|dy|ds|d",
"hours|hour|hrs|hr|hs|h",
"minutes|minute|mins|min|ms|m",
"seconds|second|secs|sec|s")
self.dayname = Daynames("next", "last",
("monday|mon|mo", "tuesday|tues|tue|tu",
"wednesday|wed|we", "thursday|thur|thu|th",
"friday|fri|fr", "saturday|sat|sa",
"sunday|sun|su"))
midnight_l = lambda p, dt: adatetime(hour=0, minute=0, second=0,
microsecond=0)
midnight = Regex("midnight", midnight_l)
noon_l = lambda p, dt: adatetime(hour=12, minute=0, second=0,
microsecond=0)
noon = Regex("noon", noon_l)
now = Regex("now", lambda p, dt: dt)
self.time = Choice((self.time12, self.time24, midnight, noon, now),
name="time")
def tomorrow_to_date(p, dt):
d = dt.date() + timedelta(days=+1)
return adatetime(year=d.year, month=d.month, day=d.day)
tomorrow = Regex("tomorrow", tomorrow_to_date)
def yesterday_to_date(p, dt):
d = dt.date() + timedelta(days=-1)
return adatetime(year=d.year, month=d.month, day=d.day)
yesterday = Regex("yesterday", yesterday_to_date)
thisyear = Regex("this year", lambda p, dt: adatetime(year=dt.year))
thismonth = Regex("this month",
lambda p, dt: adatetime(year=dt.year,
month=dt.month))
today = Regex("today",
lambda p, dt: adatetime(year=dt.year, month=dt.month,
day=dt.day))
self.month = Month("january|jan", "february|febuary|feb", "march|mar",
"april|apr", "may", "june|jun", "july|jul",
"august|aug", "september|sept|sep", "october|oct",
"november|nov", "december|dec")
# If you specify a day number you must also specify a month... this
# Choice captures that constraint
self.dmy = Choice((Sequence((self.day, self.month, self.year),
name="dmy"),
Sequence((self.month, self.day, self.year),
name="mdy"),
Sequence((self.year, self.month, self.day),
name="ymd"),
Sequence((self.year, self.day, self.month),
name="ydm"),
Sequence((self.day, self.month), name="dm"),
Sequence((self.month, self.day), name="md"),
Sequence((self.month, self.year), name="my"),
self.month, self.year, self.dayname, tomorrow,
yesterday, thisyear, thismonth, today, now,
), name="date")
self.datetime = Bag((self.time, self.dmy), name="datetime")
self.bundle = Choice((self.plusdate, self.datetime, self.simple),
name="bundle")
self.torange = Combo((self.bundle, "to", self.bundle), name="torange")
self.all = Choice((self.torange, self.bundle), name="all")
# QueryParser plugin
class DateParserPlugin(plugins.Plugin):
"""Adds more powerful parsing of DATETIME fields.
>>> parser.add_plugin(DateParserPlugin())
>>> parser.parse(u"date:'last tuesday'")
"""
def __init__(self, basedate=None, dateparser=None, callback=None,
free=False, free_expr="([A-Za-z][A-Za-z_0-9]*):([^^]+)"):
"""
:param basedate: a datetime object representing the current time
against which to measure relative dates. If you do not supply this
argument, the plugin uses ``datetime.utcnow()``.
:param dateparser: an instance of
:class:`whoosh.qparser.dateparse.DateParser`. If you do not supply
this argument, the plugin automatically uses
:class:`whoosh.qparser.dateparse.English`.
:param callback: a callback function for parsing errors. This allows
you to provide feedback to the user about problems parsing dates.
:param remove: if True, unparseable dates are removed from the token
stream instead of being replaced with ErrorToken.
:param free: if True, this plugin will install a filter early in the
parsing process and try to find undelimited dates such as
``date:last tuesday``. Note that allowing this could result in
normal query words accidentally being parsed as dates sometimes.
"""
self.basedate = basedate
if dateparser is None:
dateparser = English()
self.dateparser = dateparser
self.callback = callback
self.free = free
self.freeexpr = free_expr
def taggers(self, parser):
if self.free:
# If we're tokenizing, we have to go before the FieldsPlugin
return [(DateTagger(self, self.freeexpr), -1)]
else:
return ()
def filters(self, parser):
# Run the filter after the FieldsPlugin assigns field names
return [(self.do_dates, 110)]
def errorize(self, message, node):
if self.callback:
self.callback(message)
return syntax.ErrorNode(message, node)
def text_to_dt(self, node):
text = node.text
try:
dt = self.dateparser.date_from(text, self.basedate)
if dt is None:
return self.errorize(text, node)
else:
n = DateTimeNode(node.fieldname, dt, node.boost)
except DateParseError:
e = sys.exc_info()[1]
n = self.errorize(e, node)
n.startchar = node.startchar
n.endchar = node.endchar
return n
def range_to_dt(self, node):
start = end = None
dp = self.dateparser.get_parser()
if node.start:
start = dp.date_from(node.start, self.basedate)
if start is None:
return self.errorize(node.start, node)
if node.end:
end = dp.date_from(node.end, self.basedate)
if end is None:
return self.errorize(node.end, node)
if start and end:
ts = timespan(start, end).disambiguated(self.basedate)
start, end = ts.start, ts.end
elif start:
start = start.disambiguated(self.basedate)
if isinstance(start, timespan):
start = start.start
elif end:
end = end.disambiguated(self.basedate)
if isinstance(end, timespan):
end = end.end
drn = DateRangeNode(node.fieldname, start, end, boost=node.boost)
drn.startchar = node.startchar
drn.endchar = node.endchar
return drn
def do_dates(self, parser, group):
schema = parser.schema
if not schema:
return group
from whoosh.fields import DATETIME
datefields = frozenset(fieldname for fieldname, field
in parser.schema.items()
if isinstance(field, DATETIME))
for i, node in enumerate(group):
if node.has_fieldname:
fname = node.fieldname or parser.fieldname
else:
fname = None
if isinstance(node, syntax.GroupNode):
group[i] = self.do_dates(parser, node)
elif fname in datefields:
if node.has_text:
group[i] = self.text_to_dt(node)
elif isinstance(node, syntax.RangeNode):
group[i] = self.range_to_dt(node)
return group
class DateTimeNode(syntax.SyntaxNode):
has_fieldname = True
has_boost = True
def __init__(self, fieldname, dt, boost=1.0):
self.fieldname = fieldname
self.dt = dt
self.boost = 1.0
def r(self):
return repr(self.dt)
def query(self, parser):
from whoosh import query
fieldname = self.fieldname or parser.fieldname
field = parser.schema[fieldname]
dt = self.dt
if isinstance(self.dt, datetime):
btext = field.to_bytes(dt)
return query.Term(fieldname, btext, boost=self.boost)
elif isinstance(self.dt, timespan):
return query.DateRange(fieldname, dt.start, dt.end,
boost=self.boost)
else:
raise Exception("Unknown time object: %r" % dt)
class DateRangeNode(syntax.SyntaxNode):
has_fieldname = True
has_boost = True
def __init__(self, fieldname, start, end, boost=1.0):
self.fieldname = fieldname
self.start = start
self.end = end
self.boost = 1.0
def r(self):
return "%r-%r" % (self.start, self.end)
def query(self, parser):
from whoosh import query
fieldname = self.fieldname or parser.fieldname
return query.DateRange(fieldname, self.start, self.end,
boost=self.boost)
class DateTagger(Tagger):
def __init__(self, plugin, expr):
self.plugin = plugin
self.expr = rcompile(expr, re.IGNORECASE)
def match(self, parser, text, pos):
from whoosh.fields import DATETIME
match = self.expr.match(text, pos)
if match:
fieldname = match.group(1)
dtext = match.group(2)
if parser.schema and fieldname in parser.schema:
field = parser.schema[fieldname]
if isinstance(field, DATETIME):
plugin = self.plugin
dateparser = plugin.dateparser
basedate = plugin.basedate
d, newpos = dateparser.parse(dtext, basedate)
if d:
node = DateTimeNode(fieldname, d)
node.startchar = match.start()
node.endchar = newpos + match.start(2)
return node

View File

@@ -0,0 +1,439 @@
# Copyright 2011 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
import sys
from whoosh import query
from whoosh.compat import text_type
from whoosh.qparser import syntax
from whoosh.qparser.common import print_debug, QueryParserError
# Query parser object
class QueryParser(object):
"""A hand-written query parser built on modular plug-ins. The default
configuration implements a powerful fielded query language similar to
Lucene's.
You can use the ``plugins`` argument when creating the object to override
the default list of plug-ins, and/or use ``add_plugin()`` and/or
``remove_plugin_class()`` to change the plug-ins included in the parser.
>>> from whoosh import qparser
>>> parser = qparser.QueryParser("content", schema)
>>> parser.remove_plugin_class(qparser.WildcardPlugin)
>>> parser.add_plugin(qparser.PrefixPlugin())
>>> parser.parse(u"hello there")
And([Term("content", u"hello"), Term("content", u"there")])
"""
def __init__(self, fieldname, schema, plugins=None, termclass=query.Term,
phraseclass=query.Phrase, group=syntax.AndGroup):
"""
:param fieldname: the default field -- the parser uses this as the
field for any terms without an explicit field.
:param schema: a :class:`whoosh.fields.Schema` object to use when
parsing. The appropriate fields in the schema will be used to
tokenize terms/phrases before they are turned into query objects.
You can specify None for the schema to create a parser that does
not analyze the text of the query, usually for testing purposes.
:param plugins: a list of plugins to use. WhitespacePlugin is
automatically included, do not put it in this list. This overrides
the default list of plugins. Classes in the list will be
automatically instantiated.
:param termclass: the query class to use for individual search terms.
The default is :class:`whoosh.query.Term`.
:param phraseclass: the query class to use for phrases. The default
is :class:`whoosh.query.Phrase`.
:param group: the default grouping. ``AndGroup`` makes terms required
by default. ``OrGroup`` makes terms optional by default.
"""
self.fieldname = fieldname
self.schema = schema
self.termclass = termclass
self.phraseclass = phraseclass
self.group = group
self.plugins = []
if plugins is None:
plugins = self.default_set()
self._add_ws_plugin()
self.add_plugins(plugins)
def default_set(self):
"""Returns the default list of plugins to use.
"""
from whoosh.qparser import plugins
return [plugins.WhitespacePlugin(),
plugins.SingleQuotePlugin(),
plugins.FieldsPlugin(),
plugins.WildcardPlugin(),
plugins.PhrasePlugin(),
plugins.RangePlugin(),
plugins.GroupPlugin(),
plugins.OperatorsPlugin(),
plugins.BoostPlugin(),
plugins.EveryPlugin(),
]
def add_plugins(self, pins):
"""Adds the given list of plugins to the list of plugins in this
parser.
"""
for pin in pins:
self.add_plugin(pin)
def add_plugin(self, pin):
"""Adds the given plugin to the list of plugins in this parser.
"""
if isinstance(pin, type):
pin = pin()
self.plugins.append(pin)
def _add_ws_plugin(self):
from whoosh.qparser.plugins import WhitespacePlugin
self.add_plugin(WhitespacePlugin())
def remove_plugin(self, pi):
"""Removes the given plugin object from the list of plugins in this
parser.
"""
self.plugins.remove(pi)
def remove_plugin_class(self, cls):
"""Removes any plugins of the given class from this parser.
"""
self.plugins = [pi for pi in self.plugins if not isinstance(pi, cls)]
def replace_plugin(self, plugin):
"""Removes any plugins of the class of the given plugin and then adds
it. This is a convenience method to keep from having to call
``remove_plugin_class`` followed by ``add_plugin`` each time you want
to reconfigure a default plugin.
>>> qp = qparser.QueryParser("content", schema)
>>> qp.replace_plugin(qparser.NotPlugin("(^| )-"))
"""
self.remove_plugin_class(plugin.__class__)
self.add_plugin(plugin)
def _priorized(self, methodname):
# methodname is "taggers" or "filters". Returns a priorized list of
# tagger objects or filter functions.
items_and_priorities = []
for plugin in self.plugins:
# Call either .taggers() or .filters() on the plugin
method = getattr(plugin, methodname)
for item in method(self):
items_and_priorities.append(item)
# Sort the list by priority (lower priority runs first)
items_and_priorities.sort(key=lambda x: x[1])
# Return the sorted list without the priorities
return [item for item, _ in items_and_priorities]
def multitoken_query(self, spec, texts, fieldname, termclass, boost):
"""Returns a query for multiple texts. This method implements the
intention specified in the field's ``multitoken_query`` attribute,
which specifies what to do when strings that look like single terms
to the parser turn out to yield multiple tokens when analyzed.
:param spec: a string describing how to join the text strings into a
query. This is usually the value of the field's
``multitoken_query`` attribute.
:param texts: a list of token strings.
:param fieldname: the name of the field.
:param termclass: the query class to use for single terms.
:param boost: the original term's boost in the query string, should be
applied to the returned query object.
"""
spec = spec.lower()
if spec == "first":
# Throw away all but the first token
return termclass(fieldname, texts[0], boost=boost)
elif spec == "phrase":
# Turn the token into a phrase
return self.phraseclass(fieldname, texts, boost=boost)
else:
if spec == "default":
qclass = self.group.qclass
elif spec == "and":
qclass = query.And
elif spec == "or":
qclass = query.Or
else:
raise QueryParserError("Unknown multitoken_query value %r"
% spec)
return qclass([termclass(fieldname, t, boost=boost)
for t in texts])
def term_query(self, fieldname, text, termclass, boost=1.0, tokenize=True,
removestops=True):
"""Returns the appropriate query object for a single term in the query
string.
"""
if self.schema and fieldname in self.schema:
field = self.schema[fieldname]
# If this field type wants to parse queries itself, let it do so
# and return early
if field.self_parsing():
try:
q = field.parse_query(fieldname, text, boost=boost)
return q
except:
e = sys.exc_info()[1]
return query.error_query(e)
# Otherwise, ask the field to process the text into a list of
# tokenized strings
texts = list(field.process_text(text, mode="query",
tokenize=tokenize,
removestops=removestops))
# If the analyzer returned more than one token, use the field's
# multitoken_query attribute to decide what query class, if any, to
# use to put the tokens together
if len(texts) > 1:
return self.multitoken_query(field.multitoken_query, texts,
fieldname, termclass, boost)
# It's possible field.process_text() will return an empty list (for
# example, on a stop word)
if not texts:
return None
text = texts[0]
return termclass(fieldname, text, boost=boost)
def taggers(self):
"""Returns a priorized list of tagger objects provided by the parser's
currently configured plugins.
"""
return self._priorized("taggers")
def filters(self):
"""Returns a priorized list of filter functions provided by the
parser's currently configured plugins.
"""
return self._priorized("filters")
def tag(self, text, pos=0, debug=False):
"""Returns a group of syntax nodes corresponding to the given text,
created by matching the Taggers provided by the parser's plugins.
:param text: the text to tag.
:param pos: the position in the text to start tagging at.
"""
# The list out output tags
stack = []
# End position of the previous match
prev = pos
# Priorized list of taggers provided by the parser's plugins
taggers = self.taggers()
if debug:
print_debug(debug, "Taggers: %r" % taggers)
# Define a function that will make a WordNode from the "interstitial"
# text between matches
def inter(startchar, endchar):
n = syntax.WordNode(text[startchar:endchar])
n.startchar = startchar
n.endchar = endchar
return n
while pos < len(text):
node = None
# Try each tagger to see if it matches at the current position
for tagger in taggers:
node = tagger.match(self, text, pos)
if node is not None:
if node.endchar <= pos:
raise Exception("Token %r did not move cursor forward."
" (%r, %s)" % (tagger, text, pos))
if prev < pos:
tween = inter(prev, pos)
if debug:
print_debug(debug, "Tween: %r" % tween)
stack.append(tween)
if debug:
print_debug(debug, "Tagger: %r at %s: %r"
% (tagger, pos, node))
stack.append(node)
prev = pos = node.endchar
break
if not node:
# No taggers matched, move forward
pos += 1
# If there's unmatched text left over on the end, put it in a WordNode
if prev < len(text):
stack.append(inter(prev, len(text)))
# Wrap the list of nodes in a group node
group = self.group(stack)
if debug:
print_debug(debug, "Tagged group: %r" % group)
return group
def filterize(self, nodes, debug=False):
"""Takes a group of nodes and runs the filters provided by the parser's
plugins.
"""
# Call each filter in the priorized list of plugin filters
if debug:
print_debug(debug, "Pre-filtered group: %r" % nodes)
for f in self.filters():
if debug:
print_debug(debug, "..Applying: %r" % f)
nodes = f(self, nodes)
if debug:
print_debug(debug, "..Result: %r" % nodes)
if nodes is None:
raise Exception("Filter %r did not return anything" % f)
return nodes
def process(self, text, pos=0, debug=False):
"""Returns a group of syntax nodes corresponding to the given text,
tagged by the plugin Taggers and filtered by the plugin filters.
:param text: the text to tag.
:param pos: the position in the text to start tagging at.
"""
nodes = self.tag(text, pos=pos, debug=debug)
nodes = self.filterize(nodes, debug=debug)
return nodes
def parse(self, text, normalize=True, debug=False):
"""Parses the input string and returns a :class:`whoosh.query.Query`
object/tree.
:param text: the unicode string to parse.
:param normalize: whether to call normalize() on the query object/tree
before returning it. This should be left on unless you're trying to
debug the parser output.
:rtype: :class:`whoosh.query.Query`
"""
if not isinstance(text, text_type):
text = text.decode("latin1")
nodes = self.process(text, debug=debug)
if debug:
print_debug(debug, "Syntax tree: %r" % nodes)
q = nodes.query(self)
if not q:
q = query.NullQuery
if debug:
print_debug(debug, "Pre-normalized query: %r" % q)
if normalize:
q = q.normalize()
if debug:
print_debug(debug, "Normalized query: %r" % q)
return q
def parse_(self, text, normalize=True):
pass
# Premade parser configurations
def MultifieldParser(fieldnames, schema, fieldboosts=None, **kwargs):
"""Returns a QueryParser configured to search in multiple fields.
Instead of assigning unfielded clauses to a default field, this parser
transforms them into an OR clause that searches a list of fields. For
example, if the list of multi-fields is "f1", "f2" and the query string is
"hello there", the class will parse "(f1:hello OR f2:hello) (f1:there OR
f2:there)". This is very useful when you have two textual fields (e.g.
"title" and "content") you want to search by default.
:param fieldnames: a list of field names to search.
:param fieldboosts: an optional dictionary mapping field names to boosts.
"""
from whoosh.qparser.plugins import MultifieldPlugin
p = QueryParser(None, schema, **kwargs)
mfp = MultifieldPlugin(fieldnames, fieldboosts=fieldboosts)
p.add_plugin(mfp)
return p
def SimpleParser(fieldname, schema, **kwargs):
"""Returns a QueryParser configured to support only +, -, and phrase
syntax.
"""
from whoosh.qparser import plugins, syntax
pins = [plugins.WhitespacePlugin,
plugins.PlusMinusPlugin,
plugins.PhrasePlugin]
orgroup = syntax.OrGroup
return QueryParser(fieldname, schema, plugins=pins, group=orgroup,
**kwargs)
def DisMaxParser(fieldboosts, schema, tiebreak=0.0, **kwargs):
"""Returns a QueryParser configured to support only +, -, and phrase
syntax, and which converts individual terms into DisjunctionMax queries
across a set of fields.
:param fieldboosts: a dictionary mapping field names to boosts.
"""
from whoosh.qparser import plugins, syntax
mfp = plugins.MultifieldPlugin(list(fieldboosts.keys()),
fieldboosts=fieldboosts,
group=syntax.DisMaxGroup)
pins = [plugins.WhitespacePlugin,
plugins.PlusMinusPlugin,
plugins.PhrasePlugin,
mfp]
orgroup = syntax.OrGroup
return QueryParser(None, schema, plugins=pins, group=orgroup, **kwargs)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,645 @@
# Copyright 2011 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
import sys, weakref
from whoosh import query
from whoosh.qparser.common import get_single_text, QueryParserError, attach
class SyntaxNode(object):
"""Base class for nodes that make up the abstract syntax tree (AST) of a
parsed user query string. The AST is an intermediate step, generated
from the query string, then converted into a :class:`whoosh.query.Query`
tree by calling the ``query()`` method on the nodes.
Instances have the following required attributes:
``has_fieldname``
True if this node has a ``fieldname`` attribute.
``has_text``
True if this node has a ``text`` attribute
``has_boost``
True if this node has a ``boost`` attribute.
``startchar``
The character position in the original text at which this node started.
``endchar``
The character position in the original text at which this node ended.
"""
has_fieldname = False
has_text = False
has_boost = False
_parent = None
def __repr__(self):
r = "<"
if self.has_fieldname:
r += "%r:" % self.fieldname
r += self.r()
if self.has_boost and self.boost != 1.0:
r += " ^%s" % self.boost
r += ">"
return r
def r(self):
"""Returns a basic representation of this node. The base class's
``__repr__`` method calls this, then does the extra busy work of adding
fieldname and boost where appropriate.
"""
return "%s %r" % (self.__class__.__name__, self.__dict__)
def apply(self, fn):
return self
def accept(self, fn):
def fn_wrapper(n):
return fn(n.apply(fn_wrapper))
return fn_wrapper(self)
def query(self, parser):
"""Returns a :class:`whoosh.query.Query` instance corresponding to this
syntax tree node.
"""
raise NotImplementedError(self.__class__.__name__)
def is_ws(self):
"""Returns True if this node is ignorable whitespace.
"""
return False
def is_text(self):
return False
def set_fieldname(self, name, override=False):
"""Sets the fieldname associated with this node. If ``override`` is
False (the default), the fieldname will only be replaced if this node
does not already have a fieldname set.
For nodes that don't have a fieldname, this is a no-op.
"""
if not self.has_fieldname:
return
if self.fieldname is None or override:
self.fieldname = name
return self
def set_boost(self, boost):
"""Sets the boost associated with this node.
For nodes that don't have a boost, this is a no-op.
"""
if not self.has_boost:
return
self.boost = boost
return self
def set_range(self, startchar, endchar):
"""Sets the character range associated with this node.
"""
self.startchar = startchar
self.endchar = endchar
return self
# Navigation methods
def parent(self):
if self._parent:
return self._parent()
def next_sibling(self):
p = self.parent()
if p:
return p.node_after(self)
def prev_sibling(self):
p = self.parent()
if p:
return p.node_before(self)
def bake(self, parent):
self._parent = weakref.ref(parent)
class MarkerNode(SyntaxNode):
"""Base class for nodes that only exist to mark places in the tree.
"""
def r(self):
return self.__class__.__name__
class Whitespace(MarkerNode):
"""Abstract syntax tree node for ignorable whitespace.
"""
def r(self):
return " "
def is_ws(self):
return True
class FieldnameNode(SyntaxNode):
"""Abstract syntax tree node for field name assignments.
"""
has_fieldname = True
def __init__(self, fieldname, original):
self.fieldname = fieldname
self.original = original
def __repr__(self):
return "<%r:>" % self.fieldname
class GroupNode(SyntaxNode):
"""Base class for abstract syntax tree node types that group together
sub-nodes.
Instances have the following attributes:
``merging``
True if side-by-side instances of this group can be merged into a
single group.
``qclass``
If a subclass doesn't override ``query()``, the base class will simply
wrap this class around the queries returned by the subnodes.
This class implements a number of list methods for operating on the
subnodes.
"""
has_boost = True
merging = True
qclass = None
def __init__(self, nodes=None, boost=1.0, **kwargs):
self.nodes = nodes or []
self.boost = boost
self.kwargs = kwargs
def r(self):
return "%s %s" % (self.__class__.__name__,
", ".join(repr(n) for n in self.nodes))
@property
def startchar(self):
if not self.nodes:
return None
return self.nodes[0].startchar
@property
def endchar(self):
if not self.nodes:
return None
return self.nodes[-1].endchar
def apply(self, fn):
return self.__class__(self.type, [fn(node) for node in self.nodes],
boost=self.boost, **self.kwargs)
def query(self, parser):
subs = []
for node in self.nodes:
subq = node.query(parser)
if subq is not None:
subs.append(subq)
q = self.qclass(subs, boost=self.boost, **self.kwargs)
return attach(q, self)
def empty_copy(self):
"""Returns an empty copy of this group.
This is used in the common pattern where a filter creates an new
group and then adds nodes from the input group to it if they meet
certain criteria, then returns the new group::
def remove_whitespace(parser, group):
newgroup = group.empty_copy()
for node in group:
if not node.is_ws():
newgroup.append(node)
return newgroup
"""
c = self.__class__(**self.kwargs)
if self.has_boost:
c.boost = self.boost
if self.has_fieldname:
c.fieldname = self.fieldname
if self.has_text:
c.text = self.text
return c
def set_fieldname(self, name, override=False):
SyntaxNode.set_fieldname(self, name, override=override)
for node in self.nodes:
node.set_fieldname(name, override=override)
def set_range(self, startchar, endchar):
for node in self.nodes:
node.set_range(startchar, endchar)
return self
# List-like methods
def __nonzero__(self):
return bool(self.nodes)
__bool__ = __nonzero__
def __iter__(self):
return iter(self.nodes)
def __len__(self):
return len(self.nodes)
def __getitem__(self, n):
return self.nodes.__getitem__(n)
def __setitem__(self, n, v):
self.nodes.__setitem__(n, v)
def __delitem__(self, n):
self.nodes.__delitem__(n)
def insert(self, n, v):
self.nodes.insert(n, v)
def append(self, v):
self.nodes.append(v)
def extend(self, vs):
self.nodes.extend(vs)
def pop(self, *args, **kwargs):
return self.nodes.pop(*args, **kwargs)
def reverse(self):
self.nodes.reverse()
def index(self, v):
return self.nodes.index(v)
# Navigation methods
def bake(self, parent):
SyntaxNode.bake(self, parent)
for node in self.nodes:
node.bake(self)
def node_before(self, n):
try:
i = self.nodes.index(n)
except ValueError:
return
if i > 0:
return self.nodes[i - 1]
def node_after(self, n):
try:
i = self.nodes.index(n)
except ValueError:
return
if i < len(self.nodes) - 2:
return self.nodes[i + 1]
class BinaryGroup(GroupNode):
"""Intermediate base class for group nodes that have two subnodes and
whose ``qclass`` initializer takes two arguments instead of a list.
"""
merging = False
has_boost = False
def query(self, parser):
assert len(self.nodes) == 2
qa = self.nodes[0].query(parser)
qb = self.nodes[1].query(parser)
if qa is None and qb is None:
q = query.NullQuery
elif qa is None:
q = qb
elif qb is None:
q = qa
else:
q = self.qclass(self.nodes[0].query(parser),
self.nodes[1].query(parser))
return attach(q, self)
class Wrapper(GroupNode):
"""Intermediate base class for nodes that wrap a single sub-node.
"""
merging = False
def query(self, parser):
q = self.nodes[0].query(parser)
if q:
return attach(self.qclass(q), self)
class ErrorNode(SyntaxNode):
def __init__(self, message, node=None):
self.message = message
self.node = node
def r(self):
return "ERR %r %r" % (self.node, self.message)
@property
def startchar(self):
return self.node.startchar
@property
def endchar(self):
return self.node.endchar
def query(self, parser):
if self.node:
q = self.node.query(parser)
else:
q = query.NullQuery
return attach(query.error_query(self.message, q), self)
class AndGroup(GroupNode):
qclass = query.And
class OrGroup(GroupNode):
qclass = query.Or
@classmethod
def factory(cls, scale=1.0):
class ScaledOrGroup(OrGroup):
def __init__(self, nodes=None, **kwargs):
if "scale" in kwargs:
del kwargs["scale"]
super(ScaledOrGroup, self).__init__(nodes=nodes, scale=scale,
**kwargs)
return ScaledOrGroup
class DisMaxGroup(GroupNode):
qclass = query.DisjunctionMax
class OrderedGroup(GroupNode):
qclass = query.Ordered
class AndNotGroup(BinaryGroup):
qclass = query.AndNot
class AndMaybeGroup(BinaryGroup):
qclass = query.AndMaybe
class RequireGroup(BinaryGroup):
qclass = query.Require
class NotGroup(Wrapper):
qclass = query.Not
class RangeNode(SyntaxNode):
"""Syntax node for range queries.
"""
has_fieldname = True
def __init__(self, start, end, startexcl, endexcl):
self.start = start
self.end = end
self.startexcl = startexcl
self.endexcl = endexcl
self.boost = 1.0
self.fieldname = None
self.kwargs = {}
def r(self):
b1 = "{" if self.startexcl else "["
b2 = "}" if self.endexcl else "]"
return "%s%r %r%s" % (b1, self.start, self.end, b2)
def query(self, parser):
fieldname = self.fieldname or parser.fieldname
start = self.start
end = self.end
if parser.schema and fieldname in parser.schema:
field = parser.schema[fieldname]
if field.self_parsing():
try:
q = field.parse_range(fieldname, start, end,
self.startexcl, self.endexcl,
boost=self.boost)
if q is not None:
return attach(q, self)
except QueryParserError:
e = sys.exc_info()[1]
return attach(query.error_query(e), self)
if start:
start = get_single_text(field, start, tokenize=False,
removestops=False)
if end:
end = get_single_text(field, end, tokenize=False,
removestops=False)
q = query.TermRange(fieldname, start, end, self.startexcl,
self.endexcl, boost=self.boost)
return attach(q, self)
class TextNode(SyntaxNode):
"""Intermediate base class for basic nodes that search for text, such as
term queries, wildcards, prefixes, etc.
Instances have the following attributes:
``qclass``
If a subclass does not override ``query()``, the base class will use
this class to construct the query.
``tokenize``
If True and the subclass does not override ``query()``, the node's text
will be tokenized before constructing the query
``removestops``
If True and the subclass does not override ``query()``, and the field's
analyzer has a stop word filter, stop words will be removed from the
text before constructing the query.
"""
has_fieldname = True
has_text = True
has_boost = True
qclass = None
tokenize = False
removestops = False
def __init__(self, text):
self.fieldname = None
self.text = text
self.boost = 1.0
def r(self):
return "%s %r" % (self.__class__.__name__, self.text)
def is_text(self):
return True
def query(self, parser):
fieldname = self.fieldname or parser.fieldname
termclass = self.qclass or parser.termclass
q = parser.term_query(fieldname, self.text, termclass,
boost=self.boost, tokenize=self.tokenize,
removestops=self.removestops)
return attach(q, self)
class WordNode(TextNode):
"""Syntax node for term queries.
"""
tokenize = True
removestops = True
def r(self):
return repr(self.text)
# Operators
class Operator(SyntaxNode):
"""Base class for PrefixOperator, PostfixOperator, and InfixOperator.
Operators work by moving the nodes they apply to (e.g. for prefix operator,
the previous node, for infix operator, the nodes on either side, etc.) into
a group node. The group provides the code for what to do with the nodes.
"""
def __init__(self, text, grouptype, leftassoc=True):
"""
:param text: the text of the operator in the query string.
:param grouptype: the type of group to create in place of the operator
and the node(s) it operates on.
:param leftassoc: for infix opeators, whether the operator is left
associative. use ``leftassoc=False`` for right-associative infix
operators.
"""
self.text = text
self.grouptype = grouptype
self.leftassoc = leftassoc
def r(self):
return "OP %r" % self.text
def replace_self(self, parser, group, position):
"""Called with the parser, a group, and the position at which the
operator occurs in that group. Should return a group with the operator
replaced by whatever effect the operator has (e.g. for an infix op,
replace the op and the nodes on either side with a sub-group).
"""
raise NotImplementedError
class PrefixOperator(Operator):
def replace_self(self, parser, group, position):
length = len(group)
del group[position]
if position < length - 1:
group[position] = self.grouptype([group[position]])
return position
class PostfixOperator(Operator):
def replace_self(self, parser, group, position):
del group[position]
if position > 0:
group[position - 1] = self.grouptype([group[position - 1]])
return position
class InfixOperator(Operator):
def replace_self(self, parser, group, position):
la = self.leftassoc
gtype = self.grouptype
merging = gtype.merging
if position > 0 and position < len(group) - 1:
left = group[position - 1]
right = group[position + 1]
# The first two clauses check whether the "strong" side is already
# a group of the type we are going to create. If it is, we just
# append the "weak" side to the "strong" side instead of creating
# a new group inside the existing one. This is necessary because
# we can quickly run into Python's recursion limit otherwise.
if merging and la and isinstance(left, gtype):
left.append(right)
del group[position:position + 2]
elif merging and not la and isinstance(right, gtype):
right.insert(0, left)
del group[position - 1:position + 1]
return position - 1
else:
# Replace the operator and the two surrounding objects
group[position - 1:position + 2] = [gtype([left, right])]
else:
del group[position]
return position
# Functions
def to_word(n):
node = WordNode(n.original)
node.startchar = n.startchar
node.endchar = n.endchar
return node

View File

@@ -0,0 +1,93 @@
# Copyright 2011 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
from whoosh.util.text import rcompile
# Tagger objects
class Tagger(object):
"""Base class for taggers, objects which match syntax in the query string
and translate it into a :class:`whoosh.qparser.syntax.SyntaxNode` object.
"""
def match(self, parser, text, pos):
"""This method should see if this tagger matches the query string at
the given position. If it matches, it should return
:param parser: the :class:`whoosh.qparser.default.QueryParser` object.
:param text: the text being parsed.
:param pos: the position in the text at which the tagger should try to
match.
"""
raise NotImplementedError
class RegexTagger(Tagger):
"""Tagger class that uses regular expressions to match the query string.
Subclasses should override ``create()`` instead of ``match()``.
"""
def __init__(self, expr):
self.expr = rcompile(expr)
def match(self, parser, text, pos):
match = self.expr.match(text, pos)
if match:
node = self.create(parser, match)
if node is not None:
node = node.set_range(match.start(), match.end())
return node
def create(self, parser, match):
"""When the regular expression matches, this method is called to
translate the regex match object into a syntax node.
:param parser: the :class:`whoosh.qparser.default.QueryParser` object.
:param match: the regex match object.
"""
raise NotImplementedError
class FnTagger(RegexTagger):
"""Tagger that takes a regular expression and a class or function, and for
matches calls the class/function with the regex match's named groups as
keyword arguments.
"""
def __init__(self, expr, fn, memo=""):
RegexTagger.__init__(self, expr)
self.fn = fn
self.memo = memo
def __repr__(self):
return "<%s %r (%s)>" % (self.__class__.__name__, self.expr, self.memo)
def create(self, parser, match):
return self.fn(**match.groupdict())