2026-1-6
This commit is contained in:
388
venv/Lib/site-packages/whoosh/automata/nfa.py
Normal file
388
venv/Lib/site-packages/whoosh/automata/nfa.py
Normal file
@@ -0,0 +1,388 @@
|
||||
# Copyright 2012 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
from whoosh.automata.fst import Arc
|
||||
|
||||
|
||||
class Instruction(object):
|
||||
def __repr__(self):
|
||||
return "%s()" % (self.__class__.__name__, )
|
||||
|
||||
|
||||
class Char(Instruction):
|
||||
"""
|
||||
Matches a literal character.
|
||||
"""
|
||||
|
||||
def __init__(self, c):
|
||||
self.c = c
|
||||
|
||||
def __repr__(self):
|
||||
return "Char(%r)" % self.c
|
||||
|
||||
class Lit(Instruction):
|
||||
"""
|
||||
Matches a literal string.
|
||||
"""
|
||||
|
||||
def __init__(self, c):
|
||||
self.c = c
|
||||
|
||||
def __repr__(self):
|
||||
return "Lit(%r)" % self.c
|
||||
|
||||
|
||||
class Any(Instruction):
|
||||
"""
|
||||
Matches any character.
|
||||
"""
|
||||
|
||||
|
||||
class Match(Instruction):
|
||||
"""
|
||||
Stop this thread: the string matched.
|
||||
"""
|
||||
|
||||
def __repr__(self):
|
||||
return "Match()"
|
||||
|
||||
|
||||
class Jmp(Instruction):
|
||||
"""
|
||||
Jump to a specified instruction.
|
||||
"""
|
||||
|
||||
def __init__(self, x):
|
||||
self.x = x
|
||||
|
||||
def __repr__(self):
|
||||
return "Jmp(%s)" % self.x
|
||||
|
||||
|
||||
class Split(Instruction):
|
||||
"""
|
||||
Split execution: continue at two separate specified instructions.
|
||||
"""
|
||||
|
||||
def __init__(self, x, y):
|
||||
self.x = x
|
||||
self.y = y
|
||||
|
||||
def __repr__(self):
|
||||
return "Split(%s, %s)" % (self.x, self.y)
|
||||
|
||||
|
||||
class Label(Instruction):
|
||||
"""
|
||||
Placeholder to act as a target for JMP instructions
|
||||
"""
|
||||
|
||||
def __hash__(self):
|
||||
return id(self)
|
||||
|
||||
def __repr__(self):
|
||||
return "L(%s)" % hex(id(self))
|
||||
|
||||
|
||||
def concat(e1, e2):
|
||||
return e1 + e2
|
||||
|
||||
|
||||
def alt(e1, e2):
|
||||
L1, L2, L3 = Label(), Label(), Label()
|
||||
return [L1] + e1 + [Jmp(L3), L2] + e2 + [L3]
|
||||
|
||||
|
||||
def zero_or_one(e):
|
||||
L1, L2 = Label(), Label()
|
||||
return [Split(L1, L2), L1] + e + [L2]
|
||||
|
||||
|
||||
def zero_or_more(e):
|
||||
L1, L2, L3 = Label(), Label(), Label()
|
||||
return [L1, Split(L2, L3), L2] + e + [Jmp(L1), L3]
|
||||
|
||||
|
||||
def one_or_more(e):
|
||||
L1, L2 = Label(), Label()
|
||||
return [L1] + e + [Split(L1, L2), L2]
|
||||
|
||||
|
||||
def fixup(program):
|
||||
refs = {}
|
||||
i = 0
|
||||
while i < len(program):
|
||||
op = program[i]
|
||||
if isinstance(op, Label):
|
||||
refs[op] = i
|
||||
program.pop(i)
|
||||
else:
|
||||
i += 1
|
||||
|
||||
if refs:
|
||||
for op in program:
|
||||
if isinstance(op, (Jmp, Split)):
|
||||
op.x = refs[op.x]
|
||||
if isinstance(op, Split):
|
||||
op.y = refs[op.y]
|
||||
|
||||
return program + [Match]
|
||||
|
||||
|
||||
class ThreadList(object):
|
||||
def __init__(self, program, max=1000):
|
||||
self.program = program
|
||||
self.max = max
|
||||
self.threads = []
|
||||
|
||||
def __nonzero__(self):
|
||||
return bool(self.threads)
|
||||
|
||||
def current(self):
|
||||
return self.threads.pop()
|
||||
|
||||
def add(self, thread):
|
||||
op = self.program[thread.pc]
|
||||
optype = type(op)
|
||||
if optype is Jmp:
|
||||
self.add(thread.at(op.x))
|
||||
elif optype is Split:
|
||||
self.add(thread.copy_at(op.x))
|
||||
self.add(thread.at(op.y))
|
||||
else:
|
||||
self.threads.append(thread)
|
||||
|
||||
|
||||
class Thread(object):
|
||||
def __init__(self, pc, address, sofar='', accept=False):
|
||||
self.pc = pc
|
||||
self.address = address
|
||||
self.sofar = sofar
|
||||
self.accept = accept
|
||||
|
||||
def at(self, pc):
|
||||
self.pc = pc
|
||||
return self
|
||||
|
||||
def copy_at(self, pc):
|
||||
return Thread(pc, self.address, self.sofar, self.accept)
|
||||
|
||||
def __repr__(self):
|
||||
d = self.__dict__
|
||||
return "Thread(%s)" % ",".join("%s=%r" % (k, v) for k, v in d.items())
|
||||
|
||||
|
||||
def advance(thread, arc, c):
|
||||
thread.pc += 1
|
||||
thread.address = arc.target
|
||||
thread.sofar += c
|
||||
thread.accept = arc.accept
|
||||
|
||||
|
||||
def run(graph, program, address):
|
||||
threads = ThreadList(program)
|
||||
threads.add(Thread(0, address))
|
||||
arc = Arc()
|
||||
while threads:
|
||||
thread = threads.current()
|
||||
address = thread.address
|
||||
op = program[thread.pc]
|
||||
optype = type(op)
|
||||
|
||||
if optype is Char:
|
||||
if address:
|
||||
arc = graph.find_arc(address, op.c, arc)
|
||||
if arc:
|
||||
advance(thread, arc)
|
||||
threads.add(thread)
|
||||
elif optype is Lit:
|
||||
if address:
|
||||
c = op.c
|
||||
arc = graph.find_path(c, arc, address)
|
||||
if arc:
|
||||
advance(thread, arc, c)
|
||||
threads.add(thread)
|
||||
elif optype is Any:
|
||||
if address:
|
||||
sofar = thread.sofar
|
||||
pc = thread.pc + 1
|
||||
for arc in graph.iter_arcs(address, arc):
|
||||
t = Thread(pc, arc.target, sofar + arc.label, arc.accept)
|
||||
threads.add(t)
|
||||
elif op is Match:
|
||||
if thread.accept:
|
||||
yield thread.sofar
|
||||
else:
|
||||
raise Exception("Don't know what to do with %r" % op)
|
||||
|
||||
|
||||
LO = 0
|
||||
HI = 1
|
||||
|
||||
|
||||
def regex_limit(graph, mode, program, address):
|
||||
low = mode == LO
|
||||
output = []
|
||||
threads = ThreadList(program)
|
||||
threads.add(Thread(0, address))
|
||||
arc = Arc()
|
||||
while threads:
|
||||
thread = threads.current()
|
||||
address = thread.address
|
||||
op = program[thread.pc]
|
||||
optype = type(op)
|
||||
|
||||
if optype is Char:
|
||||
if address:
|
||||
arc = graph.find_arc(address, op.c, arc)
|
||||
if arc:
|
||||
if low and arc.accept:
|
||||
return thread.sofar + thread.label
|
||||
advance(thread, arc)
|
||||
threads.add(thread)
|
||||
elif optype is Lit:
|
||||
if address:
|
||||
labels = op.c
|
||||
for label in labels:
|
||||
arc = graph.find_arc(address, label)
|
||||
if arc is None:
|
||||
return thread.sofar
|
||||
elif thread.accept:
|
||||
return thread.sofar
|
||||
elif optype is Any:
|
||||
if address:
|
||||
if low:
|
||||
arc = graph.arc_at(address, arc)
|
||||
else:
|
||||
for arc in graph.iter_arcs(address):
|
||||
pass
|
||||
advance(thread, arc, arc.label)
|
||||
threads.add(thread)
|
||||
elif thread.accept:
|
||||
return thread.sofar
|
||||
elif op is Match:
|
||||
return thread.sofar
|
||||
else:
|
||||
raise Exception("Don't know what to do with %r" % op)
|
||||
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# from whoosh import index, query
|
||||
# from whoosh.filedb.filestore import RamStorage
|
||||
# from whoosh.automata import fst
|
||||
# from whoosh.util.testing import timing
|
||||
#
|
||||
# st = RamStorage()
|
||||
# gw = fst.GraphWriter(st.create_file("test"))
|
||||
# gw.start_field("test")
|
||||
# for key in ["aaaa", "aaab", "aabb", "abbb", "babb", "bbab", "bbba"]:
|
||||
# gw.insert(key)
|
||||
# gw.close()
|
||||
# gr = fst.GraphReader(st.open_file("test"))
|
||||
#
|
||||
# program = one_or_more([Lit("a")])
|
||||
# print program
|
||||
# program = fixup(program)
|
||||
# print program
|
||||
# print list(run(gr, program, gr.root("test")))
|
||||
#
|
||||
# ix = index.open_dir("e:/dev/src/houdini/help/index")
|
||||
# r = ix.reader()
|
||||
# gr = r._get_graph()
|
||||
#
|
||||
# # program = fixup([Any(), Any(), Any(), Any(), Any()])
|
||||
# # program = fixup(concat(zero_or_more([Any()]), [Char("/")]))
|
||||
# # with timing():
|
||||
# # x = list(run(gr, program, gr.root("path")))
|
||||
# # print len(x)
|
||||
#
|
||||
# q = query.Regex("path", "^.[abc].*/$")
|
||||
# with timing():
|
||||
# y = list(q._btexts(r))
|
||||
# print len(y)
|
||||
# print y[0], y[-1]
|
||||
#
|
||||
# pr = [Any()] + alt([Lit("c")], alt([Lit("b")], [Lit("a")])) + zero_or_more([Any()]) + [Lit("/")]
|
||||
# program = fixup(pr)
|
||||
# # with timing():
|
||||
# # x = list(run(gr, program, gr.root("path")))
|
||||
# # print len(x), x
|
||||
#
|
||||
# with timing():
|
||||
# print "lo=", regex_limit(gr, LO, program, gr.root("path"))
|
||||
# print "hi=", regex_limit(gr, HI, program, gr.root("path"))
|
||||
#
|
||||
#
|
||||
#
|
||||
# #int
|
||||
# #backtrackingvm(Inst *prog, char *input)
|
||||
# #{
|
||||
# # enum { MAXTHREAD = 1000 };
|
||||
# # Thread ready[MAXTHREAD];
|
||||
# # int nready;
|
||||
# # Inst *pc;
|
||||
# # char *sp;
|
||||
# #
|
||||
# # /* queue initial thread */
|
||||
# # ready[0] = thread(prog, input);
|
||||
# # nready = 1;
|
||||
# #
|
||||
# # /* run threads in stack order */
|
||||
# # while(nready > 0){
|
||||
# # --nready; /* pop state for next thread to run */
|
||||
# # pc = ready[nready].pc;
|
||||
# # sp = ready[nready].sp;
|
||||
# # for(;;){
|
||||
# # switch(pc->opcode){
|
||||
# # case Char:
|
||||
# # if(*sp != pc->c)
|
||||
# # goto Dead;
|
||||
# # pc++;
|
||||
# # sp++;
|
||||
# # continue;
|
||||
# # case Match:
|
||||
# # return 1;
|
||||
# # case Jmp:
|
||||
# # pc = pc->x;
|
||||
# # continue;
|
||||
# # case Split:
|
||||
# # if(nready >= MAXTHREAD){
|
||||
# # fprintf(stderr, "regexp overflow");
|
||||
# # return -1;
|
||||
# # }
|
||||
# # /* queue new thread */
|
||||
# # ready[nready++] = thread(pc->y, sp);
|
||||
# # pc = pc->x; /* continue current thread */
|
||||
# # continue;
|
||||
# # }
|
||||
# # }
|
||||
# # Dead:;
|
||||
# # }
|
||||
# # return 0;
|
||||
# #}
|
||||
#
|
||||
#
|
||||
Reference in New Issue
Block a user