2026-1-6
This commit is contained in:
0
venv/Lib/site-packages/whoosh/filedb/__init__.py
Normal file
0
venv/Lib/site-packages/whoosh/filedb/__init__.py
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
331
venv/Lib/site-packages/whoosh/filedb/compound.py
Normal file
331
venv/Lib/site-packages/whoosh/filedb/compound.py
Normal file
@@ -0,0 +1,331 @@
|
||||
# Copyright 2011 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
import errno
|
||||
import os
|
||||
import sys
|
||||
from threading import Lock
|
||||
from shutil import copyfileobj
|
||||
|
||||
try:
|
||||
import mmap
|
||||
except ImportError:
|
||||
mmap = None
|
||||
|
||||
from whoosh.compat import BytesIO, memoryview_
|
||||
from whoosh.filedb.structfile import BufferFile, StructFile
|
||||
from whoosh.filedb.filestore import FileStorage, StorageError
|
||||
from whoosh.system import emptybytes
|
||||
from whoosh.util import random_name
|
||||
|
||||
|
||||
class CompoundStorage(FileStorage):
|
||||
readonly = True
|
||||
|
||||
def __init__(self, dbfile, use_mmap=True, basepos=0):
|
||||
self._file = dbfile
|
||||
self.is_closed = False
|
||||
|
||||
# Seek to the end to get total file size (to check if mmap is OK)
|
||||
dbfile.seek(0, os.SEEK_END)
|
||||
filesize = self._file.tell()
|
||||
dbfile.seek(basepos)
|
||||
|
||||
self._diroffset = self._file.read_long()
|
||||
self._dirlength = self._file.read_int()
|
||||
self._file.seek(self._diroffset)
|
||||
self._dir = self._file.read_pickle()
|
||||
self._options = self._file.read_pickle()
|
||||
self._locks = {}
|
||||
self._source = None
|
||||
|
||||
use_mmap = (
|
||||
use_mmap
|
||||
and hasattr(self._file, "fileno") # check file is a real file
|
||||
and filesize < sys.maxsize # check fit on 32-bit Python
|
||||
)
|
||||
if mmap and use_mmap:
|
||||
# Try to open the entire segment as a memory-mapped object
|
||||
try:
|
||||
fileno = self._file.fileno()
|
||||
self._source = mmap.mmap(fileno, 0, access=mmap.ACCESS_READ)
|
||||
except (mmap.error, OSError):
|
||||
e = sys.exc_info()[1]
|
||||
# If we got an error because there wasn't enough memory to
|
||||
# open the map, ignore it and fall through, we'll just use the
|
||||
# (slower) "sub-file" implementation
|
||||
if e.errno == errno.ENOMEM:
|
||||
pass
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
# If that worked, we can close the file handle we were given
|
||||
self._file.close()
|
||||
self._file = None
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s (%s)>" % (self.__class__.__name__, self._name)
|
||||
|
||||
def close(self):
|
||||
if self.is_closed:
|
||||
raise Exception("Already closed")
|
||||
self.is_closed = True
|
||||
|
||||
if self._source:
|
||||
try:
|
||||
self._source.close()
|
||||
except BufferError:
|
||||
del self._source
|
||||
if self._file:
|
||||
self._file.close()
|
||||
|
||||
def range(self, name):
|
||||
try:
|
||||
fileinfo = self._dir[name]
|
||||
except KeyError:
|
||||
raise NameError("Unknown file %r" % (name,))
|
||||
return fileinfo["offset"], fileinfo["length"]
|
||||
|
||||
def open_file(self, name, *args, **kwargs):
|
||||
if self.is_closed:
|
||||
raise StorageError("Storage was closed")
|
||||
|
||||
offset, length = self.range(name)
|
||||
if self._source:
|
||||
# Create a memoryview/buffer from the mmap
|
||||
buf = memoryview_(self._source, offset, length)
|
||||
f = BufferFile(buf, name=name)
|
||||
elif hasattr(self._file, "subset"):
|
||||
f = self._file.subset(offset, length, name=name)
|
||||
else:
|
||||
f = StructFile(SubFile(self._file, offset, length), name=name)
|
||||
return f
|
||||
|
||||
def list(self):
|
||||
return list(self._dir.keys())
|
||||
|
||||
def file_exists(self, name):
|
||||
return name in self._dir
|
||||
|
||||
def file_length(self, name):
|
||||
info = self._dir[name]
|
||||
return info["length"]
|
||||
|
||||
def file_modified(self, name):
|
||||
info = self._dir[name]
|
||||
return info["modified"]
|
||||
|
||||
def lock(self, name):
|
||||
if name not in self._locks:
|
||||
self._locks[name] = Lock()
|
||||
return self._locks[name]
|
||||
|
||||
@staticmethod
|
||||
def assemble(dbfile, store, names, **options):
|
||||
assert names, names
|
||||
|
||||
directory = {}
|
||||
basepos = dbfile.tell()
|
||||
dbfile.write_long(0) # Directory position
|
||||
dbfile.write_int(0) # Directory length
|
||||
|
||||
# Copy the files into the compound file
|
||||
for name in names:
|
||||
if name.endswith(".toc") or name.endswith(".seg"):
|
||||
raise Exception(name)
|
||||
|
||||
for name in names:
|
||||
offset = dbfile.tell()
|
||||
length = store.file_length(name)
|
||||
modified = store.file_modified(name)
|
||||
directory[name] = {"offset": offset, "length": length,
|
||||
"modified": modified}
|
||||
f = store.open_file(name)
|
||||
copyfileobj(f, dbfile)
|
||||
f.close()
|
||||
|
||||
CompoundStorage.write_dir(dbfile, basepos, directory, options)
|
||||
|
||||
@staticmethod
|
||||
def write_dir(dbfile, basepos, directory, options=None):
|
||||
options = options or {}
|
||||
|
||||
dirpos = dbfile.tell() # Remember the start of the directory
|
||||
dbfile.write_pickle(directory) # Write the directory
|
||||
dbfile.write_pickle(options)
|
||||
endpos = dbfile.tell() # Remember the end of the directory
|
||||
dbfile.flush()
|
||||
dbfile.seek(basepos) # Seek back to the start
|
||||
dbfile.write_long(dirpos) # Directory position
|
||||
dbfile.write_int(endpos - dirpos) # Directory length
|
||||
|
||||
dbfile.close()
|
||||
|
||||
|
||||
class SubFile(object):
|
||||
def __init__(self, parentfile, offset, length, name=None):
|
||||
self._file = parentfile
|
||||
self._offset = offset
|
||||
self._length = length
|
||||
self._end = offset + length
|
||||
self._pos = 0
|
||||
|
||||
self.name = name
|
||||
self.closed = False
|
||||
|
||||
def close(self):
|
||||
self.closed = True
|
||||
|
||||
def subset(self, position, length, name=None):
|
||||
start = self._offset + position
|
||||
end = start + length
|
||||
name = name or self.name
|
||||
assert self._offset >= start >= self._end
|
||||
assert self._offset >= end >= self._end
|
||||
return SubFile(self._file, self._offset + position, length, name=name)
|
||||
|
||||
def read(self, size=None):
|
||||
if size is None:
|
||||
size = self._length - self._pos
|
||||
else:
|
||||
size = min(size, self._length - self._pos)
|
||||
if size < 0:
|
||||
size = 0
|
||||
|
||||
if size > 0:
|
||||
self._file.seek(self._offset + self._pos)
|
||||
self._pos += size
|
||||
return self._file.read(size)
|
||||
else:
|
||||
return emptybytes
|
||||
|
||||
def readline(self):
|
||||
maxsize = self._length - self._pos
|
||||
self._file.seek(self._offset + self._pos)
|
||||
data = self._file.readline()
|
||||
if len(data) > maxsize:
|
||||
data = data[:maxsize]
|
||||
self._pos += len(data)
|
||||
return data
|
||||
|
||||
def seek(self, where, whence=0):
|
||||
if whence == 0: # Absolute
|
||||
pos = where
|
||||
elif whence == 1: # Relative
|
||||
pos = self._pos + where
|
||||
elif whence == 2: # From end
|
||||
pos = self._length - where
|
||||
else:
|
||||
raise ValueError
|
||||
|
||||
self._pos = pos
|
||||
|
||||
def tell(self):
|
||||
return self._pos
|
||||
|
||||
|
||||
class CompoundWriter(object):
|
||||
def __init__(self, tempstorage, buffersize=32 * 1024):
|
||||
assert isinstance(buffersize, int)
|
||||
self._tempstorage = tempstorage
|
||||
self._tempname = "%s.ctmp" % random_name()
|
||||
self._temp = tempstorage.create_file(self._tempname, mode="w+b")
|
||||
self._buffersize = buffersize
|
||||
self._streams = {}
|
||||
|
||||
def create_file(self, name):
|
||||
ss = self.SubStream(self._temp, self._buffersize)
|
||||
self._streams[name] = ss
|
||||
return StructFile(ss)
|
||||
|
||||
def _readback(self):
|
||||
temp = self._temp
|
||||
for name, substream in self._streams.items():
|
||||
substream.close()
|
||||
|
||||
def gen():
|
||||
for f, offset, length in substream.blocks:
|
||||
if f is None:
|
||||
f = temp
|
||||
f.seek(offset)
|
||||
yield f.read(length)
|
||||
|
||||
yield (name, gen)
|
||||
temp.close()
|
||||
self._tempstorage.delete_file(self._tempname)
|
||||
|
||||
def save_as_compound(self, dbfile):
|
||||
basepos = dbfile.tell()
|
||||
dbfile.write_long(0) # Directory offset
|
||||
dbfile.write_int(0) # Directory length
|
||||
|
||||
directory = {}
|
||||
for name, blocks in self._readback():
|
||||
filestart = dbfile.tell()
|
||||
for block in blocks():
|
||||
dbfile.write(block)
|
||||
directory[name] = {"offset": filestart,
|
||||
"length": dbfile.tell() - filestart}
|
||||
|
||||
CompoundStorage.write_dir(dbfile, basepos, directory)
|
||||
|
||||
def save_as_files(self, storage, name_fn):
|
||||
for name, blocks in self._readback():
|
||||
f = storage.create_file(name_fn(name))
|
||||
for block in blocks():
|
||||
f.write(block)
|
||||
f.close()
|
||||
|
||||
class SubStream(object):
|
||||
def __init__(self, dbfile, buffersize):
|
||||
self._dbfile = dbfile
|
||||
self._buffersize = buffersize
|
||||
self._buffer = BytesIO()
|
||||
self.blocks = []
|
||||
|
||||
def tell(self):
|
||||
return sum(b[2] for b in self.blocks) + self._buffer.tell()
|
||||
|
||||
def write(self, inbytes):
|
||||
bio = self._buffer
|
||||
buflen = bio.tell()
|
||||
length = buflen + len(inbytes)
|
||||
if length >= self._buffersize:
|
||||
offset = self._dbfile.tell()
|
||||
self._dbfile.write(bio.getvalue()[:buflen])
|
||||
self._dbfile.write(inbytes)
|
||||
|
||||
self.blocks.append((None, offset, length))
|
||||
self._buffer.seek(0)
|
||||
else:
|
||||
bio.write(inbytes)
|
||||
|
||||
def close(self):
|
||||
bio = self._buffer
|
||||
length = bio.tell()
|
||||
if length:
|
||||
self.blocks.append((bio, 0, length))
|
||||
662
venv/Lib/site-packages/whoosh/filedb/filestore.py
Normal file
662
venv/Lib/site-packages/whoosh/filedb/filestore.py
Normal file
@@ -0,0 +1,662 @@
|
||||
# Copyright 2009 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
from __future__ import with_statement
|
||||
import errno, os, sys, tempfile
|
||||
from threading import Lock
|
||||
|
||||
from whoosh.compat import BytesIO, memoryview_
|
||||
from whoosh.filedb.structfile import BufferFile, StructFile
|
||||
from whoosh.index import _DEF_INDEX_NAME, EmptyIndexError
|
||||
from whoosh.util import random_name
|
||||
from whoosh.util.filelock import FileLock
|
||||
|
||||
|
||||
# Exceptions
|
||||
|
||||
class StorageError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ReadOnlyError(StorageError):
|
||||
pass
|
||||
|
||||
|
||||
# Base class
|
||||
|
||||
class Storage(object):
|
||||
"""Abstract base class for storage objects.
|
||||
|
||||
A storage object is a virtual flat filesystem, allowing the creation and
|
||||
retrieval of file-like objects
|
||||
(:class:`~whoosh.filedb.structfile.StructFile` objects). The default
|
||||
implementation (:class:`FileStorage`) uses actual files in a directory.
|
||||
|
||||
All access to files in Whoosh goes through this object. This allows more
|
||||
different forms of storage (for example, in RAM, in a database, in a single
|
||||
file) to be used transparently.
|
||||
|
||||
For example, to create a :class:`FileStorage` object::
|
||||
|
||||
# Create a storage object
|
||||
st = FileStorage("indexdir")
|
||||
# Create the directory if it doesn't already exist
|
||||
st.create()
|
||||
|
||||
The :meth:`Storage.create` method makes it slightly easier to swap storage
|
||||
implementations. The ``create()`` method handles set-up of the storage
|
||||
object. For example, ``FileStorage.create()`` creates the directory. A
|
||||
database implementation might create tables. This is designed to let you
|
||||
avoid putting implementation-specific setup code in your application.
|
||||
"""
|
||||
|
||||
readonly = False
|
||||
supports_mmap = False
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.list())
|
||||
|
||||
def __enter__(self):
|
||||
self.create()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.close()
|
||||
|
||||
def create(self):
|
||||
"""Creates any required implementation-specific resources. For example,
|
||||
a filesystem-based implementation might create a directory, while a
|
||||
database implementation might create tables. For example::
|
||||
|
||||
from whoosh.filedb.filestore import FileStorage
|
||||
# Create a storage object
|
||||
st = FileStorage("indexdir")
|
||||
# Create any necessary resources
|
||||
st.create()
|
||||
|
||||
This method returns ``self`` so you can also say::
|
||||
|
||||
st = FileStorage("indexdir").create()
|
||||
|
||||
Storage implementations should be written so that calling create() a
|
||||
second time on the same storage
|
||||
|
||||
:return: a :class:`Storage` instance.
|
||||
"""
|
||||
|
||||
return self
|
||||
|
||||
def destroy(self, *args, **kwargs):
|
||||
"""Removes any implementation-specific resources related to this storage
|
||||
object. For example, a filesystem-based implementation might delete a
|
||||
directory, and a database implementation might drop tables.
|
||||
|
||||
The arguments are implementation-specific.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
def create_index(self, schema, indexname=_DEF_INDEX_NAME, indexclass=None):
|
||||
"""Creates a new index in this storage.
|
||||
|
||||
>>> from whoosh import fields
|
||||
>>> from whoosh.filedb.filestore import FileStorage
|
||||
>>> schema = fields.Schema(content=fields.TEXT)
|
||||
>>> # Create the storage directory
|
||||
>>> st = FileStorage.create("indexdir")
|
||||
>>> # Create an index in the storage
|
||||
>>> ix = st.create_index(schema)
|
||||
|
||||
:param schema: the :class:`whoosh.fields.Schema` object to use for the
|
||||
new index.
|
||||
:param indexname: the name of the index within the storage object. You
|
||||
can use this option to store multiple indexes in the same storage.
|
||||
:param indexclass: an optional custom ``Index`` sub-class to use to
|
||||
create the index files. The default is
|
||||
:class:`whoosh.index.FileIndex`. This method will call the
|
||||
``create`` class method on the given class to create the index.
|
||||
:return: a :class:`whoosh.index.Index` instance.
|
||||
"""
|
||||
|
||||
if self.readonly:
|
||||
raise ReadOnlyError
|
||||
if indexclass is None:
|
||||
import whoosh.index
|
||||
indexclass = whoosh.index.FileIndex
|
||||
return indexclass.create(self, schema, indexname)
|
||||
|
||||
def open_index(self, indexname=_DEF_INDEX_NAME, schema=None, indexclass=None):
|
||||
"""Opens an existing index (created using :meth:`create_index`) in this
|
||||
storage.
|
||||
|
||||
>>> from whoosh.filedb.filestore import FileStorage
|
||||
>>> st = FileStorage("indexdir")
|
||||
>>> # Open an index in the storage
|
||||
>>> ix = st.open_index()
|
||||
|
||||
:param indexname: the name of the index within the storage object. You
|
||||
can use this option to store multiple indexes in the same storage.
|
||||
:param schema: if you pass in a :class:`whoosh.fields.Schema` object
|
||||
using this argument, it will override the schema that was stored
|
||||
with the index.
|
||||
:param indexclass: an optional custom ``Index`` sub-class to use to
|
||||
open the index files. The default is
|
||||
:class:`whoosh.index.FileIndex`. This method will instantiate the
|
||||
class with this storage object.
|
||||
:return: a :class:`whoosh.index.Index` instance.
|
||||
"""
|
||||
|
||||
if indexclass is None:
|
||||
import whoosh.index
|
||||
indexclass = whoosh.index.FileIndex
|
||||
return indexclass(self, schema=schema, indexname=indexname)
|
||||
|
||||
def index_exists(self, indexname=None):
|
||||
"""Returns True if a non-empty index exists in this storage.
|
||||
|
||||
:param indexname: the name of the index within the storage object. You
|
||||
can use this option to store multiple indexes in the same storage.
|
||||
:rtype: bool
|
||||
"""
|
||||
|
||||
if indexname is None:
|
||||
indexname = _DEF_INDEX_NAME
|
||||
try:
|
||||
ix = self.open_index(indexname)
|
||||
gen = ix.latest_generation()
|
||||
ix.close()
|
||||
return gen > -1
|
||||
except EmptyIndexError:
|
||||
pass
|
||||
return False
|
||||
|
||||
def create_file(self, name):
|
||||
"""Creates a file with the given name in this storage.
|
||||
|
||||
:param name: the name for the new file.
|
||||
:return: a :class:`whoosh.filedb.structfile.StructFile` instance.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def open_file(self, name, *args, **kwargs):
|
||||
"""Opens a file with the given name in this storage.
|
||||
|
||||
:param name: the name for the new file.
|
||||
:return: a :class:`whoosh.filedb.structfile.StructFile` instance.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def list(self):
|
||||
"""Returns a list of file names in this storage.
|
||||
|
||||
:return: a list of strings
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def file_exists(self, name):
|
||||
"""Returns True if the given file exists in this storage.
|
||||
|
||||
:param name: the name to check.
|
||||
:rtype: bool
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def file_modified(self, name):
|
||||
"""Returns the last-modified time of the given file in this storage (as
|
||||
a "ctime" UNIX timestamp).
|
||||
|
||||
:param name: the name to check.
|
||||
:return: a "ctime" number.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def file_length(self, name):
|
||||
"""Returns the size (in bytes) of the given file in this storage.
|
||||
|
||||
:param name: the name to check.
|
||||
:rtype: int
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def delete_file(self, name):
|
||||
"""Removes the given file from this storage.
|
||||
|
||||
:param name: the name to delete.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def rename_file(self, frm, to, safe=False):
|
||||
"""Renames a file in this storage.
|
||||
|
||||
:param frm: The current name of the file.
|
||||
:param to: The new name for the file.
|
||||
:param safe: if True, raise an exception if a file with the new name
|
||||
already exists.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def lock(self, name):
|
||||
"""Return a named lock object (implementing ``.acquire()`` and
|
||||
``.release()`` methods). Different storage implementations may use
|
||||
different lock types with different guarantees. For example, the
|
||||
RamStorage object uses Python thread locks, while the FileStorage
|
||||
object uses filesystem-based locks that are valid across different
|
||||
processes.
|
||||
|
||||
:param name: a name for the lock.
|
||||
:return: a lock-like object.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def close(self):
|
||||
"""Closes any resources opened by this storage object. For some storage
|
||||
implementations this will be a no-op, but for others it is necessary
|
||||
to release locks and/or prevent leaks, so it's a good idea to call it
|
||||
when you're done with a storage object.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
def optimize(self):
|
||||
"""Optimizes the storage object. The meaning and cost of "optimizing"
|
||||
will vary by implementation. For example, a database implementation
|
||||
might run a garbage collection procedure on the underlying database.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
def temp_storage(self, name=None):
|
||||
"""Creates a new storage object for temporary files. You can call
|
||||
:meth:`Storage.destroy` on the new storage when you're finished with
|
||||
it.
|
||||
|
||||
:param name: a name for the new storage. This may be optional or
|
||||
required depending on the storage implementation.
|
||||
:rtype: :class:`Storage`
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class OverlayStorage(Storage):
|
||||
"""Overlays two storage objects. Reads are processed from the first if it
|
||||
has the named file, otherwise the second. Writes always go to the second.
|
||||
"""
|
||||
|
||||
def __init__(self, a, b):
|
||||
self.a = a
|
||||
self.b = b
|
||||
|
||||
def create_index(self, *args, **kwargs):
|
||||
self.b.create_index(*args, **kwargs)
|
||||
|
||||
def open_index(self, *args, **kwargs):
|
||||
self.a.open_index(*args, **kwargs)
|
||||
|
||||
def create_file(self, *args, **kwargs):
|
||||
return self.b.create_file(*args, **kwargs)
|
||||
|
||||
def open_file(self, name, *args, **kwargs):
|
||||
if self.a.file_exists(name):
|
||||
return self.a.open_file(name, *args, **kwargs)
|
||||
else:
|
||||
return self.b.open_file(name, *args, **kwargs)
|
||||
|
||||
def list(self):
|
||||
return list(set(self.a.list()) | set(self.b.list()))
|
||||
|
||||
def file_exists(self, name):
|
||||
return self.a.file_exists(name) or self.b.file_exists(name)
|
||||
|
||||
def file_modified(self, name):
|
||||
if self.a.file_exists(name):
|
||||
return self.a.file_modified(name)
|
||||
else:
|
||||
return self.b.file_modified(name)
|
||||
|
||||
def file_length(self, name):
|
||||
if self.a.file_exists(name):
|
||||
return self.a.file_length(name)
|
||||
else:
|
||||
return self.b.file_length(name)
|
||||
|
||||
def delete_file(self, name):
|
||||
return self.b.delete_file(name)
|
||||
|
||||
def rename_file(self, *args, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
def lock(self, name):
|
||||
return self.b.lock(name)
|
||||
|
||||
def close(self):
|
||||
self.a.close()
|
||||
self.b.close()
|
||||
|
||||
def optimize(self):
|
||||
self.a.optimize()
|
||||
self.b.optimize()
|
||||
|
||||
def temp_storage(self, name=None):
|
||||
return self.b.temp_storage(name=name)
|
||||
|
||||
|
||||
class FileStorage(Storage):
|
||||
"""Storage object that stores the index as files in a directory on disk.
|
||||
|
||||
Prior to version 3, the initializer would raise an IOError if the directory
|
||||
did not exist. As of version 3, the object does not check if the
|
||||
directory exists at initialization. This change is to support using the
|
||||
:meth:`FileStorage.create` method.
|
||||
"""
|
||||
|
||||
supports_mmap = True
|
||||
|
||||
def __init__(self, path, supports_mmap=True, readonly=False, debug=False):
|
||||
"""
|
||||
:param path: a path to a directory.
|
||||
:param supports_mmap: if True (the default), use the ``mmap`` module to
|
||||
open memory mapped files. You can open the storage object with
|
||||
``supports_mmap=False`` to force Whoosh to open files normally
|
||||
instead of with ``mmap``.
|
||||
:param readonly: If ``True``, the object will raise an exception if you
|
||||
attempt to create or rename a file.
|
||||
"""
|
||||
|
||||
self.folder = path
|
||||
self.supports_mmap = supports_mmap
|
||||
self.readonly = readonly
|
||||
self._debug = debug
|
||||
self.locks = {}
|
||||
|
||||
def __repr__(self):
|
||||
return "%s(%r)" % (self.__class__.__name__, self.folder)
|
||||
|
||||
def create(self):
|
||||
"""Creates this storage object's directory path using ``os.makedirs`` if
|
||||
it doesn't already exist.
|
||||
|
||||
>>> from whoosh.filedb.filestore import FileStorage
|
||||
>>> st = FileStorage("indexdir")
|
||||
>>> st.create()
|
||||
|
||||
This method returns ``self``, you can say::
|
||||
|
||||
st = FileStorage("indexdir").create()
|
||||
|
||||
Note that you can simply create handle the creation of the directory
|
||||
yourself and open the storage object using the initializer::
|
||||
|
||||
dirname = "indexdir"
|
||||
os.mkdir(dirname)
|
||||
st = FileStorage(dirname)
|
||||
|
||||
However, using the ``create()`` method allows you to potentially swap in
|
||||
other storage implementations more easily.
|
||||
|
||||
:return: a :class:`Storage` instance.
|
||||
"""
|
||||
|
||||
dirpath = os.path.abspath(self.folder)
|
||||
# If the given directory does not already exist, try to create it
|
||||
try:
|
||||
os.makedirs(dirpath)
|
||||
except OSError:
|
||||
# This is necessary for compatibility between Py2 and Py3
|
||||
e = sys.exc_info()[1]
|
||||
# If we get an error because the path already exists, ignore it
|
||||
if e.errno != errno.EEXIST:
|
||||
raise
|
||||
|
||||
# Raise an exception if the given path is not a directory
|
||||
if not os.path.isdir(dirpath):
|
||||
e = IOError("%r is not a directory" % dirpath)
|
||||
e.errno = errno.ENOTDIR
|
||||
raise e
|
||||
|
||||
return self
|
||||
|
||||
def destroy(self):
|
||||
"""Removes any files in this storage object and then removes the
|
||||
storage object's directory. What happens if any of the files or the
|
||||
directory are in use depends on the underlying platform.
|
||||
"""
|
||||
|
||||
# Remove all files
|
||||
self.clean()
|
||||
try:
|
||||
# Try to remove the directory
|
||||
os.rmdir(self.folder)
|
||||
except IOError:
|
||||
e = sys.exc_info()[1]
|
||||
if e.errno == errno.ENOENT:
|
||||
pass
|
||||
else:
|
||||
raise e
|
||||
|
||||
def create_file(self, name, excl=False, mode="wb", **kwargs):
|
||||
"""Creates a file with the given name in this storage.
|
||||
|
||||
:param name: the name for the new file.
|
||||
:param excl: if True, try to open the file in "exclusive" mode.
|
||||
:param mode: the mode flags with which to open the file. The default is
|
||||
``"wb"``.
|
||||
:return: a :class:`whoosh.filedb.structfile.StructFile` instance.
|
||||
"""
|
||||
|
||||
if self.readonly:
|
||||
raise ReadOnlyError
|
||||
|
||||
path = self._fpath(name)
|
||||
if excl:
|
||||
flags = os.O_CREAT | os.O_EXCL | os.O_RDWR
|
||||
if hasattr(os, "O_BINARY"):
|
||||
flags |= os.O_BINARY
|
||||
fd = os.open(path, flags)
|
||||
fileobj = os.fdopen(fd, mode)
|
||||
else:
|
||||
fileobj = open(path, mode)
|
||||
|
||||
f = StructFile(fileobj, name=name, **kwargs)
|
||||
return f
|
||||
|
||||
def open_file(self, name, **kwargs):
|
||||
"""Opens an existing file in this storage.
|
||||
|
||||
:param name: the name of the file to open.
|
||||
:param kwargs: additional keyword arguments are passed through to the
|
||||
:class:`~whoosh.filedb.structfile.StructFile` initializer.
|
||||
:return: a :class:`whoosh.filedb.structfile.StructFile` instance.
|
||||
"""
|
||||
|
||||
f = StructFile(open(self._fpath(name), "rb"), name=name, **kwargs)
|
||||
return f
|
||||
|
||||
def _fpath(self, fname):
|
||||
return os.path.abspath(os.path.join(self.folder, fname))
|
||||
|
||||
def clean(self, ignore=False):
|
||||
if self.readonly:
|
||||
raise ReadOnlyError
|
||||
|
||||
path = self.folder
|
||||
files = self.list()
|
||||
for fname in files:
|
||||
try:
|
||||
os.remove(os.path.join(path, fname))
|
||||
except OSError:
|
||||
if not ignore:
|
||||
raise
|
||||
|
||||
def list(self):
|
||||
try:
|
||||
files = os.listdir(self.folder)
|
||||
except IOError:
|
||||
files = []
|
||||
|
||||
return files
|
||||
|
||||
def file_exists(self, name):
|
||||
return os.path.exists(self._fpath(name))
|
||||
|
||||
def file_modified(self, name):
|
||||
return os.path.getmtime(self._fpath(name))
|
||||
|
||||
def file_length(self, name):
|
||||
return os.path.getsize(self._fpath(name))
|
||||
|
||||
def delete_file(self, name):
|
||||
if self.readonly:
|
||||
raise ReadOnlyError
|
||||
|
||||
os.remove(self._fpath(name))
|
||||
|
||||
def rename_file(self, oldname, newname, safe=False):
|
||||
if self.readonly:
|
||||
raise ReadOnlyError
|
||||
|
||||
if os.path.exists(self._fpath(newname)):
|
||||
if safe:
|
||||
raise NameError("File %r exists" % newname)
|
||||
else:
|
||||
os.remove(self._fpath(newname))
|
||||
os.rename(self._fpath(oldname), self._fpath(newname))
|
||||
|
||||
def lock(self, name):
|
||||
return FileLock(self._fpath(name))
|
||||
|
||||
def temp_storage(self, name=None):
|
||||
name = name or "%s.tmp" % random_name()
|
||||
path = os.path.join(self.folder, name)
|
||||
tempstore = FileStorage(path)
|
||||
return tempstore.create()
|
||||
|
||||
|
||||
class RamStorage(Storage):
|
||||
"""Storage object that keeps the index in memory.
|
||||
"""
|
||||
|
||||
supports_mmap = False
|
||||
|
||||
def __init__(self):
|
||||
self.files = {}
|
||||
self.locks = {}
|
||||
self.folder = ''
|
||||
|
||||
def destroy(self):
|
||||
del self.files
|
||||
del self.locks
|
||||
|
||||
def list(self):
|
||||
return list(self.files.keys())
|
||||
|
||||
def clean(self):
|
||||
self.files = {}
|
||||
|
||||
def total_size(self):
|
||||
return sum(self.file_length(f) for f in self.list())
|
||||
|
||||
def file_exists(self, name):
|
||||
return name in self.files
|
||||
|
||||
def file_length(self, name):
|
||||
if name not in self.files:
|
||||
raise NameError(name)
|
||||
return len(self.files[name])
|
||||
|
||||
def file_modified(self, name):
|
||||
return -1
|
||||
|
||||
def delete_file(self, name):
|
||||
if name not in self.files:
|
||||
raise NameError(name)
|
||||
del self.files[name]
|
||||
|
||||
def rename_file(self, name, newname, safe=False):
|
||||
if name not in self.files:
|
||||
raise NameError(name)
|
||||
if safe and newname in self.files:
|
||||
raise NameError("File %r exists" % newname)
|
||||
|
||||
content = self.files[name]
|
||||
del self.files[name]
|
||||
self.files[newname] = content
|
||||
|
||||
def create_file(self, name, **kwargs):
|
||||
def onclose_fn(sfile):
|
||||
self.files[name] = sfile.file.getvalue()
|
||||
f = StructFile(BytesIO(), name=name, onclose=onclose_fn)
|
||||
return f
|
||||
|
||||
def open_file(self, name, **kwargs):
|
||||
if name not in self.files:
|
||||
raise NameError(name)
|
||||
buf = memoryview_(self.files[name])
|
||||
return BufferFile(buf, name=name, **kwargs)
|
||||
|
||||
def lock(self, name):
|
||||
if name not in self.locks:
|
||||
self.locks[name] = Lock()
|
||||
return self.locks[name]
|
||||
|
||||
def temp_storage(self, name=None):
|
||||
tdir = tempfile.gettempdir()
|
||||
name = name or "%s.tmp" % random_name()
|
||||
path = os.path.join(tdir, name)
|
||||
tempstore = FileStorage(path)
|
||||
return tempstore.create()
|
||||
|
||||
|
||||
def copy_storage(sourcestore, deststore):
|
||||
"""Copies the files from the source storage object to the destination
|
||||
storage object using ``shutil.copyfileobj``.
|
||||
"""
|
||||
from shutil import copyfileobj
|
||||
|
||||
for name in sourcestore.list():
|
||||
with sourcestore.open_file(name) as source:
|
||||
with deststore.create_file(name) as dest:
|
||||
copyfileobj(source, dest)
|
||||
|
||||
|
||||
def copy_to_ram(storage):
|
||||
"""Copies the given FileStorage object into a new RamStorage object.
|
||||
|
||||
:rtype: :class:`RamStorage`
|
||||
"""
|
||||
|
||||
ram = RamStorage()
|
||||
copy_storage(storage, ram)
|
||||
return ram
|
||||
735
venv/Lib/site-packages/whoosh/filedb/filetables.py
Normal file
735
venv/Lib/site-packages/whoosh/filedb/filetables.py
Normal file
@@ -0,0 +1,735 @@
|
||||
# Copyright 2009 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
"""This module defines writer and reader classes for a fast, immutable
|
||||
on-disk key-value database format. The current format is based heavily on
|
||||
D. J. Bernstein's CDB format (http://cr.yp.to/cdb.html).
|
||||
"""
|
||||
|
||||
import os, struct
|
||||
from binascii import crc32
|
||||
from bisect import bisect_left
|
||||
from hashlib import md5 # @UnresolvedImport
|
||||
|
||||
from whoosh.compat import b, bytes_type
|
||||
from whoosh.compat import xrange
|
||||
from whoosh.util.numlists import GrowableArray
|
||||
from whoosh.system import _INT_SIZE, emptybytes
|
||||
|
||||
|
||||
# Exceptions
|
||||
|
||||
class FileFormatError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
# Hash functions
|
||||
|
||||
def cdb_hash(key):
|
||||
h = 5381
|
||||
for c in key:
|
||||
h = (h + (h << 5)) & 0xffffffff ^ ord(c)
|
||||
return h
|
||||
|
||||
|
||||
def md5_hash(key):
|
||||
return int(md5(key).hexdigest(), 16) & 0xffffffff
|
||||
|
||||
|
||||
def crc_hash(key):
|
||||
return crc32(key) & 0xffffffff
|
||||
|
||||
|
||||
_hash_functions = (md5_hash, crc_hash, cdb_hash)
|
||||
|
||||
|
||||
# Structs
|
||||
|
||||
# Two uints before the key/value pair giving the length of the key and value
|
||||
_lengths = struct.Struct("!ii")
|
||||
# A pointer in a hash table, giving the hash value and the key position
|
||||
_pointer = struct.Struct("!Iq")
|
||||
# A pointer in the hash table directory, giving the position and number of slots
|
||||
_dir_entry = struct.Struct("!qi")
|
||||
|
||||
_directory_size = 256 * _dir_entry.size
|
||||
|
||||
|
||||
# Basic hash file
|
||||
|
||||
class HashWriter(object):
|
||||
"""Implements a fast on-disk key-value store. This hash uses a two-level
|
||||
hashing scheme, where a key is hashed, the low eight bits of the hash value
|
||||
are used to index into one of 256 hash tables. This is basically the CDB
|
||||
algorithm, but unlike CDB this object writes all data serially (it doesn't
|
||||
seek backwards to overwrite information at the end).
|
||||
|
||||
Also unlike CDB, this format uses 64-bit file pointers, so the file length
|
||||
is essentially unlimited. However, each key and value must be less than
|
||||
2 GB in length.
|
||||
"""
|
||||
|
||||
def __init__(self, dbfile, magic=b("HSH3"), hashtype=0):
|
||||
"""
|
||||
:param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object
|
||||
to write to.
|
||||
:param magic: the format tag bytes to write at the start of the file.
|
||||
:param hashtype: an integer indicating which hashing algorithm to use.
|
||||
Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash).
|
||||
"""
|
||||
|
||||
self.dbfile = dbfile
|
||||
self.hashtype = hashtype
|
||||
self.hashfn = _hash_functions[self.hashtype]
|
||||
# A place for subclasses to put extra metadata
|
||||
self.extras = {}
|
||||
|
||||
self.startoffset = dbfile.tell()
|
||||
# Write format tag
|
||||
dbfile.write(magic)
|
||||
# Write hash type
|
||||
dbfile.write_byte(self.hashtype)
|
||||
# Unused future expansion bits
|
||||
dbfile.write_int(0)
|
||||
dbfile.write_int(0)
|
||||
|
||||
# 256 lists of hashed keys and positions
|
||||
self.buckets = [[] for _ in xrange(256)]
|
||||
# List to remember the positions of the hash tables
|
||||
self.directory = []
|
||||
|
||||
def tell(self):
|
||||
return self.dbfile.tell()
|
||||
|
||||
def add(self, key, value):
|
||||
"""Adds a key/value pair to the file. Note that keys DO NOT need to be
|
||||
unique. You can store multiple values under the same key and retrieve
|
||||
them using :meth:`HashReader.all`.
|
||||
"""
|
||||
|
||||
assert isinstance(key, bytes_type)
|
||||
assert isinstance(value, bytes_type)
|
||||
|
||||
dbfile = self.dbfile
|
||||
pos = dbfile.tell()
|
||||
dbfile.write(_lengths.pack(len(key), len(value)))
|
||||
dbfile.write(key)
|
||||
dbfile.write(value)
|
||||
|
||||
# Get hash value for the key
|
||||
h = self.hashfn(key)
|
||||
# Add hash and on-disk position to appropriate bucket
|
||||
self.buckets[h & 255].append((h, pos))
|
||||
|
||||
def add_all(self, items):
|
||||
"""Convenience method to add a sequence of ``(key, value)`` pairs. This
|
||||
is the same as calling :meth:`HashWriter.add` on each pair in the
|
||||
sequence.
|
||||
"""
|
||||
|
||||
add = self.add
|
||||
for key, value in items:
|
||||
add(key, value)
|
||||
|
||||
def _write_hashes(self):
|
||||
# Writes 256 hash tables containing pointers to the key/value pairs
|
||||
|
||||
dbfile = self.dbfile
|
||||
# Represent and empty slot in the hash table using 0,0 (no key can
|
||||
# start at position 0 because of the header)
|
||||
null = (0, 0)
|
||||
|
||||
for entries in self.buckets:
|
||||
# Start position of this bucket's hash table
|
||||
pos = dbfile.tell()
|
||||
# Remember the start position and the number of slots
|
||||
numslots = 2 * len(entries)
|
||||
self.directory.append((pos, numslots))
|
||||
|
||||
# Create the empty hash table
|
||||
hashtable = [null] * numslots
|
||||
# For each (hash value, key position) tuple in the bucket
|
||||
for hashval, position in entries:
|
||||
# Bitshift and wrap to get the slot for this entry
|
||||
slot = (hashval >> 8) % numslots
|
||||
# If the slot is taken, keep going until we find an empty slot
|
||||
while hashtable[slot] != null:
|
||||
slot = (slot + 1) % numslots
|
||||
# Insert the entry into the hashtable
|
||||
hashtable[slot] = (hashval, position)
|
||||
|
||||
# Write the hash table for this bucket to disk
|
||||
for hashval, position in hashtable:
|
||||
dbfile.write(_pointer.pack(hashval, position))
|
||||
|
||||
def _write_directory(self):
|
||||
# Writes a directory of pointers to the 256 hash tables
|
||||
|
||||
dbfile = self.dbfile
|
||||
for position, numslots in self.directory:
|
||||
dbfile.write(_dir_entry.pack(position, numslots))
|
||||
|
||||
def _write_extras(self):
|
||||
self.dbfile.write_pickle(self.extras)
|
||||
|
||||
def close(self):
|
||||
dbfile = self.dbfile
|
||||
|
||||
# Write hash tables
|
||||
self._write_hashes()
|
||||
# Write directory of pointers to hash tables
|
||||
self._write_directory()
|
||||
|
||||
expos = dbfile.tell()
|
||||
# Write extra information
|
||||
self._write_extras()
|
||||
# Write length of pickle
|
||||
dbfile.write_int(dbfile.tell() - expos)
|
||||
|
||||
endpos = dbfile.tell()
|
||||
dbfile.close()
|
||||
return endpos
|
||||
|
||||
|
||||
class HashReader(object):
|
||||
"""Reader for the fast on-disk key-value files created by
|
||||
:class:`HashWriter`.
|
||||
"""
|
||||
|
||||
def __init__(self, dbfile, length=None, magic=b("HSH3"), startoffset=0):
|
||||
"""
|
||||
:param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object
|
||||
to read from.
|
||||
:param length: the length of the file data. This is necessary since the
|
||||
hashing information is written at the end of the file.
|
||||
:param magic: the format tag bytes to look for at the start of the
|
||||
file. If the file's format tag does not match these bytes, the
|
||||
object raises a :class:`FileFormatError` exception.
|
||||
:param startoffset: the starting point of the file data.
|
||||
"""
|
||||
|
||||
self.dbfile = dbfile
|
||||
self.startoffset = startoffset
|
||||
self.is_closed = False
|
||||
|
||||
if length is None:
|
||||
dbfile.seek(0, os.SEEK_END)
|
||||
length = dbfile.tell() - startoffset
|
||||
|
||||
dbfile.seek(startoffset)
|
||||
# Check format tag
|
||||
filemagic = dbfile.read(4)
|
||||
if filemagic != magic:
|
||||
raise FileFormatError("Unknown file header %r" % filemagic)
|
||||
# Read hash type
|
||||
self.hashtype = dbfile.read_byte()
|
||||
self.hashfn = _hash_functions[self.hashtype]
|
||||
# Skip unused future expansion bits
|
||||
dbfile.read_int()
|
||||
dbfile.read_int()
|
||||
self.startofdata = dbfile.tell()
|
||||
|
||||
exptr = startoffset + length - _INT_SIZE
|
||||
# Get the length of extras from the end of the file
|
||||
exlen = dbfile.get_int(exptr)
|
||||
# Read the extras
|
||||
expos = exptr - exlen
|
||||
dbfile.seek(expos)
|
||||
self._read_extras()
|
||||
|
||||
# Calculate the directory base from the beginning of the extras
|
||||
dbfile.seek(expos - _directory_size)
|
||||
# Read directory of hash tables
|
||||
self.tables = []
|
||||
entrysize = _dir_entry.size
|
||||
unpackentry = _dir_entry.unpack
|
||||
for _ in xrange(256):
|
||||
# position, numslots
|
||||
self.tables.append(unpackentry(dbfile.read(entrysize)))
|
||||
# The position of the first hash table is the end of the key/value pairs
|
||||
self.endofdata = self.tables[0][0]
|
||||
|
||||
@classmethod
|
||||
def open(cls, storage, name):
|
||||
"""Convenience method to open a hash file given a
|
||||
:class:`whoosh.filedb.filestore.Storage` object and a name. This takes
|
||||
care of opening the file and passing its length to the initializer.
|
||||
"""
|
||||
|
||||
length = storage.file_length(name)
|
||||
dbfile = storage.open_file(name)
|
||||
return cls(dbfile, length)
|
||||
|
||||
def file(self):
|
||||
return self.dbfile
|
||||
|
||||
def _read_extras(self):
|
||||
try:
|
||||
self.extras = self.dbfile.read_pickle()
|
||||
except EOFError:
|
||||
self.extras = {}
|
||||
|
||||
def close(self):
|
||||
if self.is_closed:
|
||||
raise Exception("Tried to close %r twice" % self)
|
||||
self.dbfile.close()
|
||||
self.is_closed = True
|
||||
|
||||
def key_at(self, pos):
|
||||
# Returns the key bytes at the given position
|
||||
|
||||
dbfile = self.dbfile
|
||||
keylen = dbfile.get_uint(pos)
|
||||
return dbfile.get(pos + _lengths.size, keylen)
|
||||
|
||||
def key_and_range_at(self, pos):
|
||||
# Returns a (keybytes, datapos, datalen) tuple for the key at the given
|
||||
# position
|
||||
dbfile = self.dbfile
|
||||
lenssize = _lengths.size
|
||||
|
||||
if pos >= self.endofdata:
|
||||
return None
|
||||
|
||||
keylen, datalen = _lengths.unpack(dbfile.get(pos, lenssize))
|
||||
keybytes = dbfile.get(pos + lenssize, keylen)
|
||||
datapos = pos + lenssize + keylen
|
||||
return keybytes, datapos, datalen
|
||||
|
||||
def _ranges(self, pos=None, eod=None):
|
||||
# Yields a series of (keypos, keylength, datapos, datalength) tuples
|
||||
# for the key/value pairs in the file
|
||||
dbfile = self.dbfile
|
||||
pos = pos or self.startofdata
|
||||
eod = eod or self.endofdata
|
||||
lenssize = _lengths.size
|
||||
unpacklens = _lengths.unpack
|
||||
|
||||
while pos < eod:
|
||||
keylen, datalen = unpacklens(dbfile.get(pos, lenssize))
|
||||
keypos = pos + lenssize
|
||||
datapos = keypos + keylen
|
||||
yield (keypos, keylen, datapos, datalen)
|
||||
pos = datapos + datalen
|
||||
|
||||
def __getitem__(self, key):
|
||||
for value in self.all(key):
|
||||
return value
|
||||
raise KeyError(key)
|
||||
|
||||
def __iter__(self):
|
||||
dbfile = self.dbfile
|
||||
for keypos, keylen, datapos, datalen in self._ranges():
|
||||
key = dbfile.get(keypos, keylen)
|
||||
value = dbfile.get(datapos, datalen)
|
||||
yield (key, value)
|
||||
|
||||
def __contains__(self, key):
|
||||
for _ in self.ranges_for_key(key):
|
||||
return True
|
||||
return False
|
||||
|
||||
def keys(self):
|
||||
dbfile = self.dbfile
|
||||
for keypos, keylen, _, _ in self._ranges():
|
||||
yield dbfile.get(keypos, keylen)
|
||||
|
||||
def values(self):
|
||||
dbfile = self.dbfile
|
||||
for _, _, datapos, datalen in self._ranges():
|
||||
yield dbfile.get(datapos, datalen)
|
||||
|
||||
def items(self):
|
||||
dbfile = self.dbfile
|
||||
for keypos, keylen, datapos, datalen in self._ranges():
|
||||
yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen))
|
||||
|
||||
def get(self, key, default=None):
|
||||
for value in self.all(key):
|
||||
return value
|
||||
return default
|
||||
|
||||
def all(self, key):
|
||||
"""Yields a sequence of values associated with the given key.
|
||||
"""
|
||||
|
||||
dbfile = self.dbfile
|
||||
for datapos, datalen in self.ranges_for_key(key):
|
||||
yield dbfile.get(datapos, datalen)
|
||||
|
||||
def ranges_for_key(self, key):
|
||||
"""Yields a sequence of ``(datapos, datalength)`` tuples associated
|
||||
with the given key.
|
||||
"""
|
||||
|
||||
if not isinstance(key, bytes_type):
|
||||
raise TypeError("Key %r should be bytes" % key)
|
||||
dbfile = self.dbfile
|
||||
|
||||
# Hash the key
|
||||
keyhash = self.hashfn(key)
|
||||
# Get the position and number of slots for the hash table in which the
|
||||
# key may be found
|
||||
tablestart, numslots = self.tables[keyhash & 255]
|
||||
# If the hash table is empty, we know the key doesn't exists
|
||||
if not numslots:
|
||||
return
|
||||
|
||||
ptrsize = _pointer.size
|
||||
unpackptr = _pointer.unpack
|
||||
lenssize = _lengths.size
|
||||
unpacklens = _lengths.unpack
|
||||
|
||||
# Calculate where the key's slot should be
|
||||
slotpos = tablestart + (((keyhash >> 8) % numslots) * ptrsize)
|
||||
# Read slots looking for our key's hash value
|
||||
for _ in xrange(numslots):
|
||||
slothash, itempos = unpackptr(dbfile.get(slotpos, ptrsize))
|
||||
# If this slot is empty, we're done
|
||||
if not itempos:
|
||||
return
|
||||
|
||||
# If the key hash in this slot matches our key's hash, we might have
|
||||
# a match, so read the actual key and see if it's our key
|
||||
if slothash == keyhash:
|
||||
# Read the key and value lengths
|
||||
keylen, datalen = unpacklens(dbfile.get(itempos, lenssize))
|
||||
# Only bother reading the actual key if the lengths match
|
||||
if keylen == len(key):
|
||||
keystart = itempos + lenssize
|
||||
if key == dbfile.get(keystart, keylen):
|
||||
# The keys match, so yield (datapos, datalen)
|
||||
yield (keystart + keylen, datalen)
|
||||
|
||||
slotpos += ptrsize
|
||||
# If we reach the end of the hashtable, wrap around
|
||||
if slotpos == tablestart + (numslots * ptrsize):
|
||||
slotpos = tablestart
|
||||
|
||||
def range_for_key(self, key):
|
||||
for item in self.ranges_for_key(key):
|
||||
return item
|
||||
raise KeyError(key)
|
||||
|
||||
|
||||
# Ordered hash file
|
||||
|
||||
class OrderedHashWriter(HashWriter):
|
||||
"""Implements an on-disk hash, but requires that keys be added in order.
|
||||
An :class:`OrderedHashReader` can then look up "nearest keys" based on
|
||||
the ordering.
|
||||
"""
|
||||
|
||||
def __init__(self, dbfile):
|
||||
HashWriter.__init__(self, dbfile)
|
||||
# Keep an array of the positions of all keys
|
||||
self.index = GrowableArray("H")
|
||||
# Keep track of the last key added
|
||||
self.lastkey = emptybytes
|
||||
|
||||
def add(self, key, value):
|
||||
if key <= self.lastkey:
|
||||
raise ValueError("Keys must increase: %r..%r"
|
||||
% (self.lastkey, key))
|
||||
self.index.append(self.dbfile.tell())
|
||||
HashWriter.add(self, key, value)
|
||||
self.lastkey = key
|
||||
|
||||
def _write_extras(self):
|
||||
dbfile = self.dbfile
|
||||
index = self.index
|
||||
|
||||
# Store metadata about the index array
|
||||
self.extras["indextype"] = index.typecode
|
||||
self.extras["indexlen"] = len(index)
|
||||
# Write the extras
|
||||
HashWriter._write_extras(self)
|
||||
# Write the index array
|
||||
index.to_file(dbfile)
|
||||
|
||||
|
||||
class OrderedHashReader(HashReader):
|
||||
def closest_key(self, key):
|
||||
"""Returns the closest key equal to or greater than the given key. If
|
||||
there is no key in the file equal to or greater than the given key,
|
||||
returns None.
|
||||
"""
|
||||
|
||||
pos = self.closest_key_pos(key)
|
||||
if pos is None:
|
||||
return None
|
||||
return self.key_at(pos)
|
||||
|
||||
def ranges_from(self, key):
|
||||
"""Yields a series of ``(keypos, keylen, datapos, datalen)`` tuples
|
||||
for the ordered series of keys equal or greater than the given key.
|
||||
"""
|
||||
|
||||
pos = self.closest_key_pos(key)
|
||||
if pos is None:
|
||||
return
|
||||
|
||||
for item in self._ranges(pos=pos):
|
||||
yield item
|
||||
|
||||
def keys_from(self, key):
|
||||
"""Yields an ordered series of keys equal to or greater than the given
|
||||
key.
|
||||
"""
|
||||
|
||||
dbfile = self.dbfile
|
||||
for keypos, keylen, _, _ in self.ranges_from(key):
|
||||
yield dbfile.get(keypos, keylen)
|
||||
|
||||
def items_from(self, key):
|
||||
"""Yields an ordered series of ``(key, value)`` tuples for keys equal
|
||||
to or greater than the given key.
|
||||
"""
|
||||
|
||||
dbfile = self.dbfile
|
||||
for keypos, keylen, datapos, datalen in self.ranges_from(key):
|
||||
yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen))
|
||||
|
||||
def _read_extras(self):
|
||||
dbfile = self.dbfile
|
||||
|
||||
# Read the extras
|
||||
HashReader._read_extras(self)
|
||||
|
||||
# Set up for reading the index array
|
||||
indextype = self.extras["indextype"]
|
||||
self.indexbase = dbfile.tell()
|
||||
self.indexlen = self.extras["indexlen"]
|
||||
self.indexsize = struct.calcsize(indextype)
|
||||
# Set up the function to read values from the index array
|
||||
if indextype == "B":
|
||||
self._get_pos = dbfile.get_byte
|
||||
elif indextype == "H":
|
||||
self._get_pos = dbfile.get_ushort
|
||||
elif indextype == "i":
|
||||
self._get_pos = dbfile.get_int
|
||||
elif indextype == "I":
|
||||
self._get_pos = dbfile.get_uint
|
||||
elif indextype == "q":
|
||||
self._get_pos = dbfile.get_long
|
||||
else:
|
||||
raise Exception("Unknown index type %r" % indextype)
|
||||
|
||||
def closest_key_pos(self, key):
|
||||
# Given a key, return the position of that key OR the next highest key
|
||||
# if the given key does not exist
|
||||
if not isinstance(key, bytes_type):
|
||||
raise TypeError("Key %r should be bytes" % key)
|
||||
|
||||
indexbase = self.indexbase
|
||||
indexsize = self.indexsize
|
||||
key_at = self.key_at
|
||||
_get_pos = self._get_pos
|
||||
|
||||
# Do a binary search of the positions in the index array
|
||||
lo = 0
|
||||
hi = self.indexlen
|
||||
while lo < hi:
|
||||
mid = (lo + hi) // 2
|
||||
midkey = key_at(_get_pos(indexbase + mid * indexsize))
|
||||
if midkey < key:
|
||||
lo = mid + 1
|
||||
else:
|
||||
hi = mid
|
||||
|
||||
# If we went off the end, return None
|
||||
if lo == self.indexlen:
|
||||
return None
|
||||
# Return the closest key
|
||||
return _get_pos(indexbase + lo * indexsize)
|
||||
|
||||
|
||||
# Fielded Ordered hash file
|
||||
|
||||
class FieldedOrderedHashWriter(HashWriter):
|
||||
"""Implements an on-disk hash, but writes separate position indexes for
|
||||
each field.
|
||||
"""
|
||||
|
||||
def __init__(self, dbfile):
|
||||
HashWriter.__init__(self, dbfile)
|
||||
# Map field names to (startpos, indexpos, length, typecode)
|
||||
self.fieldmap = self.extras["fieldmap"] = {}
|
||||
|
||||
# Keep track of the last key added
|
||||
self.lastkey = emptybytes
|
||||
|
||||
def start_field(self, fieldname):
|
||||
self.fieldstart = self.dbfile.tell()
|
||||
self.fieldname = fieldname
|
||||
# Keep an array of the positions of all keys
|
||||
self.poses = GrowableArray("H")
|
||||
self.lastkey = emptybytes
|
||||
|
||||
def add(self, key, value):
|
||||
if key <= self.lastkey:
|
||||
raise ValueError("Keys must increase: %r..%r"
|
||||
% (self.lastkey, key))
|
||||
self.poses.append(self.dbfile.tell() - self.fieldstart)
|
||||
HashWriter.add(self, key, value)
|
||||
self.lastkey = key
|
||||
|
||||
def end_field(self):
|
||||
dbfile = self.dbfile
|
||||
fieldname = self.fieldname
|
||||
poses = self.poses
|
||||
self.fieldmap[fieldname] = (self.fieldstart, dbfile.tell(), len(poses),
|
||||
poses.typecode)
|
||||
poses.to_file(dbfile)
|
||||
|
||||
|
||||
class FieldedOrderedHashReader(HashReader):
|
||||
def __init__(self, *args, **kwargs):
|
||||
HashReader.__init__(self, *args, **kwargs)
|
||||
self.fieldmap = self.extras["fieldmap"]
|
||||
# Make a sorted list of the field names with their start and end ranges
|
||||
self.fieldlist = []
|
||||
for fieldname in sorted(self.fieldmap.keys()):
|
||||
startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname]
|
||||
self.fieldlist.append((fieldname, startpos, ixpos))
|
||||
|
||||
def field_start(self, fieldname):
|
||||
return self.fieldmap[fieldname][0]
|
||||
|
||||
def fielded_ranges(self, pos=None, eod=None):
|
||||
flist = self.fieldlist
|
||||
fpos = 0
|
||||
fieldname, start, end = flist[fpos]
|
||||
for keypos, keylen, datapos, datalen in self._ranges(pos, eod):
|
||||
if keypos >= end:
|
||||
fpos += 1
|
||||
fieldname, start, end = flist[fpos]
|
||||
yield fieldname, keypos, keylen, datapos, datalen
|
||||
|
||||
def iter_terms(self):
|
||||
get = self.dbfile.get
|
||||
for fieldname, keypos, keylen, _, _ in self.fielded_ranges():
|
||||
yield fieldname, get(keypos, keylen)
|
||||
|
||||
def iter_term_items(self):
|
||||
get = self.dbfile.get
|
||||
for item in self.fielded_ranges():
|
||||
fieldname, keypos, keylen, datapos, datalen = item
|
||||
yield fieldname, get(keypos, keylen), get(datapos, datalen)
|
||||
|
||||
def contains_term(self, fieldname, btext):
|
||||
try:
|
||||
x = self.range_for_term(fieldname, btext)
|
||||
return True
|
||||
except KeyError:
|
||||
return False
|
||||
|
||||
def range_for_term(self, fieldname, btext):
|
||||
start, ixpos, ixsize, code = self.fieldmap[fieldname]
|
||||
for datapos, datalen in self.ranges_for_key(btext):
|
||||
if start < datapos < ixpos:
|
||||
return datapos, datalen
|
||||
raise KeyError((fieldname, btext))
|
||||
|
||||
def term_data(self, fieldname, btext):
|
||||
datapos, datalen = self.range_for_term(fieldname, btext)
|
||||
return self.dbfile.get(datapos, datalen)
|
||||
|
||||
def term_get(self, fieldname, btext, default=None):
|
||||
try:
|
||||
return self.term_data(fieldname, btext)
|
||||
except KeyError:
|
||||
return default
|
||||
|
||||
def closest_term_pos(self, fieldname, key):
|
||||
# Given a key, return the position of that key OR the next highest key
|
||||
# if the given key does not exist
|
||||
if not isinstance(key, bytes_type):
|
||||
raise TypeError("Key %r should be bytes" % key)
|
||||
|
||||
dbfile = self.dbfile
|
||||
key_at = self.key_at
|
||||
startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname]
|
||||
|
||||
if ixtype == "B":
|
||||
get_pos = dbfile.get_byte
|
||||
elif ixtype == "H":
|
||||
get_pos = dbfile.get_ushort
|
||||
elif ixtype == "i":
|
||||
get_pos = dbfile.get_int
|
||||
elif ixtype == "I":
|
||||
get_pos = dbfile.get_uint
|
||||
elif ixtype == "q":
|
||||
get_pos = dbfile.get_long
|
||||
else:
|
||||
raise Exception("Unknown index type %r" % ixtype)
|
||||
|
||||
# Do a binary search of the positions in the index array
|
||||
lo = 0
|
||||
hi = ixsize
|
||||
while lo < hi:
|
||||
mid = (lo + hi) // 2
|
||||
midkey = key_at(startpos + get_pos(ixpos + mid * ixsize))
|
||||
if midkey < key:
|
||||
lo = mid + 1
|
||||
else:
|
||||
hi = mid
|
||||
|
||||
# If we went off the end, return None
|
||||
if lo == ixsize:
|
||||
return None
|
||||
# Return the closest key
|
||||
return startpos + get_pos(ixpos + lo * ixsize)
|
||||
|
||||
def closest_term(self, fieldname, btext):
|
||||
pos = self.closest_term_pos(fieldname, btext)
|
||||
if pos is None:
|
||||
return None
|
||||
return self.key_at(pos)
|
||||
|
||||
def term_ranges_from(self, fieldname, btext):
|
||||
pos = self.closest_term_pos(fieldname, btext)
|
||||
if pos is None:
|
||||
return
|
||||
|
||||
startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname]
|
||||
for item in self._ranges(pos, ixpos):
|
||||
yield item
|
||||
|
||||
def terms_from(self, fieldname, btext):
|
||||
dbfile = self.dbfile
|
||||
for keypos, keylen, _, _ in self.term_ranges_from(fieldname, btext):
|
||||
yield dbfile.get(keypos, keylen)
|
||||
|
||||
def term_items_from(self, fieldname, btext):
|
||||
dbfile = self.dbfile
|
||||
for item in self.term_ranges_from(fieldname, btext):
|
||||
keypos, keylen, datapos, datalen = item
|
||||
yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen))
|
||||
|
||||
|
||||
|
||||
164
venv/Lib/site-packages/whoosh/filedb/gae.py
Normal file
164
venv/Lib/site-packages/whoosh/filedb/gae.py
Normal file
@@ -0,0 +1,164 @@
|
||||
"""
|
||||
This module contains EXPERIMENTAL support for storing a Whoosh index's files in
|
||||
the Google App Engine blobstore. This will use a lot of RAM since all files are
|
||||
loaded into RAM, but it potentially useful as a workaround for the lack of file
|
||||
storage in Google App Engine.
|
||||
|
||||
Use at your own risk, but please report any problems to me so I can fix them.
|
||||
|
||||
To create a new index::
|
||||
|
||||
from whoosh.filedb.gae import DatastoreStorage
|
||||
|
||||
ix = DatastoreStorage().create_index(schema)
|
||||
|
||||
To open an existing index::
|
||||
|
||||
ix = DatastoreStorage().open_index()
|
||||
"""
|
||||
|
||||
import time
|
||||
|
||||
from google.appengine.api import memcache # @UnresolvedImport
|
||||
from google.appengine.ext import db # @UnresolvedImport
|
||||
|
||||
from whoosh.compat import BytesIO
|
||||
from whoosh.index import TOC, FileIndex, _DEF_INDEX_NAME
|
||||
from whoosh.filedb.filestore import ReadOnlyError, Storage
|
||||
from whoosh.filedb.structfile import StructFile
|
||||
|
||||
|
||||
class DatastoreFile(db.Model):
|
||||
"""A file-like object that is backed by a BytesIO() object whose contents
|
||||
is loaded from a BlobProperty in the app engine datastore.
|
||||
"""
|
||||
|
||||
value = db.BlobProperty()
|
||||
mtime = db.IntegerProperty(default=0)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(DatastoreFile, self).__init__(*args, **kwargs)
|
||||
self.data = BytesIO()
|
||||
|
||||
@classmethod
|
||||
def loadfile(cls, name):
|
||||
value = memcache.get(name, namespace="DatastoreFile")
|
||||
if value is None:
|
||||
file = cls.get_by_key_name(name)
|
||||
memcache.set(name, file.value, namespace="DatastoreFile")
|
||||
else:
|
||||
file = cls(value=value)
|
||||
file.data = BytesIO(file.value)
|
||||
return file
|
||||
|
||||
def close(self):
|
||||
oldvalue = self.value
|
||||
self.value = self.getvalue()
|
||||
if oldvalue != self.value:
|
||||
self.mtime = int(time.time())
|
||||
self.put()
|
||||
memcache.set(self.key().id_or_name(), self.value,
|
||||
namespace="DatastoreFile")
|
||||
|
||||
def tell(self):
|
||||
return self.data.tell()
|
||||
|
||||
def write(self, data):
|
||||
return self.data.write(data)
|
||||
|
||||
def read(self, length):
|
||||
return self.data.read(length)
|
||||
|
||||
def seek(self, *args):
|
||||
return self.data.seek(*args)
|
||||
|
||||
def readline(self):
|
||||
return self.data.readline()
|
||||
|
||||
def getvalue(self):
|
||||
return self.data.getvalue()
|
||||
|
||||
|
||||
class MemcacheLock(object):
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
|
||||
def acquire(self, blocking=False):
|
||||
val = memcache.add(self.name, "L", 360, namespace="whooshlocks")
|
||||
|
||||
if blocking and not val:
|
||||
# Simulate blocking by retrying the acquire over and over
|
||||
import time
|
||||
while not val:
|
||||
time.sleep(0.1)
|
||||
val = memcache.add(self.name, "", 360, namespace="whooshlocks")
|
||||
|
||||
return val
|
||||
|
||||
def release(self):
|
||||
memcache.delete(self.name, namespace="whooshlocks")
|
||||
|
||||
|
||||
class DatastoreStorage(Storage):
|
||||
"""An implementation of :class:`whoosh.store.Storage` that stores files in
|
||||
the app engine datastore as blob properties.
|
||||
"""
|
||||
|
||||
def create_index(self, schema, indexname=_DEF_INDEX_NAME):
|
||||
if self.readonly:
|
||||
raise ReadOnlyError
|
||||
|
||||
TOC.create(self, schema, indexname)
|
||||
return FileIndex(self, schema, indexname)
|
||||
|
||||
def open_index(self, indexname=_DEF_INDEX_NAME, schema=None):
|
||||
return FileIndex(self, schema=schema, indexname=indexname)
|
||||
|
||||
def list(self):
|
||||
query = DatastoreFile.all()
|
||||
keys = []
|
||||
for file in query:
|
||||
keys.append(file.key().id_or_name())
|
||||
return keys
|
||||
|
||||
def clean(self):
|
||||
pass
|
||||
|
||||
def total_size(self):
|
||||
return sum(self.file_length(f) for f in self.list())
|
||||
|
||||
def file_exists(self, name):
|
||||
return DatastoreFile.get_by_key_name(name) is not None
|
||||
|
||||
def file_modified(self, name):
|
||||
return DatastoreFile.get_by_key_name(name).mtime
|
||||
|
||||
def file_length(self, name):
|
||||
return len(DatastoreFile.get_by_key_name(name).value)
|
||||
|
||||
def delete_file(self, name):
|
||||
memcache.delete(name, namespace="DatastoreFile")
|
||||
return DatastoreFile.get_by_key_name(name).delete()
|
||||
|
||||
def rename_file(self, name, newname, safe=False):
|
||||
file = DatastoreFile.get_by_key_name(name)
|
||||
newfile = DatastoreFile(key_name=newname)
|
||||
newfile.value = file.value
|
||||
newfile.mtime = file.mtime
|
||||
newfile.put()
|
||||
file.delete()
|
||||
|
||||
def create_file(self, name, **kwargs):
|
||||
f = StructFile(DatastoreFile(key_name=name), name=name,
|
||||
onclose=lambda sfile: sfile.file.close())
|
||||
return f
|
||||
|
||||
def open_file(self, name, *args, **kwargs):
|
||||
return StructFile(DatastoreFile.loadfile(name))
|
||||
|
||||
def lock(self, name):
|
||||
return MemcacheLock(name)
|
||||
|
||||
def temp_storage(self, name=None):
|
||||
tempstore = DatastoreStorage()
|
||||
return tempstore.create()
|
||||
402
venv/Lib/site-packages/whoosh/filedb/structfile.py
Normal file
402
venv/Lib/site-packages/whoosh/filedb/structfile.py
Normal file
@@ -0,0 +1,402 @@
|
||||
# Copyright 2009 Matt Chaput. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# The views and conclusions contained in the software and documentation are
|
||||
# those of the authors and should not be interpreted as representing official
|
||||
# policies, either expressed or implied, of Matt Chaput.
|
||||
|
||||
from array import array
|
||||
from copy import copy
|
||||
from struct import calcsize
|
||||
|
||||
from whoosh.compat import BytesIO, bytes_type
|
||||
from whoosh.compat import dump as dump_pickle
|
||||
from whoosh.compat import load as load_pickle
|
||||
from whoosh.compat import array_frombytes, array_tobytes
|
||||
from whoosh.system import _INT_SIZE, _SHORT_SIZE, _FLOAT_SIZE, _LONG_SIZE
|
||||
from whoosh.system import IS_LITTLE
|
||||
from whoosh.system import pack_byte, unpack_byte, pack_sbyte, unpack_sbyte
|
||||
from whoosh.system import pack_ushort, unpack_ushort
|
||||
from whoosh.system import pack_ushort_le, unpack_ushort_le
|
||||
from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint
|
||||
from whoosh.system import pack_uint_le, unpack_uint_le
|
||||
from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong
|
||||
from whoosh.system import pack_float, unpack_float
|
||||
from whoosh.util.varints import varint, read_varint
|
||||
from whoosh.util.varints import signed_varint, decode_signed_varint
|
||||
|
||||
|
||||
_SIZEMAP = dict((typecode, calcsize(typecode)) for typecode in "bBiIhHqQf")
|
||||
_ORDERMAP = {"little": "<", "big": ">"}
|
||||
|
||||
_types = (("sbyte", "b"), ("ushort", "H"), ("int", "i"),
|
||||
("long", "q"), ("float", "f"))
|
||||
|
||||
|
||||
# Main function
|
||||
|
||||
class StructFile(object):
|
||||
"""Returns a "structured file" object that wraps the given file object and
|
||||
provides numerous additional methods for writing structured data, such as
|
||||
"write_varint" and "write_long".
|
||||
"""
|
||||
|
||||
def __init__(self, fileobj, name=None, onclose=None):
|
||||
self.file = fileobj
|
||||
self._name = name
|
||||
self.onclose = onclose
|
||||
self.is_closed = False
|
||||
|
||||
self.is_real = hasattr(fileobj, "fileno")
|
||||
if self.is_real:
|
||||
self.fileno = fileobj.fileno
|
||||
|
||||
def __repr__(self):
|
||||
return "%s(%r)" % (self.__class__.__name__, self._name)
|
||||
|
||||
def __str__(self):
|
||||
return self._name
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.close()
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.file)
|
||||
|
||||
def raw_file(self):
|
||||
return self.file
|
||||
|
||||
def read(self, *args, **kwargs):
|
||||
return self.file.read(*args, **kwargs)
|
||||
|
||||
def readline(self, *args, **kwargs):
|
||||
return self.file.readline(*args, **kwargs)
|
||||
|
||||
def write(self, *args, **kwargs):
|
||||
return self.file.write(*args, **kwargs)
|
||||
|
||||
def tell(self, *args, **kwargs):
|
||||
return self.file.tell(*args, **kwargs)
|
||||
|
||||
def seek(self, *args, **kwargs):
|
||||
return self.file.seek(*args, **kwargs)
|
||||
|
||||
def truncate(self, *args, **kwargs):
|
||||
return self.file.truncate(*args, **kwargs)
|
||||
|
||||
def flush(self):
|
||||
"""Flushes the buffer of the wrapped file. This is a no-op if the
|
||||
wrapped file does not have a flush method.
|
||||
"""
|
||||
|
||||
if hasattr(self.file, "flush"):
|
||||
self.file.flush()
|
||||
|
||||
def close(self):
|
||||
"""Closes the wrapped file.
|
||||
"""
|
||||
|
||||
if self.is_closed:
|
||||
raise Exception("This file is already closed")
|
||||
if self.onclose:
|
||||
self.onclose(self)
|
||||
if hasattr(self.file, "close"):
|
||||
self.file.close()
|
||||
self.is_closed = True
|
||||
|
||||
def subset(self, offset, length, name=None):
|
||||
from whoosh.filedb.compound import SubFile
|
||||
|
||||
name = name or self._name
|
||||
return StructFile(SubFile(self.file, offset, length), name=name)
|
||||
|
||||
def write_string(self, s):
|
||||
"""Writes a string to the wrapped file. This method writes the length
|
||||
of the string first, so you can read the string back without having to
|
||||
know how long it was.
|
||||
"""
|
||||
self.write_varint(len(s))
|
||||
self.write(s)
|
||||
|
||||
def write_string2(self, s):
|
||||
self.write(pack_ushort(len(s)) + s)
|
||||
|
||||
def write_string4(self, s):
|
||||
self.write(pack_int(len(s)) + s)
|
||||
|
||||
def read_string(self):
|
||||
"""Reads a string from the wrapped file.
|
||||
"""
|
||||
return self.read(self.read_varint())
|
||||
|
||||
def read_string2(self):
|
||||
l = self.read_ushort()
|
||||
return self.read(l)
|
||||
|
||||
def read_string4(self):
|
||||
l = self.read_int()
|
||||
return self.read(l)
|
||||
|
||||
def get_string2(self, pos):
|
||||
l = self.get_ushort(pos)
|
||||
base = pos + _SHORT_SIZE
|
||||
return self.get(base, l), base + l
|
||||
|
||||
def get_string4(self, pos):
|
||||
l = self.get_int(pos)
|
||||
base = pos + _INT_SIZE
|
||||
return self.get(base, l), base + l
|
||||
|
||||
def skip_string(self):
|
||||
l = self.read_varint()
|
||||
self.seek(l, 1)
|
||||
|
||||
def write_varint(self, i):
|
||||
"""Writes a variable-length unsigned integer to the wrapped file.
|
||||
"""
|
||||
self.write(varint(i))
|
||||
|
||||
def write_svarint(self, i):
|
||||
"""Writes a variable-length signed integer to the wrapped file.
|
||||
"""
|
||||
self.write(signed_varint(i))
|
||||
|
||||
def read_varint(self):
|
||||
"""Reads a variable-length encoded unsigned integer from the wrapped
|
||||
file.
|
||||
"""
|
||||
return read_varint(self.read)
|
||||
|
||||
def read_svarint(self):
|
||||
"""Reads a variable-length encoded signed integer from the wrapped
|
||||
file.
|
||||
"""
|
||||
return decode_signed_varint(read_varint(self.read))
|
||||
|
||||
def write_tagint(self, i):
|
||||
"""Writes a sometimes-compressed unsigned integer to the wrapped file.
|
||||
This is similar to the varint methods but uses a less compressed but
|
||||
faster format.
|
||||
"""
|
||||
|
||||
# Store numbers 0-253 in one byte. Byte 254 means "an unsigned 16-bit
|
||||
# int follows." Byte 255 means "An unsigned 32-bit int follows."
|
||||
if i <= 253:
|
||||
self.write(chr(i))
|
||||
elif i <= 65535:
|
||||
self.write("\xFE" + pack_ushort(i))
|
||||
else:
|
||||
self.write("\xFF" + pack_uint(i))
|
||||
|
||||
def read_tagint(self):
|
||||
"""Reads a sometimes-compressed unsigned integer from the wrapped file.
|
||||
This is similar to the varint methods but uses a less compressed but
|
||||
faster format.
|
||||
"""
|
||||
|
||||
tb = ord(self.read(1))
|
||||
if tb == 254:
|
||||
return self.read_ushort()
|
||||
elif tb == 255:
|
||||
return self.read_uint()
|
||||
else:
|
||||
return tb
|
||||
|
||||
def write_byte(self, n):
|
||||
"""Writes a single byte to the wrapped file, shortcut for
|
||||
``file.write(chr(n))``.
|
||||
"""
|
||||
self.write(pack_byte(n))
|
||||
|
||||
def read_byte(self):
|
||||
return ord(self.read(1))
|
||||
|
||||
def write_pickle(self, obj, protocol=-1):
|
||||
"""Writes a pickled representation of obj to the wrapped file.
|
||||
"""
|
||||
dump_pickle(obj, self.file, protocol)
|
||||
|
||||
def read_pickle(self):
|
||||
"""Reads a pickled object from the wrapped file.
|
||||
"""
|
||||
return load_pickle(self.file)
|
||||
|
||||
def write_sbyte(self, n):
|
||||
self.write(pack_sbyte(n))
|
||||
|
||||
def write_int(self, n):
|
||||
self.write(pack_int(n))
|
||||
|
||||
def write_uint(self, n):
|
||||
self.write(pack_uint(n))
|
||||
|
||||
def write_uint_le(self, n):
|
||||
self.write(pack_uint_le(n))
|
||||
|
||||
def write_ushort(self, n):
|
||||
self.write(pack_ushort(n))
|
||||
|
||||
def write_ushort_le(self, n):
|
||||
self.write(pack_ushort_le(n))
|
||||
|
||||
def write_long(self, n):
|
||||
self.write(pack_long(n))
|
||||
|
||||
def write_ulong(self, n):
|
||||
self.write(pack_ulong(n))
|
||||
|
||||
def write_float(self, n):
|
||||
self.write(pack_float(n))
|
||||
|
||||
def write_array(self, arry):
|
||||
if IS_LITTLE:
|
||||
arry = copy(arry)
|
||||
arry.byteswap()
|
||||
if self.is_real:
|
||||
arry.tofile(self.file)
|
||||
else:
|
||||
self.write(array_tobytes(arry))
|
||||
|
||||
def read_sbyte(self):
|
||||
return unpack_sbyte(self.read(1))[0]
|
||||
|
||||
def read_int(self):
|
||||
return unpack_int(self.read(_INT_SIZE))[0]
|
||||
|
||||
def read_uint(self):
|
||||
return unpack_uint(self.read(_INT_SIZE))[0]
|
||||
|
||||
def read_uint_le(self):
|
||||
return unpack_uint_le(self.read(_INT_SIZE))[0]
|
||||
|
||||
def read_ushort(self):
|
||||
return unpack_ushort(self.read(_SHORT_SIZE))[0]
|
||||
|
||||
def read_ushort_le(self):
|
||||
return unpack_ushort_le(self.read(_SHORT_SIZE))[0]
|
||||
|
||||
def read_long(self):
|
||||
return unpack_long(self.read(_LONG_SIZE))[0]
|
||||
|
||||
def read_ulong(self):
|
||||
return unpack_ulong(self.read(_LONG_SIZE))[0]
|
||||
|
||||
def read_float(self):
|
||||
return unpack_float(self.read(_FLOAT_SIZE))[0]
|
||||
|
||||
def read_array(self, typecode, length):
|
||||
a = array(typecode)
|
||||
if self.is_real:
|
||||
a.fromfile(self.file, length)
|
||||
else:
|
||||
array_frombytes(a, self.read(length * _SIZEMAP[typecode]))
|
||||
if IS_LITTLE:
|
||||
a.byteswap()
|
||||
return a
|
||||
|
||||
def get(self, position, length):
|
||||
self.seek(position)
|
||||
return self.read(length)
|
||||
|
||||
def get_byte(self, position):
|
||||
return unpack_byte(self.get(position, 1))[0]
|
||||
|
||||
def get_sbyte(self, position):
|
||||
return unpack_sbyte(self.get(position, 1))[0]
|
||||
|
||||
def get_int(self, position):
|
||||
return unpack_int(self.get(position, _INT_SIZE))[0]
|
||||
|
||||
def get_uint(self, position):
|
||||
return unpack_uint(self.get(position, _INT_SIZE))[0]
|
||||
|
||||
def get_ushort(self, position):
|
||||
return unpack_ushort(self.get(position, _SHORT_SIZE))[0]
|
||||
|
||||
def get_long(self, position):
|
||||
return unpack_long(self.get(position, _LONG_SIZE))[0]
|
||||
|
||||
def get_ulong(self, position):
|
||||
return unpack_ulong(self.get(position, _LONG_SIZE))[0]
|
||||
|
||||
def get_float(self, position):
|
||||
return unpack_float(self.get(position, _FLOAT_SIZE))[0]
|
||||
|
||||
def get_array(self, position, typecode, length):
|
||||
self.seek(position)
|
||||
return self.read_array(typecode, length)
|
||||
|
||||
|
||||
class BufferFile(StructFile):
|
||||
def __init__(self, buf, name=None, onclose=None):
|
||||
self._buf = buf
|
||||
self._name = name
|
||||
self.file = BytesIO(buf)
|
||||
self.onclose = onclose
|
||||
|
||||
self.is_real = False
|
||||
self.is_closed = False
|
||||
|
||||
def subset(self, position, length, name=None):
|
||||
name = name or self._name
|
||||
return BufferFile(self.get(position, length), name=name)
|
||||
|
||||
def get(self, position, length):
|
||||
return bytes_type(self._buf[position:position + length])
|
||||
|
||||
def get_array(self, position, typecode, length):
|
||||
a = array(typecode)
|
||||
array_frombytes(a, self.get(position, length * _SIZEMAP[typecode]))
|
||||
if IS_LITTLE:
|
||||
a.byteswap()
|
||||
return a
|
||||
|
||||
|
||||
class ChecksumFile(StructFile):
|
||||
def __init__(self, *args, **kwargs):
|
||||
StructFile.__init__(self, *args, **kwargs)
|
||||
self._check = 0
|
||||
self._crc32 = __import__("zlib").crc32
|
||||
|
||||
def __iter__(self):
|
||||
for line in self.file:
|
||||
self._check = self._crc32(line, self._check)
|
||||
yield line
|
||||
|
||||
def seek(self, *args):
|
||||
raise Exception("Cannot seek on a ChecksumFile")
|
||||
|
||||
def read(self, *args, **kwargs):
|
||||
b = self.file.read(*args, **kwargs)
|
||||
self._check = self._crc32(b, self._check)
|
||||
return b
|
||||
|
||||
def write(self, b):
|
||||
self._check = self._crc32(b, self._check)
|
||||
self.file.write(b)
|
||||
|
||||
def checksum(self):
|
||||
return self._check & 0xffffffff
|
||||
Reference in New Issue
Block a user