This commit is contained in:
“shengyudong”
2026-01-06 14:18:39 +08:00
commit 5a384b694e
10345 changed files with 2050918 additions and 0 deletions

View File

@@ -0,0 +1,331 @@
# Copyright 2011 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
import errno
import os
import sys
from threading import Lock
from shutil import copyfileobj
try:
import mmap
except ImportError:
mmap = None
from whoosh.compat import BytesIO, memoryview_
from whoosh.filedb.structfile import BufferFile, StructFile
from whoosh.filedb.filestore import FileStorage, StorageError
from whoosh.system import emptybytes
from whoosh.util import random_name
class CompoundStorage(FileStorage):
readonly = True
def __init__(self, dbfile, use_mmap=True, basepos=0):
self._file = dbfile
self.is_closed = False
# Seek to the end to get total file size (to check if mmap is OK)
dbfile.seek(0, os.SEEK_END)
filesize = self._file.tell()
dbfile.seek(basepos)
self._diroffset = self._file.read_long()
self._dirlength = self._file.read_int()
self._file.seek(self._diroffset)
self._dir = self._file.read_pickle()
self._options = self._file.read_pickle()
self._locks = {}
self._source = None
use_mmap = (
use_mmap
and hasattr(self._file, "fileno") # check file is a real file
and filesize < sys.maxsize # check fit on 32-bit Python
)
if mmap and use_mmap:
# Try to open the entire segment as a memory-mapped object
try:
fileno = self._file.fileno()
self._source = mmap.mmap(fileno, 0, access=mmap.ACCESS_READ)
except (mmap.error, OSError):
e = sys.exc_info()[1]
# If we got an error because there wasn't enough memory to
# open the map, ignore it and fall through, we'll just use the
# (slower) "sub-file" implementation
if e.errno == errno.ENOMEM:
pass
else:
raise
else:
# If that worked, we can close the file handle we were given
self._file.close()
self._file = None
def __repr__(self):
return "<%s (%s)>" % (self.__class__.__name__, self._name)
def close(self):
if self.is_closed:
raise Exception("Already closed")
self.is_closed = True
if self._source:
try:
self._source.close()
except BufferError:
del self._source
if self._file:
self._file.close()
def range(self, name):
try:
fileinfo = self._dir[name]
except KeyError:
raise NameError("Unknown file %r" % (name,))
return fileinfo["offset"], fileinfo["length"]
def open_file(self, name, *args, **kwargs):
if self.is_closed:
raise StorageError("Storage was closed")
offset, length = self.range(name)
if self._source:
# Create a memoryview/buffer from the mmap
buf = memoryview_(self._source, offset, length)
f = BufferFile(buf, name=name)
elif hasattr(self._file, "subset"):
f = self._file.subset(offset, length, name=name)
else:
f = StructFile(SubFile(self._file, offset, length), name=name)
return f
def list(self):
return list(self._dir.keys())
def file_exists(self, name):
return name in self._dir
def file_length(self, name):
info = self._dir[name]
return info["length"]
def file_modified(self, name):
info = self._dir[name]
return info["modified"]
def lock(self, name):
if name not in self._locks:
self._locks[name] = Lock()
return self._locks[name]
@staticmethod
def assemble(dbfile, store, names, **options):
assert names, names
directory = {}
basepos = dbfile.tell()
dbfile.write_long(0) # Directory position
dbfile.write_int(0) # Directory length
# Copy the files into the compound file
for name in names:
if name.endswith(".toc") or name.endswith(".seg"):
raise Exception(name)
for name in names:
offset = dbfile.tell()
length = store.file_length(name)
modified = store.file_modified(name)
directory[name] = {"offset": offset, "length": length,
"modified": modified}
f = store.open_file(name)
copyfileobj(f, dbfile)
f.close()
CompoundStorage.write_dir(dbfile, basepos, directory, options)
@staticmethod
def write_dir(dbfile, basepos, directory, options=None):
options = options or {}
dirpos = dbfile.tell() # Remember the start of the directory
dbfile.write_pickle(directory) # Write the directory
dbfile.write_pickle(options)
endpos = dbfile.tell() # Remember the end of the directory
dbfile.flush()
dbfile.seek(basepos) # Seek back to the start
dbfile.write_long(dirpos) # Directory position
dbfile.write_int(endpos - dirpos) # Directory length
dbfile.close()
class SubFile(object):
def __init__(self, parentfile, offset, length, name=None):
self._file = parentfile
self._offset = offset
self._length = length
self._end = offset + length
self._pos = 0
self.name = name
self.closed = False
def close(self):
self.closed = True
def subset(self, position, length, name=None):
start = self._offset + position
end = start + length
name = name or self.name
assert self._offset >= start >= self._end
assert self._offset >= end >= self._end
return SubFile(self._file, self._offset + position, length, name=name)
def read(self, size=None):
if size is None:
size = self._length - self._pos
else:
size = min(size, self._length - self._pos)
if size < 0:
size = 0
if size > 0:
self._file.seek(self._offset + self._pos)
self._pos += size
return self._file.read(size)
else:
return emptybytes
def readline(self):
maxsize = self._length - self._pos
self._file.seek(self._offset + self._pos)
data = self._file.readline()
if len(data) > maxsize:
data = data[:maxsize]
self._pos += len(data)
return data
def seek(self, where, whence=0):
if whence == 0: # Absolute
pos = where
elif whence == 1: # Relative
pos = self._pos + where
elif whence == 2: # From end
pos = self._length - where
else:
raise ValueError
self._pos = pos
def tell(self):
return self._pos
class CompoundWriter(object):
def __init__(self, tempstorage, buffersize=32 * 1024):
assert isinstance(buffersize, int)
self._tempstorage = tempstorage
self._tempname = "%s.ctmp" % random_name()
self._temp = tempstorage.create_file(self._tempname, mode="w+b")
self._buffersize = buffersize
self._streams = {}
def create_file(self, name):
ss = self.SubStream(self._temp, self._buffersize)
self._streams[name] = ss
return StructFile(ss)
def _readback(self):
temp = self._temp
for name, substream in self._streams.items():
substream.close()
def gen():
for f, offset, length in substream.blocks:
if f is None:
f = temp
f.seek(offset)
yield f.read(length)
yield (name, gen)
temp.close()
self._tempstorage.delete_file(self._tempname)
def save_as_compound(self, dbfile):
basepos = dbfile.tell()
dbfile.write_long(0) # Directory offset
dbfile.write_int(0) # Directory length
directory = {}
for name, blocks in self._readback():
filestart = dbfile.tell()
for block in blocks():
dbfile.write(block)
directory[name] = {"offset": filestart,
"length": dbfile.tell() - filestart}
CompoundStorage.write_dir(dbfile, basepos, directory)
def save_as_files(self, storage, name_fn):
for name, blocks in self._readback():
f = storage.create_file(name_fn(name))
for block in blocks():
f.write(block)
f.close()
class SubStream(object):
def __init__(self, dbfile, buffersize):
self._dbfile = dbfile
self._buffersize = buffersize
self._buffer = BytesIO()
self.blocks = []
def tell(self):
return sum(b[2] for b in self.blocks) + self._buffer.tell()
def write(self, inbytes):
bio = self._buffer
buflen = bio.tell()
length = buflen + len(inbytes)
if length >= self._buffersize:
offset = self._dbfile.tell()
self._dbfile.write(bio.getvalue()[:buflen])
self._dbfile.write(inbytes)
self.blocks.append((None, offset, length))
self._buffer.seek(0)
else:
bio.write(inbytes)
def close(self):
bio = self._buffer
length = bio.tell()
if length:
self.blocks.append((bio, 0, length))

View File

@@ -0,0 +1,662 @@
# Copyright 2009 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
from __future__ import with_statement
import errno, os, sys, tempfile
from threading import Lock
from whoosh.compat import BytesIO, memoryview_
from whoosh.filedb.structfile import BufferFile, StructFile
from whoosh.index import _DEF_INDEX_NAME, EmptyIndexError
from whoosh.util import random_name
from whoosh.util.filelock import FileLock
# Exceptions
class StorageError(Exception):
pass
class ReadOnlyError(StorageError):
pass
# Base class
class Storage(object):
"""Abstract base class for storage objects.
A storage object is a virtual flat filesystem, allowing the creation and
retrieval of file-like objects
(:class:`~whoosh.filedb.structfile.StructFile` objects). The default
implementation (:class:`FileStorage`) uses actual files in a directory.
All access to files in Whoosh goes through this object. This allows more
different forms of storage (for example, in RAM, in a database, in a single
file) to be used transparently.
For example, to create a :class:`FileStorage` object::
# Create a storage object
st = FileStorage("indexdir")
# Create the directory if it doesn't already exist
st.create()
The :meth:`Storage.create` method makes it slightly easier to swap storage
implementations. The ``create()`` method handles set-up of the storage
object. For example, ``FileStorage.create()`` creates the directory. A
database implementation might create tables. This is designed to let you
avoid putting implementation-specific setup code in your application.
"""
readonly = False
supports_mmap = False
def __iter__(self):
return iter(self.list())
def __enter__(self):
self.create()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
def create(self):
"""Creates any required implementation-specific resources. For example,
a filesystem-based implementation might create a directory, while a
database implementation might create tables. For example::
from whoosh.filedb.filestore import FileStorage
# Create a storage object
st = FileStorage("indexdir")
# Create any necessary resources
st.create()
This method returns ``self`` so you can also say::
st = FileStorage("indexdir").create()
Storage implementations should be written so that calling create() a
second time on the same storage
:return: a :class:`Storage` instance.
"""
return self
def destroy(self, *args, **kwargs):
"""Removes any implementation-specific resources related to this storage
object. For example, a filesystem-based implementation might delete a
directory, and a database implementation might drop tables.
The arguments are implementation-specific.
"""
pass
def create_index(self, schema, indexname=_DEF_INDEX_NAME, indexclass=None):
"""Creates a new index in this storage.
>>> from whoosh import fields
>>> from whoosh.filedb.filestore import FileStorage
>>> schema = fields.Schema(content=fields.TEXT)
>>> # Create the storage directory
>>> st = FileStorage.create("indexdir")
>>> # Create an index in the storage
>>> ix = st.create_index(schema)
:param schema: the :class:`whoosh.fields.Schema` object to use for the
new index.
:param indexname: the name of the index within the storage object. You
can use this option to store multiple indexes in the same storage.
:param indexclass: an optional custom ``Index`` sub-class to use to
create the index files. The default is
:class:`whoosh.index.FileIndex`. This method will call the
``create`` class method on the given class to create the index.
:return: a :class:`whoosh.index.Index` instance.
"""
if self.readonly:
raise ReadOnlyError
if indexclass is None:
import whoosh.index
indexclass = whoosh.index.FileIndex
return indexclass.create(self, schema, indexname)
def open_index(self, indexname=_DEF_INDEX_NAME, schema=None, indexclass=None):
"""Opens an existing index (created using :meth:`create_index`) in this
storage.
>>> from whoosh.filedb.filestore import FileStorage
>>> st = FileStorage("indexdir")
>>> # Open an index in the storage
>>> ix = st.open_index()
:param indexname: the name of the index within the storage object. You
can use this option to store multiple indexes in the same storage.
:param schema: if you pass in a :class:`whoosh.fields.Schema` object
using this argument, it will override the schema that was stored
with the index.
:param indexclass: an optional custom ``Index`` sub-class to use to
open the index files. The default is
:class:`whoosh.index.FileIndex`. This method will instantiate the
class with this storage object.
:return: a :class:`whoosh.index.Index` instance.
"""
if indexclass is None:
import whoosh.index
indexclass = whoosh.index.FileIndex
return indexclass(self, schema=schema, indexname=indexname)
def index_exists(self, indexname=None):
"""Returns True if a non-empty index exists in this storage.
:param indexname: the name of the index within the storage object. You
can use this option to store multiple indexes in the same storage.
:rtype: bool
"""
if indexname is None:
indexname = _DEF_INDEX_NAME
try:
ix = self.open_index(indexname)
gen = ix.latest_generation()
ix.close()
return gen > -1
except EmptyIndexError:
pass
return False
def create_file(self, name):
"""Creates a file with the given name in this storage.
:param name: the name for the new file.
:return: a :class:`whoosh.filedb.structfile.StructFile` instance.
"""
raise NotImplementedError
def open_file(self, name, *args, **kwargs):
"""Opens a file with the given name in this storage.
:param name: the name for the new file.
:return: a :class:`whoosh.filedb.structfile.StructFile` instance.
"""
raise NotImplementedError
def list(self):
"""Returns a list of file names in this storage.
:return: a list of strings
"""
raise NotImplementedError
def file_exists(self, name):
"""Returns True if the given file exists in this storage.
:param name: the name to check.
:rtype: bool
"""
raise NotImplementedError
def file_modified(self, name):
"""Returns the last-modified time of the given file in this storage (as
a "ctime" UNIX timestamp).
:param name: the name to check.
:return: a "ctime" number.
"""
raise NotImplementedError
def file_length(self, name):
"""Returns the size (in bytes) of the given file in this storage.
:param name: the name to check.
:rtype: int
"""
raise NotImplementedError
def delete_file(self, name):
"""Removes the given file from this storage.
:param name: the name to delete.
"""
raise NotImplementedError
def rename_file(self, frm, to, safe=False):
"""Renames a file in this storage.
:param frm: The current name of the file.
:param to: The new name for the file.
:param safe: if True, raise an exception if a file with the new name
already exists.
"""
raise NotImplementedError
def lock(self, name):
"""Return a named lock object (implementing ``.acquire()`` and
``.release()`` methods). Different storage implementations may use
different lock types with different guarantees. For example, the
RamStorage object uses Python thread locks, while the FileStorage
object uses filesystem-based locks that are valid across different
processes.
:param name: a name for the lock.
:return: a lock-like object.
"""
raise NotImplementedError
def close(self):
"""Closes any resources opened by this storage object. For some storage
implementations this will be a no-op, but for others it is necessary
to release locks and/or prevent leaks, so it's a good idea to call it
when you're done with a storage object.
"""
pass
def optimize(self):
"""Optimizes the storage object. The meaning and cost of "optimizing"
will vary by implementation. For example, a database implementation
might run a garbage collection procedure on the underlying database.
"""
pass
def temp_storage(self, name=None):
"""Creates a new storage object for temporary files. You can call
:meth:`Storage.destroy` on the new storage when you're finished with
it.
:param name: a name for the new storage. This may be optional or
required depending on the storage implementation.
:rtype: :class:`Storage`
"""
raise NotImplementedError
class OverlayStorage(Storage):
"""Overlays two storage objects. Reads are processed from the first if it
has the named file, otherwise the second. Writes always go to the second.
"""
def __init__(self, a, b):
self.a = a
self.b = b
def create_index(self, *args, **kwargs):
self.b.create_index(*args, **kwargs)
def open_index(self, *args, **kwargs):
self.a.open_index(*args, **kwargs)
def create_file(self, *args, **kwargs):
return self.b.create_file(*args, **kwargs)
def open_file(self, name, *args, **kwargs):
if self.a.file_exists(name):
return self.a.open_file(name, *args, **kwargs)
else:
return self.b.open_file(name, *args, **kwargs)
def list(self):
return list(set(self.a.list()) | set(self.b.list()))
def file_exists(self, name):
return self.a.file_exists(name) or self.b.file_exists(name)
def file_modified(self, name):
if self.a.file_exists(name):
return self.a.file_modified(name)
else:
return self.b.file_modified(name)
def file_length(self, name):
if self.a.file_exists(name):
return self.a.file_length(name)
else:
return self.b.file_length(name)
def delete_file(self, name):
return self.b.delete_file(name)
def rename_file(self, *args, **kwargs):
raise NotImplementedError
def lock(self, name):
return self.b.lock(name)
def close(self):
self.a.close()
self.b.close()
def optimize(self):
self.a.optimize()
self.b.optimize()
def temp_storage(self, name=None):
return self.b.temp_storage(name=name)
class FileStorage(Storage):
"""Storage object that stores the index as files in a directory on disk.
Prior to version 3, the initializer would raise an IOError if the directory
did not exist. As of version 3, the object does not check if the
directory exists at initialization. This change is to support using the
:meth:`FileStorage.create` method.
"""
supports_mmap = True
def __init__(self, path, supports_mmap=True, readonly=False, debug=False):
"""
:param path: a path to a directory.
:param supports_mmap: if True (the default), use the ``mmap`` module to
open memory mapped files. You can open the storage object with
``supports_mmap=False`` to force Whoosh to open files normally
instead of with ``mmap``.
:param readonly: If ``True``, the object will raise an exception if you
attempt to create or rename a file.
"""
self.folder = path
self.supports_mmap = supports_mmap
self.readonly = readonly
self._debug = debug
self.locks = {}
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self.folder)
def create(self):
"""Creates this storage object's directory path using ``os.makedirs`` if
it doesn't already exist.
>>> from whoosh.filedb.filestore import FileStorage
>>> st = FileStorage("indexdir")
>>> st.create()
This method returns ``self``, you can say::
st = FileStorage("indexdir").create()
Note that you can simply create handle the creation of the directory
yourself and open the storage object using the initializer::
dirname = "indexdir"
os.mkdir(dirname)
st = FileStorage(dirname)
However, using the ``create()`` method allows you to potentially swap in
other storage implementations more easily.
:return: a :class:`Storage` instance.
"""
dirpath = os.path.abspath(self.folder)
# If the given directory does not already exist, try to create it
try:
os.makedirs(dirpath)
except OSError:
# This is necessary for compatibility between Py2 and Py3
e = sys.exc_info()[1]
# If we get an error because the path already exists, ignore it
if e.errno != errno.EEXIST:
raise
# Raise an exception if the given path is not a directory
if not os.path.isdir(dirpath):
e = IOError("%r is not a directory" % dirpath)
e.errno = errno.ENOTDIR
raise e
return self
def destroy(self):
"""Removes any files in this storage object and then removes the
storage object's directory. What happens if any of the files or the
directory are in use depends on the underlying platform.
"""
# Remove all files
self.clean()
try:
# Try to remove the directory
os.rmdir(self.folder)
except IOError:
e = sys.exc_info()[1]
if e.errno == errno.ENOENT:
pass
else:
raise e
def create_file(self, name, excl=False, mode="wb", **kwargs):
"""Creates a file with the given name in this storage.
:param name: the name for the new file.
:param excl: if True, try to open the file in "exclusive" mode.
:param mode: the mode flags with which to open the file. The default is
``"wb"``.
:return: a :class:`whoosh.filedb.structfile.StructFile` instance.
"""
if self.readonly:
raise ReadOnlyError
path = self._fpath(name)
if excl:
flags = os.O_CREAT | os.O_EXCL | os.O_RDWR
if hasattr(os, "O_BINARY"):
flags |= os.O_BINARY
fd = os.open(path, flags)
fileobj = os.fdopen(fd, mode)
else:
fileobj = open(path, mode)
f = StructFile(fileobj, name=name, **kwargs)
return f
def open_file(self, name, **kwargs):
"""Opens an existing file in this storage.
:param name: the name of the file to open.
:param kwargs: additional keyword arguments are passed through to the
:class:`~whoosh.filedb.structfile.StructFile` initializer.
:return: a :class:`whoosh.filedb.structfile.StructFile` instance.
"""
f = StructFile(open(self._fpath(name), "rb"), name=name, **kwargs)
return f
def _fpath(self, fname):
return os.path.abspath(os.path.join(self.folder, fname))
def clean(self, ignore=False):
if self.readonly:
raise ReadOnlyError
path = self.folder
files = self.list()
for fname in files:
try:
os.remove(os.path.join(path, fname))
except OSError:
if not ignore:
raise
def list(self):
try:
files = os.listdir(self.folder)
except IOError:
files = []
return files
def file_exists(self, name):
return os.path.exists(self._fpath(name))
def file_modified(self, name):
return os.path.getmtime(self._fpath(name))
def file_length(self, name):
return os.path.getsize(self._fpath(name))
def delete_file(self, name):
if self.readonly:
raise ReadOnlyError
os.remove(self._fpath(name))
def rename_file(self, oldname, newname, safe=False):
if self.readonly:
raise ReadOnlyError
if os.path.exists(self._fpath(newname)):
if safe:
raise NameError("File %r exists" % newname)
else:
os.remove(self._fpath(newname))
os.rename(self._fpath(oldname), self._fpath(newname))
def lock(self, name):
return FileLock(self._fpath(name))
def temp_storage(self, name=None):
name = name or "%s.tmp" % random_name()
path = os.path.join(self.folder, name)
tempstore = FileStorage(path)
return tempstore.create()
class RamStorage(Storage):
"""Storage object that keeps the index in memory.
"""
supports_mmap = False
def __init__(self):
self.files = {}
self.locks = {}
self.folder = ''
def destroy(self):
del self.files
del self.locks
def list(self):
return list(self.files.keys())
def clean(self):
self.files = {}
def total_size(self):
return sum(self.file_length(f) for f in self.list())
def file_exists(self, name):
return name in self.files
def file_length(self, name):
if name not in self.files:
raise NameError(name)
return len(self.files[name])
def file_modified(self, name):
return -1
def delete_file(self, name):
if name not in self.files:
raise NameError(name)
del self.files[name]
def rename_file(self, name, newname, safe=False):
if name not in self.files:
raise NameError(name)
if safe and newname in self.files:
raise NameError("File %r exists" % newname)
content = self.files[name]
del self.files[name]
self.files[newname] = content
def create_file(self, name, **kwargs):
def onclose_fn(sfile):
self.files[name] = sfile.file.getvalue()
f = StructFile(BytesIO(), name=name, onclose=onclose_fn)
return f
def open_file(self, name, **kwargs):
if name not in self.files:
raise NameError(name)
buf = memoryview_(self.files[name])
return BufferFile(buf, name=name, **kwargs)
def lock(self, name):
if name not in self.locks:
self.locks[name] = Lock()
return self.locks[name]
def temp_storage(self, name=None):
tdir = tempfile.gettempdir()
name = name or "%s.tmp" % random_name()
path = os.path.join(tdir, name)
tempstore = FileStorage(path)
return tempstore.create()
def copy_storage(sourcestore, deststore):
"""Copies the files from the source storage object to the destination
storage object using ``shutil.copyfileobj``.
"""
from shutil import copyfileobj
for name in sourcestore.list():
with sourcestore.open_file(name) as source:
with deststore.create_file(name) as dest:
copyfileobj(source, dest)
def copy_to_ram(storage):
"""Copies the given FileStorage object into a new RamStorage object.
:rtype: :class:`RamStorage`
"""
ram = RamStorage()
copy_storage(storage, ram)
return ram

View File

@@ -0,0 +1,735 @@
# Copyright 2009 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
"""This module defines writer and reader classes for a fast, immutable
on-disk key-value database format. The current format is based heavily on
D. J. Bernstein's CDB format (http://cr.yp.to/cdb.html).
"""
import os, struct
from binascii import crc32
from bisect import bisect_left
from hashlib import md5 # @UnresolvedImport
from whoosh.compat import b, bytes_type
from whoosh.compat import xrange
from whoosh.util.numlists import GrowableArray
from whoosh.system import _INT_SIZE, emptybytes
# Exceptions
class FileFormatError(Exception):
pass
# Hash functions
def cdb_hash(key):
h = 5381
for c in key:
h = (h + (h << 5)) & 0xffffffff ^ ord(c)
return h
def md5_hash(key):
return int(md5(key).hexdigest(), 16) & 0xffffffff
def crc_hash(key):
return crc32(key) & 0xffffffff
_hash_functions = (md5_hash, crc_hash, cdb_hash)
# Structs
# Two uints before the key/value pair giving the length of the key and value
_lengths = struct.Struct("!ii")
# A pointer in a hash table, giving the hash value and the key position
_pointer = struct.Struct("!Iq")
# A pointer in the hash table directory, giving the position and number of slots
_dir_entry = struct.Struct("!qi")
_directory_size = 256 * _dir_entry.size
# Basic hash file
class HashWriter(object):
"""Implements a fast on-disk key-value store. This hash uses a two-level
hashing scheme, where a key is hashed, the low eight bits of the hash value
are used to index into one of 256 hash tables. This is basically the CDB
algorithm, but unlike CDB this object writes all data serially (it doesn't
seek backwards to overwrite information at the end).
Also unlike CDB, this format uses 64-bit file pointers, so the file length
is essentially unlimited. However, each key and value must be less than
2 GB in length.
"""
def __init__(self, dbfile, magic=b("HSH3"), hashtype=0):
"""
:param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object
to write to.
:param magic: the format tag bytes to write at the start of the file.
:param hashtype: an integer indicating which hashing algorithm to use.
Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash).
"""
self.dbfile = dbfile
self.hashtype = hashtype
self.hashfn = _hash_functions[self.hashtype]
# A place for subclasses to put extra metadata
self.extras = {}
self.startoffset = dbfile.tell()
# Write format tag
dbfile.write(magic)
# Write hash type
dbfile.write_byte(self.hashtype)
# Unused future expansion bits
dbfile.write_int(0)
dbfile.write_int(0)
# 256 lists of hashed keys and positions
self.buckets = [[] for _ in xrange(256)]
# List to remember the positions of the hash tables
self.directory = []
def tell(self):
return self.dbfile.tell()
def add(self, key, value):
"""Adds a key/value pair to the file. Note that keys DO NOT need to be
unique. You can store multiple values under the same key and retrieve
them using :meth:`HashReader.all`.
"""
assert isinstance(key, bytes_type)
assert isinstance(value, bytes_type)
dbfile = self.dbfile
pos = dbfile.tell()
dbfile.write(_lengths.pack(len(key), len(value)))
dbfile.write(key)
dbfile.write(value)
# Get hash value for the key
h = self.hashfn(key)
# Add hash and on-disk position to appropriate bucket
self.buckets[h & 255].append((h, pos))
def add_all(self, items):
"""Convenience method to add a sequence of ``(key, value)`` pairs. This
is the same as calling :meth:`HashWriter.add` on each pair in the
sequence.
"""
add = self.add
for key, value in items:
add(key, value)
def _write_hashes(self):
# Writes 256 hash tables containing pointers to the key/value pairs
dbfile = self.dbfile
# Represent and empty slot in the hash table using 0,0 (no key can
# start at position 0 because of the header)
null = (0, 0)
for entries in self.buckets:
# Start position of this bucket's hash table
pos = dbfile.tell()
# Remember the start position and the number of slots
numslots = 2 * len(entries)
self.directory.append((pos, numslots))
# Create the empty hash table
hashtable = [null] * numslots
# For each (hash value, key position) tuple in the bucket
for hashval, position in entries:
# Bitshift and wrap to get the slot for this entry
slot = (hashval >> 8) % numslots
# If the slot is taken, keep going until we find an empty slot
while hashtable[slot] != null:
slot = (slot + 1) % numslots
# Insert the entry into the hashtable
hashtable[slot] = (hashval, position)
# Write the hash table for this bucket to disk
for hashval, position in hashtable:
dbfile.write(_pointer.pack(hashval, position))
def _write_directory(self):
# Writes a directory of pointers to the 256 hash tables
dbfile = self.dbfile
for position, numslots in self.directory:
dbfile.write(_dir_entry.pack(position, numslots))
def _write_extras(self):
self.dbfile.write_pickle(self.extras)
def close(self):
dbfile = self.dbfile
# Write hash tables
self._write_hashes()
# Write directory of pointers to hash tables
self._write_directory()
expos = dbfile.tell()
# Write extra information
self._write_extras()
# Write length of pickle
dbfile.write_int(dbfile.tell() - expos)
endpos = dbfile.tell()
dbfile.close()
return endpos
class HashReader(object):
"""Reader for the fast on-disk key-value files created by
:class:`HashWriter`.
"""
def __init__(self, dbfile, length=None, magic=b("HSH3"), startoffset=0):
"""
:param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object
to read from.
:param length: the length of the file data. This is necessary since the
hashing information is written at the end of the file.
:param magic: the format tag bytes to look for at the start of the
file. If the file's format tag does not match these bytes, the
object raises a :class:`FileFormatError` exception.
:param startoffset: the starting point of the file data.
"""
self.dbfile = dbfile
self.startoffset = startoffset
self.is_closed = False
if length is None:
dbfile.seek(0, os.SEEK_END)
length = dbfile.tell() - startoffset
dbfile.seek(startoffset)
# Check format tag
filemagic = dbfile.read(4)
if filemagic != magic:
raise FileFormatError("Unknown file header %r" % filemagic)
# Read hash type
self.hashtype = dbfile.read_byte()
self.hashfn = _hash_functions[self.hashtype]
# Skip unused future expansion bits
dbfile.read_int()
dbfile.read_int()
self.startofdata = dbfile.tell()
exptr = startoffset + length - _INT_SIZE
# Get the length of extras from the end of the file
exlen = dbfile.get_int(exptr)
# Read the extras
expos = exptr - exlen
dbfile.seek(expos)
self._read_extras()
# Calculate the directory base from the beginning of the extras
dbfile.seek(expos - _directory_size)
# Read directory of hash tables
self.tables = []
entrysize = _dir_entry.size
unpackentry = _dir_entry.unpack
for _ in xrange(256):
# position, numslots
self.tables.append(unpackentry(dbfile.read(entrysize)))
# The position of the first hash table is the end of the key/value pairs
self.endofdata = self.tables[0][0]
@classmethod
def open(cls, storage, name):
"""Convenience method to open a hash file given a
:class:`whoosh.filedb.filestore.Storage` object and a name. This takes
care of opening the file and passing its length to the initializer.
"""
length = storage.file_length(name)
dbfile = storage.open_file(name)
return cls(dbfile, length)
def file(self):
return self.dbfile
def _read_extras(self):
try:
self.extras = self.dbfile.read_pickle()
except EOFError:
self.extras = {}
def close(self):
if self.is_closed:
raise Exception("Tried to close %r twice" % self)
self.dbfile.close()
self.is_closed = True
def key_at(self, pos):
# Returns the key bytes at the given position
dbfile = self.dbfile
keylen = dbfile.get_uint(pos)
return dbfile.get(pos + _lengths.size, keylen)
def key_and_range_at(self, pos):
# Returns a (keybytes, datapos, datalen) tuple for the key at the given
# position
dbfile = self.dbfile
lenssize = _lengths.size
if pos >= self.endofdata:
return None
keylen, datalen = _lengths.unpack(dbfile.get(pos, lenssize))
keybytes = dbfile.get(pos + lenssize, keylen)
datapos = pos + lenssize + keylen
return keybytes, datapos, datalen
def _ranges(self, pos=None, eod=None):
# Yields a series of (keypos, keylength, datapos, datalength) tuples
# for the key/value pairs in the file
dbfile = self.dbfile
pos = pos or self.startofdata
eod = eod or self.endofdata
lenssize = _lengths.size
unpacklens = _lengths.unpack
while pos < eod:
keylen, datalen = unpacklens(dbfile.get(pos, lenssize))
keypos = pos + lenssize
datapos = keypos + keylen
yield (keypos, keylen, datapos, datalen)
pos = datapos + datalen
def __getitem__(self, key):
for value in self.all(key):
return value
raise KeyError(key)
def __iter__(self):
dbfile = self.dbfile
for keypos, keylen, datapos, datalen in self._ranges():
key = dbfile.get(keypos, keylen)
value = dbfile.get(datapos, datalen)
yield (key, value)
def __contains__(self, key):
for _ in self.ranges_for_key(key):
return True
return False
def keys(self):
dbfile = self.dbfile
for keypos, keylen, _, _ in self._ranges():
yield dbfile.get(keypos, keylen)
def values(self):
dbfile = self.dbfile
for _, _, datapos, datalen in self._ranges():
yield dbfile.get(datapos, datalen)
def items(self):
dbfile = self.dbfile
for keypos, keylen, datapos, datalen in self._ranges():
yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen))
def get(self, key, default=None):
for value in self.all(key):
return value
return default
def all(self, key):
"""Yields a sequence of values associated with the given key.
"""
dbfile = self.dbfile
for datapos, datalen in self.ranges_for_key(key):
yield dbfile.get(datapos, datalen)
def ranges_for_key(self, key):
"""Yields a sequence of ``(datapos, datalength)`` tuples associated
with the given key.
"""
if not isinstance(key, bytes_type):
raise TypeError("Key %r should be bytes" % key)
dbfile = self.dbfile
# Hash the key
keyhash = self.hashfn(key)
# Get the position and number of slots for the hash table in which the
# key may be found
tablestart, numslots = self.tables[keyhash & 255]
# If the hash table is empty, we know the key doesn't exists
if not numslots:
return
ptrsize = _pointer.size
unpackptr = _pointer.unpack
lenssize = _lengths.size
unpacklens = _lengths.unpack
# Calculate where the key's slot should be
slotpos = tablestart + (((keyhash >> 8) % numslots) * ptrsize)
# Read slots looking for our key's hash value
for _ in xrange(numslots):
slothash, itempos = unpackptr(dbfile.get(slotpos, ptrsize))
# If this slot is empty, we're done
if not itempos:
return
# If the key hash in this slot matches our key's hash, we might have
# a match, so read the actual key and see if it's our key
if slothash == keyhash:
# Read the key and value lengths
keylen, datalen = unpacklens(dbfile.get(itempos, lenssize))
# Only bother reading the actual key if the lengths match
if keylen == len(key):
keystart = itempos + lenssize
if key == dbfile.get(keystart, keylen):
# The keys match, so yield (datapos, datalen)
yield (keystart + keylen, datalen)
slotpos += ptrsize
# If we reach the end of the hashtable, wrap around
if slotpos == tablestart + (numslots * ptrsize):
slotpos = tablestart
def range_for_key(self, key):
for item in self.ranges_for_key(key):
return item
raise KeyError(key)
# Ordered hash file
class OrderedHashWriter(HashWriter):
"""Implements an on-disk hash, but requires that keys be added in order.
An :class:`OrderedHashReader` can then look up "nearest keys" based on
the ordering.
"""
def __init__(self, dbfile):
HashWriter.__init__(self, dbfile)
# Keep an array of the positions of all keys
self.index = GrowableArray("H")
# Keep track of the last key added
self.lastkey = emptybytes
def add(self, key, value):
if key <= self.lastkey:
raise ValueError("Keys must increase: %r..%r"
% (self.lastkey, key))
self.index.append(self.dbfile.tell())
HashWriter.add(self, key, value)
self.lastkey = key
def _write_extras(self):
dbfile = self.dbfile
index = self.index
# Store metadata about the index array
self.extras["indextype"] = index.typecode
self.extras["indexlen"] = len(index)
# Write the extras
HashWriter._write_extras(self)
# Write the index array
index.to_file(dbfile)
class OrderedHashReader(HashReader):
def closest_key(self, key):
"""Returns the closest key equal to or greater than the given key. If
there is no key in the file equal to or greater than the given key,
returns None.
"""
pos = self.closest_key_pos(key)
if pos is None:
return None
return self.key_at(pos)
def ranges_from(self, key):
"""Yields a series of ``(keypos, keylen, datapos, datalen)`` tuples
for the ordered series of keys equal or greater than the given key.
"""
pos = self.closest_key_pos(key)
if pos is None:
return
for item in self._ranges(pos=pos):
yield item
def keys_from(self, key):
"""Yields an ordered series of keys equal to or greater than the given
key.
"""
dbfile = self.dbfile
for keypos, keylen, _, _ in self.ranges_from(key):
yield dbfile.get(keypos, keylen)
def items_from(self, key):
"""Yields an ordered series of ``(key, value)`` tuples for keys equal
to or greater than the given key.
"""
dbfile = self.dbfile
for keypos, keylen, datapos, datalen in self.ranges_from(key):
yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen))
def _read_extras(self):
dbfile = self.dbfile
# Read the extras
HashReader._read_extras(self)
# Set up for reading the index array
indextype = self.extras["indextype"]
self.indexbase = dbfile.tell()
self.indexlen = self.extras["indexlen"]
self.indexsize = struct.calcsize(indextype)
# Set up the function to read values from the index array
if indextype == "B":
self._get_pos = dbfile.get_byte
elif indextype == "H":
self._get_pos = dbfile.get_ushort
elif indextype == "i":
self._get_pos = dbfile.get_int
elif indextype == "I":
self._get_pos = dbfile.get_uint
elif indextype == "q":
self._get_pos = dbfile.get_long
else:
raise Exception("Unknown index type %r" % indextype)
def closest_key_pos(self, key):
# Given a key, return the position of that key OR the next highest key
# if the given key does not exist
if not isinstance(key, bytes_type):
raise TypeError("Key %r should be bytes" % key)
indexbase = self.indexbase
indexsize = self.indexsize
key_at = self.key_at
_get_pos = self._get_pos
# Do a binary search of the positions in the index array
lo = 0
hi = self.indexlen
while lo < hi:
mid = (lo + hi) // 2
midkey = key_at(_get_pos(indexbase + mid * indexsize))
if midkey < key:
lo = mid + 1
else:
hi = mid
# If we went off the end, return None
if lo == self.indexlen:
return None
# Return the closest key
return _get_pos(indexbase + lo * indexsize)
# Fielded Ordered hash file
class FieldedOrderedHashWriter(HashWriter):
"""Implements an on-disk hash, but writes separate position indexes for
each field.
"""
def __init__(self, dbfile):
HashWriter.__init__(self, dbfile)
# Map field names to (startpos, indexpos, length, typecode)
self.fieldmap = self.extras["fieldmap"] = {}
# Keep track of the last key added
self.lastkey = emptybytes
def start_field(self, fieldname):
self.fieldstart = self.dbfile.tell()
self.fieldname = fieldname
# Keep an array of the positions of all keys
self.poses = GrowableArray("H")
self.lastkey = emptybytes
def add(self, key, value):
if key <= self.lastkey:
raise ValueError("Keys must increase: %r..%r"
% (self.lastkey, key))
self.poses.append(self.dbfile.tell() - self.fieldstart)
HashWriter.add(self, key, value)
self.lastkey = key
def end_field(self):
dbfile = self.dbfile
fieldname = self.fieldname
poses = self.poses
self.fieldmap[fieldname] = (self.fieldstart, dbfile.tell(), len(poses),
poses.typecode)
poses.to_file(dbfile)
class FieldedOrderedHashReader(HashReader):
def __init__(self, *args, **kwargs):
HashReader.__init__(self, *args, **kwargs)
self.fieldmap = self.extras["fieldmap"]
# Make a sorted list of the field names with their start and end ranges
self.fieldlist = []
for fieldname in sorted(self.fieldmap.keys()):
startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname]
self.fieldlist.append((fieldname, startpos, ixpos))
def field_start(self, fieldname):
return self.fieldmap[fieldname][0]
def fielded_ranges(self, pos=None, eod=None):
flist = self.fieldlist
fpos = 0
fieldname, start, end = flist[fpos]
for keypos, keylen, datapos, datalen in self._ranges(pos, eod):
if keypos >= end:
fpos += 1
fieldname, start, end = flist[fpos]
yield fieldname, keypos, keylen, datapos, datalen
def iter_terms(self):
get = self.dbfile.get
for fieldname, keypos, keylen, _, _ in self.fielded_ranges():
yield fieldname, get(keypos, keylen)
def iter_term_items(self):
get = self.dbfile.get
for item in self.fielded_ranges():
fieldname, keypos, keylen, datapos, datalen = item
yield fieldname, get(keypos, keylen), get(datapos, datalen)
def contains_term(self, fieldname, btext):
try:
x = self.range_for_term(fieldname, btext)
return True
except KeyError:
return False
def range_for_term(self, fieldname, btext):
start, ixpos, ixsize, code = self.fieldmap[fieldname]
for datapos, datalen in self.ranges_for_key(btext):
if start < datapos < ixpos:
return datapos, datalen
raise KeyError((fieldname, btext))
def term_data(self, fieldname, btext):
datapos, datalen = self.range_for_term(fieldname, btext)
return self.dbfile.get(datapos, datalen)
def term_get(self, fieldname, btext, default=None):
try:
return self.term_data(fieldname, btext)
except KeyError:
return default
def closest_term_pos(self, fieldname, key):
# Given a key, return the position of that key OR the next highest key
# if the given key does not exist
if not isinstance(key, bytes_type):
raise TypeError("Key %r should be bytes" % key)
dbfile = self.dbfile
key_at = self.key_at
startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname]
if ixtype == "B":
get_pos = dbfile.get_byte
elif ixtype == "H":
get_pos = dbfile.get_ushort
elif ixtype == "i":
get_pos = dbfile.get_int
elif ixtype == "I":
get_pos = dbfile.get_uint
elif ixtype == "q":
get_pos = dbfile.get_long
else:
raise Exception("Unknown index type %r" % ixtype)
# Do a binary search of the positions in the index array
lo = 0
hi = ixsize
while lo < hi:
mid = (lo + hi) // 2
midkey = key_at(startpos + get_pos(ixpos + mid * ixsize))
if midkey < key:
lo = mid + 1
else:
hi = mid
# If we went off the end, return None
if lo == ixsize:
return None
# Return the closest key
return startpos + get_pos(ixpos + lo * ixsize)
def closest_term(self, fieldname, btext):
pos = self.closest_term_pos(fieldname, btext)
if pos is None:
return None
return self.key_at(pos)
def term_ranges_from(self, fieldname, btext):
pos = self.closest_term_pos(fieldname, btext)
if pos is None:
return
startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname]
for item in self._ranges(pos, ixpos):
yield item
def terms_from(self, fieldname, btext):
dbfile = self.dbfile
for keypos, keylen, _, _ in self.term_ranges_from(fieldname, btext):
yield dbfile.get(keypos, keylen)
def term_items_from(self, fieldname, btext):
dbfile = self.dbfile
for item in self.term_ranges_from(fieldname, btext):
keypos, keylen, datapos, datalen = item
yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen))

View File

@@ -0,0 +1,164 @@
"""
This module contains EXPERIMENTAL support for storing a Whoosh index's files in
the Google App Engine blobstore. This will use a lot of RAM since all files are
loaded into RAM, but it potentially useful as a workaround for the lack of file
storage in Google App Engine.
Use at your own risk, but please report any problems to me so I can fix them.
To create a new index::
from whoosh.filedb.gae import DatastoreStorage
ix = DatastoreStorage().create_index(schema)
To open an existing index::
ix = DatastoreStorage().open_index()
"""
import time
from google.appengine.api import memcache # @UnresolvedImport
from google.appengine.ext import db # @UnresolvedImport
from whoosh.compat import BytesIO
from whoosh.index import TOC, FileIndex, _DEF_INDEX_NAME
from whoosh.filedb.filestore import ReadOnlyError, Storage
from whoosh.filedb.structfile import StructFile
class DatastoreFile(db.Model):
"""A file-like object that is backed by a BytesIO() object whose contents
is loaded from a BlobProperty in the app engine datastore.
"""
value = db.BlobProperty()
mtime = db.IntegerProperty(default=0)
def __init__(self, *args, **kwargs):
super(DatastoreFile, self).__init__(*args, **kwargs)
self.data = BytesIO()
@classmethod
def loadfile(cls, name):
value = memcache.get(name, namespace="DatastoreFile")
if value is None:
file = cls.get_by_key_name(name)
memcache.set(name, file.value, namespace="DatastoreFile")
else:
file = cls(value=value)
file.data = BytesIO(file.value)
return file
def close(self):
oldvalue = self.value
self.value = self.getvalue()
if oldvalue != self.value:
self.mtime = int(time.time())
self.put()
memcache.set(self.key().id_or_name(), self.value,
namespace="DatastoreFile")
def tell(self):
return self.data.tell()
def write(self, data):
return self.data.write(data)
def read(self, length):
return self.data.read(length)
def seek(self, *args):
return self.data.seek(*args)
def readline(self):
return self.data.readline()
def getvalue(self):
return self.data.getvalue()
class MemcacheLock(object):
def __init__(self, name):
self.name = name
def acquire(self, blocking=False):
val = memcache.add(self.name, "L", 360, namespace="whooshlocks")
if blocking and not val:
# Simulate blocking by retrying the acquire over and over
import time
while not val:
time.sleep(0.1)
val = memcache.add(self.name, "", 360, namespace="whooshlocks")
return val
def release(self):
memcache.delete(self.name, namespace="whooshlocks")
class DatastoreStorage(Storage):
"""An implementation of :class:`whoosh.store.Storage` that stores files in
the app engine datastore as blob properties.
"""
def create_index(self, schema, indexname=_DEF_INDEX_NAME):
if self.readonly:
raise ReadOnlyError
TOC.create(self, schema, indexname)
return FileIndex(self, schema, indexname)
def open_index(self, indexname=_DEF_INDEX_NAME, schema=None):
return FileIndex(self, schema=schema, indexname=indexname)
def list(self):
query = DatastoreFile.all()
keys = []
for file in query:
keys.append(file.key().id_or_name())
return keys
def clean(self):
pass
def total_size(self):
return sum(self.file_length(f) for f in self.list())
def file_exists(self, name):
return DatastoreFile.get_by_key_name(name) is not None
def file_modified(self, name):
return DatastoreFile.get_by_key_name(name).mtime
def file_length(self, name):
return len(DatastoreFile.get_by_key_name(name).value)
def delete_file(self, name):
memcache.delete(name, namespace="DatastoreFile")
return DatastoreFile.get_by_key_name(name).delete()
def rename_file(self, name, newname, safe=False):
file = DatastoreFile.get_by_key_name(name)
newfile = DatastoreFile(key_name=newname)
newfile.value = file.value
newfile.mtime = file.mtime
newfile.put()
file.delete()
def create_file(self, name, **kwargs):
f = StructFile(DatastoreFile(key_name=name), name=name,
onclose=lambda sfile: sfile.file.close())
return f
def open_file(self, name, *args, **kwargs):
return StructFile(DatastoreFile.loadfile(name))
def lock(self, name):
return MemcacheLock(name)
def temp_storage(self, name=None):
tempstore = DatastoreStorage()
return tempstore.create()

View File

@@ -0,0 +1,402 @@
# Copyright 2009 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
from array import array
from copy import copy
from struct import calcsize
from whoosh.compat import BytesIO, bytes_type
from whoosh.compat import dump as dump_pickle
from whoosh.compat import load as load_pickle
from whoosh.compat import array_frombytes, array_tobytes
from whoosh.system import _INT_SIZE, _SHORT_SIZE, _FLOAT_SIZE, _LONG_SIZE
from whoosh.system import IS_LITTLE
from whoosh.system import pack_byte, unpack_byte, pack_sbyte, unpack_sbyte
from whoosh.system import pack_ushort, unpack_ushort
from whoosh.system import pack_ushort_le, unpack_ushort_le
from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint
from whoosh.system import pack_uint_le, unpack_uint_le
from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong
from whoosh.system import pack_float, unpack_float
from whoosh.util.varints import varint, read_varint
from whoosh.util.varints import signed_varint, decode_signed_varint
_SIZEMAP = dict((typecode, calcsize(typecode)) for typecode in "bBiIhHqQf")
_ORDERMAP = {"little": "<", "big": ">"}
_types = (("sbyte", "b"), ("ushort", "H"), ("int", "i"),
("long", "q"), ("float", "f"))
# Main function
class StructFile(object):
"""Returns a "structured file" object that wraps the given file object and
provides numerous additional methods for writing structured data, such as
"write_varint" and "write_long".
"""
def __init__(self, fileobj, name=None, onclose=None):
self.file = fileobj
self._name = name
self.onclose = onclose
self.is_closed = False
self.is_real = hasattr(fileobj, "fileno")
if self.is_real:
self.fileno = fileobj.fileno
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self._name)
def __str__(self):
return self._name
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
def __iter__(self):
return iter(self.file)
def raw_file(self):
return self.file
def read(self, *args, **kwargs):
return self.file.read(*args, **kwargs)
def readline(self, *args, **kwargs):
return self.file.readline(*args, **kwargs)
def write(self, *args, **kwargs):
return self.file.write(*args, **kwargs)
def tell(self, *args, **kwargs):
return self.file.tell(*args, **kwargs)
def seek(self, *args, **kwargs):
return self.file.seek(*args, **kwargs)
def truncate(self, *args, **kwargs):
return self.file.truncate(*args, **kwargs)
def flush(self):
"""Flushes the buffer of the wrapped file. This is a no-op if the
wrapped file does not have a flush method.
"""
if hasattr(self.file, "flush"):
self.file.flush()
def close(self):
"""Closes the wrapped file.
"""
if self.is_closed:
raise Exception("This file is already closed")
if self.onclose:
self.onclose(self)
if hasattr(self.file, "close"):
self.file.close()
self.is_closed = True
def subset(self, offset, length, name=None):
from whoosh.filedb.compound import SubFile
name = name or self._name
return StructFile(SubFile(self.file, offset, length), name=name)
def write_string(self, s):
"""Writes a string to the wrapped file. This method writes the length
of the string first, so you can read the string back without having to
know how long it was.
"""
self.write_varint(len(s))
self.write(s)
def write_string2(self, s):
self.write(pack_ushort(len(s)) + s)
def write_string4(self, s):
self.write(pack_int(len(s)) + s)
def read_string(self):
"""Reads a string from the wrapped file.
"""
return self.read(self.read_varint())
def read_string2(self):
l = self.read_ushort()
return self.read(l)
def read_string4(self):
l = self.read_int()
return self.read(l)
def get_string2(self, pos):
l = self.get_ushort(pos)
base = pos + _SHORT_SIZE
return self.get(base, l), base + l
def get_string4(self, pos):
l = self.get_int(pos)
base = pos + _INT_SIZE
return self.get(base, l), base + l
def skip_string(self):
l = self.read_varint()
self.seek(l, 1)
def write_varint(self, i):
"""Writes a variable-length unsigned integer to the wrapped file.
"""
self.write(varint(i))
def write_svarint(self, i):
"""Writes a variable-length signed integer to the wrapped file.
"""
self.write(signed_varint(i))
def read_varint(self):
"""Reads a variable-length encoded unsigned integer from the wrapped
file.
"""
return read_varint(self.read)
def read_svarint(self):
"""Reads a variable-length encoded signed integer from the wrapped
file.
"""
return decode_signed_varint(read_varint(self.read))
def write_tagint(self, i):
"""Writes a sometimes-compressed unsigned integer to the wrapped file.
This is similar to the varint methods but uses a less compressed but
faster format.
"""
# Store numbers 0-253 in one byte. Byte 254 means "an unsigned 16-bit
# int follows." Byte 255 means "An unsigned 32-bit int follows."
if i <= 253:
self.write(chr(i))
elif i <= 65535:
self.write("\xFE" + pack_ushort(i))
else:
self.write("\xFF" + pack_uint(i))
def read_tagint(self):
"""Reads a sometimes-compressed unsigned integer from the wrapped file.
This is similar to the varint methods but uses a less compressed but
faster format.
"""
tb = ord(self.read(1))
if tb == 254:
return self.read_ushort()
elif tb == 255:
return self.read_uint()
else:
return tb
def write_byte(self, n):
"""Writes a single byte to the wrapped file, shortcut for
``file.write(chr(n))``.
"""
self.write(pack_byte(n))
def read_byte(self):
return ord(self.read(1))
def write_pickle(self, obj, protocol=-1):
"""Writes a pickled representation of obj to the wrapped file.
"""
dump_pickle(obj, self.file, protocol)
def read_pickle(self):
"""Reads a pickled object from the wrapped file.
"""
return load_pickle(self.file)
def write_sbyte(self, n):
self.write(pack_sbyte(n))
def write_int(self, n):
self.write(pack_int(n))
def write_uint(self, n):
self.write(pack_uint(n))
def write_uint_le(self, n):
self.write(pack_uint_le(n))
def write_ushort(self, n):
self.write(pack_ushort(n))
def write_ushort_le(self, n):
self.write(pack_ushort_le(n))
def write_long(self, n):
self.write(pack_long(n))
def write_ulong(self, n):
self.write(pack_ulong(n))
def write_float(self, n):
self.write(pack_float(n))
def write_array(self, arry):
if IS_LITTLE:
arry = copy(arry)
arry.byteswap()
if self.is_real:
arry.tofile(self.file)
else:
self.write(array_tobytes(arry))
def read_sbyte(self):
return unpack_sbyte(self.read(1))[0]
def read_int(self):
return unpack_int(self.read(_INT_SIZE))[0]
def read_uint(self):
return unpack_uint(self.read(_INT_SIZE))[0]
def read_uint_le(self):
return unpack_uint_le(self.read(_INT_SIZE))[0]
def read_ushort(self):
return unpack_ushort(self.read(_SHORT_SIZE))[0]
def read_ushort_le(self):
return unpack_ushort_le(self.read(_SHORT_SIZE))[0]
def read_long(self):
return unpack_long(self.read(_LONG_SIZE))[0]
def read_ulong(self):
return unpack_ulong(self.read(_LONG_SIZE))[0]
def read_float(self):
return unpack_float(self.read(_FLOAT_SIZE))[0]
def read_array(self, typecode, length):
a = array(typecode)
if self.is_real:
a.fromfile(self.file, length)
else:
array_frombytes(a, self.read(length * _SIZEMAP[typecode]))
if IS_LITTLE:
a.byteswap()
return a
def get(self, position, length):
self.seek(position)
return self.read(length)
def get_byte(self, position):
return unpack_byte(self.get(position, 1))[0]
def get_sbyte(self, position):
return unpack_sbyte(self.get(position, 1))[0]
def get_int(self, position):
return unpack_int(self.get(position, _INT_SIZE))[0]
def get_uint(self, position):
return unpack_uint(self.get(position, _INT_SIZE))[0]
def get_ushort(self, position):
return unpack_ushort(self.get(position, _SHORT_SIZE))[0]
def get_long(self, position):
return unpack_long(self.get(position, _LONG_SIZE))[0]
def get_ulong(self, position):
return unpack_ulong(self.get(position, _LONG_SIZE))[0]
def get_float(self, position):
return unpack_float(self.get(position, _FLOAT_SIZE))[0]
def get_array(self, position, typecode, length):
self.seek(position)
return self.read_array(typecode, length)
class BufferFile(StructFile):
def __init__(self, buf, name=None, onclose=None):
self._buf = buf
self._name = name
self.file = BytesIO(buf)
self.onclose = onclose
self.is_real = False
self.is_closed = False
def subset(self, position, length, name=None):
name = name or self._name
return BufferFile(self.get(position, length), name=name)
def get(self, position, length):
return bytes_type(self._buf[position:position + length])
def get_array(self, position, typecode, length):
a = array(typecode)
array_frombytes(a, self.get(position, length * _SIZEMAP[typecode]))
if IS_LITTLE:
a.byteswap()
return a
class ChecksumFile(StructFile):
def __init__(self, *args, **kwargs):
StructFile.__init__(self, *args, **kwargs)
self._check = 0
self._crc32 = __import__("zlib").crc32
def __iter__(self):
for line in self.file:
self._check = self._crc32(line, self._check)
yield line
def seek(self, *args):
raise Exception("Cannot seek on a ChecksumFile")
def read(self, *args, **kwargs):
b = self.file.read(*args, **kwargs)
self._check = self._crc32(b, self._check)
return b
def write(self, b):
self._check = self._crc32(b, self._check)
self.file.write(b)
def checksum(self):
return self._check & 0xffffffff