mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Update docstrings and API docs for StringStore
This commit is contained in:
parent
251346b59f
commit
2c5cfe8bbf
|
@ -11,8 +11,6 @@ from preshed.maps cimport map_iter, key_t
|
|||
from .typedefs cimport hash_t
|
||||
from libc.stdint cimport uint32_t
|
||||
|
||||
import ujson
|
||||
|
||||
|
||||
cpdef hash_t hash_string(unicode string) except 0:
|
||||
chars = string.encode('utf8')
|
||||
|
@ -72,15 +70,12 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
|
|||
|
||||
|
||||
cdef class StringStore:
|
||||
"""
|
||||
Map strings to and from integer IDs.
|
||||
"""
|
||||
"""Map strings to and from integer IDs."""
|
||||
def __init__(self, strings=None, freeze=False):
|
||||
"""
|
||||
Create the StringStore.
|
||||
"""Create the StringStore.
|
||||
|
||||
Arguments:
|
||||
strings: A sequence of unicode strings to add to the store.
|
||||
strings (iterable): A sequence of unicode strings to add to the store.
|
||||
RETURNS (StringStore): The newly constructed object.
|
||||
"""
|
||||
self.mem = Pool()
|
||||
self._map = PreshMap()
|
||||
|
@ -106,23 +101,17 @@ cdef class StringStore:
|
|||
return (StringStore, (list(self),))
|
||||
|
||||
def __len__(self):
|
||||
"""
|
||||
The number of strings in the store.
|
||||
"""The number of strings in the store.
|
||||
|
||||
Returns:
|
||||
int The number of strings in the store.
|
||||
RETURNS (int): The number of strings in the store.
|
||||
"""
|
||||
return self.size-1
|
||||
|
||||
def __getitem__(self, object string_or_id):
|
||||
"""
|
||||
Retrieve a string from a given integer ID, or vice versa.
|
||||
"""Retrieve a string from a given integer ID, or vice versa.
|
||||
|
||||
Arguments:
|
||||
string_or_id (bytes or unicode or int):
|
||||
The value to encode.
|
||||
Returns:
|
||||
unicode or int: The value to retrieved.
|
||||
string_or_id (bytes or unicode or int): The value to encode.
|
||||
Returns (unicode or int): The value to be retrieved.
|
||||
"""
|
||||
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
|
||||
return 0
|
||||
|
@ -163,13 +152,10 @@ cdef class StringStore:
|
|||
return utf8str - self.c
|
||||
|
||||
def __contains__(self, unicode string not None):
|
||||
"""
|
||||
Check whether a string is in the store.
|
||||
"""Check whether a string is in the store.
|
||||
|
||||
Arguments:
|
||||
string (unicode): The string to check.
|
||||
Returns bool:
|
||||
Whether the store contains the string.
|
||||
string (unicode): The string to check.
|
||||
RETURNS (bool): Whether the store contains the string.
|
||||
"""
|
||||
if len(string) == 0:
|
||||
return True
|
||||
|
@ -177,10 +163,9 @@ cdef class StringStore:
|
|||
return self._map.get(key) is not NULL
|
||||
|
||||
def __iter__(self):
|
||||
"""
|
||||
Iterate over the strings in the store, in order.
|
||||
"""Iterate over the strings in the store, in order.
|
||||
|
||||
Yields: unicode A string in the store.
|
||||
YIELDS (unicode): A string in the store.
|
||||
"""
|
||||
cdef int i
|
||||
for i in range(self.size):
|
||||
|
@ -195,6 +180,41 @@ cdef class StringStore:
|
|||
strings.append(py_string)
|
||||
return (StringStore, (strings,), None, None, None)
|
||||
|
||||
def to_disk(self, path):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def from_disk(self, path):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
|
||||
path (unicode or Path): A path to a directory. Paths may be either
|
||||
strings or `Path`-like objects.
|
||||
RETURNS (StringStore): The modified `StringStore` object.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
"""Serialize the current state to a binary string.
|
||||
|
||||
**exclude: Named attributes to prevent from being serialized.
|
||||
RETURNS (bytes): The serialized form of the `StringStore` object.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
"""Load state from a binary string.
|
||||
|
||||
bytes_data (bytes): The data to load from.
|
||||
**exclude: Named attributes to prevent from being loaded.
|
||||
RETURNS (StringStore): The `StringStore` object.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def set_frozen(self, bint is_frozen):
|
||||
# TODO
|
||||
self.is_frozen = is_frozen
|
||||
|
@ -235,40 +255,6 @@ cdef class StringStore:
|
|||
self.size += 1
|
||||
return &self.c[self.size-1]
|
||||
|
||||
def dump(self, file_):
|
||||
"""
|
||||
Save the strings to a JSON file.
|
||||
|
||||
Arguments:
|
||||
file_ (buffer): The file to save the strings.
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
string_data = ujson.dumps(list(self))
|
||||
if not isinstance(string_data, unicode):
|
||||
string_data = string_data.decode('utf8')
|
||||
# TODO: OOV?
|
||||
file_.write(string_data)
|
||||
|
||||
def load(self, file_):
|
||||
"""
|
||||
Load the strings from a JSON file.
|
||||
|
||||
Arguments:
|
||||
file_ (buffer): The file from which to load the strings.
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
strings = ujson.load(file_)
|
||||
if strings == ['']:
|
||||
return None
|
||||
cdef unicode string
|
||||
for string in strings:
|
||||
# explicit None/len check instead of simple truth testing
|
||||
# (bug in Cython <= 0.23.4)
|
||||
if string is not None and len(string):
|
||||
self.intern_unicode(string)
|
||||
|
||||
def _realloc(self):
|
||||
# We want to map straight to pointers, but they'll be invalidated if
|
||||
# we resize our array. So, first we remap to indices, then we resize,
|
||||
|
|
|
@ -7,12 +7,18 @@ p Map strings to and from integer IDs.
|
|||
+h(2, "init") StringStore.__init__
|
||||
+tag method
|
||||
|
||||
p Create the #[code StringStore].
|
||||
p
|
||||
| Create the #[code StringStore]. Note that a newly initialised store will
|
||||
| always include an empty string #[code ''] at position #[code 0].
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.strings import StringStore
|
||||
stringstore = StringStore([u'apple', u'orange'])
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code strings]
|
||||
+cell -
|
||||
+cell iterable
|
||||
+cell A sequence of unicode strings to add to the store.
|
||||
|
||||
+footrow
|
||||
|
@ -25,6 +31,10 @@ p Create the #[code StringStore].
|
|||
|
||||
p Get the number of strings in the store.
|
||||
|
||||
+aside-code("Example").
|
||||
stringstore = StringStore([u'apple', u'orange'])
|
||||
assert len(stringstore) == 2
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
|
@ -36,22 +46,32 @@ p Get the number of strings in the store.
|
|||
|
||||
p Retrieve a string from a given integer ID, or vice versa.
|
||||
|
||||
+aside-code("Example").
|
||||
stringstore = StringStore([u'apple', u'orange'])
|
||||
int_id = stringstore[u'apple'] # 1
|
||||
assert stringstore[int_id] == u'apple'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code string_or_id]
|
||||
+cell bytes / unicode / int
|
||||
+cell bytes, unicode or int
|
||||
+cell The value to encode.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell unicode / int
|
||||
+cell The value to retrieved.
|
||||
+cell unicode or int
|
||||
+cell The value to be retrieved.
|
||||
|
||||
+h(2, "contains") StringStore.__contains__
|
||||
+tag method
|
||||
|
||||
p Check whether a string is in the store.
|
||||
|
||||
+aside-code("Example").
|
||||
stringstore = StringStore([u'apple', u'orange'])
|
||||
assert u'apple' in stringstore == True
|
||||
assert u'cherry' in stringstore == False
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code string]
|
||||
|
@ -66,10 +86,101 @@ p Check whether a string is in the store.
|
|||
+h(2, "iter") StringStore.__iter__
|
||||
+tag method
|
||||
|
||||
p Iterate over the strings in the store, in order.
|
||||
p
|
||||
| Iterate over the strings in the store, in order. Note that a newly
|
||||
| initialised store will always include an empty string #[code ''] at
|
||||
| position #[code 0].
|
||||
|
||||
+aside-code("Example").
|
||||
stringstore = StringStore([u'apple', u'orange'])
|
||||
all_strings = [s for s in stringstore]
|
||||
assert all_strings == [u'', u'apple', u'orange']
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yields
|
||||
+cell unicode
|
||||
+cell A string in the store.
|
||||
|
||||
+h(2, "to_disk") StringStore.to_disk
|
||||
+tag method
|
||||
|
||||
p Save the current state to a directory.
|
||||
|
||||
+aside-code("Example").
|
||||
stringstore.to_disk('/path/to/strings')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code path]
|
||||
+cell unicode or #[code Path]
|
||||
+cell
|
||||
| A path to a directory, which will be created if it doesn't exist.
|
||||
| Paths may be either strings or #[code Path]-like objects.
|
||||
|
||||
+h(2, "from_disk") Tokenizer.from_disk
|
||||
+tag method
|
||||
|
||||
p Loads state from a directory. Modifies the object in place and returns it.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.strings import StringStore
|
||||
stringstore = StringStore().from_disk('/path/to/strings')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code path]
|
||||
+cell unicode or #[code Path]
|
||||
+cell
|
||||
| A path to a directory. Paths may be either strings or
|
||||
| #[code Path]-like objects.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Tokenizer]
|
||||
+cell The modified #[code Tokenizer] object.
|
||||
|
||||
+h(2, "to_bytes") Tokenizer.to_bytes
|
||||
+tag method
|
||||
|
||||
p Serialize the current state to a binary string.
|
||||
|
||||
+aside-code("Example").
|
||||
store_bytes = stringstore.to_bytes()
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being serialized.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell bytes
|
||||
+cell The serialized form of the #[code Tokenizer] object.
|
||||
|
||||
+h(2, "from_bytes") Tokenizer.from_bytes
|
||||
+tag method
|
||||
|
||||
p Load state from a binary string.
|
||||
|
||||
+aside-code("Example").
|
||||
fron spacy.strings import StringStore
|
||||
store_bytes = stringstore.to_bytes()
|
||||
new_store = StringStore().from_bytes(store_bytes)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code bytes_data]
|
||||
+cell bytes
|
||||
+cell The data to load from.
|
||||
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being loaded.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code StringStore]
|
||||
+cell The #[code StringStore] object.
|
||||
|
|
Loading…
Reference in New Issue
Block a user