Update docstrings and API docs for Tokenizer

This commit is contained in:
ines 2017-05-21 13:18:14 +02:00
parent f216422ac5
commit c5a653fa48
2 changed files with 200 additions and 74 deletions

View File

@ -2,8 +2,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import ujson
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc from cython.operator cimport preincrement as preinc
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
@ -12,32 +10,31 @@ from preshed.maps cimport PreshMap
from .strings cimport hash_string from .strings cimport hash_string
cimport cython cimport cython
from . import util
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
cdef class Tokenizer: cdef class Tokenizer:
"""Segment text, and create Doc objects with the discovered segment
boundaries.
""" """
def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None): def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
""" """Create a `Tokenizer`, to create `Doc` objects given unicode text.
Create a Tokenizer, to create Doc objects given unicode text.
Arguments: vocab (Vocab): A storage container for lexical types.
vocab (Vocab): rules (dict): Exceptions and special-cases for the tokenizer.
A storage container for lexical types. prefix_search (callable): A function matching the signature of
rules (dict): `re.compile(string).search` to match prefixes.
Exceptions and special-cases for the tokenizer. suffix_search (callable): A function matching the signature of
prefix_search: `re.compile(string).search` to match suffixes.
A function matching the signature of re.compile(string).search `infix_finditer` (callable): A function matching the signature of
to match prefixes. `re.compile(string).finditer` to find infixes.
suffix_search: token_match (callable): A boolean function matching strings to be
A function matching the signature of re.compile(string).search recognised as tokens.
to match suffixes. RETURNS (Tokenizer): The newly constructed object.
infix_finditer:
A function matching the signature of re.compile(string).finditer EXAMPLE:
to find infixes. >>> tokenizer = Tokenizer(nlp.vocab)
token_match: >>> tokenizer = English().Defaults.create_tokenizer(nlp)
A boolean function matching strings that becomes tokens.
""" """
self.mem = Pool() self.mem = Pool()
self._cache = PreshMap() self._cache = PreshMap()
@ -69,13 +66,10 @@ cdef class Tokenizer:
@cython.boundscheck(False) @cython.boundscheck(False)
def __call__(self, unicode string): def __call__(self, unicode string):
""" """Tokenize a string.
Tokenize a string.
Arguments: string (unicode): The string to tokenize.
string (unicode): The string to tokenize. RETURNS (Doc): A container for linguistic annotations.
Returns:
Doc A container for linguistic annotations.
""" """
if len(string) >= (2 ** 30): if len(string) >= (2 ** 30):
raise ValueError( raise ValueError(
@ -123,18 +117,13 @@ cdef class Tokenizer:
return tokens return tokens
def pipe(self, texts, batch_size=1000, n_threads=2): def pipe(self, texts, batch_size=1000, n_threads=2):
""" """Tokenize a stream of texts.
Tokenize a stream of texts.
Arguments: texts: A sequence of unicode texts.
texts: A sequence of unicode texts. batch_size (int): The number of texts to accumulate in an internal buffer.
batch_size (int): n_threads (int): The number of threads to use, if the implementation
The number of texts to accumulate in an internal buffer. supports multi-threading. The default tokenizer is single-threaded.
n_threads (int): YIELDS (Doc): A sequence of Doc objects, in order.
The number of threads to use, if the implementation supports
multi-threading. The default tokenizer is single-threaded.
Yields:
Doc A sequence of Doc objects, in order.
""" """
for text in texts: for text in texts:
yield self(text) yield self(text)
@ -278,27 +267,23 @@ cdef class Tokenizer:
self._cache.set(key, cached) self._cache.set(key, cached)
def find_infix(self, unicode string): def find_infix(self, unicode string):
""" """Find internal split points of the string, such as hyphens.
Find internal split points of the string, such as hyphens.
string (unicode): The string to segment. string (unicode): The string to segment.
RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
Returns List[re.MatchObject] and `.end()` methods, denoting the placement of internal segment
A list of objects that have .start() and .end() methods, denoting the separators, e.g. hyphens.
placement of internal segment separators, e.g. hyphens.
""" """
if self.infix_finditer is None: if self.infix_finditer is None:
return 0 return 0
return list(self.infix_finditer(string)) return list(self.infix_finditer(string))
def find_prefix(self, unicode string): def find_prefix(self, unicode string):
""" """Find the length of a prefix that should be segmented from the string,
Find the length of a prefix that should be segmented from the string,
or None if no prefix rules match. or None if no prefix rules match.
Arguments: string (unicode): The string to segment.
string (unicode): The string to segment. RETURNS (int): The length of the prefix if present, otherwise `None`.
Returns (int or None): The length of the prefix if present, otherwise None.
""" """
if self.prefix_search is None: if self.prefix_search is None:
return 0 return 0
@ -306,13 +291,11 @@ cdef class Tokenizer:
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
def find_suffix(self, unicode string): def find_suffix(self, unicode string):
""" """Find the length of a suffix that should be segmented from the string,
Find the length of a suffix that should be segmented from the string,
or None if no suffix rules match. or None if no suffix rules match.
Arguments: string (unicode): The string to segment.
string (unicode): The string to segment. Returns (int): The length of the suffix if present, otherwise `None`.
Returns (int or None): The length of the suffix if present, otherwise None.
""" """
if self.suffix_search is None: if self.suffix_search is None:
return 0 return 0
@ -320,23 +303,17 @@ cdef class Tokenizer:
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
def _load_special_tokenization(self, special_cases): def _load_special_tokenization(self, special_cases):
""" """Add special-case tokenization rules."""
Add special-case tokenization rules.
"""
for chunk, substrings in sorted(special_cases.items()): for chunk, substrings in sorted(special_cases.items()):
self.add_special_case(chunk, substrings) self.add_special_case(chunk, substrings)
def add_special_case(self, unicode string, substrings): def add_special_case(self, unicode string, substrings):
""" """Add a special-case tokenization rule.
Add a special-case tokenization rule.
Arguments: string (unicode): The string to specially tokenize.
string (unicode): The string to specially tokenize. token_attrs (iterable): A sequence of dicts, where each dict describes
token_attrs: a token and its attributes. The `ORTH` fields of the attributes must
A sequence of dicts, where each dict describes a token and its exactly match the string when they are concatenated.
attributes. The ORTH fields of the attributes must exactly match
the string when they are concatenated.
Returns None
""" """
substrings = list(substrings) substrings = list(substrings)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
@ -347,3 +324,38 @@ cdef class Tokenizer:
self._specials.set(key, cached) self._specials.set(key, cached)
self._cache.set(key, cached) self._cache.set(key, cached)
self._rules[string] = substrings self._rules[string] = substrings
def to_disk(self, path):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
"""
raise NotImplementedError()
def from_disk(self, path):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
RETURNS (Tokenizer): The modified `Tokenizer` object.
"""
raise NotImplementedError()
def to_bytes(self, **exclude):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `Tokenizer` object.
"""
raise NotImplementedError()
def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
RETURNS (Tokenizer): The `Tokenizer` object.
"""
raise NotImplementedError()

View File

@ -11,6 +11,15 @@ p
p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text. p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
+aside-code("Example").
# Construction 1
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)
# Construction 2
from spacy.lang.en import English
tokenizer = English().Defaults.create_tokenizer(nlp)
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code vocab] +cell #[code vocab]
@ -43,6 +52,11 @@ p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
| A function matching the signature of | A function matching the signature of
| #[code re.compile(string).finditer] to find infixes. | #[code re.compile(string).finditer] to find infixes.
+row
+cell #[code token_match]
+cell callable
+cell A boolean function matching strings to be recognised as tokens.
+footrow +footrow
+cell returns +cell returns
+cell #[code Tokenizer] +cell #[code Tokenizer]
@ -53,6 +67,10 @@ p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
p Tokenize a string. p Tokenize a string.
+aside-code("Example").
tokens = tokenizer(u'This is a sentence')
assert len(tokens) == 4
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code string] +cell #[code string]
@ -69,6 +87,11 @@ p Tokenize a string.
p Tokenize a stream of texts. p Tokenize a stream of texts.
+aside-code("Example").
texts = [u'One document.', u'...', u'Lots of documents']
for doc in tokenizer.pipe(texts, batch_size=50):
pass
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code texts] +cell #[code texts]
@ -105,11 +128,11 @@ p Find internal split points of the string.
+footrow +footrow
+cell returns +cell returns
+cell #[code List[re.MatchObject]] +cell list
+cell +cell
| A list of objects that have #[code .start()] and #[code .end()] | A list of #[code re.MatchObject] objects that have #[code .start()]
| methods, denoting the placement of internal segment separators, | and #[code .end()] methods, denoting the placement of internal
| e.g. hyphens. | segment separators, e.g. hyphens.
+h(2, "find_prefix") Tokenizer.find_prefix +h(2, "find_prefix") Tokenizer.find_prefix
+tag method +tag method
@ -126,7 +149,7 @@ p
+footrow +footrow
+cell returns +cell returns
+cell int / #[code None] +cell int
+cell The length of the prefix if present, otherwise #[code None]. +cell The length of the prefix if present, otherwise #[code None].
+h(2, "find_suffix") Tokenizer.find_suffix +h(2, "find_suffix") Tokenizer.find_suffix
@ -150,7 +173,16 @@ p
+h(2, "add_special_case") Tokenizer.add_special_case +h(2, "add_special_case") Tokenizer.add_special_case
+tag method +tag method
p Add a special-case tokenization rule. p
| Add a special-case tokenization rule. This mechanism is also used to add
| custom tokenizer exceptions to the language data. See the usage workflow
| on #[+a("/docs/usage/adding-languages#tokenizer-exceptions") adding languages]
| for more details and examples.
+aside-code("Example").
from spacy.attrs import ORTH, LEMMA
case = [{"don't": [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]}]
tokenizer.add_special_case(case)
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
@ -160,16 +192,98 @@ p Add a special-case tokenization rule.
+row +row
+cell #[code token_attrs] +cell #[code token_attrs]
+cell - +cell iterable
+cell +cell
| A sequence of dicts, where each dict describes a token and its | A sequence of dicts, where each dict describes a token and its
| attributes. The #[code ORTH] fields of the attributes must | attributes. The #[code ORTH] fields of the attributes must
| exactly match the string when they are concatenated. | exactly match the string when they are concatenated.
+h(2, "to_disk") Tokenizer.to_disk
+tag method
p Save the current state to a directory.
+aside-code("Example").
tokenizer.to_disk('/path/to/tokenizer')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+h(2, "from_disk") Tokenizer.from_disk
+tag method
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)
tokenizer = tokenizer.from_disk('/path/to/tokenizer')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+footrow +footrow
+cell returns +cell returns
+cell #[code None] +cell #[code Tokenizer]
+cell The modified #[code Tokenizer] object.
+h(2, "to_bytes") Tokenizer.to_bytes
+tag method
p Serialize the current state to a binary string.
+aside-code("Example").
tokenizer_bytes = tokenizer.to_bytes()
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell - +cell -
+cell Named attributes to prevent from being serialized.
+footrow
+cell returns
+cell bytes
+cell The serialized form of the #[code Tokenizer] object.
+h(2, "from_bytes") Tokenizer.from_bytes
+tag method
p Load state from a binary string.
+aside-code("Example").
fron spacy.tokenizer import Tokenizer
tokenizer_bytes = tokenizer.to_bytes()
new_tokenizer = Tokenizer(nlp.vocab)
new_tokenizer.from_bytes(tokenizer_bytes)
+table(["Name", "Type", "Description"])
+row
+cell #[code bytes_data]
+cell bytes
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The #[code Tokenizer] object.
+h(2, "attributes") Attributes +h(2, "attributes") Attributes