diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 398e9ba7a..9aa897444 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -2,8 +2,6 @@ # coding: utf8 from __future__ import unicode_literals -import ujson - from cython.operator cimport dereference as deref from cython.operator cimport preincrement as preinc from cymem.cymem cimport Pool @@ -12,32 +10,31 @@ from preshed.maps cimport PreshMap from .strings cimport hash_string cimport cython -from . import util from .tokens.doc cimport Doc cdef class Tokenizer: + """Segment text, and create Doc objects with the discovered segment + boundaries. """ def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None): - """ - Create a Tokenizer, to create Doc objects given unicode text. + """Create a `Tokenizer`, to create `Doc` objects given unicode text. - Arguments: - vocab (Vocab): - A storage container for lexical types. - rules (dict): - Exceptions and special-cases for the tokenizer. - prefix_search: - A function matching the signature of re.compile(string).search - to match prefixes. - suffix_search: - A function matching the signature of re.compile(string).search - to match suffixes. - infix_finditer: - A function matching the signature of re.compile(string).finditer - to find infixes. - token_match: - A boolean function matching strings that becomes tokens. + vocab (Vocab): A storage container for lexical types. + rules (dict): Exceptions and special-cases for the tokenizer. + prefix_search (callable): A function matching the signature of + `re.compile(string).search` to match prefixes. + suffix_search (callable): A function matching the signature of + `re.compile(string).search` to match suffixes. + `infix_finditer` (callable): A function matching the signature of + `re.compile(string).finditer` to find infixes. + token_match (callable): A boolean function matching strings to be + recognised as tokens. + RETURNS (Tokenizer): The newly constructed object. + + EXAMPLE: + >>> tokenizer = Tokenizer(nlp.vocab) + >>> tokenizer = English().Defaults.create_tokenizer(nlp) """ self.mem = Pool() self._cache = PreshMap() @@ -69,13 +66,10 @@ cdef class Tokenizer: @cython.boundscheck(False) def __call__(self, unicode string): - """ - Tokenize a string. + """Tokenize a string. - Arguments: - string (unicode): The string to tokenize. - Returns: - Doc A container for linguistic annotations. + string (unicode): The string to tokenize. + RETURNS (Doc): A container for linguistic annotations. """ if len(string) >= (2 ** 30): raise ValueError( @@ -123,18 +117,13 @@ cdef class Tokenizer: return tokens def pipe(self, texts, batch_size=1000, n_threads=2): - """ - Tokenize a stream of texts. + """Tokenize a stream of texts. - Arguments: - texts: A sequence of unicode texts. - batch_size (int): - The number of texts to accumulate in an internal buffer. - n_threads (int): - The number of threads to use, if the implementation supports - multi-threading. The default tokenizer is single-threaded. - Yields: - Doc A sequence of Doc objects, in order. + texts: A sequence of unicode texts. + batch_size (int): The number of texts to accumulate in an internal buffer. + n_threads (int): The number of threads to use, if the implementation + supports multi-threading. The default tokenizer is single-threaded. + YIELDS (Doc): A sequence of Doc objects, in order. """ for text in texts: yield self(text) @@ -278,27 +267,23 @@ cdef class Tokenizer: self._cache.set(key, cached) def find_infix(self, unicode string): - """ - Find internal split points of the string, such as hyphens. + """Find internal split points of the string, such as hyphens. string (unicode): The string to segment. - - Returns List[re.MatchObject] - A list of objects that have .start() and .end() methods, denoting the - placement of internal segment separators, e.g. hyphens. + RETURNS (list): A list of `re.MatchObject` objects that have `.start()` + and `.end()` methods, denoting the placement of internal segment + separators, e.g. hyphens. """ if self.infix_finditer is None: return 0 return list(self.infix_finditer(string)) def find_prefix(self, unicode string): - """ - Find the length of a prefix that should be segmented from the string, + """Find the length of a prefix that should be segmented from the string, or None if no prefix rules match. - Arguments: - string (unicode): The string to segment. - Returns (int or None): The length of the prefix if present, otherwise None. + string (unicode): The string to segment. + RETURNS (int): The length of the prefix if present, otherwise `None`. """ if self.prefix_search is None: return 0 @@ -306,13 +291,11 @@ cdef class Tokenizer: return (match.end() - match.start()) if match is not None else 0 def find_suffix(self, unicode string): - """ - Find the length of a suffix that should be segmented from the string, + """Find the length of a suffix that should be segmented from the string, or None if no suffix rules match. - Arguments: - string (unicode): The string to segment. - Returns (int or None): The length of the suffix if present, otherwise None. + string (unicode): The string to segment. + Returns (int): The length of the suffix if present, otherwise `None`. """ if self.suffix_search is None: return 0 @@ -320,23 +303,17 @@ cdef class Tokenizer: return (match.end() - match.start()) if match is not None else 0 def _load_special_tokenization(self, special_cases): - """ - Add special-case tokenization rules. - """ + """Add special-case tokenization rules.""" for chunk, substrings in sorted(special_cases.items()): self.add_special_case(chunk, substrings) def add_special_case(self, unicode string, substrings): - """ - Add a special-case tokenization rule. + """Add a special-case tokenization rule. - Arguments: - string (unicode): The string to specially tokenize. - token_attrs: - A sequence of dicts, where each dict describes a token and its - attributes. The ORTH fields of the attributes must exactly match - the string when they are concatenated. - Returns None + string (unicode): The string to specially tokenize. + token_attrs (iterable): A sequence of dicts, where each dict describes + a token and its attributes. The `ORTH` fields of the attributes must + exactly match the string when they are concatenated. """ substrings = list(substrings) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) @@ -347,3 +324,38 @@ cdef class Tokenizer: self._specials.set(key, cached) self._cache.set(key, cached) self._rules[string] = substrings + + def to_disk(self, path): + """Save the current state to a directory. + + path (unicode or Path): A path to a directory, which will be created if + it doesn't exist. Paths may be either strings or `Path`-like objects. + """ + raise NotImplementedError() + + def from_disk(self, path): + """Loads state from a directory. Modifies the object in place and + returns it. + + path (unicode or Path): A path to a directory. Paths may be either + strings or `Path`-like objects. + RETURNS (Tokenizer): The modified `Tokenizer` object. + """ + raise NotImplementedError() + + def to_bytes(self, **exclude): + """Serialize the current state to a binary string. + + **exclude: Named attributes to prevent from being serialized. + RETURNS (bytes): The serialized form of the `Tokenizer` object. + """ + raise NotImplementedError() + + def from_bytes(self, bytes_data, **exclude): + """Load state from a binary string. + + bytes_data (bytes): The data to load from. + **exclude: Named attributes to prevent from being loaded. + RETURNS (Tokenizer): The `Tokenizer` object. + """ + raise NotImplementedError() diff --git a/website/docs/api/tokenizer.jade b/website/docs/api/tokenizer.jade index 9f0cdb14c..5c0f69854 100644 --- a/website/docs/api/tokenizer.jade +++ b/website/docs/api/tokenizer.jade @@ -11,6 +11,15 @@ p p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text. ++aside-code("Example"). + # Construction 1 + from spacy.tokenizer import Tokenizer + tokenizer = Tokenizer(nlp.vocab) + + # Construction 2 + from spacy.lang.en import English + tokenizer = English().Defaults.create_tokenizer(nlp) + +table(["Name", "Type", "Description"]) +row +cell #[code vocab] @@ -43,6 +52,11 @@ p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text. | A function matching the signature of | #[code re.compile(string).finditer] to find infixes. + +row + +cell #[code token_match] + +cell callable + +cell A boolean function matching strings to be recognised as tokens. + +footrow +cell returns +cell #[code Tokenizer] @@ -53,6 +67,10 @@ p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text. p Tokenize a string. ++aside-code("Example"). + tokens = tokenizer(u'This is a sentence') + assert len(tokens) == 4 + +table(["Name", "Type", "Description"]) +row +cell #[code string] @@ -69,6 +87,11 @@ p Tokenize a string. p Tokenize a stream of texts. ++aside-code("Example"). + texts = [u'One document.', u'...', u'Lots of documents'] + for doc in tokenizer.pipe(texts, batch_size=50): + pass + +table(["Name", "Type", "Description"]) +row +cell #[code texts] @@ -105,11 +128,11 @@ p Find internal split points of the string. +footrow +cell returns - +cell #[code List[re.MatchObject]] + +cell list +cell - | A list of objects that have #[code .start()] and #[code .end()] - | methods, denoting the placement of internal segment separators, - | e.g. hyphens. + | A list of #[code re.MatchObject] objects that have #[code .start()] + | and #[code .end()] methods, denoting the placement of internal + | segment separators, e.g. hyphens. +h(2, "find_prefix") Tokenizer.find_prefix +tag method @@ -126,7 +149,7 @@ p +footrow +cell returns - +cell int / #[code None] + +cell int +cell The length of the prefix if present, otherwise #[code None]. +h(2, "find_suffix") Tokenizer.find_suffix @@ -150,7 +173,16 @@ p +h(2, "add_special_case") Tokenizer.add_special_case +tag method -p Add a special-case tokenization rule. +p + | Add a special-case tokenization rule. This mechanism is also used to add + | custom tokenizer exceptions to the language data. See the usage workflow + | on #[+a("/docs/usage/adding-languages#tokenizer-exceptions") adding languages] + | for more details and examples. + ++aside-code("Example"). + from spacy.attrs import ORTH, LEMMA + case = [{"don't": [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]}] + tokenizer.add_special_case(case) +table(["Name", "Type", "Description"]) +row @@ -160,16 +192,98 @@ p Add a special-case tokenization rule. +row +cell #[code token_attrs] - +cell - + +cell iterable +cell | A sequence of dicts, where each dict describes a token and its | attributes. The #[code ORTH] fields of the attributes must | exactly match the string when they are concatenated. + ++h(2, "to_disk") Tokenizer.to_disk + +tag method + +p Save the current state to a directory. + ++aside-code("Example"). + tokenizer.to_disk('/path/to/tokenizer') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code path] + +cell unicode or #[code Path] + +cell + | A path to a directory, which will be created if it doesn't exist. + | Paths may be either strings or #[code Path]-like objects. + ++h(2, "from_disk") Tokenizer.from_disk + +tag method + +p Loads state from a directory. Modifies the object in place and returns it. + ++aside-code("Example"). + from spacy.tokenizer import Tokenizer + tokenizer = Tokenizer(nlp.vocab) + tokenizer = tokenizer.from_disk('/path/to/tokenizer') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code path] + +cell unicode or #[code Path] + +cell + | A path to a directory. Paths may be either strings or + | #[code Path]-like objects. + +footrow +cell returns - +cell #[code None] + +cell #[code Tokenizer] + +cell The modified #[code Tokenizer] object. + ++h(2, "to_bytes") Tokenizer.to_bytes + +tag method + +p Serialize the current state to a binary string. + ++aside-code("Example"). + tokenizer_bytes = tokenizer.to_bytes() + ++table(["Name", "Type", "Description"]) + +row + +cell #[code **exclude] +cell - + +cell Named attributes to prevent from being serialized. + + +footrow + +cell returns + +cell bytes + +cell The serialized form of the #[code Tokenizer] object. + ++h(2, "from_bytes") Tokenizer.from_bytes + +tag method + +p Load state from a binary string. + ++aside-code("Example"). + fron spacy.tokenizer import Tokenizer + tokenizer_bytes = tokenizer.to_bytes() + new_tokenizer = Tokenizer(nlp.vocab) + new_tokenizer.from_bytes(tokenizer_bytes) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code bytes_data] + +cell bytes + +cell The data to load from. + + +row + +cell #[code **exclude] + +cell - + +cell Named attributes to prevent from being loaded. + + +footrow + +cell returns + +cell #[code Tokenizer] + +cell The #[code Tokenizer] object. + +h(2, "attributes") Attributes