From c5a653fa48524251221b4dea85914c2d39b2db1f Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 21 May 2017 13:18:14 +0200
Subject: [PATCH] Update docstrings and API docs for Tokenizer

---
 spacy/tokenizer.pyx             | 144 +++++++++++++++++---------------
 website/docs/api/tokenizer.jade | 130 ++++++++++++++++++++++++++--
 2 files changed, 200 insertions(+), 74 deletions(-)

diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 398e9ba7a..9aa897444 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -2,8 +2,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-import ujson
-
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc
 from cymem.cymem cimport Pool
@@ -12,32 +10,31 @@ from preshed.maps cimport PreshMap
 from .strings cimport hash_string
 cimport cython
 
-from . import util
 from .tokens.doc cimport Doc
 
 
 cdef class Tokenizer:
+    """Segment text, and create Doc objects with the discovered segment
+    boundaries.
     """
     def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
-        """
-        Create a Tokenizer, to create Doc objects given unicode text.
+        """Create a `Tokenizer`, to create `Doc` objects given unicode text.
 
-        Arguments:
-            vocab (Vocab):
-                A storage container for lexical types.
-            rules (dict):
-                Exceptions and special-cases for the tokenizer.
-            prefix_search:
-                A function matching the signature of re.compile(string).search
-                to match prefixes.
-            suffix_search:
-                A function matching the signature of re.compile(string).search
-                to match suffixes.
-            infix_finditer:
-                A function matching the signature of re.compile(string).finditer
-                to find infixes.
-            token_match:
-                A boolean function matching strings that becomes tokens.
+        vocab (Vocab): A storage container for lexical types.
+        rules (dict): Exceptions and special-cases for the tokenizer.
+        prefix_search (callable): A function matching the signature of
+            `re.compile(string).search` to match prefixes.
+        suffix_search (callable): A function matching the signature of
+            `re.compile(string).search` to match suffixes.
+        `infix_finditer` (callable): A function matching the signature of
+            `re.compile(string).finditer` to find infixes.
+        token_match (callable): A boolean function matching strings to be
+            recognised as tokens.
+        RETURNS (Tokenizer): The newly constructed object.
+
+        EXAMPLE:
+            >>> tokenizer = Tokenizer(nlp.vocab)
+            >>> tokenizer = English().Defaults.create_tokenizer(nlp)
         """
         self.mem = Pool()
         self._cache = PreshMap()
@@ -69,13 +66,10 @@ cdef class Tokenizer:
 
     @cython.boundscheck(False)
     def __call__(self, unicode string):
-        """
-        Tokenize a string.
+        """Tokenize a string.
 
-        Arguments:
-            string (unicode): The string to tokenize.
-        Returns:
-            Doc A container for linguistic annotations.
+        string (unicode): The string to tokenize.
+        RETURNS (Doc): A container for linguistic annotations.
         """
         if len(string) >= (2 ** 30):
             raise ValueError(
@@ -123,18 +117,13 @@ cdef class Tokenizer:
         return tokens
 
     def pipe(self, texts, batch_size=1000, n_threads=2):
-        """
-        Tokenize a stream of texts.
+        """Tokenize a stream of texts.
 
-        Arguments:
-            texts: A sequence of unicode texts.
-            batch_size (int):
-                The number of texts to accumulate in an internal buffer.
-            n_threads (int):
-                The number of threads to use, if the implementation supports
-                multi-threading. The default tokenizer is single-threaded.
-        Yields:
-            Doc A sequence of Doc objects, in order.
+        texts: A sequence of unicode texts.
+        batch_size (int): The number of texts to accumulate in an internal buffer.
+        n_threads (int): The number of threads to use, if the implementation
+            supports multi-threading. The default tokenizer is single-threaded.
+        YIELDS (Doc): A sequence of Doc objects, in order.
         """
         for text in texts:
             yield self(text)
@@ -278,27 +267,23 @@ cdef class Tokenizer:
         self._cache.set(key, cached)
 
     def find_infix(self, unicode string):
-        """
-        Find internal split points of the string, such as hyphens.
+        """Find internal split points of the string, such as hyphens.
 
         string (unicode): The string to segment.
-
-        Returns List[re.MatchObject]
-            A list of objects that have .start() and .end() methods, denoting the
-            placement of internal segment separators, e.g. hyphens.
+        RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
+            and `.end()` methods, denoting the placement of internal segment
+            separators, e.g. hyphens.
         """
         if self.infix_finditer is None:
             return 0
         return list(self.infix_finditer(string))
 
     def find_prefix(self, unicode string):
-        """
-        Find the length of a prefix that should be segmented from the string,
+        """Find the length of a prefix that should be segmented from the string,
         or None if no prefix rules match.
 
-        Arguments:
-            string (unicode): The string to segment.
-        Returns (int or None): The length of the prefix if present, otherwise None.
+        string (unicode): The string to segment.
+        RETURNS (int): The length of the prefix if present, otherwise `None`.
         """
         if self.prefix_search is None:
             return 0
@@ -306,13 +291,11 @@ cdef class Tokenizer:
         return (match.end() - match.start()) if match is not None else 0
 
     def find_suffix(self, unicode string):
-        """
-        Find the length of a suffix that should be segmented from the string,
+        """Find the length of a suffix that should be segmented from the string,
         or None if no suffix rules match.
 
-        Arguments:
-            string (unicode): The string to segment.
-        Returns (int or None): The length of the suffix if present, otherwise None.
+        string (unicode): The string to segment.
+        Returns (int): The length of the suffix if present, otherwise `None`.
         """
         if self.suffix_search is None:
             return 0
@@ -320,23 +303,17 @@ cdef class Tokenizer:
         return (match.end() - match.start()) if match is not None else 0
 
     def _load_special_tokenization(self, special_cases):
-        """
-        Add special-case tokenization rules.
-        """
+        """Add special-case tokenization rules."""
         for chunk, substrings in sorted(special_cases.items()):
             self.add_special_case(chunk, substrings)
 
     def add_special_case(self, unicode string, substrings):
-        """
-        Add a special-case tokenization rule.
+        """Add a special-case tokenization rule.
 
-        Arguments:
-            string (unicode): The string to specially tokenize.
-            token_attrs:
-                A sequence of dicts, where each dict describes a token and its
-                attributes. The ORTH fields of the attributes must exactly match
-                the string when they are concatenated.
-        Returns None
+        string (unicode): The string to specially tokenize.
+        token_attrs (iterable): A sequence of dicts, where each dict describes
+            a token and its attributes. The `ORTH` fields of the attributes must
+            exactly match the string when they are concatenated.
         """
         substrings = list(substrings)
         cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
@@ -347,3 +324,38 @@ cdef class Tokenizer:
         self._specials.set(key, cached)
         self._cache.set(key, cached)
         self._rules[string] = substrings
+
+    def to_disk(self, path):
+        """Save the current state to a directory.
+
+        path (unicode or Path): A path to a directory, which will be created if
+            it doesn't exist. Paths may be either strings or `Path`-like objects.
+        """
+        raise NotImplementedError()
+
+    def from_disk(self, path):
+        """Loads state from a directory. Modifies the object in place and
+        returns it.
+
+        path (unicode or Path): A path to a directory. Paths may be either
+            strings or `Path`-like objects.
+        RETURNS (Tokenizer): The modified `Tokenizer` object.
+        """
+        raise NotImplementedError()
+
+    def to_bytes(self, **exclude):
+        """Serialize the current state to a binary string.
+
+        **exclude: Named attributes to prevent from being serialized.
+        RETURNS (bytes): The serialized form of the `Tokenizer` object.
+        """
+        raise NotImplementedError()
+
+    def from_bytes(self, bytes_data, **exclude):
+        """Load state from a binary string.
+
+        bytes_data (bytes): The data to load from.
+        **exclude: Named attributes to prevent from being loaded.
+        RETURNS (Tokenizer): The `Tokenizer` object.
+        """
+        raise NotImplementedError()
diff --git a/website/docs/api/tokenizer.jade b/website/docs/api/tokenizer.jade
index 9f0cdb14c..5c0f69854 100644
--- a/website/docs/api/tokenizer.jade
+++ b/website/docs/api/tokenizer.jade
@@ -11,6 +11,15 @@ p
 
 p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
 
++aside-code("Example").
+    # Construction 1
+    from spacy.tokenizer import Tokenizer
+    tokenizer = Tokenizer(nlp.vocab)
+
+    # Construction 2
+    from spacy.lang.en import English
+    tokenizer = English().Defaults.create_tokenizer(nlp)
+
 +table(["Name", "Type", "Description"])
     +row
         +cell #[code vocab]
@@ -43,6 +52,11 @@ p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
             |  A function matching the signature of
             |  #[code re.compile(string).finditer] to find infixes.
 
+    +row
+        +cell #[code token_match]
+        +cell callable
+        +cell A boolean function matching strings to be recognised as tokens.
+
     +footrow
         +cell returns
         +cell #[code Tokenizer]
@@ -53,6 +67,10 @@ p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
 
 p Tokenize a string.
 
++aside-code("Example").
+    tokens = tokenizer(u'This is a sentence')
+    assert len(tokens) == 4
+
 +table(["Name", "Type", "Description"])
     +row
         +cell #[code string]
@@ -69,6 +87,11 @@ p Tokenize a string.
 
 p Tokenize a stream of texts.
 
++aside-code("Example").
+    texts = [u'One document.', u'...', u'Lots of documents']
+    for doc in tokenizer.pipe(texts, batch_size=50):
+        pass
+
 +table(["Name", "Type", "Description"])
     +row
         +cell #[code texts]
@@ -105,11 +128,11 @@ p Find internal split points of the string.
 
     +footrow
         +cell returns
-        +cell #[code List[re.MatchObject]]
+        +cell list
         +cell
-            |  A list of objects that have #[code .start()] and #[code .end()]
-            |  methods, denoting the placement of internal segment separators,
-            |  e.g. hyphens.
+            |  A list of #[code re.MatchObject] objects that have #[code .start()]
+            |  and #[code .end()] methods, denoting the placement of internal
+            |  segment separators, e.g. hyphens.
 
 +h(2, "find_prefix") Tokenizer.find_prefix
     +tag method
@@ -126,7 +149,7 @@ p
 
     +footrow
         +cell returns
-        +cell int / #[code None]
+        +cell int
         +cell The length of the prefix if present, otherwise #[code None].
 
 +h(2, "find_suffix") Tokenizer.find_suffix
@@ -150,7 +173,16 @@ p
 +h(2, "add_special_case") Tokenizer.add_special_case
     +tag method
 
-p Add a special-case tokenization rule.
+p
+    |  Add a special-case tokenization rule. This mechanism is also used to add
+    |  custom tokenizer exceptions to the language data. See the usage workflow
+    |  on #[+a("/docs/usage/adding-languages#tokenizer-exceptions") adding languages]
+    |  for more details and examples.
+
++aside-code("Example").
+    from spacy.attrs import ORTH, LEMMA
+    case = [{"don't": [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]}]
+    tokenizer.add_special_case(case)
 
 +table(["Name", "Type", "Description"])
     +row
@@ -160,16 +192,98 @@ p Add a special-case tokenization rule.
 
     +row
         +cell #[code token_attrs]
-        +cell -
+        +cell iterable
         +cell
             |  A sequence of dicts, where each dict describes a token and its
             |  attributes. The #[code ORTH] fields of the attributes must
             |  exactly match the string when they are concatenated.
 
+
++h(2, "to_disk") Tokenizer.to_disk
+    +tag method
+
+p Save the current state to a directory.
+
++aside-code("Example").
+    tokenizer.to_disk('/path/to/tokenizer')
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code path]
+        +cell unicode or #[code Path]
+        +cell
+            |  A path to a directory, which will be created if it doesn't exist.
+            |  Paths may be either strings or #[code Path]-like objects.
+
++h(2, "from_disk") Tokenizer.from_disk
+    +tag method
+
+p Loads state from a directory. Modifies the object in place and returns it.
+
++aside-code("Example").
+    from spacy.tokenizer import Tokenizer
+    tokenizer = Tokenizer(nlp.vocab)
+    tokenizer = tokenizer.from_disk('/path/to/tokenizer')
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code path]
+        +cell unicode or #[code Path]
+        +cell
+            |  A path to a directory. Paths may be either strings or
+            |  #[code Path]-like objects.
+
     +footrow
         +cell returns
-        +cell #[code None]
+        +cell #[code Tokenizer]
+        +cell The modified #[code Tokenizer] object.
+
++h(2, "to_bytes") Tokenizer.to_bytes
+    +tag method
+
+p Serialize the current state to a binary string.
+
++aside-code("Example").
+    tokenizer_bytes = tokenizer.to_bytes()
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code **exclude]
         +cell -
+        +cell Named attributes to prevent from being serialized.
+
+    +footrow
+        +cell returns
+        +cell bytes
+        +cell The serialized form of the #[code Tokenizer] object.
+
++h(2, "from_bytes") Tokenizer.from_bytes
+    +tag method
+
+p Load state from a binary string.
+
++aside-code("Example").
+    fron spacy.tokenizer import Tokenizer
+    tokenizer_bytes = tokenizer.to_bytes()
+    new_tokenizer = Tokenizer(nlp.vocab)
+    new_tokenizer.from_bytes(tokenizer_bytes)
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code bytes_data]
+        +cell bytes
+        +cell The data to load from.
+
+    +row
+        +cell #[code **exclude]
+        +cell -
+        +cell Named attributes to prevent from being loaded.
+
+    +footrow
+        +cell returns
+        +cell #[code Tokenizer]
+        +cell The #[code Tokenizer] object.
+
 
 +h(2, "attributes") Attributes