Update docstrings and API docs for Tokenizer

2025-07-12 09:12:21 +03:00 · 2017-05-21 13:18:14 +02:00 · 2017-05-21 13:18:14 +02:00 · c5a653fa48
commit c5a653fa48
parent f216422ac5
2 changed files with 200 additions and 74 deletions
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -2,8 +2,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 import ujson
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc
 from cymem.cymem cimport Pool
@ -12,32 +10,31 @@ from preshed.maps cimport PreshMap
 from .strings cimport hash_string
 cimport cython
 from . import util
 from .tokens.doc cimport Doc
 cdef class Tokenizer:
    """Segment text, and create Doc objects with the discovered segment
    boundaries.
    """
    def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
-        """
+        """Create a `Tokenizer`, to create `Doc` objects given unicode text.
        Create a Tokenizer, to create Doc objects given unicode text.
-        Arguments:
+        vocab (Vocab): A storage container for lexical types.
-            vocab (Vocab):
+        rules (dict): Exceptions and special-cases for the tokenizer.
-                A storage container for lexical types.
+        prefix_search (callable): A function matching the signature of
-            rules (dict):
+            `re.compile(string).search` to match prefixes.
-                Exceptions and special-cases for the tokenizer.
+        suffix_search (callable): A function matching the signature of
-            prefix_search:
+            `re.compile(string).search` to match suffixes.
-                A function matching the signature of re.compile(string).search
+        `infix_finditer` (callable): A function matching the signature of
-                to match prefixes.
+            `re.compile(string).finditer` to find infixes.
-            suffix_search:
+        token_match (callable): A boolean function matching strings to be
-                A function matching the signature of re.compile(string).search
+            recognised as tokens.
-                to match suffixes.
+        RETURNS (Tokenizer): The newly constructed object.
-            infix_finditer:
+
-                A function matching the signature of re.compile(string).finditer
+        EXAMPLE:
-                to find infixes.
+            >>> tokenizer = Tokenizer(nlp.vocab)
-            token_match:
+            >>> tokenizer = English().Defaults.create_tokenizer(nlp)
                A boolean function matching strings that becomes tokens.
        """
        self.mem = Pool()
        self._cache = PreshMap()
@ -69,13 +66,10 @@ cdef class Tokenizer:
    @cython.boundscheck(False)
    def __call__(self, unicode string):
-        """
+        """Tokenize a string.
        Tokenize a string.
-        Arguments:
+        string (unicode): The string to tokenize.
-            string (unicode): The string to tokenize.
+        RETURNS (Doc): A container for linguistic annotations.
        Returns:
            Doc A container for linguistic annotations.
        """
        if len(string) >= (2 ** 30):
            raise ValueError(
@ -123,18 +117,13 @@ cdef class Tokenizer:
        return tokens
    def pipe(self, texts, batch_size=1000, n_threads=2):
-        """
+        """Tokenize a stream of texts.
        Tokenize a stream of texts.
-        Arguments:
+        texts: A sequence of unicode texts.
-            texts: A sequence of unicode texts.
+        batch_size (int): The number of texts to accumulate in an internal buffer.
-            batch_size (int):
+        n_threads (int): The number of threads to use, if the implementation
-                The number of texts to accumulate in an internal buffer.
+            supports multi-threading. The default tokenizer is single-threaded.
-            n_threads (int):
+        YIELDS (Doc): A sequence of Doc objects, in order.
                The number of threads to use, if the implementation supports
                multi-threading. The default tokenizer is single-threaded.
        Yields:
            Doc A sequence of Doc objects, in order.
        """
        for text in texts:
            yield self(text)
@ -278,27 +267,23 @@ cdef class Tokenizer:
        self._cache.set(key, cached)
    def find_infix(self, unicode string):
-        """
+        """Find internal split points of the string, such as hyphens.
        Find internal split points of the string, such as hyphens.
        string (unicode): The string to segment.
-
+        RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
-        Returns List[re.MatchObject]
+            and `.end()` methods, denoting the placement of internal segment
-            A list of objects that have .start() and .end() methods, denoting the
+            separators, e.g. hyphens.
            placement of internal segment separators, e.g. hyphens.
        """
        if self.infix_finditer is None:
            return 0
        return list(self.infix_finditer(string))
    def find_prefix(self, unicode string):
-        """
+        """Find the length of a prefix that should be segmented from the string,
        Find the length of a prefix that should be segmented from the string,
        or None if no prefix rules match.
-        Arguments:
+        string (unicode): The string to segment.
-            string (unicode): The string to segment.
+        RETURNS (int): The length of the prefix if present, otherwise `None`.
        Returns (int or None): The length of the prefix if present, otherwise None.
        """
        if self.prefix_search is None:
            return 0
@ -306,13 +291,11 @@ cdef class Tokenizer:
        return (match.end() - match.start()) if match is not None else 0
    def find_suffix(self, unicode string):
-        """
+        """Find the length of a suffix that should be segmented from the string,
        Find the length of a suffix that should be segmented from the string,
        or None if no suffix rules match.
-        Arguments:
+        string (unicode): The string to segment.
-            string (unicode): The string to segment.
+        Returns (int): The length of the suffix if present, otherwise `None`.
        Returns (int or None): The length of the suffix if present, otherwise None.
        """
        if self.suffix_search is None:
            return 0
@ -320,23 +303,17 @@ cdef class Tokenizer:
        return (match.end() - match.start()) if match is not None else 0
    def _load_special_tokenization(self, special_cases):
-        """
+        """Add special-case tokenization rules."""
        Add special-case tokenization rules.
        """
        for chunk, substrings in sorted(special_cases.items()):
            self.add_special_case(chunk, substrings)
    def add_special_case(self, unicode string, substrings):
-        """
+        """Add a special-case tokenization rule.
        Add a special-case tokenization rule.
-        Arguments:
+        string (unicode): The string to specially tokenize.
-            string (unicode): The string to specially tokenize.
+        token_attrs (iterable): A sequence of dicts, where each dict describes
-            token_attrs:
+            a token and its attributes. The `ORTH` fields of the attributes must
-                A sequence of dicts, where each dict describes a token and its
+            exactly match the string when they are concatenated.
                attributes. The ORTH fields of the attributes must exactly match
                the string when they are concatenated.
        Returns None
        """
        substrings = list(substrings)
        cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
@ -347,3 +324,38 @@ cdef class Tokenizer:
        self._specials.set(key, cached)
        self._cache.set(key, cached)
        self._rules[string] = substrings
    def to_disk(self, path):
        """Save the current state to a directory.
        path (unicode or Path): A path to a directory, which will be created if
            it doesn't exist. Paths may be either strings or `Path`-like objects.
        """
        raise NotImplementedError()
    def from_disk(self, path):
        """Loads state from a directory. Modifies the object in place and
        returns it.
        path (unicode or Path): A path to a directory. Paths may be either
            strings or `Path`-like objects.
        RETURNS (Tokenizer): The modified `Tokenizer` object.
        """
        raise NotImplementedError()
    def to_bytes(self, **exclude):
        """Serialize the current state to a binary string.
        **exclude: Named attributes to prevent from being serialized.
        RETURNS (bytes): The serialized form of the `Tokenizer` object.
        """
        raise NotImplementedError()
    def from_bytes(self, bytes_data, **exclude):
        """Load state from a binary string.
        bytes_data (bytes): The data to load from.
        **exclude: Named attributes to prevent from being loaded.
        RETURNS (Tokenizer): The `Tokenizer` object.
        """
        raise NotImplementedError()
--- a/website/docs/api/tokenizer.jade
+++ b/website/docs/api/tokenizer.jade
@ -11,6 +11,15 @@ p
 p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
 +aside-code("Example").
    # Construction 1
    from spacy.tokenizer import Tokenizer
    tokenizer = Tokenizer(nlp.vocab)
    # Construction 2
    from spacy.lang.en import English
    tokenizer = English().Defaults.create_tokenizer(nlp)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
@ -43,6 +52,11 @@ p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
            |  A function matching the signature of
            |  #[code re.compile(string).finditer] to find infixes.
    +row
        +cell #[code token_match]
        +cell callable
        +cell A boolean function matching strings to be recognised as tokens.
    +footrow
        +cell returns
        +cell #[code Tokenizer]
@ -53,6 +67,10 @@ p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
 p Tokenize a string.
 +aside-code("Example").
    tokens = tokenizer(u'This is a sentence')
    assert len(tokens) == 4
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
@ -69,6 +87,11 @@ p Tokenize a string.
 p Tokenize a stream of texts.
 +aside-code("Example").
    texts = [u'One document.', u'...', u'Lots of documents']
    for doc in tokenizer.pipe(texts, batch_size=50):
        pass
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code texts]
@ -105,11 +128,11 @@ p Find internal split points of the string.
    +footrow
        +cell returns
-        +cell #[code List[re.MatchObject]]
+        +cell list
        +cell
-            |  A list of objects that have #[code .start()] and #[code .end()]
+            |  A list of #[code re.MatchObject] objects that have #[code .start()]
-            |  methods, denoting the placement of internal segment separators,
+            |  and #[code .end()] methods, denoting the placement of internal
-            |  e.g. hyphens.
+            |  segment separators, e.g. hyphens.
 +h(2, "find_prefix") Tokenizer.find_prefix
    +tag method
@ -126,7 +149,7 @@ p
    +footrow
        +cell returns
-        +cell int / #[code None]
+        +cell int
        +cell The length of the prefix if present, otherwise #[code None].
 +h(2, "find_suffix") Tokenizer.find_suffix
@ -150,7 +173,16 @@ p
 +h(2, "add_special_case") Tokenizer.add_special_case
    +tag method
-p Add a special-case tokenization rule.
+p
    |  Add a special-case tokenization rule. This mechanism is also used to add
    |  custom tokenizer exceptions to the language data. See the usage workflow
    |  on #[+a("/docs/usage/adding-languages#tokenizer-exceptions") adding languages]
    |  for more details and examples.
 +aside-code("Example").
    from spacy.attrs import ORTH, LEMMA
    case = [{"don't": [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]}]
    tokenizer.add_special_case(case)
 +table(["Name", "Type", "Description"])
    +row
@ -160,16 +192,98 @@ p Add a special-case tokenization rule.
    +row
        +cell #[code token_attrs]
-        +cell -
+        +cell iterable
        +cell
            |  A sequence of dicts, where each dict describes a token and its
            |  attributes. The #[code ORTH] fields of the attributes must
            |  exactly match the string when they are concatenated.
 +h(2, "to_disk") Tokenizer.to_disk
    +tag method
 p Save the current state to a directory.
 +aside-code("Example").
    tokenizer.to_disk('/path/to/tokenizer')
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell unicode or #[code Path]
        +cell
            |  A path to a directory, which will be created if it doesn't exist.
            |  Paths may be either strings or #[code Path]-like objects.
 +h(2, "from_disk") Tokenizer.from_disk
    +tag method
 p Loads state from a directory. Modifies the object in place and returns it.
 +aside-code("Example").
    from spacy.tokenizer import Tokenizer
    tokenizer = Tokenizer(nlp.vocab)
    tokenizer = tokenizer.from_disk('/path/to/tokenizer')
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell unicode or #[code Path]
        +cell
            |  A path to a directory. Paths may be either strings or
            |  #[code Path]-like objects.
    +footrow
        +cell returns
-        +cell #[code None]
+        +cell #[code Tokenizer]
        +cell The modified #[code Tokenizer] object.
 +h(2, "to_bytes") Tokenizer.to_bytes
    +tag method
 p Serialize the current state to a binary string.
 +aside-code("Example").
    tokenizer_bytes = tokenizer.to_bytes()
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being serialized.
    +footrow
        +cell returns
        +cell bytes
        +cell The serialized form of the #[code Tokenizer] object.
 +h(2, "from_bytes") Tokenizer.from_bytes
    +tag method
 p Load state from a binary string.
 +aside-code("Example").
    fron spacy.tokenizer import Tokenizer
    tokenizer_bytes = tokenizer.to_bytes()
    new_tokenizer = Tokenizer(nlp.vocab)
    new_tokenizer.from_bytes(tokenizer_bytes)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code bytes_data]
        +cell bytes
        +cell The data to load from.
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being loaded.
    +footrow
        +cell returns
        +cell #[code Tokenizer]
        +cell The #[code Tokenizer] object.
 +h(2, "attributes") Attributes