Add documentation for Tokenizer API (see #600)

2025-11-09 20:38:06 +03:00 · 2016-11-02 23:17:42 +01:00 · 2016-11-02 23:17:42 +01:00 · 2515b32a74
commit 2515b32a74
parent 309a8c2c0f
2 changed files with 255 additions and 0 deletions
--- a/website/docs/api/_data.json
+++ b/website/docs/api/_data.json
@ -10,6 +10,7 @@
            "Token": "token",
            "Span": "span",
            "Language": "language",
            "Tokenizer": "tokenizer",
            "Tagger": "tagger",
            "DependencyParser": "dependencyparser",
            "EntityRecognizer": "entityrecognizer",
@ -93,6 +94,11 @@
        "tag": "class"
    },
    "tokenizer": {
        "title": "Tokenizer",
        "tag": "class"
    },
    "tagger": {
        "title": "Tagger",
        "tag": "class"
--- a/website/docs/api/tokenizer.jade
+++ b/website/docs/api/tokenizer.jade
@ -0,0 +1,249 @@
 //- 💫 DOCS > API > TOKENIZER
 include ../../_includes/_mixins
 p
    |  Segment text, and create #[code Doc] objects with the discovered segment
    |  boundaries.
 +h(2, "attributes") Attributes
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell The vocab object of the parent #[code Doc].
    +row
        +cell #[code prefix_search]
        +cell -
        +cell
            |  A function to find segment boundaries from the start of a
            |  string. Returns the length of the segment, or #[code None].
    +row
        +cell #[code suffix_search]
        +cell -
        +cell
            |  A function to find segment boundaries from the end of a string.
            |  Returns the length of the segment, or #[code None].
    +row
        +cell #[code infix_finditer]
        +cell -
        +cell
            |  A function to find internal segment separators, e.g. hyphens.
            |  Returns a (possibly empty) list of #[code re.MatchObject]
            |  objects.
 +h(2, "load") Tokenizer.load
    +tag classmethod
 p Load a #[code Tokenizer], reading unsupplied components from the path.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell #[code Path]
        +cell The path to load from.
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell A storage container for lexical types.
    +row
        +cell #[code rules]
        +cell dict
        +cell Exceptions and special-cases for the tokenizer.
    +row
        +cell #[code prefix_search]
        +cell callable
        +cell
            |  A function matching the signature of
            |  #[code re.compile(string).search] to match prefixes.
    +row
        +cell #[code suffix_search]
        +cell callable
        +cell
            |  A function matching the signature of
            |  #[code re.compile(string).search] to match suffixes.
    +row
        +cell #[code infix_finditer]
        +cell callable
        +cell
            |  A function matching the signature of
            |  #[code re.compile(string).finditer] to find infixes.
    +footrow
        +cell return
        +cell #[code Tokenizer]
        +cell The newly constructed object.
 +h(2, "init") Tokenizer.__init__
    +tag method
 p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell A storage container for lexical types.
    +row
        +cell #[code rules]
        +cell dict
        +cell Exceptions and special-cases for the tokenizer.
    +row
        +cell #[code prefix_search]
        +cell callable
        +cell
            |  A function matching the signature of
            |  #[code re.compile(string).search] to match prefixes.
    +row
        +cell #[code suffix_search]
        +cell callable
        +cell
            |  A function matching the signature of
            |  #[code re.compile(string).search] to match suffixes.
    +row
        +cell #[code infix_finditer]
        +cell callable
        +cell
            |  A function matching the signature of
            |  #[code re.compile(string).finditer] to find infixes.
    +footrow
        +cell return
        +cell #[code Tokenizer]
        +cell The newly constructed object.
 +h(2, "call") Tokenizer.__call__
    +tag method
 p Tokenize a string.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to tokenize.
    +footrow
        +cell return
        +cell #[code Doc]
        +cell A container for linguistic annotations.
 +h(2, "pipe") Tokenizer.pipe
    +tag method
 p Tokenize a stream of texts.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code texts]
        +cell -
        +cell A sequence of unicode texts.
    +row
        +cell #[code batch_size]
        +cell int
        +cell The number of texts to accumulate in an internal buffer.
    +row
        +cell #[code n_threads]
        +cell int
        +cell
            |  The number of threads to use, if the implementation supports
            |  multi-threading. The default tokenizer is single-threaded.
    +footrow
        +cell yield
        +cell #[code Doc]
        +cell A sequence of Doc objects, in order.
 +h(2, "find_infix") Tokenizer.find_infix
    +tag method
 p Find internal split points of the string.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to split.
    +footrow
        +cell return
        +cell #[code List[re.MatchObject]]
        +cell
            |  A list of objects that have #[code .start()] and #[code .end()]
            |  methods, denoting the placement of internal segment separators,
            |  e.g. hyphens.
 +h(2, "find_prefix") Tokenizer.find_prefix
    +tag method
 p
    |  Find the length of a prefix that should be segmented from the string, or
    |  #[code None] if no prefix rules match.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to segment.
    +footrow
        +cell return
        +cell int / #[code None]
        +cell The length of the prefix if present, otherwise #[code None].
 +h(2, "find_suffix") Tokenizer.find_suffix
    +tag method
 p
    |  Find the length of a suffix that should be segmented from the string, or
    |  #[code None] if no suffix rules match.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to segment.
    +footrow
        +cell return
        +cell int / #[code None]
        +cell The length of the suffix if present, otherwise #[code None].
 +h(2, "add_special_case") Tokenizer.add_special_case
    +tag method
 p Add a special-case tokenization rule.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to specially tokenize.
    +row
        +cell #[code token_attrs]
        +cell -
        +cell
            |  A sequence of dicts, where each dict describes a token and its
            |  attributes. The #[code ORTH] fields of the attributes must
            |  exactly match the string when they are concatenated.
    +footrow
        +cell return
        +cell #[code None]
        +cell -