From 2515b32a7498c37d1b0f47fdd84f8e5e031e7f06 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 2 Nov 2016 23:17:42 +0100 Subject: [PATCH] Add documentation for Tokenizer API (see #600) --- website/docs/api/_data.json | 6 + website/docs/api/tokenizer.jade | 249 ++++++++++++++++++++++++++++++++ 2 files changed, 255 insertions(+) create mode 100644 website/docs/api/tokenizer.jade diff --git a/website/docs/api/_data.json b/website/docs/api/_data.json index df602d376..b41f97b38 100644 --- a/website/docs/api/_data.json +++ b/website/docs/api/_data.json @@ -10,6 +10,7 @@ "Token": "token", "Span": "span", "Language": "language", + "Tokenizer": "tokenizer", "Tagger": "tagger", "DependencyParser": "dependencyparser", "EntityRecognizer": "entityrecognizer", @@ -93,6 +94,11 @@ "tag": "class" }, + "tokenizer": { + "title": "Tokenizer", + "tag": "class" + }, + "tagger": { "title": "Tagger", "tag": "class" diff --git a/website/docs/api/tokenizer.jade b/website/docs/api/tokenizer.jade new file mode 100644 index 000000000..44ba0fc69 --- /dev/null +++ b/website/docs/api/tokenizer.jade @@ -0,0 +1,249 @@ +//- 💫 DOCS > API > TOKENIZER + +include ../../_includes/_mixins + +p + | Segment text, and create #[code Doc] objects with the discovered segment + | boundaries. + ++h(2, "attributes") Attributes + ++table(["Name", "Type", "Description"]) + +row + +cell #[code vocab] + +cell #[code Vocab] + +cell The vocab object of the parent #[code Doc]. + + +row + +cell #[code prefix_search] + +cell - + +cell + | A function to find segment boundaries from the start of a + | string. Returns the length of the segment, or #[code None]. + + +row + +cell #[code suffix_search] + +cell - + +cell + | A function to find segment boundaries from the end of a string. + | Returns the length of the segment, or #[code None]. + + +row + +cell #[code infix_finditer] + +cell - + +cell + | A function to find internal segment separators, e.g. hyphens. + | Returns a (possibly empty) list of #[code re.MatchObject] + | objects. + ++h(2, "load") Tokenizer.load + +tag classmethod + +p Load a #[code Tokenizer], reading unsupplied components from the path. + ++table(["Name", "Type", "Description"]) + +row + +cell #[code path] + +cell #[code Path] + +cell The path to load from. + + +row + +cell #[code vocab] + +cell #[code Vocab] + +cell A storage container for lexical types. + + +row + +cell #[code rules] + +cell dict + +cell Exceptions and special-cases for the tokenizer. + + +row + +cell #[code prefix_search] + +cell callable + +cell + | A function matching the signature of + | #[code re.compile(string).search] to match prefixes. + + +row + +cell #[code suffix_search] + +cell callable + +cell + | A function matching the signature of + | #[code re.compile(string).search] to match suffixes. + + +row + +cell #[code infix_finditer] + +cell callable + +cell + | A function matching the signature of + | #[code re.compile(string).finditer] to find infixes. + + +footrow + +cell return + +cell #[code Tokenizer] + +cell The newly constructed object. + ++h(2, "init") Tokenizer.__init__ + +tag method + +p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text. + ++table(["Name", "Type", "Description"]) + +row + +cell #[code vocab] + +cell #[code Vocab] + +cell A storage container for lexical types. + + +row + +cell #[code rules] + +cell dict + +cell Exceptions and special-cases for the tokenizer. + + +row + +cell #[code prefix_search] + +cell callable + +cell + | A function matching the signature of + | #[code re.compile(string).search] to match prefixes. + + +row + +cell #[code suffix_search] + +cell callable + +cell + | A function matching the signature of + | #[code re.compile(string).search] to match suffixes. + + +row + +cell #[code infix_finditer] + +cell callable + +cell + | A function matching the signature of + | #[code re.compile(string).finditer] to find infixes. + + +footrow + +cell return + +cell #[code Tokenizer] + +cell The newly constructed object. + ++h(2, "call") Tokenizer.__call__ + +tag method + +p Tokenize a string. + ++table(["Name", "Type", "Description"]) + +row + +cell #[code string] + +cell unicode + +cell The string to tokenize. + + +footrow + +cell return + +cell #[code Doc] + +cell A container for linguistic annotations. + ++h(2, "pipe") Tokenizer.pipe + +tag method + +p Tokenize a stream of texts. + ++table(["Name", "Type", "Description"]) + +row + +cell #[code texts] + +cell - + +cell A sequence of unicode texts. + + +row + +cell #[code batch_size] + +cell int + +cell The number of texts to accumulate in an internal buffer. + + +row + +cell #[code n_threads] + +cell int + +cell + | The number of threads to use, if the implementation supports + | multi-threading. The default tokenizer is single-threaded. + + +footrow + +cell yield + +cell #[code Doc] + +cell A sequence of Doc objects, in order. + ++h(2, "find_infix") Tokenizer.find_infix + +tag method + +p Find internal split points of the string. + ++table(["Name", "Type", "Description"]) + +row + +cell #[code string] + +cell unicode + +cell The string to split. + + +footrow + +cell return + +cell #[code List[re.MatchObject]] + +cell + | A list of objects that have #[code .start()] and #[code .end()] + | methods, denoting the placement of internal segment separators, + | e.g. hyphens. + ++h(2, "find_prefix") Tokenizer.find_prefix + +tag method + +p + | Find the length of a prefix that should be segmented from the string, or + | #[code None] if no prefix rules match. + ++table(["Name", "Type", "Description"]) + +row + +cell #[code string] + +cell unicode + +cell The string to segment. + + +footrow + +cell return + +cell int / #[code None] + +cell The length of the prefix if present, otherwise #[code None]. + ++h(2, "find_suffix") Tokenizer.find_suffix + +tag method + +p + | Find the length of a suffix that should be segmented from the string, or + | #[code None] if no suffix rules match. + ++table(["Name", "Type", "Description"]) + +row + +cell #[code string] + +cell unicode + +cell The string to segment. + + +footrow + +cell return + +cell int / #[code None] + +cell The length of the suffix if present, otherwise #[code None]. + ++h(2, "add_special_case") Tokenizer.add_special_case + +tag method + +p Add a special-case tokenization rule. + ++table(["Name", "Type", "Description"]) + +row + +cell #[code string] + +cell unicode + +cell The string to specially tokenize. + + +row + +cell #[code token_attrs] + +cell - + +cell + | A sequence of dicts, where each dict describes a token and its + | attributes. The #[code ORTH] fields of the attributes must + | exactly match the string when they are concatenated. + + +footrow + +cell return + +cell #[code None] + +cell -