From 205c73a58914b3fd9aebdd0708582fb7a80fd625 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20B=C3=B6ing?= Date: Wed, 10 Jul 2019 10:16:48 +0200 Subject: [PATCH] Update tokenizer and doc init example (#3939) * Fix Doc.to_json hyperlink * Update tokenizer and doc init examples * Change "matchin rules" to "punctuation rules" * Auto-format --- spacy/tokens/doc.pyx | 5 +++-- website/docs/api/doc.md | 2 +- website/docs/api/tokenizer.md | 8 +++++++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 131c43d37..373771247 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -85,13 +85,14 @@ cdef class Doc: Python-level `Token` and `Span` objects are views of this array, i.e. they don't own the data themselves. - EXAMPLE: Construction 1 + EXAMPLE: + Construction 1 >>> doc = nlp(u'Some text') Construction 2 >>> from spacy.tokens import Doc >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], - spaces=[True, False, False]) + >>> spaces=[True, False, False]) DOCS: https://spacy.io/api/doc """ diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index f5a94335f..bf9801564 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -264,7 +264,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor. | ----------- | -------------------------------------- | ----------------------------------------------- | | **RETURNS** | `numpy.ndarray[ndim=2, dtype='int32']` | The lowest common ancestor matrix of the `Doc`. | -## Doc.to_json {#to_json, tag="method" new="2.1"} +## Doc.to_json {#to_json tag="method" new="2.1"} Convert a Doc to JSON. The format it produces will be the new format for the [`spacy train`](/api/cli#train) command (not implemented yet). If custom diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 5bc0df625..67e67f5c9 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -9,7 +9,10 @@ Segment text, and create `Doc` objects with the discovered segment boundaries. ## Tokenizer.\_\_init\_\_ {#init tag="method"} -Create a `Tokenizer`, to create `Doc` objects given unicode text. +Create a `Tokenizer`, to create `Doc` objects given unicode text. For examples +of how to construct a custom tokenizer with different tokenization rules, see +the +[usage documentation](https://spacy.io/usage/linguistic-features#native-tokenizers). > #### Example > @@ -18,11 +21,14 @@ Create a `Tokenizer`, to create `Doc` objects given unicode text. > from spacy.tokenizer import Tokenizer > from spacy.lang.en import English > nlp = English() +> # Create a blank Tokenizer with just the English vocab > tokenizer = Tokenizer(nlp.vocab) > > # Construction 2 > from spacy.lang.en import English > nlp = English() +> # Create a Tokenizer with the default settings for English +> # including punctuation rules and exceptions > tokenizer = nlp.Defaults.create_tokenizer(nlp) > ```