From 1d6aec805d5c03ad8a039466e98ed3a619e650c4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 9 Mar 2020 11:17:20 +0100 Subject: [PATCH] Fix formatting and update docs for v2.2.4 --- spacy/cli/debug_data.py | 25 ++++++++++++++++--------- website/docs/api/cli.md | 30 ++++++++++++++++++++---------- website/docs/api/doc.md | 22 ++++++++++++---------- website/docs/api/span.md | 30 ++++++++++++++++++++++++++---- website/docs/api/top-level.md | 32 ++++++++++++++++---------------- website/meta/languages.json | 2 ++ 6 files changed, 92 insertions(+), 49 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 0e12a594c..c5e1ff6cf 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -23,20 +23,17 @@ BLANK_MODEL_THRESHOLD = 2000 @plac.annotations( + # fmt: off lang=("model language", "positional", None, str), train_path=("location of JSON-formatted training data", "positional", None, Path), dev_path=("location of JSON-formatted development data", "positional", None, Path), tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), base_model=("name of model to update (optional)", "option", "b", str), - pipeline=( - "Comma-separated names of pipeline components to train", - "option", - "p", - str, - ), + pipeline=("Comma-separated names of pipeline components to train", "option", "p", str), ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool), verbose=("Print additional information and explanations", "flag", "V", bool), no_format=("Don't pretty-print the results", "flag", "NF", bool), + # fmt: on ) def debug_data( lang, @@ -235,13 +232,17 @@ def debug_data( if gold_train_data["ws_ents"]: msg.fail( - "{} invalid whitespace entity span(s)".format(gold_train_data["ws_ents"]) + "{} invalid whitespace entity span(s)".format( + gold_train_data["ws_ents"] + ) ) has_ws_ents_error = True if gold_train_data["punct_ents"]: msg.warn( - "{} entity span(s) with punctuation".format(gold_train_data["punct_ents"]) + "{} entity span(s) with punctuation".format( + gold_train_data["punct_ents"] + ) ) has_punct_ents_warning = True @@ -592,7 +593,13 @@ def _compile_gold(train_docs, pipeline): if label.startswith(("B-", "U-", "L-")) and doc[i].is_space: # "Illegal" whitespace entity data["ws_ents"] += 1 - if label.startswith(("B-", "U-", "L-")) and doc[i].text in [".", "'", "!", "?", ","]: + if label.startswith(("B-", "U-", "L-")) and doc[i].text in [ + ".", + "'", + "!", + "?", + ",", + ]: # punctuation entity: could be replaced by whitespace when training with noise, # so add a warning to alert the user to this unexpected side effect. data["punct_ents"] += 1 diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 2f7346491..e47695efb 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -184,16 +184,17 @@ low data labels and more. $ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pipeline] [--ignore-warnings] [--verbose] [--no-format] ``` -| Argument | Type | Description | -| -------------------------- | ---------- | -------------------------------------------------------------------------------------------------- | -| `lang` | positional | Model language. | -| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | -| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | -| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. | -| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | -| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | -| `--verbose`, `-V` | flag | Print additional information and explanations. | -| --no-format, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | +| Argument | Type | Description | +| ------------------------------------------------------ | ---------- | -------------------------------------------------------------------------------------------------- | +| `lang` | positional | Model language. | +| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | +| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | +| `--tag-map-path`, `-tm` 2.2.3 | option | Location of JSON-formatted tag map. | +| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. | +| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | +| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | +| `--verbose`, `-V` | flag | Print additional information and explanations. | +| --no-format, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | @@ -368,6 +369,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | | `--base-model`, `-b` 2.1 | option | Optional name of base model to update. Can be any loadable spaCy model. | | `--pipeline`, `-p` 2.1 | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | +| `--replace-components`, `-R` | flag | Replace components from the base model. | | `--vectors`, `-v` | option | Model to load vectors from. | | `--n-iter`, `-n` | option | Number of iterations (default: `30`). | | `--n-early-stopping`, `-ne` | option | Maximum number of training epochs without dev accuracy improvement. | @@ -378,6 +380,13 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | `--init-tok2vec`, `-t2v` 2.1 | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. | | `--parser-multitasks`, `-pt` | option | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'` | | `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` | +| `--width`, `-cw` 2.2.4 | option | Width of CNN layers of `Tok2Vec` component. | +| `--conv-depth`, `-cd` 2.2.4 | option | Depth of CNN layers of `Tok2Vec` component. | +| `--cnn-window`, `-cW` 2.2.4 | option | Window size for CNN layers of `Tok2Vec` component. | +| `--cnn-pieces`, `-cP` 2.2.4 | option | Maxout size for CNN layers of `Tok2Vec` component. | +| `--use-chars`, `-chr` 2.2.4 | flag | Whether to use character-based embedding of `Tok2Vec` component. | +| `--bilstm-depth`, `-lstm` 2.2.4 | option | Depth of BiLSTM layers of `Tok2Vec` component (requires PyTorch). | +| `--embed-rows`, `-er` 2.2.4 | option | Number of embedding rows of `Tok2Vec` component. | | `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. | | `--orth-variant-level`, `-ovl` 2.2 | option | Float indicating the orthography variation for data augmentation (e.g. `0.3` for making 30% of occurrences of some tokens subject to replacement). | | `--gold-preproc`, `-G` | flag | Use gold preprocessing. | @@ -385,6 +394,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | `--textcat-multilabel`, `-TML` 2.2 | flag | Text classification classes aren't mutually exclusive (multilabel). | | `--textcat-arch`, `-ta` 2.2 | option | Text classification model architecture. Defaults to `"bow"`. | | `--textcat-positive-label`, `-tpl` 2.2 | option | Text classification positive label for binary classes with two labels. | +| `--tag-map-path`, `-tm` 2.2.4 | option | Location of JSON-formatted tag map. | | `--verbose`, `-VV` 2.0.13 | flag | Show more detailed messages during training. | | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | model, pickle | A spaCy model on each epoch. | diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 4f948e425..87b854a8c 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -7,9 +7,10 @@ source: spacy/tokens/doc.pyx A `Doc` is a sequence of [`Token`](/api/token) objects. Access sentences and named entities, export annotations to numpy arrays, losslessly serialize to -compressed binary strings. The `Doc` object holds an array of [`TokenC`](/api/cython-structs#tokenc) structs. -The Python-level `Token` and [`Span`](/api/span) objects are views of this -array, i.e. they don't own the data themselves. +compressed binary strings. The `Doc` object holds an array of +[`TokenC`](/api/cython-structs#tokenc) structs. The Python-level `Token` and +[`Span`](/api/span) objects are views of this array, i.e. they don't own the +data themselves. ## Doc.\_\_init\_\_ {#init tag="method"} @@ -197,13 +198,14 @@ the character indices don't map to a valid span. > assert span.text == "New York" > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | ------------------------------------------------------- | -| `start` | int | The index of the first character of the span. | -| `end` | int | The index of the last character after the span. | -| `label` | uint64 / unicode | A label to attach to the Span, e.g. for named entities. | -| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | -| **RETURNS** | `Span` | The newly constructed object or `None`. | +| Name | Type | Description | +| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- | +| `start` | int | The index of the first character of the span. | +| `end` | int | The index of the last character after the span. | +| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. | +| `kb_id` 2.2 | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. | +| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| **RETURNS** | `Span` | The newly constructed object or `None`. | ## Doc.similarity {#similarity tag="method" model="vectors"} diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 64b77b89d..3833bbca9 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -172,6 +172,28 @@ Remove a previously registered extension. | `name` | unicode | Name of the extension. | | **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | +## Span.char_span {#char_span tag="method" new="2.2.4"} + +Create a `Span` object from the slice `span.text[start:end]`. Returns `None` if +the character indices don't map to a valid span. + +> #### Example +> +> ```python +> doc = nlp("I like New York") +> span = doc[1:4].char_span(5, 13, label="GPE") +> assert span.text == "New York" +> ``` + +| Name | Type | Description | +| ----------- | ---------------------------------------- | --------------------------------------------------------------------- | +| `start` | int | The index of the first character of the span. | +| `end` | int | The index of the last character after the span. | +| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. | +| `kb_id` | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. | +| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| **RETURNS** | `Span` | The newly constructed object or `None`. | + ## Span.similarity {#similarity tag="method" model="vectors"} Make a semantic similarity estimate. The default estimate is cosine similarity @@ -293,10 +315,10 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data. > assert doc2.text == "New York" > ``` -| Name | Type | Description | -| ----------------- | ----- | ---------------------------------------------------- | -| `copy_user_data` | bool | Whether or not to copy the original doc's user data. | -| **RETURNS** | `Doc` | A `Doc` object of the `Span`'s content. | +| Name | Type | Description | +| ---------------- | ----- | ---------------------------------------------------- | +| `copy_user_data` | bool | Whether or not to copy the original doc's user data. | +| **RETURNS** | `Doc` | A `Doc` object of the `Span`'s content. | ## Span.root {#root tag="property" model="parser"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 266df87f0..217c51794 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -236,22 +236,22 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="dep", options=options) > ``` -| Name | Type | Description | Default | -| ------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | -| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | -| `add_lemma` | bool | Print the lemma's in a separate row below the token texts in the `dep` visualisation. | `False` | -| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | -| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | -| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | -| `color` | unicode | Text color (HEX, RGB or color names). | `'#000000'` | -| `bg` | unicode | Background color (HEX, RGB or color names). | `'#ffffff'` | -| `font` | unicode | Font name or font family for all text. | `'Arial'` | -| `offset_x` | int | Spacing on left side of the SVG in px. | `50` | -| `arrow_stroke` | int | Width of arrow path in px. | `2` | -| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) | -| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) | -| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` | -| `distance` | int | Distance between words in px. | `175` / `150` (compact) | +| Name | Type | Description | Default | +| ------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | +| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | +| `add_lemma` 2.2.4 | bool | Print the lemma's in a separate row below the token texts. | `False` | +| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | +| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | +| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | +| `color` | unicode | Text color (HEX, RGB or color names). | `'#000000'` | +| `bg` | unicode | Background color (HEX, RGB or color names). | `'#ffffff'` | +| `font` | unicode | Font name or font family for all text. | `'Arial'` | +| `offset_x` | int | Spacing on left side of the SVG in px. | `50` | +| `arrow_stroke` | int | Width of arrow path in px. | `2` | +| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) | +| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) | +| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` | +| `distance` | int | Distance between words in px. | `175` / `150` (compact) | #### Named Entity Visualizer options {#displacy_options-ent} diff --git a/website/meta/languages.json b/website/meta/languages.json index c22ddad69..8834aaddc 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -95,6 +95,8 @@ "has_examples": true }, { "code": "hr", "name": "Croatian", "has_examples": true }, + { "code": "eu", "name": "Basque", "has_examples": true }, + { "code": "yo", "name": "Yoruba", "has_examples": true }, { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true }, { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true }, { "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true },