Fix formatting and update docs for v2.2.4

2025-10-21 11:14:32 +03:00 · 2020-03-09 11:17:20 +01:00 · 2020-03-09 11:17:20 +01:00 · 1d6aec805d
commit 1d6aec805d
parent 5f68004264
6 changed files with 92 additions and 49 deletions
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -23,20 +23,17 @@ BLANK_MODEL_THRESHOLD = 2000


@plac.annotations(
+    # fmt: off
    lang=("model language", "positional", None, str),
    train_path=("location of JSON-formatted training data", "positional", None, Path),
    dev_path=("location of JSON-formatted development data", "positional", None, Path),
    tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
    base_model=("name of model to update (optional)", "option", "b", str),
-    pipeline=(
-        "Comma-separated names of pipeline components to train",
-        "option",
-        "p",
-        str,
-    ),
+    pipeline=("Comma-separated names of pipeline components to train", "option", "p", str),
    ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
    verbose=("Print additional information and explanations", "flag", "V", bool),
    no_format=("Don't pretty-print the results", "flag", "NF", bool),
+    # fmt: on
 )
 def debug_data(
    lang,
@ -235,13 +232,17 @@ def debug_data(

        if gold_train_data["ws_ents"]:
            msg.fail(
-                "{} invalid whitespace entity span(s)".format(gold_train_data["ws_ents"])
+                "{} invalid whitespace entity span(s)".format(
+                    gold_train_data["ws_ents"]
+                )
            )
            has_ws_ents_error = True

        if gold_train_data["punct_ents"]:
            msg.warn(
-                "{} entity span(s) with punctuation".format(gold_train_data["punct_ents"])
+                "{} entity span(s) with punctuation".format(
+                    gold_train_data["punct_ents"]
+                )
            )
            has_punct_ents_warning = True

@ -592,7 +593,13 @@ def _compile_gold(train_docs, pipeline):
                if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
                    # "Illegal" whitespace entity
                    data["ws_ents"] += 1
-                if label.startswith(("B-", "U-", "L-")) and doc[i].text in [".", "'", "!", "?", ","]:
+                if label.startswith(("B-", "U-", "L-")) and doc[i].text in [
+                    ".",
+                    "'",
+                    "!",
+                    "?",
+                    ",",
+                ]:
                    # punctuation entity: could be replaced by whitespace when training with noise,
                    # so add a warning to alert the user to this unexpected side effect.
                    data["punct_ents"] += 1
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -184,16 +184,17 @@ low data labels and more.
 $ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pipeline] [--ignore-warnings] [--verbose] [--no-format]
 ```

-| Argument                   | Type       | Description                                                                                        |
-| -------------------------- | ---------- | -------------------------------------------------------------------------------------------------- |
-| `lang`                     | positional | Model language.                                                                                    |
-| `train_path`               | positional | Location of JSON-formatted training data. Can be a file or a directory of files.                   |
-| `dev_path`                 | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. |
-| `--base-model`, `-b`       | option     | Optional name of base model to update. Can be any loadable spaCy model.                            |
-| `--pipeline`, `-p`         | option     | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`.          |
-| `--ignore-warnings`, `-IW` | flag       | Ignore warnings, only show stats and errors.                                                       |
-| `--verbose`, `-V`          | flag       | Print additional information and explanations.                                                     |
-| --no-format, `-NF`         | flag       | Don't pretty-print the results. Use this if you want to write to a file.                           |
+| Argument                                               | Type       | Description                                                                                        |
+| ------------------------------------------------------ | ---------- | -------------------------------------------------------------------------------------------------- |
+| `lang`                                                 | positional | Model language.                                                                                    |
+| `train_path`                                           | positional | Location of JSON-formatted training data. Can be a file or a directory of files.                   |
+| `dev_path`                                             | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. |
+| `--tag-map-path`, `-tm` <Tag variant="new">2.2.3</Tag> | option     | Location of JSON-formatted tag map.                                                                |
+| `--base-model`, `-b`                                   | option     | Optional name of base model to update. Can be any loadable spaCy model.                            |
+| `--pipeline`, `-p`                                     | option     | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`.          |
+| `--ignore-warnings`, `-IW`                             | flag       | Ignore warnings, only show stats and errors.                                                       |
+| `--verbose`, `-V`                                      | flag       | Print additional information and explanations.                                                     |
+| --no-format, `-NF`                                     | flag       | Don't pretty-print the results. Use this if you want to write to a file.                           |

 <Accordion title="Example output">

@ -368,6 +369,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path]
 | `dev_path`                                                      | positional    | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files.                                                                |
 | `--base-model`, `-b` <Tag variant="new">2.1</Tag>               | option        | Optional name of base model to update. Can be any loadable spaCy model.                                                                                           |
 | `--pipeline`, `-p` <Tag variant="new">2.1</Tag>                 | option        | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`.                                                                         |
+| `--replace-components`, `-R`                                    | flag          | Replace components from the base model.                                                                                                                           |
 | `--vectors`, `-v`                                               | option        | Model to load vectors from.                                                                                                                                       |
 | `--n-iter`, `-n`                                                | option        | Number of iterations (default: `30`).                                                                                                                             |
 | `--n-early-stopping`, `-ne`                                     | option        | Maximum number of training epochs without dev accuracy improvement.                                                                                               |
@ -378,6 +380,13 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path]
 | `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag>           | option        | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental.                                                       |
 | `--parser-multitasks`, `-pt`                                    | option        | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'`                                                                                                       |
 | `--entity-multitasks`, `-et`                                    | option        | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'`                                                                                                          |
+| `--width`, `-cw` <Tag variant="new">2.2.4</Tag>                 | option        | Width of CNN layers of `Tok2Vec` component.                                                                                                                       |
+| `--conv-depth`, `-cd` <Tag variant="new">2.2.4</Tag>            | option        | Depth of CNN layers of `Tok2Vec` component.                                                                                                                       |
+| `--cnn-window`, `-cW` <Tag variant="new">2.2.4</Tag>            | option        | Window size for CNN layers of `Tok2Vec` component.                                                                                                                |
+| `--cnn-pieces`, `-cP` <Tag variant="new">2.2.4</Tag>            | option        | Maxout size for CNN layers of `Tok2Vec` component.                                                                                                                |
+| `--use-chars`, `-chr` <Tag variant="new">2.2.4</Tag>            | flag          | Whether to use character-based embedding of `Tok2Vec` component.                                                                                                  |
+| `--bilstm-depth`, `-lstm` <Tag variant="new">2.2.4</Tag>        | option        | Depth of BiLSTM layers of `Tok2Vec` component (requires PyTorch).                                                                                                 |
+| `--embed-rows`, `-er` <Tag variant="new">2.2.4</Tag>            | option        | Number of embedding rows of `Tok2Vec` component.                                                                                                                  |
 | `--noise-level`, `-nl`                                          | option        | Float indicating the amount of corruption for data augmentation.                                                                                                  |
 | `--orth-variant-level`, `-ovl` <Tag variant="new">2.2</Tag>     | option        | Float indicating the orthography variation for data augmentation (e.g. `0.3` for making 30% of occurrences of some tokens subject to replacement).                |
 | `--gold-preproc`, `-G`                                          | flag          | Use gold preprocessing.                                                                                                                                           |
@ -385,6 +394,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path]
 | `--textcat-multilabel`, `-TML` <Tag variant="new">2.2</Tag>     | flag          | Text classification classes aren't mutually exclusive (multilabel).                                                                                               |
 | `--textcat-arch`, `-ta` <Tag variant="new">2.2</Tag>            | option        | Text classification model architecture. Defaults to `"bow"`.                                                                                                      |
 | `--textcat-positive-label`, `-tpl` <Tag variant="new">2.2</Tag> | option        | Text classification positive label for binary classes with two labels.                                                                                            |
+| `--tag-map-path`, `-tm` <Tag variant="new">2.2.4</Tag>          | option        | Location of JSON-formatted tag map.                                                                                                                               |
 | `--verbose`, `-VV` <Tag variant="new">2.0.13</Tag>              | flag          | Show more detailed messages during training.                                                                                                                      |
 | `--help`, `-h`                                                  | flag          | Show help message and available arguments.                                                                                                                        |
 | **CREATES**                                                     | model, pickle | A spaCy model on each epoch.                                                                                                                                      |
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -7,9 +7,10 @@ source: spacy/tokens/doc.pyx

 A `Doc` is a sequence of [`Token`](/api/token) objects. Access sentences and
 named entities, export annotations to numpy arrays, losslessly serialize to
-compressed binary strings. The `Doc` object holds an array of [`TokenC`](/api/cython-structs#tokenc) structs.
-The Python-level `Token` and [`Span`](/api/span) objects are views of this
-array, i.e. they don't own the data themselves.
+compressed binary strings. The `Doc` object holds an array of
+[`TokenC`](/api/cython-structs#tokenc) structs. The Python-level `Token` and
+[`Span`](/api/span) objects are views of this array, i.e. they don't own the
+data themselves.

 ## Doc.\_\_init\_\_ {#init tag="method"}

@ -197,13 +198,14 @@ the character indices don't map to a valid span.
 > assert span.text == "New York"
 > ```

-| Name        | Type                                     | Description                                             |
-| ----------- | ---------------------------------------- | ------------------------------------------------------- |
-| `start`     | int                                      | The index of the first character of the span.           |
-| `end`       | int                                      | The index of the last character after the span.         |
-| `label`     | uint64 / unicode                         | A label to attach to the Span, e.g. for named entities. |
-| `vector`    | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span.                   |
-| **RETURNS** | `Span`                                   | The newly constructed object or `None`.                 |
+| Name                                 | Type                                     | Description                                                           |
+| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- |
+| `start`                              | int                                      | The index of the first character of the span.                         |
+| `end`                                | int                                      | The index of the last character after the span.                       |
+| `label`                              | uint64 / unicode                         | A label to attach to the span, e.g. for named entities.               |
+| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode                         | An ID from a knowledge base to capture the meaning of a named entity. |
+| `vector`                             | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span.                                 |
+| **RETURNS**                          | `Span`                                   | The newly constructed object or `None`.                               |

 ## Doc.similarity {#similarity tag="method" model="vectors"}

--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@ -172,6 +172,28 @@ Remove a previously registered extension.
 | `name`      | unicode | Name of the extension.                                                |
 | **RETURNS** | tuple   | A `(default, method, getter, setter)` tuple of the removed extension. |

+## Span.char_span {#char_span tag="method" new="2.2.4"}
+
+Create a `Span` object from the slice `span.text[start:end]`. Returns `None` if
+the character indices don't map to a valid span.
+
+> #### Example
+>
+> ```python
+> doc = nlp("I like New York")
+> span = doc[1:4].char_span(5, 13, label="GPE")
+> assert span.text == "New York"
+> ```
+
+| Name        | Type                                     | Description                                                           |
+| ----------- | ---------------------------------------- | --------------------------------------------------------------------- |
+| `start`     | int                                      | The index of the first character of the span.                         |
+| `end`       | int                                      | The index of the last character after the span.                       |
+| `label`     | uint64 / unicode                         | A label to attach to the span, e.g. for named entities.               |
+| `kb_id`     | uint64 / unicode                         | An ID from a knowledge base to capture the meaning of a named entity. |
+| `vector`    | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span.                                 |
+| **RETURNS** | `Span`                                   | The newly constructed object or `None`.                               |
+
 ## Span.similarity {#similarity tag="method" model="vectors"}

 Make a semantic similarity estimate. The default estimate is cosine similarity
@ -293,10 +315,10 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
 > assert doc2.text == "New York"
 > ```

-| Name              | Type  | Description                                          |
-| ----------------- | ----- | ---------------------------------------------------- |
-| `copy_user_data`  | bool  | Whether or not to copy the original doc's user data. |
-| **RETURNS**       | `Doc` | A `Doc` object of the `Span`'s content.              |
+| Name             | Type  | Description                                          |
+| ---------------- | ----- | ---------------------------------------------------- |
+| `copy_user_data` | bool  | Whether or not to copy the original doc's user data. |
+| **RETURNS**      | `Doc` | A `Doc` object of the `Span`'s content.              |

 ## Span.root {#root tag="property" model="parser"}

--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -236,22 +236,22 @@ If a setting is not present in the options, the default value will be used.
 > displacy.serve(doc, style="dep", options=options)
 > ```

-| Name               | Type    | Description                                                                                                     | Default                 |
-| ------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
-| `fine_grained`     | bool    | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`).              | `False`                 |
-| `add_lemma`        | bool    | Print the lemma's in a separate row below the token texts in the `dep` visualisation.                           | `False`                 |
-| `collapse_punct`   | bool    | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True`                  |
-| `collapse_phrases` | bool    | Merge noun phrases into one token.                                                                              | `False`                 |
-| `compact`          | bool    | "Compact mode" with square arrows that takes up less space.                                                     | `False`                 |
-| `color`            | unicode | Text color (HEX, RGB or color names).                                                                           | `'#000000'`             |
-| `bg`               | unicode | Background color (HEX, RGB or color names).                                                                     | `'#ffffff'`             |
-| `font`             | unicode | Font name or font family for all text.                                                                          | `'Arial'`               |
-| `offset_x`         | int     | Spacing on left side of the SVG in px.                                                                          | `50`                    |
-| `arrow_stroke`     | int     | Width of arrow path in px.                                                                                      | `2`                     |
-| `arrow_width`      | int     | Width of arrow head in px.                                                                                      | `10` / `8` (compact)    |
-| `arrow_spacing`    | int     | Spacing between arrows in px to avoid overlaps.                                                                 | `20` / `12` (compact)   |
-| `word_spacing`     | int     | Vertical spacing between words and arcs in px.                                                                  | `45`                    |
-| `distance`         | int     | Distance between words in px.                                                                                   | `175` / `150` (compact) |
+| Name                                       | Type    | Description                                                                                                     | Default                 |
+| ------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
+| `fine_grained`                             | bool    | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`).              | `False`                 |
+| `add_lemma` <Tag variant="new">2.2.4</Tag> | bool    | Print the lemma's in a separate row below the token texts.                                                      | `False`                 |
+| `collapse_punct`                           | bool    | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True`                  |
+| `collapse_phrases`                         | bool    | Merge noun phrases into one token.                                                                              | `False`                 |
+| `compact`                                  | bool    | "Compact mode" with square arrows that takes up less space.                                                     | `False`                 |
+| `color`                                    | unicode | Text color (HEX, RGB or color names).                                                                           | `'#000000'`             |
+| `bg`                                       | unicode | Background color (HEX, RGB or color names).                                                                     | `'#ffffff'`             |
+| `font`                                     | unicode | Font name or font family for all text.                                                                          | `'Arial'`               |
+| `offset_x`                                 | int     | Spacing on left side of the SVG in px.                                                                          | `50`                    |
+| `arrow_stroke`                             | int     | Width of arrow path in px.                                                                                      | `2`                     |
+| `arrow_width`                              | int     | Width of arrow head in px.                                                                                      | `10` / `8` (compact)    |
+| `arrow_spacing`                            | int     | Spacing between arrows in px to avoid overlaps.                                                                 | `20` / `12` (compact)   |
+| `word_spacing`                             | int     | Vertical spacing between words and arcs in px.                                                                  | `45`                    |
+| `distance`                                 | int     | Distance between words in px.                                                                                   | `175` / `150` (compact) |

 #### Named Entity Visualizer options {#displacy_options-ent}

--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@ -95,6 +95,8 @@
            "has_examples": true
        },
        { "code": "hr", "name": "Croatian", "has_examples": true },
+        { "code": "eu", "name": "Basque", "has_examples": true },
+        { "code": "yo", "name": "Yoruba", "has_examples": true },
        { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true },
        { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true },
        { "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true },