Fix formatting and update docs for v2.2.4

This commit is contained in:
Ines Montani 2020-03-09 11:17:20 +01:00
parent 5f68004264
commit 1d6aec805d
6 changed files with 92 additions and 49 deletions

View File

@ -23,20 +23,17 @@ BLANK_MODEL_THRESHOLD = 2000
@plac.annotations(
# fmt: off
lang=("model language", "positional", None, str),
train_path=("location of JSON-formatted training data", "positional", None, Path),
dev_path=("location of JSON-formatted development data", "positional", None, Path),
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
base_model=("name of model to update (optional)", "option", "b", str),
pipeline=(
"Comma-separated names of pipeline components to train",
"option",
"p",
str,
),
pipeline=("Comma-separated names of pipeline components to train", "option", "p", str),
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
verbose=("Print additional information and explanations", "flag", "V", bool),
no_format=("Don't pretty-print the results", "flag", "NF", bool),
# fmt: on
)
def debug_data(
lang,
@ -235,13 +232,17 @@ def debug_data(
if gold_train_data["ws_ents"]:
msg.fail(
"{} invalid whitespace entity span(s)".format(gold_train_data["ws_ents"])
"{} invalid whitespace entity span(s)".format(
gold_train_data["ws_ents"]
)
)
has_ws_ents_error = True
if gold_train_data["punct_ents"]:
msg.warn(
"{} entity span(s) with punctuation".format(gold_train_data["punct_ents"])
"{} entity span(s) with punctuation".format(
gold_train_data["punct_ents"]
)
)
has_punct_ents_warning = True
@ -592,7 +593,13 @@ def _compile_gold(train_docs, pipeline):
if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
# "Illegal" whitespace entity
data["ws_ents"] += 1
if label.startswith(("B-", "U-", "L-")) and doc[i].text in [".", "'", "!", "?", ","]:
if label.startswith(("B-", "U-", "L-")) and doc[i].text in [
".",
"'",
"!",
"?",
",",
]:
# punctuation entity: could be replaced by whitespace when training with noise,
# so add a warning to alert the user to this unexpected side effect.
data["punct_ents"] += 1

View File

@ -185,10 +185,11 @@ $ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pi
```
| Argument | Type | Description |
| -------------------------- | ---------- | -------------------------------------------------------------------------------------------------- |
| ------------------------------------------------------ | ---------- | -------------------------------------------------------------------------------------------------- |
| `lang` | positional | Model language. |
| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. |
| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. |
| `--tag-map-path`, `-tm` <Tag variant="new">2.2.3</Tag> | option | Location of JSON-formatted tag map. |
| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. |
| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. |
| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. |
@ -368,6 +369,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path]
| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. |
| `--base-model`, `-b` <Tag variant="new">2.1</Tag> | option | Optional name of base model to update. Can be any loadable spaCy model. |
| `--pipeline`, `-p` <Tag variant="new">2.1</Tag> | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. |
| `--replace-components`, `-R` | flag | Replace components from the base model. |
| `--vectors`, `-v` | option | Model to load vectors from. |
| `--n-iter`, `-n` | option | Number of iterations (default: `30`). |
| `--n-early-stopping`, `-ne` | option | Maximum number of training epochs without dev accuracy improvement. |
@ -378,6 +380,13 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path]
| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. |
| `--parser-multitasks`, `-pt` | option | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'` |
| `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` |
| `--width`, `-cw` <Tag variant="new">2.2.4</Tag> | option | Width of CNN layers of `Tok2Vec` component. |
| `--conv-depth`, `-cd` <Tag variant="new">2.2.4</Tag> | option | Depth of CNN layers of `Tok2Vec` component. |
| `--cnn-window`, `-cW` <Tag variant="new">2.2.4</Tag> | option | Window size for CNN layers of `Tok2Vec` component. |
| `--cnn-pieces`, `-cP` <Tag variant="new">2.2.4</Tag> | option | Maxout size for CNN layers of `Tok2Vec` component. |
| `--use-chars`, `-chr` <Tag variant="new">2.2.4</Tag> | flag | Whether to use character-based embedding of `Tok2Vec` component. |
| `--bilstm-depth`, `-lstm` <Tag variant="new">2.2.4</Tag> | option | Depth of BiLSTM layers of `Tok2Vec` component (requires PyTorch). |
| `--embed-rows`, `-er` <Tag variant="new">2.2.4</Tag> | option | Number of embedding rows of `Tok2Vec` component. |
| `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. |
| `--orth-variant-level`, `-ovl` <Tag variant="new">2.2</Tag> | option | Float indicating the orthography variation for data augmentation (e.g. `0.3` for making 30% of occurrences of some tokens subject to replacement). |
| `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
@ -385,6 +394,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path]
| `--textcat-multilabel`, `-TML` <Tag variant="new">2.2</Tag> | flag | Text classification classes aren't mutually exclusive (multilabel). |
| `--textcat-arch`, `-ta` <Tag variant="new">2.2</Tag> | option | Text classification model architecture. Defaults to `"bow"`. |
| `--textcat-positive-label`, `-tpl` <Tag variant="new">2.2</Tag> | option | Text classification positive label for binary classes with two labels. |
| `--tag-map-path`, `-tm` <Tag variant="new">2.2.4</Tag> | option | Location of JSON-formatted tag map. |
| `--verbose`, `-VV` <Tag variant="new">2.0.13</Tag> | flag | Show more detailed messages during training. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | model, pickle | A spaCy model on each epoch. |

View File

@ -7,9 +7,10 @@ source: spacy/tokens/doc.pyx
A `Doc` is a sequence of [`Token`](/api/token) objects. Access sentences and
named entities, export annotations to numpy arrays, losslessly serialize to
compressed binary strings. The `Doc` object holds an array of [`TokenC`](/api/cython-structs#tokenc) structs.
The Python-level `Token` and [`Span`](/api/span) objects are views of this
array, i.e. they don't own the data themselves.
compressed binary strings. The `Doc` object holds an array of
[`TokenC`](/api/cython-structs#tokenc) structs. The Python-level `Token` and
[`Span`](/api/span) objects are views of this array, i.e. they don't own the
data themselves.
## Doc.\_\_init\_\_ {#init tag="method"}
@ -198,10 +199,11 @@ the character indices don't map to a valid span.
> ```
| Name | Type | Description |
| ----------- | ---------------------------------------- | ------------------------------------------------------- |
| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- |
| `start` | int | The index of the first character of the span. |
| `end` | int | The index of the last character after the span. |
| `label` | uint64 / unicode | A label to attach to the Span, e.g. for named entities. |
| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. |
| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. |
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
| **RETURNS** | `Span` | The newly constructed object or `None`. |

View File

@ -172,6 +172,28 @@ Remove a previously registered extension.
| `name` | unicode | Name of the extension. |
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
## Span.char_span {#char_span tag="method" new="2.2.4"}
Create a `Span` object from the slice `span.text[start:end]`. Returns `None` if
the character indices don't map to a valid span.
> #### Example
>
> ```python
> doc = nlp("I like New York")
> span = doc[1:4].char_span(5, 13, label="GPE")
> assert span.text == "New York"
> ```
| Name | Type | Description |
| ----------- | ---------------------------------------- | --------------------------------------------------------------------- |
| `start` | int | The index of the first character of the span. |
| `end` | int | The index of the last character after the span. |
| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. |
| `kb_id` | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. |
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
| **RETURNS** | `Span` | The newly constructed object or `None`. |
## Span.similarity {#similarity tag="method" model="vectors"}
Make a semantic similarity estimate. The default estimate is cosine similarity
@ -294,7 +316,7 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
> ```
| Name | Type | Description |
| ----------------- | ----- | ---------------------------------------------------- |
| ---------------- | ----- | ---------------------------------------------------- |
| `copy_user_data` | bool | Whether or not to copy the original doc's user data. |
| **RETURNS** | `Doc` | A `Doc` object of the `Span`'s content. |

View File

@ -237,9 +237,9 @@ If a setting is not present in the options, the default value will be used.
> ```
| Name | Type | Description | Default |
| ------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
| ------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` |
| `add_lemma` | bool | Print the lemma's in a separate row below the token texts in the `dep` visualisation. | `False` |
| `add_lemma` <Tag variant="new">2.2.4</Tag> | bool | Print the lemma's in a separate row below the token texts. | `False` |
| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` |
| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` |
| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |

View File

@ -95,6 +95,8 @@
"has_examples": true
},
{ "code": "hr", "name": "Croatian", "has_examples": true },
{ "code": "eu", "name": "Basque", "has_examples": true },
{ "code": "yo", "name": "Yoruba", "has_examples": true },
{ "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true },
{ "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true },
{ "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true },