mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Fix formatting and update docs for v2.2.4
This commit is contained in:
parent
5f68004264
commit
1d6aec805d
|
@ -23,20 +23,17 @@ BLANK_MODEL_THRESHOLD = 2000
|
|||
|
||||
|
||||
@plac.annotations(
|
||||
# fmt: off
|
||||
lang=("model language", "positional", None, str),
|
||||
train_path=("location of JSON-formatted training data", "positional", None, Path),
|
||||
dev_path=("location of JSON-formatted development data", "positional", None, Path),
|
||||
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
||||
base_model=("name of model to update (optional)", "option", "b", str),
|
||||
pipeline=(
|
||||
"Comma-separated names of pipeline components to train",
|
||||
"option",
|
||||
"p",
|
||||
str,
|
||||
),
|
||||
pipeline=("Comma-separated names of pipeline components to train", "option", "p", str),
|
||||
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
|
||||
verbose=("Print additional information and explanations", "flag", "V", bool),
|
||||
no_format=("Don't pretty-print the results", "flag", "NF", bool),
|
||||
# fmt: on
|
||||
)
|
||||
def debug_data(
|
||||
lang,
|
||||
|
@ -235,13 +232,17 @@ def debug_data(
|
|||
|
||||
if gold_train_data["ws_ents"]:
|
||||
msg.fail(
|
||||
"{} invalid whitespace entity span(s)".format(gold_train_data["ws_ents"])
|
||||
"{} invalid whitespace entity span(s)".format(
|
||||
gold_train_data["ws_ents"]
|
||||
)
|
||||
)
|
||||
has_ws_ents_error = True
|
||||
|
||||
if gold_train_data["punct_ents"]:
|
||||
msg.warn(
|
||||
"{} entity span(s) with punctuation".format(gold_train_data["punct_ents"])
|
||||
"{} entity span(s) with punctuation".format(
|
||||
gold_train_data["punct_ents"]
|
||||
)
|
||||
)
|
||||
has_punct_ents_warning = True
|
||||
|
||||
|
@ -592,7 +593,13 @@ def _compile_gold(train_docs, pipeline):
|
|||
if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
|
||||
# "Illegal" whitespace entity
|
||||
data["ws_ents"] += 1
|
||||
if label.startswith(("B-", "U-", "L-")) and doc[i].text in [".", "'", "!", "?", ","]:
|
||||
if label.startswith(("B-", "U-", "L-")) and doc[i].text in [
|
||||
".",
|
||||
"'",
|
||||
"!",
|
||||
"?",
|
||||
",",
|
||||
]:
|
||||
# punctuation entity: could be replaced by whitespace when training with noise,
|
||||
# so add a warning to alert the user to this unexpected side effect.
|
||||
data["punct_ents"] += 1
|
||||
|
|
|
@ -184,16 +184,17 @@ low data labels and more.
|
|||
$ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pipeline] [--ignore-warnings] [--verbose] [--no-format]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------------------------- | ---------- | -------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | positional | Model language. |
|
||||
| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. |
|
||||
| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. |
|
||||
| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. |
|
||||
| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. |
|
||||
| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. |
|
||||
| `--verbose`, `-V` | flag | Print additional information and explanations. |
|
||||
| --no-format, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. |
|
||||
| Argument | Type | Description |
|
||||
| ------------------------------------------------------ | ---------- | -------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | positional | Model language. |
|
||||
| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. |
|
||||
| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. |
|
||||
| `--tag-map-path`, `-tm` <Tag variant="new">2.2.3</Tag> | option | Location of JSON-formatted tag map. |
|
||||
| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. |
|
||||
| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. |
|
||||
| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. |
|
||||
| `--verbose`, `-V` | flag | Print additional information and explanations. |
|
||||
| --no-format, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. |
|
||||
|
||||
<Accordion title="Example output">
|
||||
|
||||
|
@ -368,6 +369,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path]
|
|||
| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. |
|
||||
| `--base-model`, `-b` <Tag variant="new">2.1</Tag> | option | Optional name of base model to update. Can be any loadable spaCy model. |
|
||||
| `--pipeline`, `-p` <Tag variant="new">2.1</Tag> | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. |
|
||||
| `--replace-components`, `-R` | flag | Replace components from the base model. |
|
||||
| `--vectors`, `-v` | option | Model to load vectors from. |
|
||||
| `--n-iter`, `-n` | option | Number of iterations (default: `30`). |
|
||||
| `--n-early-stopping`, `-ne` | option | Maximum number of training epochs without dev accuracy improvement. |
|
||||
|
@ -378,6 +380,13 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path]
|
|||
| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. |
|
||||
| `--parser-multitasks`, `-pt` | option | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'` |
|
||||
| `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` |
|
||||
| `--width`, `-cw` <Tag variant="new">2.2.4</Tag> | option | Width of CNN layers of `Tok2Vec` component. |
|
||||
| `--conv-depth`, `-cd` <Tag variant="new">2.2.4</Tag> | option | Depth of CNN layers of `Tok2Vec` component. |
|
||||
| `--cnn-window`, `-cW` <Tag variant="new">2.2.4</Tag> | option | Window size for CNN layers of `Tok2Vec` component. |
|
||||
| `--cnn-pieces`, `-cP` <Tag variant="new">2.2.4</Tag> | option | Maxout size for CNN layers of `Tok2Vec` component. |
|
||||
| `--use-chars`, `-chr` <Tag variant="new">2.2.4</Tag> | flag | Whether to use character-based embedding of `Tok2Vec` component. |
|
||||
| `--bilstm-depth`, `-lstm` <Tag variant="new">2.2.4</Tag> | option | Depth of BiLSTM layers of `Tok2Vec` component (requires PyTorch). |
|
||||
| `--embed-rows`, `-er` <Tag variant="new">2.2.4</Tag> | option | Number of embedding rows of `Tok2Vec` component. |
|
||||
| `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. |
|
||||
| `--orth-variant-level`, `-ovl` <Tag variant="new">2.2</Tag> | option | Float indicating the orthography variation for data augmentation (e.g. `0.3` for making 30% of occurrences of some tokens subject to replacement). |
|
||||
| `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
|
||||
|
@ -385,6 +394,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path]
|
|||
| `--textcat-multilabel`, `-TML` <Tag variant="new">2.2</Tag> | flag | Text classification classes aren't mutually exclusive (multilabel). |
|
||||
| `--textcat-arch`, `-ta` <Tag variant="new">2.2</Tag> | option | Text classification model architecture. Defaults to `"bow"`. |
|
||||
| `--textcat-positive-label`, `-tpl` <Tag variant="new">2.2</Tag> | option | Text classification positive label for binary classes with two labels. |
|
||||
| `--tag-map-path`, `-tm` <Tag variant="new">2.2.4</Tag> | option | Location of JSON-formatted tag map. |
|
||||
| `--verbose`, `-VV` <Tag variant="new">2.0.13</Tag> | flag | Show more detailed messages during training. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | model, pickle | A spaCy model on each epoch. |
|
||||
|
|
|
@ -7,9 +7,10 @@ source: spacy/tokens/doc.pyx
|
|||
|
||||
A `Doc` is a sequence of [`Token`](/api/token) objects. Access sentences and
|
||||
named entities, export annotations to numpy arrays, losslessly serialize to
|
||||
compressed binary strings. The `Doc` object holds an array of [`TokenC`](/api/cython-structs#tokenc) structs.
|
||||
The Python-level `Token` and [`Span`](/api/span) objects are views of this
|
||||
array, i.e. they don't own the data themselves.
|
||||
compressed binary strings. The `Doc` object holds an array of
|
||||
[`TokenC`](/api/cython-structs#tokenc) structs. The Python-level `Token` and
|
||||
[`Span`](/api/span) objects are views of this array, i.e. they don't own the
|
||||
data themselves.
|
||||
|
||||
## Doc.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
|
@ -197,13 +198,14 @@ the character indices don't map to a valid span.
|
|||
> assert span.text == "New York"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------------------------------- | ------------------------------------------------------- |
|
||||
| `start` | int | The index of the first character of the span. |
|
||||
| `end` | int | The index of the last character after the span. |
|
||||
| `label` | uint64 / unicode | A label to attach to the Span, e.g. for named entities. |
|
||||
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
|
||||
| **RETURNS** | `Span` | The newly constructed object or `None`. |
|
||||
| Name | Type | Description |
|
||||
| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- |
|
||||
| `start` | int | The index of the first character of the span. |
|
||||
| `end` | int | The index of the last character after the span. |
|
||||
| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. |
|
||||
| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. |
|
||||
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
|
||||
| **RETURNS** | `Span` | The newly constructed object or `None`. |
|
||||
|
||||
## Doc.similarity {#similarity tag="method" model="vectors"}
|
||||
|
||||
|
|
|
@ -172,6 +172,28 @@ Remove a previously registered extension.
|
|||
| `name` | unicode | Name of the extension. |
|
||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
|
||||
|
||||
## Span.char_span {#char_span tag="method" new="2.2.4"}
|
||||
|
||||
Create a `Span` object from the slice `span.text[start:end]`. Returns `None` if
|
||||
the character indices don't map to a valid span.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc = nlp("I like New York")
|
||||
> span = doc[1:4].char_span(5, 13, label="GPE")
|
||||
> assert span.text == "New York"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------------------------------- | --------------------------------------------------------------------- |
|
||||
| `start` | int | The index of the first character of the span. |
|
||||
| `end` | int | The index of the last character after the span. |
|
||||
| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. |
|
||||
| `kb_id` | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. |
|
||||
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
|
||||
| **RETURNS** | `Span` | The newly constructed object or `None`. |
|
||||
|
||||
## Span.similarity {#similarity tag="method" model="vectors"}
|
||||
|
||||
Make a semantic similarity estimate. The default estimate is cosine similarity
|
||||
|
@ -293,10 +315,10 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
|
|||
> assert doc2.text == "New York"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------- | ----- | ---------------------------------------------------- |
|
||||
| `copy_user_data` | bool | Whether or not to copy the original doc's user data. |
|
||||
| **RETURNS** | `Doc` | A `Doc` object of the `Span`'s content. |
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ----- | ---------------------------------------------------- |
|
||||
| `copy_user_data` | bool | Whether or not to copy the original doc's user data. |
|
||||
| **RETURNS** | `Doc` | A `Doc` object of the `Span`'s content. |
|
||||
|
||||
## Span.root {#root tag="property" model="parser"}
|
||||
|
||||
|
|
|
@ -236,22 +236,22 @@ If a setting is not present in the options, the default value will be used.
|
|||
> displacy.serve(doc, style="dep", options=options)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
| ------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
|
||||
| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` |
|
||||
| `add_lemma` | bool | Print the lemma's in a separate row below the token texts in the `dep` visualisation. | `False` |
|
||||
| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` |
|
||||
| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` |
|
||||
| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
|
||||
| `color` | unicode | Text color (HEX, RGB or color names). | `'#000000'` |
|
||||
| `bg` | unicode | Background color (HEX, RGB or color names). | `'#ffffff'` |
|
||||
| `font` | unicode | Font name or font family for all text. | `'Arial'` |
|
||||
| `offset_x` | int | Spacing on left side of the SVG in px. | `50` |
|
||||
| `arrow_stroke` | int | Width of arrow path in px. | `2` |
|
||||
| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) |
|
||||
| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) |
|
||||
| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` |
|
||||
| `distance` | int | Distance between words in px. | `175` / `150` (compact) |
|
||||
| Name | Type | Description | Default |
|
||||
| ------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
|
||||
| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` |
|
||||
| `add_lemma` <Tag variant="new">2.2.4</Tag> | bool | Print the lemma's in a separate row below the token texts. | `False` |
|
||||
| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` |
|
||||
| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` |
|
||||
| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
|
||||
| `color` | unicode | Text color (HEX, RGB or color names). | `'#000000'` |
|
||||
| `bg` | unicode | Background color (HEX, RGB or color names). | `'#ffffff'` |
|
||||
| `font` | unicode | Font name or font family for all text. | `'Arial'` |
|
||||
| `offset_x` | int | Spacing on left side of the SVG in px. | `50` |
|
||||
| `arrow_stroke` | int | Width of arrow path in px. | `2` |
|
||||
| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) |
|
||||
| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) |
|
||||
| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` |
|
||||
| `distance` | int | Distance between words in px. | `175` / `150` (compact) |
|
||||
|
||||
#### Named Entity Visualizer options {#displacy_options-ent}
|
||||
|
||||
|
|
|
@ -95,6 +95,8 @@
|
|||
"has_examples": true
|
||||
},
|
||||
{ "code": "hr", "name": "Croatian", "has_examples": true },
|
||||
{ "code": "eu", "name": "Basque", "has_examples": true },
|
||||
{ "code": "yo", "name": "Yoruba", "has_examples": true },
|
||||
{ "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true },
|
||||
{ "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true },
|
||||
{ "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true },
|
||||
|
|
Loading…
Reference in New Issue
Block a user