From 3dd5f409ec874fbb57cf020577eeff03b5c98bc6 Mon Sep 17 00:00:00 2001 From: walterhenry <55140654+walterhenry@users.noreply.github.com> Date: Thu, 24 Sep 2020 13:15:28 +0200 Subject: [PATCH 1/7] Proofreading Proofread some API docs --- website/docs/api/architectures.md | 14 +++++++------- website/docs/api/attributeruler.md | 4 ++-- website/docs/api/cli.md | 4 ++-- website/docs/api/data-formats.md | 4 ++-- website/docs/api/dependencyparser.md | 3 +-- website/docs/api/doc.md | 2 +- website/docs/api/entitylinker.md | 2 +- website/docs/api/entityrecognizer.md | 4 ++-- website/docs/api/entityruler.md | 2 +- website/docs/api/example.md | 8 ++++---- website/docs/api/language.md | 16 +++++++--------- website/docs/api/lemmatizer.md | 2 +- website/docs/api/matcher.md | 2 +- website/docs/api/morphology.md | 4 ++-- website/docs/api/pipeline-functions.md | 2 +- website/docs/api/sentencerecognizer.md | 2 +- website/docs/api/sentencizer.md | 4 ++-- website/docs/api/span.md | 2 +- 18 files changed, 39 insertions(+), 42 deletions(-) diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 30d863b17..698e1ee56 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -143,7 +143,7 @@ argument that connects to the shared `tok2vec` component in the pipeline. Construct an embedding layer that separately embeds a number of lexical attributes using hash embedding, concatenates the results, and passes it through -a feed-forward subnetwork to build a mixed representations. The features used +a feed-forward subnetwork to build mixed representations. The features used are the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying definitions depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained static vectors can also be incorporated into the concatenated @@ -170,7 +170,7 @@ representation. > nC = 8 > ``` -Construct an embedded representations based on character embeddings, using a +Construct an embedded representation based on character embeddings, using a feed-forward network. A fixed number of UTF-8 byte characters are used for each word, taken from the beginning and end of the word equally. Padding is used in the center for words that are too short. @@ -392,7 +392,7 @@ a single token vector given zero or more wordpiece vectors. > ``` Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does -**not** allow multiple components to share the transformer weights, and does +**not** allow multiple components to share the transformer weights and does **not** allow the transformer to set annotations into the [`Doc`](/api/doc) object, but it's a **simpler solution** if you only need the transformer within one component. @@ -436,7 +436,7 @@ might find [this tutorial](https://explosion.ai/blog/parsing-english-in-python) helpful for background information. The neural network state prediction model consists of either two or three subnetworks: -- **tok2vec**: Map each token into a vector representations. This subnetwork is +- **tok2vec**: Map each token into a vector representation. This subnetwork is run once for each batch. - **lower**: Construct a feature-specific vector for each `(token, feature)` pair. This is also run once for each batch. Constructing the state @@ -573,14 +573,14 @@ architecture is usually less accurate than the ensemble, but runs faster. > nO = null > ``` -An ngram "bag-of-words" model. This architecture should run much faster than the +An n-gram "bag-of-words" model. This architecture should run much faster than the others, but may not be as accurate, especially if texts are short. | Name | Description | | ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | | `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ | -| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ | +| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | @@ -594,7 +594,7 @@ into the "real world". This requires 3 main components: synonyms and prior probabilities. - A candidate generation step to produce a set of likely identifiers, given a certain textual mention. -- A Machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the +- A machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the most plausible ID from the set of candidates. ### spacy.EntityLinker.v1 {#EntityLinker} diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md index 53c8c46cf..60fda6bda 100644 --- a/website/docs/api/attributeruler.md +++ b/website/docs/api/attributeruler.md @@ -71,7 +71,7 @@ pattern_dicts = [ ## AttributeRuler.\_\_call\_\_ {#call tag="method"} -Apply the attribute ruler to a Doc, setting token attributes for tokens matched +Apply the attribute ruler to a `Doc`, setting token attributes for tokens matched by the provided patterns. | Name | Description | @@ -256,6 +256,6 @@ serialization by passing in the string names via the `exclude` argument. | Name | Description | | ---------- | -------------------------------------------------------------- | | `vocab` | The shared [`Vocab`](/api/vocab). | -| `patterns` | The Matcher patterns. You usually don't want to exclude this. | +| `patterns` | The `Matcher` patterns. You usually don't want to exclude this. | | `attrs` | The attributes to set. You usually don't want to exclude this. | | `indices` | The token indices. You usually don't want to exclude this. | diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 8449d23e1..2a216f5f8 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -81,7 +81,7 @@ $ python -m spacy info [model] [--markdown] [--silent] Find all trained pipeline packages installed in the current environment and check whether they are compatible with the currently installed version of spaCy. Should be run after upgrading spaCy via `pip install -U spacy` to ensure that -all installed packages are can be used with the new version. It will show a list +all installed packages can be used with the new version. It will show a list of packages and their installed versions. If any package is out of date, the latest compatible versions and command for updating are shown. @@ -406,7 +406,7 @@ File /path/to/spacy/training/corpus.py (line 18) ### debug data {#debug-data tag="command"} -Analyze, debug, and validate your training and development data. Get useful +Analyze, debug and validate your training and development data. Get useful stats, and find problems like invalid entity annotations, cyclic dependencies, low data labels and more. diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 79ecb08b3..7c7b58a15 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -188,7 +188,7 @@ Typically, the extension for these binary files is `.spacy`, and they are used as input format for specifying a [training corpus](/api/corpus) and for spaCy's CLI [`train`](/api/cli#train) command. The built-in [`convert`](/api/cli#convert) command helps you convert spaCy's previous -[JSON format](#json-input) to the new binary format format. It also supports +[JSON format](#json-input) to the new binary format. It also supports conversion of the `.conllu` format used by the [Universal Dependencies corpora](https://github.com/UniversalDependencies). @@ -252,7 +252,7 @@ $ python -m spacy convert ./data.json ./output.spacy -Here's an example of dependencies, part-of-speech tags and names entities, taken +Here's an example of dependencies, part-of-speech tags and named entities, taken from the English Wall Street Journal portion of the Penn Treebank: ```json diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index 8af4455d3..7e809c642 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -21,8 +21,7 @@ non-projective parses. The parser is trained using an **imitation learning objective**. It follows the actions predicted by the current weights, and at each state, determines which actions are compatible with the optimal parse that could be reached from the -current state. The weights such that the scores assigned to the set of optimal -actions is increased, while scores assigned to other actions are decreased. Note +current state. The weights are updated such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note that more than one action may be optimal for a given state. ## Config and implementation {#config} diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 88dc62c2a..b4097ddb7 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -445,7 +445,7 @@ Mark a span for merging. The `attrs` will be applied to the resulting token (if they're context-dependent token attributes like `LEMMA` or `DEP`) or to the underlying lexeme (if they're context-independent lexical attributes like `LOWER` or `IS_STOP`). Writable custom extension attributes can be provided as a -dictionary mapping attribute names to values as the `"_"` key. +dictionary mapping attribute name to values as the `"_"` key. > #### Example > diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 9cb35b487..890548f0e 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -94,7 +94,7 @@ providing custom registered functions. ## EntityLinker.\_\_call\_\_ {#call tag="method"} -Apply the pipe to one document. The document is modified in place, and returned. +Apply the pipe to one document. The document is modified in place and returned. This usually happens under the hood when the `nlp` object is called on a text and all pipeline components are applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and [`pipe`](/api/entitylinker#pipe) diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 8af73f44b..d22dae12c 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -43,7 +43,7 @@ architectures and their arguments and hyperparameters. | Setting | Description | | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]] | +| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ | @@ -83,7 +83,7 @@ shortcut for this and instantiate the component using its string name and ## EntityRecognizer.\_\_call\_\_ {#call tag="method"} -Apply the pipe to one document. The document is modified in place, and returned. +Apply the pipe to one document. The document is modified in place and returned. This usually happens under the hood when the `nlp` object is called on a text and all pipeline components are applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 7be44bc95..7b7e5b635 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -256,6 +256,6 @@ Get all patterns that were added to the entity ruler. | Name | Description | | ----------------- | --------------------------------------------------------------------------------------------------------------------- | | `matcher` | The underlying matcher used to process token patterns. ~~Matcher~~ | -| `phrase_matcher` | The underlying phrase matcher, used to process phrase patterns. ~~PhraseMatcher~~ | +| `phrase_matcher` | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~ | | `token_patterns` | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ | | `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~ | diff --git a/website/docs/api/example.md b/website/docs/api/example.md index 668c8028f..2811f4d91 100644 --- a/website/docs/api/example.md +++ b/website/docs/api/example.md @@ -33,8 +33,8 @@ both documents. | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------ | -| `predicted` | The document containing (partial) predictions. Can not be `None`. ~~Doc~~ | -| `reference` | The document containing gold-standard annotations. Can not be `None`. ~~Doc~~ | +| `predicted` | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~ | +| `reference` | The document containing gold-standard annotations. Cannot be `None`. ~~Doc~~ | | _keyword-only_ | | | `alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. ~~Optional[Alignment]~~ | @@ -58,8 +58,8 @@ see the [training format documentation](/api/data-formats#dict-input). | Name | Description | | -------------- | ------------------------------------------------------------------------- | -| `predicted` | The document containing (partial) predictions. Can not be `None`. ~~Doc~~ | -| `example_dict` | `Dict[str, obj]` | The gold-standard annotations as a dictionary. Can not be `None`. ~~Dict[str, Any]~~ | +| `predicted` | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~ | +| `example_dict` | `Dict[str, obj]` | The gold-standard annotations as a dictionary. Cannot be `None`. ~~Dict[str, Any]~~ | | **RETURNS** | The newly constructed object. ~~Example~~ | ## Example.text {#text tag="property"} diff --git a/website/docs/api/language.md b/website/docs/api/language.md index ffdae9ec6..92663c44a 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -46,9 +46,7 @@ information in [`Language.meta`](/api/language#meta) and not to configure the ## Language.from_config {#from_config tag="classmethod" new="3"} Create a `Language` object from a loaded config. Will set up the tokenizer and -language data, add pipeline components based on the pipeline and components -define in the config and validate the results. If no config is provided, the -default config of the given language is used. This is also how spaCy loads a +language data, add pipeline components based on the pipeline and add pipeline components based on the definitions specified in the config. If no config is provided, the default config of the given language is used. This is also how spaCy loads a model under the hood based on its [`config.cfg`](/api/data-formats#config). > #### Example @@ -107,7 +105,7 @@ decorator. For more details and examples, see the | `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | | `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | | `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ | -| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[Doc], Doc]]~~ | +| `func` | Optional function if not used as a decorator. ~~Optional[Callable[[Doc], Doc]]~~ | ## Language.factory {#factory tag="classmethod"} @@ -155,7 +153,7 @@ examples, see the | `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ | | `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | | `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ | -| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ | +| `func` | Optional function if not used as a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ | ## Language.\_\_call\_\_ {#call tag="method"} @@ -602,7 +600,7 @@ does nothing. ## Language.enable_pipe {#enable_pipe tag="method" new="3"} -Enable a previously disable component (e.g. via +Enable a previously disabled component (e.g. via [`Language.disable_pipes`](/api/language#disable_pipes)) so it's run as part of the pipeline, [`nlp.pipeline`](/api/language#pipeline). If the component is already enabled, this method does nothing. @@ -629,7 +627,7 @@ pipeline will be restored to the initial state at the end of the block. Otherwise, a `DisabledPipes` object is returned, that has a `.restore()` method you can use to undo your changes. You can specify either `disable` (as a list or string), or `enable`. In the latter case, all components not in the `enable` -list, will be disabled. Under the hood, this method calls into +list will be disabled. Under the hood, this method calls into [`disable_pipe`](/api/language#disable_pipe) and [`enable_pipe`](/api/language#enable_pipe). @@ -662,7 +660,7 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`: | -------------- | ------------------------------------------------------------------------------------------------------ | | _keyword-only_ | | | `disable` | Name(s) of pipeline components to disable. ~~Optional[Union[str, Iterable[str]]]~~ | -| `enable` | Names(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ | +| `enable` | Name(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ | | **RETURNS** | The disabled pipes that can be restored by calling the object's `.restore()` method. ~~DisabledPipes~~ | ## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"} @@ -874,7 +872,7 @@ Loads state from a directory, including all data that was saved with the -Keep in mind that this method **only loads serialized state** and doesn't set up +Keep in mind that this method **only loads the serialized state** and doesn't set up the `nlp` object. This means that it requires the correct language class to be initialized and all pipeline components to be added to the pipeline. If you want to load a serialized pipeline from a directory, you should use diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index f9978dcf9..3693429c4 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -38,7 +38,7 @@ The default config is defined by the pipeline component factory and describes how the component should be configured. You can override its settings via the `config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your [`config.cfg` for training](/usage/training#config). For examples of the lookups -data formats used by the lookup and rule-based lemmatizers, see +data format used by the lookup and rule-based lemmatizers, see [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). > #### Example diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index 1f1946be5..3b885727b 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -61,7 +61,7 @@ matched: | `!` | Negate the pattern, by requiring it to match exactly 0 times. | | `?` | Make the pattern optional, by allowing it to match 0 or 1 times. | | `+` | Require the pattern to match 1 or more times. | -| `*` | Allow the pattern to match zero or more times. | +| `*` | Allow the pattern to match 0 or more times. | Token patterns can also map to a **dictionary of properties** instead of a single value to indicate whether the expected value is a member of a list or how diff --git a/website/docs/api/morphology.md b/website/docs/api/morphology.md index 5d5324061..e64f26bdd 100644 --- a/website/docs/api/morphology.md +++ b/website/docs/api/morphology.md @@ -12,7 +12,7 @@ container storing a single morphological analysis. ## Morphology.\_\_init\_\_ {#init tag="method"} -Create a Morphology object. +Create a `Morphology` object. > #### Example > @@ -101,7 +101,7 @@ representation. | Name | Description | | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- | | `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~ | -| **RETURNS** | The morphological features as in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ | +| **RETURNS** | The morphological features in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ | ## Attributes {#attributes} diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md index 0dc03a16a..8bb52d0f9 100644 --- a/website/docs/api/pipeline-functions.md +++ b/website/docs/api/pipeline-functions.md @@ -26,7 +26,7 @@ Merge noun chunks into a single token. Also available via the string name -Since noun chunks require part-of-speech tags and the dependency parse, make +Since noun chunks require part-of-speech tags and the dependency parser, make sure to add this component _after_ the `"tagger"` and `"parser"` components. By default, `nlp.add_pipe` will add components to the end of the pipeline and after all other components. diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index acf94fb8e..131ef26ce 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -202,7 +202,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and ## SentenceRecognizer.rehearse {#rehearse tag="method,experimental" new="3"} Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the -current model to make predictions similar to an initial model, to try to address +current model to make predictions similar to an initial model to try to address the "catastrophic forgetting" problem. This feature is experimental. > #### Example diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index ae31e4ddf..594a85f74 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -8,7 +8,7 @@ api_string_name: sentencizer api_trainable: false --- -A simple pipeline component, to allow custom sentence boundary detection logic +A simple pipeline component to allow custom sentence boundary detection logic that doesn't require the dependency parse. By default, sentence segmentation is performed by the [`DependencyParser`](/api/dependencyparser), so the `Sentencizer` lets you implement a simpler, rule-based strategy that doesn't @@ -130,7 +130,7 @@ Score a batch of examples. ## Sentencizer.to_disk {#to_disk tag="method"} -Save the sentencizer settings (punctuation characters) a directory. Will create +Save the sentencizer settings (punctuation characters) to a directory. Will create a file `sentencizer.json`. This also happens automatically when you save an `nlp` object with a sentencizer added to its pipeline. diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 1c7bc9592..242ceaed0 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -8,7 +8,7 @@ A slice from a [`Doc`](/api/doc) object. ## Span.\_\_init\_\_ {#init tag="method"} -Create a Span object from the slice `doc[start : end]`. +Create a `Span` object from the slice `doc[start : end]`. > #### Example > From a976da168c74227281bbdc7b2aa4ab93a0f2afba Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 28 Sep 2020 03:03:27 +0200 Subject: [PATCH 2/7] Support data augmentation in Corpus (#6155) * Support data augmentation in Corpus * Note initial docs for data augmentation * Add augmenter to quickstart * Fix flake8 * Format * Fix test * Update spacy/tests/training/test_training.py * Improve data augmentation arguments * Update templates * Move randomization out into caller * Refactor * Update spacy/training/augment.py * Update spacy/tests/training/test_training.py * Fix augment * Fix test --- spacy/cli/templates/quickstart_training.jinja | 1 + spacy/default_config.cfg | 5 ++ spacy/tests/training/test_training.py | 7 +- spacy/training/__init__.py | 1 + spacy/training/augment.py | 64 ++++++++++++------- spacy/training/corpus.py | 24 ++++++- spacy/util.py | 1 + website/docs/api/corpus.md | 1 + website/docs/usage/training.md | 11 ++++ 9 files changed, 86 insertions(+), 29 deletions(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 9a8b9d1d7..56faeebfa 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -270,6 +270,7 @@ factory = "{{ pipe }}" @readers = "spacy.Corpus.v1" path = ${paths.train} max_length = {{ 500 if hardware == "gpu" else 2000 }} +augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5} [corpora.dev] @readers = "spacy.Corpus.v1" diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 6f8c0aa00..63a0742e3 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -35,6 +35,11 @@ gold_preproc = false max_length = 0 # Limitation on number of training examples limit = 0 +# Apply some simply data augmentation, where we replace tokens with variations. +# This is especially useful for punctuation and case replacement, to help +# generalize beyond corpora that don't have smart-quotes, or only have smart +# quotes, etc. +augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5} [corpora.dev] @readers = "spacy.Corpus.v1" diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index a04e6aadd..5311fae1e 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -4,7 +4,7 @@ from spacy.training import biluo_tags_to_spans, iob_to_biluo from spacy.training import Corpus, docs_to_json from spacy.training.example import Example from spacy.training.converters import json_to_docs -from spacy.training.augment import make_orth_variants_example +from spacy.training.augment import create_orth_variants_augmenter from spacy.lang.en import English from spacy.tokens import Doc, DocBin from spacy.util import get_words_and_spaces, minibatch @@ -496,9 +496,8 @@ def test_make_orth_variants(doc): output_file = tmpdir / "roundtrip.spacy" DocBin(docs=[doc]).to_disk(output_file) # due to randomness, test only that this runs with no errors for now - reader = Corpus(output_file) - train_example = next(reader(nlp)) - make_orth_variants_example(nlp, train_example, orth_variant_level=0.2) + reader = Corpus(output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5)) + train_examples = list(reader(nlp)) @pytest.mark.skip("Outdated") diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py index 9172dde25..f71a5f521 100644 --- a/spacy/training/__init__.py +++ b/spacy/training/__init__.py @@ -1,6 +1,7 @@ from .corpus import Corpus # noqa: F401 from .example import Example, validate_examples # noqa: F401 from .align import Alignment # noqa: F401 +from .augment import dont_augment, orth_variants_augmenter # noqa: F401 from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401 from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401 from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401 diff --git a/spacy/training/augment.py b/spacy/training/augment.py index 4a01c8589..4d487ce93 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -1,30 +1,50 @@ +from typing import Callable import random import itertools +import copy +from functools import partial +from ..util import registry -def make_orth_variants_example(nlp, example, orth_variant_level=0.0): # TODO: naming - raw_text = example.text - orig_dict = example.to_dict() - variant_text, variant_token_annot = make_orth_variants( - nlp, raw_text, orig_dict["token_annotation"], orth_variant_level - ) - doc = nlp.make_doc(variant_text) - orig_dict["token_annotation"] = variant_token_annot - return example.from_dict(doc, orig_dict) +@registry.augmenters("spacy.dont_augment.v1") +def create_null_augmenter(): + return dont_augment -def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0): - if random.random() >= orth_variant_level: - return raw_text, orig_token_dict - if not orig_token_dict: - return raw_text, orig_token_dict - raw = raw_text - token_dict = orig_token_dict - lower = False - if random.random() >= 0.5: - lower = True - if raw is not None: - raw = raw.lower() +@registry.augmenters("spacy.orth_variants.v1") +def create_orth_variants_augmenter(level: float, lower: float) -> Callable: + """Create a data augmentation callback that uses orth-variant replacement. + The callback can be added to a corpus or other data iterator during training. + """ + return partial(orth_variants_augmenter, level=level, lower=lower) + + +def dont_augment(nlp, example): + yield example + + +def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float=0.0): + if random.random() >= level: + yield example + else: + raw_text = example.text + orig_dict = example.to_dict() + if not orig_dict["token_annotation"]: + yield example + else: + variant_text, variant_token_annot = make_orth_variants( + nlp, + raw_text, + orig_dict["token_annotation"], + lower=raw_text is not None and random.random() < lower + ) + doc = nlp.make_doc(variant_text) + orig_dict["token_annotation"] = variant_token_annot + yield example.from_dict(doc, orig_dict) + + +def make_orth_variants(nlp, raw, token_dict, *, lower: bool=False): + orig_token_dict = copy.deepcopy(token_dict) orth_variants = nlp.vocab.lookups.get_table("orth_variants", {}) ndsv = orth_variants.get("single", []) ndpv = orth_variants.get("paired", []) @@ -103,7 +123,7 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0): # something went wrong, abort # (add a warning message?) if not match_found: - return raw_text, orig_token_dict + return raw, orig_token_dict # add following whitespace while raw_idx < len(raw) and raw[raw_idx].isspace(): variant_raw += raw[raw_idx] diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index 12bda486e..90eb62474 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -1,9 +1,11 @@ import warnings from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable +from typing import Optional from pathlib import Path import srsly from .. import util +from .augment import dont_augment from .example import Example from ..errors import Warnings from ..tokens import DocBin, Doc @@ -18,9 +20,19 @@ FILE_TYPE = ".spacy" @util.registry.readers("spacy.Corpus.v1") def create_docbin_reader( - path: Path, gold_preproc: bool, max_length: int = 0, limit: int = 0 + path: Path, + gold_preproc: bool, + max_length: int = 0, + limit: int = 0, + augmenter: Optional[Callable] = None, ) -> Callable[["Language"], Iterable[Example]]: - return Corpus(path, gold_preproc=gold_preproc, max_length=max_length, limit=limit) + return Corpus( + path, + gold_preproc=gold_preproc, + max_length=max_length, + limit=limit, + augmenter=augmenter, + ) @util.registry.readers("spacy.JsonlReader.v1") @@ -70,6 +82,8 @@ class Corpus: 0, which indicates no limit. limit (int): Limit corpus to a subset of examples, e.g. for debugging. Defaults to 0, which indicates no limit. + augment (Callable[Example, Iterable[Example]]): Optional data augmentation + function, to extrapolate additional examples from your annotations. DOCS: https://nightly.spacy.io/api/corpus """ @@ -81,11 +95,13 @@ class Corpus: limit: int = 0, gold_preproc: bool = False, max_length: int = 0, + augmenter: Optional[Callable] = None, ) -> None: self.path = util.ensure_path(path) self.gold_preproc = gold_preproc self.max_length = max_length self.limit = limit + self.augmenter = augmenter if augmenter is not None else dont_augment def __call__(self, nlp: "Language") -> Iterator[Example]: """Yield examples from the data. @@ -100,7 +116,9 @@ class Corpus: examples = self.make_examples_gold_preproc(nlp, ref_docs) else: examples = self.make_examples(nlp, ref_docs) - yield from examples + for real_eg in examples: + for augmented_eg in self.augmenter(nlp, real_eg): + yield augmented_eg def _make_example( self, nlp: "Language", reference: Doc, gold_preproc: bool diff --git a/spacy/util.py b/spacy/util.py index 01232f5c5..1cc7abf57 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -81,6 +81,7 @@ class registry(thinc.registry): callbacks = catalogue.create("spacy", "callbacks") batchers = catalogue.create("spacy", "batchers", entry_points=True) readers = catalogue.create("spacy", "readers", entry_points=True) + augmenters = catalogue.create("spacy", "augmenters", entry_points=True) loggers = catalogue.create("spacy", "loggers", entry_points=True) # These are factories registered via third-party packages and the # spacy_factories entry point. This registry only exists so we can easily diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 2b308d618..e7d6773e6 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -74,6 +74,7 @@ train/test skew. |  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~ | | `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | +| `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~ ## Corpus.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 54be6b367..eb02b135a 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -6,6 +6,7 @@ menu: - ['Introduction', 'basics'] - ['Quickstart', 'quickstart'] - ['Config System', 'config'] + - ['Custom Functions', 'custom-functions'] - ['Parallel Training', 'parallel-training'] - ['Internal API', 'api'] @@ -505,6 +506,16 @@ still look good. + + ## Custom Functions {#custom-functions} Registered functions in the training config files can refer to built-in From cd21eb24851fde435d8bd3f2c8d15c5f82d66813 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 28 Sep 2020 16:45:48 +0200 Subject: [PATCH 3/7] upgrade pydantic pin for thinc's field.default_factory --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index d696cd44b..3ff8bea3d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,7 @@ pathy numpy>=1.15.0 requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 -pydantic>=1.3.0,<2.0.0 +pydantic>=1.5.0,<2.0.0 pytokenizations # Official Python utilities setuptools diff --git a/setup.cfg b/setup.cfg index b55c0d376..92732dc33 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,7 +51,7 @@ install_requires = tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 requests>=2.13.0,<3.0.0 - pydantic>=1.3.0,<2.0.0 + pydantic>=1.5.0,<2.0.0 pytokenizations # Official Python utilities setuptools From 3360825e0042a535e0da08d045f6147425edb00a Mon Sep 17 00:00:00 2001 From: walterhenry <55140654+walterhenry@users.noreply.github.com> Date: Mon, 28 Sep 2020 16:50:15 +0200 Subject: [PATCH 4/7] Proofreading Another round of proofreading. All the API docs have been read through and I've grazed the Usage docs. --- website/docs/api/doc.md | 3 +-- website/docs/api/pipeline-functions.md | 2 +- website/docs/api/span.md | 2 +- website/docs/api/textcategorizer.md | 8 ++++---- website/docs/api/tok2vec.md | 4 ++-- website/docs/api/token.md | 14 +++++++------- website/docs/api/tokenizer.md | 8 ++++---- website/docs/api/top-level.md | 16 ++++++++-------- website/docs/api/transformer.md | 18 +++++++++--------- website/docs/api/vectors.md | 6 +++--- website/docs/api/vocab.md | 12 ++++++------ website/docs/usage/embeddings-transformers.md | 2 +- 12 files changed, 47 insertions(+), 48 deletions(-) diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index b4097ddb7..151b00a0a 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -444,8 +444,7 @@ invalidated, although they may accidentally continue to work. Mark a span for merging. The `attrs` will be applied to the resulting token (if they're context-dependent token attributes like `LEMMA` or `DEP`) or to the underlying lexeme (if they're context-independent lexical attributes like -`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided as a -dictionary mapping attribute name to values as the `"_"` key. +`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute name to values. > #### Example > diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md index 8bb52d0f9..0dc03a16a 100644 --- a/website/docs/api/pipeline-functions.md +++ b/website/docs/api/pipeline-functions.md @@ -26,7 +26,7 @@ Merge noun chunks into a single token. Also available via the string name -Since noun chunks require part-of-speech tags and the dependency parser, make +Since noun chunks require part-of-speech tags and the dependency parse, make sure to add this component _after_ the `"tagger"` and `"parser"` components. By default, `nlp.add_pipe` will add components to the end of the pipeline and after all other components. diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 242ceaed0..7fa1aaa38 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -187,7 +187,7 @@ the character indices don't map to a valid span. | Name | Description | | ------------------------------------ | ----------------------------------------------------------------------------------------- | | `start` | The index of the first character of the span. ~~int~~ | -| `end` | The index of the last character after the span. ~int~~ | +| `end` | The index of the last character after the span. ~~int~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | | `kb_id` 2.2 | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index b68039094..be4052f46 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -153,7 +153,7 @@ setting up the label scheme based on the data. ## TextCategorizer.predict {#predict tag="method"} -Apply the component's model to a batch of [`Doc`](/api/doc) objects, without +Apply the component's model to a batch of [`Doc`](/api/doc) objects without modifying them. > #### Example @@ -170,7 +170,7 @@ modifying them. ## TextCategorizer.set_annotations {#set_annotations tag="method"} -Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores. +Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores. > #### Example > @@ -213,7 +213,7 @@ Delegates to [`predict`](/api/textcategorizer#predict) and ## TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"} Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the -current model to make predictions similar to an initial model, to try to address +current model to make predictions similar to an initial model to try to address the "catastrophic forgetting" problem. This feature is experimental. > #### Example @@ -286,7 +286,7 @@ Create an optimizer for the pipeline component. ## TextCategorizer.use_params {#use_params tag="method, contextmanager"} -Modify the pipe's model, to use the given parameter values. +Modify the pipe's model to use the given parameter values. > #### Example > diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md index 5c7214edc..2633a7a1a 100644 --- a/website/docs/api/tok2vec.md +++ b/website/docs/api/tok2vec.md @@ -151,7 +151,7 @@ setting up the label scheme based on the data. ## Tok2Vec.predict {#predict tag="method"} -Apply the component's model to a batch of [`Doc`](/api/doc) objects, without +Apply the component's model to a batch of [`Doc`](/api/doc) objects without modifying them. > #### Example @@ -224,7 +224,7 @@ Create an optimizer for the pipeline component. ## Tok2Vec.use_params {#use_params tag="method, contextmanager"} -Modify the pipe's model, to use the given parameter values. At the end of the +Modify the pipe's model to use the given parameter values. At the end of the context, the original parameters are restored. > #### Example diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 0860797aa..068a1d2d2 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -243,7 +243,7 @@ A sequence of the token's immediate syntactic children. ## Token.lefts {#lefts tag="property" model="parser"} -The leftward immediate children of the word, in the syntactic dependency parse. +The leftward immediate children of the word in the syntactic dependency parse. > #### Example > @@ -259,7 +259,7 @@ The leftward immediate children of the word, in the syntactic dependency parse. ## Token.rights {#rights tag="property" model="parser"} -The rightward immediate children of the word, in the syntactic dependency parse. +The rightward immediate children of the word in the syntactic dependency parse. > #### Example > @@ -275,7 +275,7 @@ The rightward immediate children of the word, in the syntactic dependency parse. ## Token.n_lefts {#n_lefts tag="property" model="parser"} -The number of leftward immediate children of the word, in the syntactic +The number of leftward immediate children of the word in the syntactic dependency parse. > #### Example @@ -291,7 +291,7 @@ dependency parse. ## Token.n_rights {#n_rights tag="property" model="parser"} -The number of rightward immediate children of the word, in the syntactic +The number of rightward immediate children of the word in the syntactic dependency parse. > #### Example @@ -422,8 +422,8 @@ The L2 norm of the token's vector representation. | `norm_` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~str~~ | | `lower` | Lowercase form of the token. ~~int~~ | | `lower_` | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~ | -| `shape` | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ | -| `shape_` | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ | +| `shape` | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ | +| `shape_` | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ | | `prefix` | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~ | | `prefix_` | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~ | | `suffix` | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~ | @@ -451,7 +451,7 @@ The L2 norm of the token's vector representation. | `tag` | Fine-grained part-of-speech. ~~int~~ | | `tag_` | Fine-grained part-of-speech. ~~str~~ | | `morph` 3 | Morphological analysis. ~~MorphAnalysis~~ | -| `morph_` 3 | Morphological analysis in the Universal Dependencies [FEATS]https://universaldependencies.org/format.html#morphological-annotation format. ~~str~~ | +| `morph_` 3 | Morphological analysis in the Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ | | `dep` | Syntactic dependency relation. ~~int~~ | | `dep_` | Syntactic dependency relation. ~~str~~ | | `lang` | Language of the parent document's vocabulary. ~~int~~ | diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 0158c5589..8ea5a1f65 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -1,6 +1,6 @@ --- title: Tokenizer -teaser: Segment text into words, punctuations marks etc. +teaser: Segment text into words, punctuations marks, etc. tag: class source: spacy/tokenizer.pyx --- @@ -15,14 +15,14 @@ source: spacy/tokenizer.pyx Segment text, and create `Doc` objects with the discovered segment boundaries. For a deeper understanding, see the docs on [how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works). -The tokenizer is typically created automatically when the a +The tokenizer is typically created automatically when a [`Language`](/api/language) subclass is initialized and it reads its settings like punctuation and special case rules from the [`Language.Defaults`](/api/language#defaults) provided by the language subclass. ## Tokenizer.\_\_init\_\_ {#init tag="method"} -Create a `Tokenizer`, to create `Doc` objects given unicode text. For examples +Create a `Tokenizer` to create `Doc` objects given unicode text. For examples of how to construct a custom tokenizer with different tokenization rules, see the [usage documentation](https://spacy.io/usage/linguistic-features#native-tokenizers). @@ -87,7 +87,7 @@ Tokenize a stream of texts. | ------------ | ------------------------------------------------------------------------------------ | | `texts` | A sequence of unicode texts. ~~Iterable[str]~~ | | `batch_size` | The number of texts to accumulate in an internal buffer. Defaults to `1000`. ~~int~~ | -| **YIELDS** | The tokenized Doc objects, in order. ~~Doc~~ | +| **YIELDS** | The tokenized `Doc` objects, in order. ~~Doc~~ | ## Tokenizer.find_infix {#find_infix tag="method"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index f52c63f18..94260cacb 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -196,7 +196,7 @@ browser. Will run a simple web server. | `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ | | `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ | | `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ | -| `manual` | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ | +| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ | | `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ | | `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ | @@ -221,7 +221,7 @@ Render a dependency parse tree or named entity visualization. | `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ | | `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ | | `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ | -| `manual` | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ | +| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ | | `jupyter` | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ | | **RETURNS** | The rendered HTML markup. ~~str~~ | @@ -242,7 +242,7 @@ If a setting is not present in the options, the default value will be used. | Name | Description | | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- | | `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ | -| `add_lemma` 2.2.4 | Print the lemma's in a separate row below the token texts. Defaults to `False`. ~~bool~~ | +| `add_lemma` 2.2.4 | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ | | `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | | `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ | | `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | @@ -611,7 +611,7 @@ sequences in the batch. Encode labelled spans into per-token tags, using the [BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit, -Out). Returns a list of strings, describing the tags. Each tag string will be of +Out). Returns a list of strings, describing the tags. Each tag string will be in the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. The string `"-"` is used where the entity offsets don't align with the tokenization in the `Doc` object. The training algorithm @@ -716,7 +716,7 @@ decorator. ### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"} Check whether a `Language` subclass is already loaded. `Language` subclasses are -loaded lazily, to avoid expensive setup code associated with the language data. +loaded lazily to avoid expensive setup code associated with the language data. > #### Example > @@ -904,7 +904,7 @@ Compile a sequence of prefix rules into a regex object. | Name | Description | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- | | `entries` | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | -| **RETURNS** | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ | +| **RETURNS** | The regex object to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ | ### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"} @@ -921,7 +921,7 @@ Compile a sequence of suffix rules into a regex object. | Name | Description | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- | | `entries` | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | -| **RETURNS** | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ | +| **RETURNS** | The regex object to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ | ### util.compile_infix_regex {#util.compile_infix_regex tag="function"} @@ -938,7 +938,7 @@ Compile a sequence of infix rules into a regex object. | Name | Description | | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------- | | `entries` | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | -| **RETURNS** | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ | +| **RETURNS** | The regex object to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ | ### util.minibatch {#util.minibatch tag="function" new="2"} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index d5bcef229..957ce69a4 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -186,7 +186,7 @@ setting up the label scheme based on the data. ## Transformer.predict {#predict tag="method"} -Apply the component's model to a batch of [`Doc`](/api/doc) objects, without +Apply the component's model to a batch of [`Doc`](/api/doc) objects without modifying them. > #### Example @@ -203,7 +203,7 @@ modifying them. ## Transformer.set_annotations {#set_annotations tag="method"} -Assign the extracted features to the Doc objects. By default, the +Assign the extracted features to the `Doc` objects. By default, the [`TransformerData`](/api/transformer#transformerdata) object is written to the [`Doc._.trf_data`](#custom-attributes) attribute. Your `set_extra_annotations` callback is then called, if provided. @@ -272,7 +272,7 @@ Create an optimizer for the pipeline component. ## Transformer.use_params {#use_params tag="method, contextmanager"} -Modify the pipe's model, to use the given parameter values. At the end of the +Modify the pipe's model to use the given parameter values. At the end of the context, the original parameters are restored. > #### Example @@ -388,8 +388,8 @@ by this class. Instances of this class are typically assigned to the | Name | Description | | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts, and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ | -| `tensors` | The activations for the Doc from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ | +| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ | +| `tensors` | The activations for the `Doc` from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ | | `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ | | `width` | The width of the last hidden layer. ~~int~~ | @@ -409,7 +409,7 @@ objects to associate the outputs to each [`Doc`](/api/doc) in the batch. | Name | Description | | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each Span can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each Span may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ | +| `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each `Span` can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each `Span` may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ | | `tokens` | The output of the tokenizer. ~~transformers.BatchEncoding~~ | | `tensors` | The output of the transformer model. ~~List[torch.Tensor]~~ | | `align` | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ | @@ -439,10 +439,10 @@ Split a `TransformerData` object that represents a batch into a list with one ## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"} Span getters are functions that take a batch of [`Doc`](/api/doc) objects and -return a lists of [`Span`](/api/span) objects for each doc, to be processed by -the transformer. This is used to manage long documents, by cutting them into +return a lists of [`Span`](/api/span) objects for each doc to be processed by +the transformer. This is used to manage long documents by cutting them into smaller sequences before running the transformer. The spans are allowed to -overlap, and you can also omit sections of the Doc if they are not relevant. +overlap, and you can also omit sections of the `Doc` if they are not relevant. Span getters can be referenced in the `[components.transformer.model.get_spans]` block of the config to customize the sequences processed by the transformer. You diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index 7e97b4ca3..ba2d5ab42 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -290,7 +290,7 @@ If a table is full, it can be resized using ## Vectors.n_keys {#n_keys tag="property"} Get the number of keys in the table. Note that this is the number of _all_ keys, -not just unique vectors. If several keys are mapped are mapped to the same +not just unique vectors. If several keys are mapped to the same vectors, they will be counted individually. > #### Example @@ -307,10 +307,10 @@ vectors, they will be counted individually. ## Vectors.most_similar {#most_similar tag="method"} -For each of the given vectors, find the `n` most similar entries to it, by +For each of the given vectors, find the `n` most similar entries to it by cosine. Queries are by vector. Results are returned as a `(keys, best_rows, scores)` tuple. If `queries` is large, the calculations are -performed in chunks, to avoid consuming too much memory. You can set the +performed in chunks to avoid consuming too much memory. You can set the `batch_size` to control the size/space trade-off during the calculations. > #### Example diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index 71a678cb3..a2ca63002 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -29,7 +29,7 @@ Create the vocabulary. | `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ | | `vectors_name` 2.2 | A name to identify the vectors table. ~~str~~ | | `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ | -| `get_noun_chunks` | A function that yields base noun phrases, used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ | +| `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ | ## Vocab.\_\_len\_\_ {#len tag="method"} @@ -150,7 +150,7 @@ rows, we would discard the vectors for "feline" and "reclined". These words would then be remapped to the closest remaining vector – so "feline" would have the same vector as "cat", and "reclined" would have the same vector as "sat". The similarities are judged by cosine. The original vectors may be large, so the -cosines are calculated in minibatches, to reduce memory usage. +cosines are calculated in minibatches to reduce memory usage. > #### Example > @@ -170,7 +170,7 @@ cosines are calculated in minibatches, to reduce memory usage. Retrieve a vector for a word in the vocabulary. Words can be looked up by string or hash value. If no vectors data is loaded, a `ValueError` is raised. If `minn` is defined, then the resulting vector uses [FastText](https://fasttext.cc/)'s -subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`). +subword features by average over n-grams of `orth` (introduced in spaCy `v2.1`). > #### Example > @@ -182,13 +182,13 @@ subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`). | Name | Description | | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------- | | `orth` | The hash value of a word, or its unicode string. ~~Union[int, str]~~ | -| `minn` 2.1 | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~ | -| `maxn` 2.1 | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~ | +| `minn` 2.1 | Minimum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ | +| `maxn` 2.1 | Maximum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ | | **RETURNS** | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | ## Vocab.set_vector {#set_vector tag="method" new="2"} -Set a vector for a word in the vocabulary. Words can be referenced by by string +Set a vector for a word in the vocabulary. Words can be referenced by string or hash value. > #### Example diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 8dd104ead..c61d7e144 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -36,7 +36,7 @@ models such as [transformers](#transformers) is that word vectors model context around them, a transformer model like BERT can't really help you. BERT is designed to understand language **in context**, which isn't what you have. A word vectors table will be a much better fit for your task. However, if you do -have words in context — whole sentences or paragraphs of running text — word +have words in context – whole sentences or paragraphs of running text – word vectors will only provide a very rough approximation of what the text is about. Word vectors are also very computationally efficient, as they map a word to a From 1d80b3dc1b23ffb2e2659d637fa073f7aebb9012 Mon Sep 17 00:00:00 2001 From: walterhenry <55140654+walterhenry@users.noreply.github.com> Date: Tue, 29 Sep 2020 12:39:10 +0200 Subject: [PATCH 5/7] Proofreading Finished with the API docs and started on the Usage, but Embedding & Transformers --- website/docs/usage/embeddings-transformers.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index b00760e62..e3a8ae448 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -41,8 +41,8 @@ transformers is that word vectors model **lexical types**, rather than _tokens_. If you have a list of terms with no context around them, a transformer model like BERT can't really help you. BERT is designed to understand language **in context**, which isn't what you have. A word vectors table will be a much better -fit for your task. However, if you do have words in context — whole sentences or -paragraphs of running text — word vectors will only provide a very rough +fit for your task. However, if you do have words in context – whole sentences or +paragraphs of running text – word vectors will only provide a very rough approximation of what the text is about. Word vectors are also very computationally efficient, as they map a word to a @@ -256,7 +256,7 @@ for doc in nlp.pipe(["some text", "some other text"]): ``` You can also customize how the [`Transformer`](/api/transformer) component sets -annotations onto the [`Doc`](/api/doc), by specifying a custom +annotations onto the [`Doc`](/api/doc) by specifying a custom `set_extra_annotations` function. This callback will be called with the raw input and output data for the whole batch, along with the batch of `Doc` objects, allowing you to implement whatever you need. The annotation setter is @@ -675,7 +675,7 @@ given you a 10% error reduction, pretraining with spaCy might give you another The [`spacy pretrain`](/api/cli#pretrain) command will take a **specific subnetwork** within one of your components, and add additional layers to build a -network for a temporary task, that forces the model to learn something about +network for a temporary task that forces the model to learn something about sentence structure and word cooccurrence statistics. Pretraining produces a **binary weights file** that can be loaded back in at the start of training. The weights file specifies an initial set of weights. Training then proceeds as From 6a04e5adeae6387074d890988c957e7e2c4f9a34 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 29 Sep 2020 14:49:55 +0200 Subject: [PATCH 6/7] encoding UTF8 (#6161) --- spacy/cli/project/document.py | 2 +- website/docs/usage/training.md | 2 +- website/setup/jinja_to_js.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/cli/project/document.py b/spacy/cli/project/document.py index d0265029a..811b7c746 100644 --- a/spacy/cli/project/document.py +++ b/spacy/cli/project/document.py @@ -114,6 +114,6 @@ def project_document( content = f"{before}{content}{after}" else: msg.warn("Replacing existing file") - with output_file.open("w") as f: + with output_file.open("w", encoding="utf8") as f: f.write(content) msg.good("Saved project documentation", output_file) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index eb02b135a..97992287b 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -700,7 +700,7 @@ from pathlib import Path @spacy.registry.loggers("my_custom_logger.v1") def custom_logger(log_path): def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]: - with Path(log_path).open("w") as file_: + with Path(log_path).open("w", encoding="utf8") as file_: file_.write("step\\t") file_.write("score\\t") for pipe in nlp.pipe_names: diff --git a/website/setup/jinja_to_js.py b/website/setup/jinja_to_js.py index 114d0e172..e2eca7ffb 100644 --- a/website/setup/jinja_to_js.py +++ b/website/setup/jinja_to_js.py @@ -1256,7 +1256,7 @@ def main(template_path, output=None, data_path=None): data_str = f"export const DATA = {data}" result = compiler.get_output() if output is not None: - with output.open("w") as f: + with output.open("w", encoding="utf8") as f: f.write(f"{header}\n{result}\n{data_str}") print(f"Updated {output.parts[-1]}") else: From b486389eece1984d932472353e650e14ef1849d4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 20:48:43 +0200 Subject: [PATCH 7/7] Update website/docs/api/doc.md --- website/docs/api/doc.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 40fd8d531..45ecd4d8c 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -479,7 +479,7 @@ invalidated, although they may accidentally continue to work. Mark a span for merging. The `attrs` will be applied to the resulting token (if they're context-dependent token attributes like `LEMMA` or `DEP`) or to the underlying lexeme (if they're context-independent lexical attributes like -`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute name to values. +`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute names to values. > #### Example >