From 1c65b3b2c0aca584c1c6d38358877695cf90453a Mon Sep 17 00:00:00 2001
From: walterhenry <55140654+walterhenry@users.noreply.github.com>
Date: Wed, 30 Sep 2020 11:33:40 +0200
Subject: [PATCH 01/10] Proofreading
A few more small things in Usage.
---
website/docs/usage/index.md | 10 +++++-----
website/docs/usage/layers-architectures.md | 2 +-
website/docs/usage/linguistic-features.md | 6 +++---
3 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md
index ad2614175..65f5fed95 100644
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@@ -17,7 +17,7 @@ spaCy is compatible with **64-bit CPython 3.6+** and runs on **Unix/Linux**,
>
> To help you make the transition from v2.x to v3.0, we've uploaded the old
> website to [**v2.spacy.io**](https://v2.spacy.io/docs). To see what's changed
-> and how to migrate, see the guide on [v3.0 guide](/usage/v3).
+> and how to migrate, see the [v3.0 guide](/usage/v3).
## Quickstart {hidden="true"}
@@ -176,7 +176,7 @@ to get the right commands for your platform and Python version.
`sudo apt-get install build-essential python-dev git`
- **macOS / OS X:** Install a recent version of
[XCode](https://developer.apple.com/xcode/), including the so-called "Command
- Line Tools". macOS and OS X ship with Python and git preinstalled.
+ Line Tools". macOS and OS X ship with Python and Git preinstalled.
- **Windows:** Install a version of the
[Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
or
@@ -369,7 +369,7 @@ This error may occur when running the `spacy` command from the command line.
spaCy does not currently add an entry to your `PATH` environment variable, as
this can lead to unexpected results, especially when using a virtual
environment. Instead, spaCy adds an auto-alias that maps `spacy` to
-`python -m spacy]`. If this is not working as expected, run the command with
+`python -m spacy`. If this is not working as expected, run the command with
`python -m`, yourself – for example `python -m spacy download en_core_web_sm`.
For more info on this, see the [`download`](/api/cli#download) command.
@@ -416,8 +416,8 @@ disk has some binary files that should not go through this conversion. When they
do, you get the error above. You can fix it by either changing your
[`core.autocrlf`](https://git-scm.com/book/en/v2/Customizing-Git-Git-Configuration)
setting to `"false"`, or by committing a
-[`.gitattributes`](https://git-scm.com/docs/gitattributes) file] to your
-repository to tell git on which files or folders it shouldn't do LF-to-CRLF
+[`.gitattributes`](https://git-scm.com/docs/gitattributes) file to your
+repository to tell Git on which files or folders it shouldn't do LF-to-CRLF
conversion, with an entry like `path/to/spacy/model/** -text`. After you've done
either of these, clone your repository again.
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index aefc64ece..826f9450f 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -351,7 +351,7 @@ dropout = 0.2
-Remember that it is best not to rely on any (hidden) default values, to ensure
+Remember that it is best not to rely on any (hidden) default values to ensure
that training configs are complete and experiments fully reproducible.
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index a229c18e9..9b22d63f6 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -44,7 +44,7 @@ in the [models directory](/models).
Inflectional morphology is the process by which a root form of a word is
modified by adding prefixes or suffixes that specify its grammatical function
-but do not changes its part-of-speech. We say that a **lemma** (root form) is
+but do not change its part-of-speech. We say that a **lemma** (root form) is
**inflected** (modified/combined) with one or more **morphological features** to
create a surface form. Here are some examples:
@@ -290,7 +290,7 @@ import DisplaCyLong2Html from 'images/displacy-long2.html'
Because the syntactic relations form a tree, every word has **exactly one
head**. You can therefore iterate over the arcs in the tree by iterating over
the words in the sentence. This is usually the best way to match an arc of
-interest — from below:
+interest – from below:
```python
### {executable="true"}
@@ -399,7 +399,7 @@ for descendant in subject.subtree:
Finally, the `.left_edge` and `.right_edge` attributes can be especially useful,
because they give you the first and last token of the subtree. This is the
easiest way to create a `Span` object for a syntactic phrase. Note that
-`.right_edge` gives a token **within** the subtree — so if you use it as the
+`.right_edge` gives a token **within** the subtree – so if you use it as the
end-point of a range, don't forget to `+1`!
```python
From 03e3bab64b96beb563e1d5bb8071c4f2b0fd43f3 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Wed, 14 Oct 2020 14:58:15 +0200
Subject: [PATCH 02/10] Update README.md [ci skip]
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 5d310492d..55e4c6512 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ state-of-the-art speed, convolutional **neural network models** for tagging,
parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management.
spaCy is commercial open-source software, released under the MIT license.
-💫 **Version 3.0 out now!**
+💫 **Version 3.0 (nightly) out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
From 1aa8e8f2af7c180294bb47047e913fa655f278a4 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Wed, 14 Oct 2020 14:58:45 +0200
Subject: [PATCH 03/10] Update docs [ci skip]
---
website/docs/usage/_benchmarks-models.md | 12 ++++++------
website/docs/usage/facts-figures.md | 4 ++--
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index a604c4b57..becd313f4 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -1,14 +1,14 @@
import { Help } from 'components/typography'; import Link from 'components/link'
-
+
| Pipeline | Parser | Tagger | NER | WPS CPU words per second on CPU, higher is better | WPS GPU words per second on GPU, higher is better |
| ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: |
-| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k |
+| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | 95.5 | 98.3 | 89.7 | 1k | 8k |
| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | 92.2 | 97.4 | 85.8 | 7k | |
-| `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.9 | 10k | |
+| `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | | 10k | |
@@ -23,9 +23,9 @@ import { Help } from 'components/typography'; import Link from 'components/link'
| Named Entity Recognition System | OntoNotes | CoNLL '03 |
| ------------------------------------------------------------------------------ | --------: | --------: |
-| spaCy RoBERTa (2020) | | 92.2 |
-| spaCy CNN (2020) | 85.3 | 88.4 |
-| spaCy CNN (2017) | 86.4 | |
+| spaCy RoBERTa (2020) | 89.7 | 91.6 |
+| spaCy CNN (2020) | 84.5 | |
+| spaCy CNN (2017) | | |
| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)1 | 88.8 | 92.1 |
| Flair2 | 89.7 | 93.1 |
| BERT Base3 | - | 92.4 |
diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md
index a31559b04..2707f68fa 100644
--- a/website/docs/usage/facts-figures.md
+++ b/website/docs/usage/facts-figures.md
@@ -65,8 +65,8 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
| Dependency Parsing System | UAS | LAS |
| ------------------------------------------------------------------------------ | ---: | ---: |
-| spaCy RoBERTa (2020)1 | 96.8 | 95.0 |
-| spaCy CNN (2020)1 | 93.7 | 91.8 |
+| spaCy RoBERTa (2020)1 | 95.5 | 94.3 |
+| spaCy CNN (2020)1 | | |
| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.4 | 96.3 |
| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.2 | 95.7 |
From 2e8dcba37947b5fc99ba5d9d581b549da0698a1a Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Wed, 14 Oct 2020 14:59:09 +0200
Subject: [PATCH 04/10] Update version pins
---
pyproject.toml | 2 +-
requirements.txt | 2 +-
setup.cfg | 8 ++++----
3 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index c175ded66..14a2d7690 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
- "thinc>=8.0.0a44,<8.0.0a50",
+ "thinc>=8.0.0rc0,<8.1.0",
"blis>=0.4.0,<0.8.0",
"pytokenizations",
"pathy"
diff --git a/requirements.txt b/requirements.txt
index d6b6267a9..36f0d1e92 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a44,<8.0.0a50
+thinc>=8.0.0rc0,<8.1.0
blis>=0.4.0,<0.8.0
ml_datasets==0.2.0a0
murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index d9414a4f4..adf0c0e20 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
- thinc>=8.0.0a44,<8.0.0a50
+ thinc>=8.0.0rc0,<8.1.0
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
- thinc>=8.0.0a44,<8.0.0a50
+ thinc>=8.0.0rc0,<8.1.0
blis>=0.4.0,<0.8.0
wasabi>=0.8.0,<1.1.0
srsly>=2.3.0,<3.0.0
@@ -65,9 +65,9 @@ console_scripts =
[options.extras_require]
lookups =
- spacy_lookups_data>=1.0.0rc0,<1.0.0
+ spacy_lookups_data>=1.0.0rc0,<1.1.0
transformers =
- spacy_transformers>=1.0.0a22,<1.0.0
+ spacy_transformers>=1.0.0rc0,<1.1.0
ray =
spacy_ray>=0.1.0,<1.0.0
cuda =
From 0aa88518786ca95f5750e3a79a87967bd3558a94 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Wed, 14 Oct 2020 15:00:49 +0200
Subject: [PATCH 05/10] always return losses
---
spacy/pipeline/tagger.pyx | 5 +++--
spacy/pipeline/trainable_pipe.pyx | 2 +-
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 1b0f79cea..3be93c32c 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -195,7 +195,7 @@ class Tagger(TrainablePipe):
validate_examples(examples, "Tagger.update")
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
# Handle cases where there are no tokens in any docs.
- return
+ return losses
set_dropout_rate(self.model, drop)
tag_scores, bp_tag_scores = self.model.begin_update([eg.predicted for eg in examples])
for sc in tag_scores:
@@ -233,7 +233,7 @@ class Tagger(TrainablePipe):
return
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
- return
+ return losses
set_dropout_rate(self.model, drop)
guesses, backprop = self.model.begin_update(docs)
target = self._rehearsal_model(examples)
@@ -243,6 +243,7 @@ class Tagger(TrainablePipe):
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += (gradient**2).sum()
+ return losses
def get_loss(self, examples, scores):
"""Find the loss and gradient of loss for the batch of documents and
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 07cb01059..6cd73d256 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -116,7 +116,7 @@ cdef class TrainablePipe(Pipe):
validate_examples(examples, "TrainablePipe.update")
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
# Handle cases where there are no tokens in any docs.
- return
+ return losses
set_dropout_rate(self.model, drop)
scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples])
loss, d_scores = self.get_loss(examples, scores)
From 478a14a61934e617988f70aaf692fcd6d7b1e226 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Wed, 14 Oct 2020 15:01:19 +0200
Subject: [PATCH 06/10] fix few typos
---
website/docs/usage/layers-architectures.md | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index e348c4389..9677398cf 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -503,7 +503,7 @@ overview of the `TrainablePipe` methods used by
-### Example: Entity elation extraction component {#component-rel}
+### Example: Entity relation extraction component {#component-rel}
This section outlines an example use-case of implementing a **novel relation
extraction component** from scratch. We'll implement a binary relation
@@ -618,7 +618,7 @@ we can define our relation model in a config file as such:
# ...
[model.get_candidates]
-@misc = "rel_cand_generator.v2"
+@misc = "rel_cand_generator.v1"
max_length = 20
[model.create_candidate_tensor]
@@ -687,8 +687,8 @@ Before the model can be used, it needs to be
[initialized](/usage/training#initialization). This function receives a callback
to access the full **training data set**, or a representative sample. This data
set can be used to deduce all **relevant labels**. Alternatively, a list of
-labels can be provided to `initialize`, or you can call the
-`RelationExtractoradd_label` directly. The number of labels defines the output
+labels can be provided to `initialize`, or you can call
+`RelationExtractor.add_label` directly. The number of labels defines the output
dimensionality of the network, and will be used to do
[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
layers of the neural network. This is triggered by calling
@@ -729,7 +729,7 @@ and its internal model can be trained and used to make predictions.
During training, the function [`update`](/api/pipe#update) is invoked which
delegates to
[`Model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
-[`get_loss`](/api/pipe#get_loss) function that **calculate the loss** for a
+[`get_loss`](/api/pipe#get_loss) function that **calculates the loss** for a
batch of examples, as well as the **gradient** of loss that will be used to
update the weights of the model layers. Thinc provides several
[loss functions](https://thinc.ai/docs/api-loss) that can be used for the
From 44e14ccae87d4077cfc3b730e76ab32bbb15cafb Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Wed, 14 Oct 2020 15:11:34 +0200
Subject: [PATCH 07/10] one more losses fix
---
spacy/pipeline/tagger.pyx | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 3be93c32c..16633a7b8 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -227,10 +227,13 @@ class Tagger(TrainablePipe):
DOCS: https://nightly.spacy.io/api/tagger#rehearse
"""
+ if losses is None:
+ losses = {}
+ losses.setdefault(self.name, 0.0)
validate_examples(examples, "Tagger.rehearse")
docs = [eg.predicted for eg in examples]
if self._rehearsal_model is None:
- return
+ return losses
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
return losses
@@ -240,9 +243,7 @@ class Tagger(TrainablePipe):
gradient = guesses - target
backprop(gradient)
self.finish_update(sgd)
- if losses is not None:
- losses.setdefault(self.name, 0.0)
- losses[self.name] += (gradient**2).sum()
+ losses[self.name] += (gradient**2).sum()
return losses
def get_loss(self, examples, scores):
From 6af585dba565c05fa287fc579b93bacc58ec721e Mon Sep 17 00:00:00 2001
From: walterhenry <55140654+walterhenry@users.noreply.github.com>
Date: Wed, 14 Oct 2020 16:37:57 +0200
Subject: [PATCH 08/10] New batch of proofs
Just tiny fixes to the docs as a proofreader
---
website/docs/usage/101/_named-entities.md | 2 +-
website/docs/usage/101/_tokenization.md | 2 +-
website/docs/usage/linguistic-features.md | 26 +++++++++++-----------
website/docs/usage/processing-pipelines.md | 22 +++++++++---------
website/docs/usage/projects.md | 13 +++++------
website/docs/usage/rule-based-matching.md | 10 ++++-----
website/docs/usage/saving-loading.md | 4 ++--
website/docs/usage/training.md | 2 +-
8 files changed, 40 insertions(+), 41 deletions(-)
diff --git a/website/docs/usage/101/_named-entities.md b/website/docs/usage/101/_named-entities.md
index 36ef07111..2abc45cbd 100644
--- a/website/docs/usage/101/_named-entities.md
+++ b/website/docs/usage/101/_named-entities.md
@@ -1,7 +1,7 @@
A named entity is a "real-world object" that's assigned a name – for example, a
person, a country, a product or a book title. spaCy can **recognize various
types of named entities in a document, by asking the model for a
-**prediction\*\*. Because models are statistical and strongly depend on the
+prediction**. Because models are statistical and strongly depend on the
examples they were trained on, this doesn't always work _perfectly_ and might
need some tuning later, depending on your use case.
diff --git a/website/docs/usage/101/_tokenization.md b/website/docs/usage/101/_tokenization.md
index 764f1e62a..b82150f1a 100644
--- a/website/docs/usage/101/_tokenization.md
+++ b/website/docs/usage/101/_tokenization.md
@@ -45,6 +45,6 @@ marks.
While punctuation rules are usually pretty general, tokenizer exceptions
strongly depend on the specifics of the individual language. This is why each
-[available language](/usage/models#languages) has its own subclass like
+[available language](/usage/models#languages) has its own subclass, like
`English` or `German`, that loads in lists of hard-coded data and exception
rules.
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 9b22d63f6..c4384b98b 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -641,7 +641,7 @@ print("After", doc.ents) # [London]
#### Setting entity annotations in Cython {#setting-cython}
-Finally, you can always write to the underlying struct, if you compile a
+Finally, you can always write to the underlying struct if you compile a
[Cython](http://cython.org/) function. This is easy to do, and allows you to
write efficient native code.
@@ -765,15 +765,15 @@ import Tokenization101 from 'usage/101/\_tokenization.md'
-spaCy introduces a novel tokenization algorithm, that gives a better balance
-between performance, ease of definition, and ease of alignment into the original
+spaCy introduces a novel tokenization algorithm that gives a better balance
+between performance, ease of definition and ease of alignment into the original
string.
After consuming a prefix or suffix, we consult the special cases again. We want
the special cases to handle things like "don't" in English, and we want the same
rule to work for "(don't)!". We do this by splitting off the open bracket, then
-the exclamation, then the close bracket, and finally matching the special case.
-Here's an implementation of the algorithm in Python, optimized for readability
+the exclamation, then the closed bracket, and finally matching the special case.
+Here's an implementation of the algorithm in Python optimized for readability
rather than performance:
```python
@@ -847,7 +847,7 @@ The algorithm can be summarized as follows:
#2.
6. If we can't consume a prefix or a suffix, look for a URL match.
7. If there's no URL match, then look for a special case.
-8. Look for "infixes" — stuff like hyphens etc. and split the substring into
+8. Look for "infixes" – stuff like hyphens etc. and split the substring into
tokens on all infixes.
9. Once we can't consume any more of the string, handle it as a single token.
@@ -864,10 +864,10 @@ intact (abbreviations like "U.S.").
Tokenization rules that are specific to one language, but can be **generalized
-across that language** should ideally live in the language data in
+across that language**, should ideally live in the language data in
[`spacy/lang`](%%GITHUB_SPACY/spacy/lang) – we always appreciate pull requests!
Anything that's specific to a domain or text type – like financial trading
-abbreviations, or Bavarian youth slang – should be added as a special case rule
+abbreviations or Bavarian youth slang – should be added as a special case rule
to your tokenizer instance. If you're dealing with a lot of customizations, it
might make sense to create an entirely custom subclass.
@@ -1110,7 +1110,7 @@ tokenized `Doc`.

To overwrite the existing tokenizer, you need to replace `nlp.tokenizer` with a
-custom function that takes a text, and returns a [`Doc`](/api/doc).
+custom function that takes a text and returns a [`Doc`](/api/doc).
> #### Creating a Doc
>
@@ -1229,7 +1229,7 @@ tokenizer** it will be using at runtime. See the docs on
#### Training with custom tokenization {#custom-tokenizer-training new="3"}
-spaCy's [training config](/usage/training#config) describe the settings,
+spaCy's [training config](/usage/training#config) describes the settings,
hyperparameters, pipeline and tokenizer used for constructing and training the
pipeline. The `[nlp.tokenizer]` block refers to a **registered function** that
takes the `nlp` object and returns a tokenizer. Here, we're registering a
@@ -1465,7 +1465,7 @@ filtered_spans = filter_spans(spans)
The [`retokenizer.split`](/api/doc#retokenizer.split) method allows splitting
one token into two or more tokens. This can be useful for cases where
tokenization rules alone aren't sufficient. For example, you might want to split
-"its" into the tokens "it" and "is" — but not the possessive pronoun "its". You
+"its" into the tokens "it" and "is" – but not the possessive pronoun "its". You
can write rule-based logic that can find only the correct "its" to split, but by
that time, the `Doc` will already be tokenized.
@@ -1513,7 +1513,7 @@ the token indices after splitting.
| `"York"` | `doc[2]` | Attach this token to `doc[1]` in the original `Doc`, i.e. "in". |
If you don't care about the heads (for example, if you're only running the
-tokenizer and not the parser), you can each subtoken to itself:
+tokenizer and not the parser), you can attach each subtoken to itself:
```python
### {highlight="3"}
@@ -1879,7 +1879,7 @@ assert nlp.vocab.vectors.n_keys > n_vectors # but not the total entries
[`Vocab.prune_vectors`](/api/vocab#prune_vectors) reduces the current vector
table to a given number of unique entries, and returns a dictionary containing
the removed words, mapped to `(string, score)` tuples, where `string` is the
-entry the removed word was mapped to, and `score` the similarity score between
+entry the removed word was mapped to and `score` the similarity score between
the two words.
```python
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 3d756215f..dd180f45a 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -128,7 +128,7 @@ should be created. spaCy will then do the following:
2. Iterate over the **pipeline names** and look up each component name in the
`[components]` block. The `factory` tells spaCy which
[component factory](#custom-components-factories) to use for adding the
- component with with [`add_pipe`](/api/language#add_pipe). The settings are
+ component with [`add_pipe`](/api/language#add_pipe). The settings are
passed into the factory.
3. Make the **model data** available to the `Language` class by calling
[`from_disk`](/api/language#from_disk) with the path to the data directory.
@@ -325,7 +325,7 @@ to remove pipeline components from an existing pipeline, the
[`rename_pipe`](/api/language#rename_pipe) method to rename them, or the
[`replace_pipe`](/api/language#replace_pipe) method to replace them with a
custom component entirely (more details on this in the section on
-[custom components](#custom-components).
+[custom components](#custom-components)).
```python
nlp.remove_pipe("parser")
@@ -384,7 +384,7 @@ vectors available – otherwise, it won't be able to make the same predictions.
>
> Instead of providing a `factory`, component blocks in the training
> [config](/usage/training#config) can also define a `source`. The string needs
-> to be a loadable spaCy pipeline package or path. The
+> to be a loadable spaCy pipeline package or path.
>
> ```ini
> [components.ner]
@@ -417,7 +417,7 @@ print(nlp.pipe_names)
### Analyzing pipeline components {#analysis new="3"}
The [`nlp.analyze_pipes`](/api/language#analyze_pipes) method analyzes the
-components in the current pipeline and outputs information about them, like the
+components in the current pipeline and outputs information about them like the
attributes they set on the [`Doc`](/api/doc) and [`Token`](/api/token), whether
they retokenize the `Doc` and which scores they produce during training. It will
also show warnings if components require values that aren't set by previous
@@ -511,7 +511,7 @@ doesn't, the pipeline analysis won't catch that.
## Creating custom pipeline components {#custom-components}
A pipeline component is a function that receives a `Doc` object, modifies it and
-returns it – – for example, by using the current weights to make a prediction
+returns it – for example, by using the current weights to make a prediction
and set some annotation on the document. By adding a component to the pipeline,
you'll get access to the `Doc` at any point **during processing** – instead of
only being able to modify it afterwards.
@@ -702,7 +702,7 @@ nlp.add_pipe("my_component", config={"some_setting": False})
The [`@Language.component`](/api/language#component) decorator is essentially a
-**shortcut** for stateless pipeline component that don't need any settings. This
+**shortcut** for stateless pipeline components that don't need any settings. This
means you don't have to always write a function that returns your function if
there's no state to be passed through – spaCy can just take care of this for
you. The following two code examples are equivalent:
@@ -888,7 +888,7 @@ components in pipelines that you [train](/usage/training). To make sure spaCy
knows where to find your custom `@misc` function, you can pass in a Python file
via the argument `--code`. If someone else is using your component, all they
have to do to customize the data is to register their own function and swap out
-the name. Registered functions can also take **arguments** by the way that can
+the name. Registered functions can also take **arguments**, by the way, that can
be defined in the config as well – you can read more about this in the docs on
[training with custom code](/usage/training#custom-code).
@@ -963,7 +963,7 @@ doc = nlp("This is a text...")
### Language-specific factories {#factories-language new="3"}
-There are many use case where you might want your pipeline components to be
+There are many use cases where you might want your pipeline components to be
language-specific. Sometimes this requires entirely different implementation per
language, sometimes the only difference is in the settings or data. spaCy allows
you to register factories of the **same name** on both the `Language` base
@@ -1028,8 +1028,8 @@ plug fully custom machine learning components into your pipeline. You'll need
the following:
1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This
- can be a model using implemented in
- [Thinc](/usage/layers-architectures#thinc), or a
+ can be a model implemented in
+ [Thinc](/usage/layers-architectures#thinc) or a
[wrapped model](/usage/layers-architectures#frameworks) implemented in
PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a
list of [`Doc`](/api/doc) objects as input and can have any type of output.
@@ -1354,7 +1354,7 @@ to `Doc.user_span_hooks` and `Doc.user_token_hooks`.
>
> The hooks live on the `Doc` object because the `Span` and `Token` objects are
> created lazily, and don't own any data. They just proxy to their parent `Doc`.
-> This turns out to be convenient here — we only have to worry about installing
+> This turns out to be convenient here – we only have to worry about installing
> hooks in one place.
| Name | Customizes |
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 9776dab1b..62d4d5eb3 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -73,7 +73,7 @@ python -m spacy project clone some_example_project
By default, the project will be cloned into the current working directory. You
can specify an optional second argument to define the output directory. The
-`--repo` option lets you define a custom repo to clone from, if you don't want
+`--repo` option lets you define a custom repo to clone from if you don't want
to use the spaCy [`projects`](https://github.com/explosion/projects) repo. You
can also use any private repo you have access to with Git.
@@ -109,7 +109,7 @@ $ python -m spacy project assets
Asset URLs can be a number of different protocols: HTTP, HTTPS, FTP, SSH, and
even cloud storage such as GCS and S3. You can also fetch assets using git, by
replacing the `url` string with a `git` block. spaCy will use Git's "sparse
-checkout" feature, to avoid download the whole repository.
+checkout" feature to avoid downloading the whole repository.
### 3. Run a command {#run}
@@ -201,7 +201,7 @@ $ python -m spacy project push
```
The `remotes` section in your `project.yml` lets you assign names to the
-different storages. To download state from a remote storage, you can use the
+different storages. To download a state from a remote storage, you can use the
[`spacy project pull`](/api/cli#project-pull) command. For more details, see the
docs on [remote storage](#remote).
@@ -315,7 +315,7 @@ company-internal and not available over the internet. In that case, you can
specify the destination paths and a checksum, and leave out the URL. When your
teammates clone and run your project, they can place the files in the respective
directory themselves. The [`project assets`](/api/cli#project-assets) command
-will alert about missing files and mismatched checksums, so you can ensure that
+will alert you about missing files and mismatched checksums, so you can ensure that
others are running your project with the same data.
### Dependencies and outputs {#deps-outputs}
@@ -363,8 +363,7 @@ graphs based on the dependencies and outputs, and won't re-run previous steps
automatically. For instance, if you only run the command `train` that depends on
data created by `preprocess` and those files are missing, spaCy will show an
error – it won't just re-run `preprocess`. If you're looking for more advanced
-data management, check out the [Data Version Control (DVC) integration](#dvc)
-integration. If you're planning on integrating your spaCy project with DVC, you
+data management, check out the [Data Version Control (DVC) integration](#dvc). If you're planning on integrating your spaCy project with DVC, you
can also use `outputs_no_cache` instead of `outputs` to define outputs that
won't be cached or tracked.
@@ -508,7 +507,7 @@ commands:
When your custom project is ready and you want to share it with others, you can
use the [`spacy project document`](/api/cli#project-document) command to
-**auto-generate** a pretty, Markdown-formatted `README` file based on your
+**auto-generate** a pretty, markdown-formatted `README` file based on your
project's `project.yml`. It will list all commands, workflows and assets defined
in the project and include details on how to run the project, as well as links
to the relevant spaCy documentation to make it easy for others to get started
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 7e979b32e..27b9d79ab 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -55,7 +55,7 @@ abstract representations of the tokens you're looking for, using lexical
attributes, linguistic features predicted by the model, operators, set
membership and rich comparison. For example, you can find a noun, followed by a
verb with the lemma "love" or "like", followed by an optional determiner and
-another token that's at least ten characters long.
+another token that's at least 10 characters long.
@@ -491,7 +491,7 @@ you prefer.
| `matcher` | The matcher instance. ~~Matcher~~ |
| `doc` | The document the matcher was used on. ~~Doc~~ |
| `i` | Index of the current match (`matches[i`]). ~~int~~ |
-| `matches` | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. ~~ List[Tuple[int, int int]]~~ |
+| `matches` | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. ~~List[Tuple[int, int int]]~~ |
### Creating spans from matches {#matcher-spans}
@@ -628,7 +628,7 @@ To get a quick overview of the results, you could collect all sentences
containing a match and render them with the
[displaCy visualizer](/usage/visualizers). In the callback function, you'll have
access to the `start` and `end` of each match, as well as the parent `Doc`. This
-lets you determine the sentence containing the match, `doc[start : end`.sent],
+lets you determine the sentence containing the match, `doc[start:end].sent`,
and calculate the start and end of the matched span within the sentence. Using
displaCy in ["manual" mode](/usage/visualizers#manual-usage) lets you pass in a
list of dictionaries containing the text and entities to render.
@@ -1451,7 +1451,7 @@ When using a trained
extract information from your texts, you may find that the predicted span only
includes parts of the entity you're looking for. Sometimes, this happens if
statistical model predicts entities incorrectly. Other times, it happens if the
-way the entity type way defined in the original training corpus doesn't match
+way the entity type was defined in the original training corpus doesn't match
what you need for your application.
> #### Where corpora come from
@@ -1642,7 +1642,7 @@ affiliation is current, we can check the head's part-of-speech tag.
```python
person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
for ent in person_entities:
- # Because the entity is a spans, we need to use its root token. The head
+ # Because the entity is a span, we need to use its root token. The head
# is the syntactic governor of the person, e.g. the verb
head = ent.root.head
if head.lemma_ == "work":
diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md
index c0fe1323c..e43889df9 100644
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@@ -448,7 +448,7 @@ entry_points={
}
```
-The factory can also implement other pipeline component like `to_disk` and
+The factory can also implement other pipeline components like `to_disk` and
`from_disk` for serialization, or even `update` to make the component trainable.
If a component exposes a `from_disk` method and is included in a pipeline, spaCy
will call it on load. This lets you ship custom data with your pipeline package.
@@ -666,7 +666,7 @@ care of putting all this together and returning a `Language` object with the
loaded pipeline and data. If your pipeline requires
[custom components](/usage/processing-pipelines#custom-components) or a custom
language class, you can also **ship the code with your package** and include it
-in the `__init__.py` – for example, to register component before the `nlp`
+in the `__init__.py` – for example, to register a component before the `nlp`
object is created.
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 65cfb563b..4e989c377 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -489,7 +489,7 @@ or TensorFlow, make **custom modifications** to the `nlp` object, create custom
optimizers or schedules, or **stream in data** and preprocesses it on the fly
while training.
-Each custom function can have any numbers of arguments that are passed in via
+Each custom function can have any number of arguments that are passed in via
the [config](#config), just the built-in functions. If your function defines
**default argument values**, spaCy is able to auto-fill your config when you run
[`init fill-config`](/api/cli#init-fill-config). If you want to make sure that a
From a2d4aaee70947219f5968f039400c63da759b41f Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Wed, 14 Oct 2020 19:51:36 +0200
Subject: [PATCH 09/10] Apply suggestions from code review
---
website/docs/usage/projects.md | 4 ++--
website/docs/usage/saving-loading.md | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 6396e9eba..492345f2f 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -197,7 +197,7 @@ $ python -m spacy project push
```
The `remotes` section in your `project.yml` lets you assign names to the
-different storages. To download a state from a remote storage, you can use the
+different storages. To download state from a remote storage, you can use the
[`spacy project pull`](/api/cli#project-pull) command. For more details, see the
docs on [remote storage](#remote).
@@ -502,7 +502,7 @@ commands:
When your custom project is ready and you want to share it with others, you can
use the [`spacy project document`](/api/cli#project-document) command to
-**auto-generate** a pretty, markdown-formatted `README` file based on your
+**auto-generate** a pretty, Markdown-formatted `README` file based on your
project's `project.yml`. It will list all commands, workflows and assets defined
in the project and include details on how to run the project, as well as links
to the relevant spaCy documentation to make it easy for others to get started
diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md
index 4e8280baf..c4957763e 100644
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@@ -463,7 +463,7 @@ entry_points={
}
```
-The factory can also implement other pipeline components like `to_disk` and
+The factory can also implement other pipeline component methods like `to_disk` and
`from_disk` for serialization, or even `update` to make the component trainable.
If a component exposes a `from_disk` method and is included in a pipeline, spaCy
will call it on load. This lets you ship custom data with your pipeline package.
From a966c271f76338bbb0769639dcc99b189b37da4f Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Wed, 14 Oct 2020 20:50:23 +0200
Subject: [PATCH 10/10] Update models docs [ci skip]
---
website/docs/models/index.md | 6 +++---
website/gatsby-config.js | 4 ----
website/gatsby-node.js | 1 -
website/meta/languages.json | 1 -
website/meta/sidebars.json | 4 ----
website/src/templates/docs.js | 25 +++++++++++++------------
website/src/widgets/landing.js | 2 --
7 files changed, 16 insertions(+), 27 deletions(-)
diff --git a/website/docs/models/index.md b/website/docs/models/index.md
index 5b17d7f83..30b4f11d9 100644
--- a/website/docs/models/index.md
+++ b/website/docs/models/index.md
@@ -22,12 +22,12 @@ import QuickstartModels from 'widgets/quickstart-models.js'
## Package naming conventions {#conventions}
In general, spaCy expects all pipeline packages to follow the naming convention
-of `[lang`\_[name]]. For spaCy's pipelines, we also chose to divide the name
+of `[lang]\_[name]`. For spaCy's pipelines, we also chose to divide the name
into three components:
1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with
- vocabulary, syntax, entities and word vectors, or `depent` for only vocab,
- syntax and entities).
+ vocabulary, syntax, entities and word vectors, or `dep` for only vocab and
+ syntax).
2. **Genre:** Type of text the pipeline is trained on, e.g. `web` or `news`.
3. **Size:** Package size indicator, `sm`, `md` or `lg`.
diff --git a/website/gatsby-config.js b/website/gatsby-config.js
index 5b11f56bc..c4047e9ac 100644
--- a/website/gatsby-config.js
+++ b/website/gatsby-config.js
@@ -41,11 +41,7 @@ function getCounts(langs = []) {
return {
langs: langs.length,
modelLangs: langs.filter(({ models }) => models && !!models.length).length,
- starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length,
models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0),
- starters: langs
- .map(({ starters }) => (starters ? starters.length : 0))
- .reduce((a, b) => a + b, 0),
}
}
diff --git a/website/gatsby-node.js b/website/gatsby-node.js
index 1f6fcf1f9..56a65aeae 100644
--- a/website/gatsby-node.js
+++ b/website/gatsby-node.js
@@ -37,7 +37,6 @@ exports.createPages = ({ graphql, actions }) => {
code
name
models
- starters
example
has_examples
}
diff --git a/website/meta/languages.json b/website/meta/languages.json
index a7ab28f03..681e778cc 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -31,7 +31,6 @@
"code": "en",
"name": "English",
"models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_core_web_trf"],
- "starters": ["en_vectors_web_lg"],
"example": "This is a sentence.",
"has_examples": true
},
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 660309a20..3799f399b 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -52,10 +52,6 @@
{
"label": "Trained Pipelines",
"items": []
- },
- {
- "label": "Starter Packages",
- "items": []
}
]
},
diff --git a/website/src/templates/docs.js b/website/src/templates/docs.js
index cc78f339c..182490370 100644
--- a/website/src/templates/docs.js
+++ b/website/src/templates/docs.js
@@ -51,17 +51,19 @@ const Docs = ({ pageContext, children }) => (
id: model,
})),
}))
- sidebar.items[2].items = languages
- .filter(({ starters }) => starters && starters.length)
- .map(lang => ({
- text: lang.name,
- url: `/models/${lang.code}-starters`,
- isActive: id === `${lang.code}-starters`,
- menu: lang.starters.map(model => ({
- text: model,
- id: model,
- })),
- }))
+ if (sidebar.items.length > 2) {
+ sidebar.items[2].items = languages
+ .filter(({ starters }) => starters && starters.length)
+ .map(lang => ({
+ text: lang.name,
+ url: `/models/${lang.code}-starters`,
+ isActive: id === `${lang.code}-starters`,
+ menu: lang.starters.map(model => ({
+ text: model,
+ id: model,
+ })),
+ }))
+ }
}
const sourcePath = source ? github(source) : null
const currentSource = getCurrentSource(slug, isIndex)
@@ -146,7 +148,6 @@ const query = graphql`
code
name
models
- starters
}
nightly
sidebars {
diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js
index ac1d7c5c7..46be93ab5 100644
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@@ -336,9 +336,7 @@ const landingQuery = graphql`
counts {
langs
modelLangs
- starterLangs
models
- starters
}
}
}