mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-20 09:01:58 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
dba40b1392
|
@ -13,7 +13,7 @@ state-of-the-art speed, convolutional **neural network models** for tagging,
|
|||
parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management.
|
||||
spaCy is commercial open-source software, released under the MIT license.
|
||||
|
||||
💫 **Version 3.0 out now!**
|
||||
💫 **Version 3.0 (nightly) out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||
|
|
|
@ -6,7 +6,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.0a44,<8.0.0a50",
|
||||
"thinc>=8.0.0rc0,<8.1.0",
|
||||
"blis>=0.4.0,<0.8.0",
|
||||
"pytokenizations",
|
||||
"pathy"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a44,<8.0.0a50
|
||||
thinc>=8.0.0rc0,<8.1.0
|
||||
blis>=0.4.0,<0.8.0
|
||||
ml_datasets==0.2.0a0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.0.0a44,<8.0.0a50
|
||||
thinc>=8.0.0rc0,<8.1.0
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a44,<8.0.0a50
|
||||
thinc>=8.0.0rc0,<8.1.0
|
||||
blis>=0.4.0,<0.8.0
|
||||
wasabi>=0.8.0,<1.1.0
|
||||
srsly>=2.3.0,<3.0.0
|
||||
|
@ -65,9 +65,9 @@ console_scripts =
|
|||
|
||||
[options.extras_require]
|
||||
lookups =
|
||||
spacy_lookups_data>=1.0.0rc0,<1.0.0
|
||||
spacy_lookups_data>=1.0.0rc0,<1.1.0
|
||||
transformers =
|
||||
spacy_transformers>=1.0.0a22,<1.0.0
|
||||
spacy_transformers>=1.0.0rc0,<1.1.0
|
||||
ray =
|
||||
spacy_ray>=0.1.0,<1.0.0
|
||||
cuda =
|
||||
|
|
|
@ -195,7 +195,7 @@ class Tagger(TrainablePipe):
|
|||
validate_examples(examples, "Tagger.update")
|
||||
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
return
|
||||
return losses
|
||||
set_dropout_rate(self.model, drop)
|
||||
tag_scores, bp_tag_scores = self.model.begin_update([eg.predicted for eg in examples])
|
||||
for sc in tag_scores:
|
||||
|
@ -227,22 +227,24 @@ class Tagger(TrainablePipe):
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/tagger#rehearse
|
||||
"""
|
||||
if losses is None:
|
||||
losses = {}
|
||||
losses.setdefault(self.name, 0.0)
|
||||
validate_examples(examples, "Tagger.rehearse")
|
||||
docs = [eg.predicted for eg in examples]
|
||||
if self._rehearsal_model is None:
|
||||
return
|
||||
return losses
|
||||
if not any(len(doc) for doc in docs):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
return
|
||||
return losses
|
||||
set_dropout_rate(self.model, drop)
|
||||
guesses, backprop = self.model.begin_update(docs)
|
||||
target = self._rehearsal_model(examples)
|
||||
gradient = guesses - target
|
||||
backprop(gradient)
|
||||
self.finish_update(sgd)
|
||||
if losses is not None:
|
||||
losses.setdefault(self.name, 0.0)
|
||||
losses[self.name] += (gradient**2).sum()
|
||||
losses[self.name] += (gradient**2).sum()
|
||||
return losses
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
"""Find the loss and gradient of loss for the batch of documents and
|
||||
|
|
|
@ -116,7 +116,7 @@ cdef class TrainablePipe(Pipe):
|
|||
validate_examples(examples, "TrainablePipe.update")
|
||||
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
return
|
||||
return losses
|
||||
set_dropout_rate(self.model, drop)
|
||||
scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples])
|
||||
loss, d_scores = self.get_loss(examples, scores)
|
||||
|
|
|
@ -22,12 +22,12 @@ import QuickstartModels from 'widgets/quickstart-models.js'
|
|||
## Package naming conventions {#conventions}
|
||||
|
||||
In general, spaCy expects all pipeline packages to follow the naming convention
|
||||
of `[lang`\_[name]]. For spaCy's pipelines, we also chose to divide the name
|
||||
of `[lang]\_[name]`. For spaCy's pipelines, we also chose to divide the name
|
||||
into three components:
|
||||
|
||||
1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with
|
||||
vocabulary, syntax, entities and word vectors, or `depent` for only vocab,
|
||||
syntax and entities).
|
||||
vocabulary, syntax, entities and word vectors, or `dep` for only vocab and
|
||||
syntax).
|
||||
2. **Genre:** Type of text the pipeline is trained on, e.g. `web` or `news`.
|
||||
3. **Size:** Package size indicator, `sm`, `md` or `lg`.
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
A named entity is a "real-world object" that's assigned a name – for example, a
|
||||
person, a country, a product or a book title. spaCy can **recognize various
|
||||
types of named entities in a document, by asking the model for a
|
||||
**prediction\*\*. Because models are statistical and strongly depend on the
|
||||
prediction**. Because models are statistical and strongly depend on the
|
||||
examples they were trained on, this doesn't always work _perfectly_ and might
|
||||
need some tuning later, depending on your use case.
|
||||
|
||||
|
|
|
@ -45,6 +45,6 @@ marks.
|
|||
|
||||
While punctuation rules are usually pretty general, tokenizer exceptions
|
||||
strongly depend on the specifics of the individual language. This is why each
|
||||
[available language](/usage/models#languages) has its own subclass like
|
||||
[available language](/usage/models#languages) has its own subclass, like
|
||||
`English` or `German`, that loads in lists of hard-coded data and exception
|
||||
rules.
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
import { Help } from 'components/typography'; import Link from 'components/link'
|
||||
|
||||
<!-- TODO: update numbers, add note on previous NER evaluation issues -->
|
||||
<!-- TODO: update speed and v2 NER numbers -->
|
||||
|
||||
<figure>
|
||||
|
||||
| Pipeline | Parser | Tagger | NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> |
|
||||
| ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: |
|
||||
| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k |
|
||||
| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | 95.5 | 98.3 | 89.7 | 1k | 8k |
|
||||
| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | 92.2 | 97.4 | 85.8 | 7k | |
|
||||
| `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.9 | 10k | |
|
||||
| `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | | 10k | |
|
||||
|
||||
<figcaption class="caption">
|
||||
|
||||
|
@ -23,9 +23,9 @@ import { Help } from 'components/typography'; import Link from 'components/link'
|
|||
|
||||
| Named Entity Recognition System | OntoNotes | CoNLL '03 |
|
||||
| ------------------------------------------------------------------------------ | --------: | --------: |
|
||||
| spaCy RoBERTa (2020) | | 92.2 |
|
||||
| spaCy CNN (2020) | 85.3 | 88.4 |
|
||||
| spaCy CNN (2017) | 86.4 | |
|
||||
| spaCy RoBERTa (2020) | 89.7 | 91.6 |
|
||||
| spaCy CNN (2020) | 84.5 | |
|
||||
| spaCy CNN (2017) | | |
|
||||
| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup> | 88.8 | 92.1 |
|
||||
| <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link><sup>2</sup> | 89.7 | 93.1 |
|
||||
| BERT Base<sup>3</sup> | - | 92.4 |
|
||||
|
|
|
@ -65,8 +65,8 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
|
|||
|
||||
| Dependency Parsing System | UAS | LAS |
|
||||
| ------------------------------------------------------------------------------ | ---: | ---: |
|
||||
| spaCy RoBERTa (2020)<sup>1</sup> | 96.8 | 95.0 |
|
||||
| spaCy CNN (2020)<sup>1</sup> | 93.7 | 91.8 |
|
||||
| spaCy RoBERTa (2020)<sup>1</sup> | 95.5 | 94.3 |
|
||||
| spaCy CNN (2020)<sup>1</sup> | | |
|
||||
| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.4 | 96.3 |
|
||||
| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.2 | 95.7 |
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ menu:
|
|||
>
|
||||
> To help you make the transition from v2.x to v3.0, we've uploaded the old
|
||||
> website to [**v2.spacy.io**](https://v2.spacy.io/docs). To see what's changed
|
||||
> and how to migrate, see the guide on [v3.0 guide](/usage/v3).
|
||||
> and how to migrate, see the [v3.0 guide](/usage/v3).
|
||||
|
||||
import QuickstartInstall from 'widgets/quickstart-install.js'
|
||||
|
||||
|
@ -187,7 +187,7 @@ to get the right commands for your platform and Python version.
|
|||
`sudo apt-get install build-essential python-dev git`
|
||||
- **macOS / OS X:** Install a recent version of
|
||||
[XCode](https://developer.apple.com/xcode/), including the so-called "Command
|
||||
Line Tools". macOS and OS X ship with Python and git preinstalled.
|
||||
Line Tools". macOS and OS X ship with Python and Git preinstalled.
|
||||
- **Windows:** Install a version of the
|
||||
[Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
|
||||
or
|
||||
|
@ -380,7 +380,7 @@ This error may occur when running the `spacy` command from the command line.
|
|||
spaCy does not currently add an entry to your `PATH` environment variable, as
|
||||
this can lead to unexpected results, especially when using a virtual
|
||||
environment. Instead, spaCy adds an auto-alias that maps `spacy` to
|
||||
`python -m spacy]`. If this is not working as expected, run the command with
|
||||
`python -m spacy`. If this is not working as expected, run the command with
|
||||
`python -m`, yourself – for example `python -m spacy download en_core_web_sm`.
|
||||
For more info on this, see the [`download`](/api/cli#download) command.
|
||||
|
||||
|
@ -427,8 +427,8 @@ disk has some binary files that should not go through this conversion. When they
|
|||
do, you get the error above. You can fix it by either changing your
|
||||
[`core.autocrlf`](https://git-scm.com/book/en/v2/Customizing-Git-Git-Configuration)
|
||||
setting to `"false"`, or by committing a
|
||||
[`.gitattributes`](https://git-scm.com/docs/gitattributes) file] to your
|
||||
repository to tell git on which files or folders it shouldn't do LF-to-CRLF
|
||||
[`.gitattributes`](https://git-scm.com/docs/gitattributes) file to your
|
||||
repository to tell Git on which files or folders it shouldn't do LF-to-CRLF
|
||||
conversion, with an entry like `path/to/spacy/model/** -text`. After you've done
|
||||
either of these, clone your repository again.
|
||||
|
||||
|
|
|
@ -352,7 +352,7 @@ dropout = 0.2
|
|||
|
||||
<Infobox variant="warning">
|
||||
|
||||
Remember that it is best not to rely on any (hidden) default values, to ensure
|
||||
Remember that it is best not to rely on any (hidden) default values to ensure
|
||||
that training configs are complete and experiments fully reproducible.
|
||||
|
||||
</Infobox>
|
||||
|
@ -503,7 +503,7 @@ overview of the `TrainablePipe` methods used by
|
|||
|
||||
</Infobox>
|
||||
|
||||
### Example: Entity elation extraction component {#component-rel}
|
||||
### Example: Entity relation extraction component {#component-rel}
|
||||
|
||||
This section outlines an example use-case of implementing a **novel relation
|
||||
extraction component** from scratch. We'll implement a binary relation
|
||||
|
@ -618,7 +618,7 @@ we can define our relation model in a config file as such:
|
|||
# ...
|
||||
|
||||
[model.get_candidates]
|
||||
@misc = "rel_cand_generator.v2"
|
||||
@misc = "rel_cand_generator.v1"
|
||||
max_length = 20
|
||||
|
||||
[model.create_candidate_tensor]
|
||||
|
@ -687,8 +687,8 @@ Before the model can be used, it needs to be
|
|||
[initialized](/usage/training#initialization). This function receives a callback
|
||||
to access the full **training data set**, or a representative sample. This data
|
||||
set can be used to deduce all **relevant labels**. Alternatively, a list of
|
||||
labels can be provided to `initialize`, or you can call the
|
||||
`RelationExtractoradd_label` directly. The number of labels defines the output
|
||||
labels can be provided to `initialize`, or you can call
|
||||
`RelationExtractor.add_label` directly. The number of labels defines the output
|
||||
dimensionality of the network, and will be used to do
|
||||
[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
|
||||
layers of the neural network. This is triggered by calling
|
||||
|
@ -729,7 +729,7 @@ and its internal model can be trained and used to make predictions.
|
|||
During training, the function [`update`](/api/pipe#update) is invoked which
|
||||
delegates to
|
||||
[`Model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
|
||||
[`get_loss`](/api/pipe#get_loss) function that **calculate the loss** for a
|
||||
[`get_loss`](/api/pipe#get_loss) function that **calculates the loss** for a
|
||||
batch of examples, as well as the **gradient** of loss that will be used to
|
||||
update the weights of the model layers. Thinc provides several
|
||||
[loss functions](https://thinc.ai/docs/api-loss) that can be used for the
|
||||
|
|
|
@ -44,7 +44,7 @@ in the [models directory](/models).
|
|||
|
||||
Inflectional morphology is the process by which a root form of a word is
|
||||
modified by adding prefixes or suffixes that specify its grammatical function
|
||||
but do not changes its part-of-speech. We say that a **lemma** (root form) is
|
||||
but do not change its part-of-speech. We say that a **lemma** (root form) is
|
||||
**inflected** (modified/combined) with one or more **morphological features** to
|
||||
create a surface form. Here are some examples:
|
||||
|
||||
|
@ -288,7 +288,7 @@ import DisplaCyLong2Html from 'images/displacy-long2.html'
|
|||
Because the syntactic relations form a tree, every word has **exactly one
|
||||
head**. You can therefore iterate over the arcs in the tree by iterating over
|
||||
the words in the sentence. This is usually the best way to match an arc of
|
||||
interest — from below:
|
||||
interest – from below:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
|
@ -397,7 +397,7 @@ for descendant in subject.subtree:
|
|||
Finally, the `.left_edge` and `.right_edge` attributes can be especially useful,
|
||||
because they give you the first and last token of the subtree. This is the
|
||||
easiest way to create a `Span` object for a syntactic phrase. Note that
|
||||
`.right_edge` gives a token **within** the subtree — so if you use it as the
|
||||
`.right_edge` gives a token **within** the subtree – so if you use it as the
|
||||
end-point of a range, don't forget to `+1`!
|
||||
|
||||
```python
|
||||
|
@ -639,7 +639,7 @@ print("After", doc.ents) # [London]
|
|||
|
||||
#### Setting entity annotations in Cython {#setting-cython}
|
||||
|
||||
Finally, you can always write to the underlying struct, if you compile a
|
||||
Finally, you can always write to the underlying struct if you compile a
|
||||
[Cython](http://cython.org/) function. This is easy to do, and allows you to
|
||||
write efficient native code.
|
||||
|
||||
|
@ -763,15 +763,15 @@ import Tokenization101 from 'usage/101/\_tokenization.md'
|
|||
|
||||
<Accordion title="Algorithm details: How spaCy's tokenizer works" id="how-tokenizer-works" spaced>
|
||||
|
||||
spaCy introduces a novel tokenization algorithm, that gives a better balance
|
||||
between performance, ease of definition, and ease of alignment into the original
|
||||
spaCy introduces a novel tokenization algorithm that gives a better balance
|
||||
between performance, ease of definition and ease of alignment into the original
|
||||
string.
|
||||
|
||||
After consuming a prefix or suffix, we consult the special cases again. We want
|
||||
the special cases to handle things like "don't" in English, and we want the same
|
||||
rule to work for "(don't)!". We do this by splitting off the open bracket, then
|
||||
the exclamation, then the close bracket, and finally matching the special case.
|
||||
Here's an implementation of the algorithm in Python, optimized for readability
|
||||
the exclamation, then the closed bracket, and finally matching the special case.
|
||||
Here's an implementation of the algorithm in Python optimized for readability
|
||||
rather than performance:
|
||||
|
||||
```python
|
||||
|
@ -845,7 +845,7 @@ The algorithm can be summarized as follows:
|
|||
#2.
|
||||
6. If we can't consume a prefix or a suffix, look for a URL match.
|
||||
7. If there's no URL match, then look for a special case.
|
||||
8. Look for "infixes" — stuff like hyphens etc. and split the substring into
|
||||
8. Look for "infixes" – stuff like hyphens etc. and split the substring into
|
||||
tokens on all infixes.
|
||||
9. Once we can't consume any more of the string, handle it as a single token.
|
||||
|
||||
|
@ -862,10 +862,10 @@ intact (abbreviations like "U.S.").
|
|||
<Accordion title="Should I change the language data or add custom tokenizer rules?" id="lang-data-vs-tokenizer">
|
||||
|
||||
Tokenization rules that are specific to one language, but can be **generalized
|
||||
across that language** should ideally live in the language data in
|
||||
across that language**, should ideally live in the language data in
|
||||
[`spacy/lang`](%%GITHUB_SPACY/spacy/lang) – we always appreciate pull requests!
|
||||
Anything that's specific to a domain or text type – like financial trading
|
||||
abbreviations, or Bavarian youth slang – should be added as a special case rule
|
||||
abbreviations or Bavarian youth slang – should be added as a special case rule
|
||||
to your tokenizer instance. If you're dealing with a lot of customizations, it
|
||||
might make sense to create an entirely custom subclass.
|
||||
|
||||
|
@ -1108,7 +1108,7 @@ tokenized `Doc`.
|
|||

|
||||
|
||||
To overwrite the existing tokenizer, you need to replace `nlp.tokenizer` with a
|
||||
custom function that takes a text, and returns a [`Doc`](/api/doc).
|
||||
custom function that takes a text and returns a [`Doc`](/api/doc).
|
||||
|
||||
> #### Creating a Doc
|
||||
>
|
||||
|
@ -1227,7 +1227,7 @@ tokenizer** it will be using at runtime. See the docs on
|
|||
|
||||
#### Training with custom tokenization {#custom-tokenizer-training new="3"}
|
||||
|
||||
spaCy's [training config](/usage/training#config) describe the settings,
|
||||
spaCy's [training config](/usage/training#config) describes the settings,
|
||||
hyperparameters, pipeline and tokenizer used for constructing and training the
|
||||
pipeline. The `[nlp.tokenizer]` block refers to a **registered function** that
|
||||
takes the `nlp` object and returns a tokenizer. Here, we're registering a
|
||||
|
@ -1463,7 +1463,7 @@ filtered_spans = filter_spans(spans)
|
|||
The [`retokenizer.split`](/api/doc#retokenizer.split) method allows splitting
|
||||
one token into two or more tokens. This can be useful for cases where
|
||||
tokenization rules alone aren't sufficient. For example, you might want to split
|
||||
"its" into the tokens "it" and "is" — but not the possessive pronoun "its". You
|
||||
"its" into the tokens "it" and "is" – but not the possessive pronoun "its". You
|
||||
can write rule-based logic that can find only the correct "its" to split, but by
|
||||
that time, the `Doc` will already be tokenized.
|
||||
|
||||
|
@ -1511,7 +1511,7 @@ the token indices after splitting.
|
|||
| `"York"` | `doc[2]` | Attach this token to `doc[1]` in the original `Doc`, i.e. "in". |
|
||||
|
||||
If you don't care about the heads (for example, if you're only running the
|
||||
tokenizer and not the parser), you can each subtoken to itself:
|
||||
tokenizer and not the parser), you can attach each subtoken to itself:
|
||||
|
||||
```python
|
||||
### {highlight="3"}
|
||||
|
@ -1880,7 +1880,7 @@ assert nlp.vocab.vectors.n_keys > n_vectors # but not the total entries
|
|||
[`Vocab.prune_vectors`](/api/vocab#prune_vectors) reduces the current vector
|
||||
table to a given number of unique entries, and returns a dictionary containing
|
||||
the removed words, mapped to `(string, score)` tuples, where `string` is the
|
||||
entry the removed word was mapped to, and `score` the similarity score between
|
||||
entry the removed word was mapped to and `score` the similarity score between
|
||||
the two words.
|
||||
|
||||
```python
|
||||
|
|
|
@ -132,8 +132,8 @@ should be created. spaCy will then do the following:
|
|||
2. Iterate over the **pipeline names** and look up each component name in the
|
||||
`[components]` block. The `factory` tells spaCy which
|
||||
[component factory](#custom-components-factories) to use for adding the
|
||||
component with with [`add_pipe`](/api/language#add_pipe). The settings are
|
||||
passed into the factory.
|
||||
component with [`add_pipe`](/api/language#add_pipe). The settings are passed
|
||||
into the factory.
|
||||
3. Make the **model data** available to the `Language` class by calling
|
||||
[`from_disk`](/api/language#from_disk) with the path to the data directory.
|
||||
|
||||
|
@ -332,7 +332,7 @@ to remove pipeline components from an existing pipeline, the
|
|||
[`rename_pipe`](/api/language#rename_pipe) method to rename them, or the
|
||||
[`replace_pipe`](/api/language#replace_pipe) method to replace them with a
|
||||
custom component entirely (more details on this in the section on
|
||||
[custom components](#custom-components).
|
||||
[custom components](#custom-components)).
|
||||
|
||||
```python
|
||||
nlp.remove_pipe("parser")
|
||||
|
@ -391,7 +391,7 @@ vectors available – otherwise, it won't be able to make the same predictions.
|
|||
>
|
||||
> Instead of providing a `factory`, component blocks in the training
|
||||
> [config](/usage/training#config) can also define a `source`. The string needs
|
||||
> to be a loadable spaCy pipeline package or path. The
|
||||
> to be a loadable spaCy pipeline package or path.
|
||||
>
|
||||
> ```ini
|
||||
> [components.ner]
|
||||
|
@ -424,7 +424,7 @@ print(nlp.pipe_names)
|
|||
### Analyzing pipeline components {#analysis new="3"}
|
||||
|
||||
The [`nlp.analyze_pipes`](/api/language#analyze_pipes) method analyzes the
|
||||
components in the current pipeline and outputs information about them, like the
|
||||
components in the current pipeline and outputs information about them like the
|
||||
attributes they set on the [`Doc`](/api/doc) and [`Token`](/api/token), whether
|
||||
they retokenize the `Doc` and which scores they produce during training. It will
|
||||
also show warnings if components require values that aren't set by previous
|
||||
|
@ -518,8 +518,8 @@ doesn't, the pipeline analysis won't catch that.
|
|||
## Creating custom pipeline components {#custom-components}
|
||||
|
||||
A pipeline component is a function that receives a `Doc` object, modifies it and
|
||||
returns it – – for example, by using the current weights to make a prediction
|
||||
and set some annotation on the document. By adding a component to the pipeline,
|
||||
returns it – for example, by using the current weights to make a prediction and
|
||||
set some annotation on the document. By adding a component to the pipeline,
|
||||
you'll get access to the `Doc` at any point **during processing** – instead of
|
||||
only being able to modify it afterwards.
|
||||
|
||||
|
@ -709,9 +709,9 @@ nlp.add_pipe("my_component", config={"some_setting": False})
|
|||
<Accordion title="How is @Language.factory different from @Language.component?" id="factories-decorator-component">
|
||||
|
||||
The [`@Language.component`](/api/language#component) decorator is essentially a
|
||||
**shortcut** for stateless pipeline component that don't need any settings. This
|
||||
means you don't have to always write a function that returns your function if
|
||||
there's no state to be passed through – spaCy can just take care of this for
|
||||
**shortcut** for stateless pipeline components that don't need any settings.
|
||||
This means you don't have to always write a function that returns your function
|
||||
if there's no state to be passed through – spaCy can just take care of this for
|
||||
you. The following two code examples are equivalent:
|
||||
|
||||
```python
|
||||
|
@ -745,7 +745,7 @@ make your factory a separate function. That's also how spaCy does it internally.
|
|||
|
||||
### Language-specific factories {#factories-language new="3"}
|
||||
|
||||
There are many use case where you might want your pipeline components to be
|
||||
There are many use cases where you might want your pipeline components to be
|
||||
language-specific. Sometimes this requires entirely different implementation per
|
||||
language, sometimes the only difference is in the settings or data. spaCy allows
|
||||
you to register factories of the **same name** on both the `Language` base
|
||||
|
@ -966,7 +966,7 @@ components in pipelines that you [train](/usage/training). To make sure spaCy
|
|||
knows where to find your custom `@misc` function, you can pass in a Python file
|
||||
via the argument `--code`. If someone else is using your component, all they
|
||||
have to do to customize the data is to register their own function and swap out
|
||||
the name. Registered functions can also take **arguments** by the way that can
|
||||
the name. Registered functions can also take **arguments**, by the way, that can
|
||||
be defined in the config as well – you can read more about this in the docs on
|
||||
[training with custom code](/usage/training#custom-code).
|
||||
|
||||
|
@ -1497,7 +1497,7 @@ to `Doc.user_span_hooks` and `Doc.user_token_hooks`.
|
|||
>
|
||||
> The hooks live on the `Doc` object because the `Span` and `Token` objects are
|
||||
> created lazily, and don't own any data. They just proxy to their parent `Doc`.
|
||||
> This turns out to be convenient here — we only have to worry about installing
|
||||
> This turns out to be convenient here – we only have to worry about installing
|
||||
> hooks in one place.
|
||||
|
||||
| Name | Customizes |
|
||||
|
|
|
@ -69,7 +69,7 @@ python -m spacy project clone pipelines/tagger_parser_ud
|
|||
|
||||
By default, the project will be cloned into the current working directory. You
|
||||
can specify an optional second argument to define the output directory. The
|
||||
`--repo` option lets you define a custom repo to clone from, if you don't want
|
||||
`--repo` option lets you define a custom repo to clone from if you don't want
|
||||
to use the spaCy [`projects`](https://github.com/explosion/projects) repo. You
|
||||
can also use any private repo you have access to with Git.
|
||||
|
||||
|
@ -105,7 +105,7 @@ $ python -m spacy project assets
|
|||
Asset URLs can be a number of different protocols: HTTP, HTTPS, FTP, SSH, and
|
||||
even cloud storage such as GCS and S3. You can also fetch assets using git, by
|
||||
replacing the `url` string with a `git` block. spaCy will use Git's "sparse
|
||||
checkout" feature, to avoid download the whole repository.
|
||||
checkout" feature to avoid downloading the whole repository.
|
||||
|
||||
### 3. Run a command {#run}
|
||||
|
||||
|
@ -310,7 +310,7 @@ company-internal and not available over the internet. In that case, you can
|
|||
specify the destination paths and a checksum, and leave out the URL. When your
|
||||
teammates clone and run your project, they can place the files in the respective
|
||||
directory themselves. The [`project assets`](/api/cli#project-assets) command
|
||||
will alert about missing files and mismatched checksums, so you can ensure that
|
||||
will alert you about missing files and mismatched checksums, so you can ensure that
|
||||
others are running your project with the same data.
|
||||
|
||||
### Dependencies and outputs {#deps-outputs}
|
||||
|
@ -358,8 +358,7 @@ graphs based on the dependencies and outputs, and won't re-run previous steps
|
|||
automatically. For instance, if you only run the command `train` that depends on
|
||||
data created by `preprocess` and those files are missing, spaCy will show an
|
||||
error – it won't just re-run `preprocess`. If you're looking for more advanced
|
||||
data management, check out the [Data Version Control (DVC) integration](#dvc)
|
||||
integration. If you're planning on integrating your spaCy project with DVC, you
|
||||
data management, check out the [Data Version Control (DVC) integration](#dvc). If you're planning on integrating your spaCy project with DVC, you
|
||||
can also use `outputs_no_cache` instead of `outputs` to define outputs that
|
||||
won't be cached or tracked.
|
||||
|
||||
|
|
|
@ -55,7 +55,7 @@ abstract representations of the tokens you're looking for, using lexical
|
|||
attributes, linguistic features predicted by the model, operators, set
|
||||
membership and rich comparison. For example, you can find a noun, followed by a
|
||||
verb with the lemma "love" or "like", followed by an optional determiner and
|
||||
another token that's at least ten characters long.
|
||||
another token that's at least 10 characters long.
|
||||
|
||||
</Accordion>
|
||||
|
||||
|
@ -494,7 +494,7 @@ you prefer.
|
|||
| `matcher` | The matcher instance. ~~Matcher~~ |
|
||||
| `doc` | The document the matcher was used on. ~~Doc~~ |
|
||||
| `i` | Index of the current match (`matches[i`]). ~~int~~ |
|
||||
| `matches` | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. ~~ List[Tuple[int, int int]]~~ |
|
||||
| `matches` | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. ~~List[Tuple[int, int int]]~~ |
|
||||
|
||||
### Creating spans from matches {#matcher-spans}
|
||||
|
||||
|
@ -631,7 +631,7 @@ To get a quick overview of the results, you could collect all sentences
|
|||
containing a match and render them with the
|
||||
[displaCy visualizer](/usage/visualizers). In the callback function, you'll have
|
||||
access to the `start` and `end` of each match, as well as the parent `Doc`. This
|
||||
lets you determine the sentence containing the match, `doc[start : end`.sent],
|
||||
lets you determine the sentence containing the match, `doc[start:end].sent`,
|
||||
and calculate the start and end of the matched span within the sentence. Using
|
||||
displaCy in ["manual" mode](/usage/visualizers#manual-usage) lets you pass in a
|
||||
list of dictionaries containing the text and entities to render.
|
||||
|
@ -1454,7 +1454,7 @@ When using a trained
|
|||
extract information from your texts, you may find that the predicted span only
|
||||
includes parts of the entity you're looking for. Sometimes, this happens if
|
||||
statistical model predicts entities incorrectly. Other times, it happens if the
|
||||
way the entity type way defined in the original training corpus doesn't match
|
||||
way the entity type was defined in the original training corpus doesn't match
|
||||
what you need for your application.
|
||||
|
||||
> #### Where corpora come from
|
||||
|
@ -1645,7 +1645,7 @@ affiliation is current, we can check the head's part-of-speech tag.
|
|||
```python
|
||||
person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
|
||||
for ent in person_entities:
|
||||
# Because the entity is a spans, we need to use its root token. The head
|
||||
# Because the entity is a span, we need to use its root token. The head
|
||||
# is the syntactic governor of the person, e.g. the verb
|
||||
head = ent.root.head
|
||||
if head.lemma_ == "work":
|
||||
|
|
|
@ -463,7 +463,7 @@ entry_points={
|
|||
}
|
||||
```
|
||||
|
||||
The factory can also implement other pipeline component like `to_disk` and
|
||||
The factory can also implement other pipeline component methods like `to_disk` and
|
||||
`from_disk` for serialization, or even `update` to make the component trainable.
|
||||
If a component exposes a `from_disk` method and is included in a pipeline, spaCy
|
||||
will call it on load. This lets you ship custom data with your pipeline package.
|
||||
|
@ -690,7 +690,7 @@ care of putting all this together and returning a `Language` object with the
|
|||
loaded pipeline and data. If your pipeline requires
|
||||
[custom components](/usage/processing-pipelines#custom-components) or a custom
|
||||
language class, you can also **ship the code with your package** and include it
|
||||
in the `__init__.py` – for example, to register component before the `nlp`
|
||||
in the `__init__.py` – for example, to register a component before the `nlp`
|
||||
object is created.
|
||||
|
||||
<Infobox variant="warning" title="Important note on making manual edits">
|
||||
|
|
|
@ -551,7 +551,7 @@ or TensorFlow, make **custom modifications** to the `nlp` object, create custom
|
|||
optimizers or schedules, or **stream in data** and preprocesses it on the fly
|
||||
while training.
|
||||
|
||||
Each custom function can have any numbers of arguments that are passed in via
|
||||
Each custom function can have any number of arguments that are passed in via
|
||||
the [config](#config), just the built-in functions. If your function defines
|
||||
**default argument values**, spaCy is able to auto-fill your config when you run
|
||||
[`init fill-config`](/api/cli#init-fill-config). If you want to make sure that a
|
||||
|
|
|
@ -41,11 +41,7 @@ function getCounts(langs = []) {
|
|||
return {
|
||||
langs: langs.length,
|
||||
modelLangs: langs.filter(({ models }) => models && !!models.length).length,
|
||||
starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length,
|
||||
models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0),
|
||||
starters: langs
|
||||
.map(({ starters }) => (starters ? starters.length : 0))
|
||||
.reduce((a, b) => a + b, 0),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -37,7 +37,6 @@ exports.createPages = ({ graphql, actions }) => {
|
|||
code
|
||||
name
|
||||
models
|
||||
starters
|
||||
example
|
||||
has_examples
|
||||
}
|
||||
|
|
|
@ -31,7 +31,6 @@
|
|||
"code": "en",
|
||||
"name": "English",
|
||||
"models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_core_web_trf"],
|
||||
"starters": ["en_vectors_web_lg"],
|
||||
"example": "This is a sentence.",
|
||||
"has_examples": true
|
||||
},
|
||||
|
|
|
@ -52,10 +52,6 @@
|
|||
{
|
||||
"label": "Trained Pipelines",
|
||||
"items": []
|
||||
},
|
||||
{
|
||||
"label": "Starter Packages",
|
||||
"items": []
|
||||
}
|
||||
]
|
||||
},
|
||||
|
|
|
@ -51,17 +51,19 @@ const Docs = ({ pageContext, children }) => (
|
|||
id: model,
|
||||
})),
|
||||
}))
|
||||
sidebar.items[2].items = languages
|
||||
.filter(({ starters }) => starters && starters.length)
|
||||
.map(lang => ({
|
||||
text: lang.name,
|
||||
url: `/models/${lang.code}-starters`,
|
||||
isActive: id === `${lang.code}-starters`,
|
||||
menu: lang.starters.map(model => ({
|
||||
text: model,
|
||||
id: model,
|
||||
})),
|
||||
}))
|
||||
if (sidebar.items.length > 2) {
|
||||
sidebar.items[2].items = languages
|
||||
.filter(({ starters }) => starters && starters.length)
|
||||
.map(lang => ({
|
||||
text: lang.name,
|
||||
url: `/models/${lang.code}-starters`,
|
||||
isActive: id === `${lang.code}-starters`,
|
||||
menu: lang.starters.map(model => ({
|
||||
text: model,
|
||||
id: model,
|
||||
})),
|
||||
}))
|
||||
}
|
||||
}
|
||||
const sourcePath = source ? github(source) : null
|
||||
const currentSource = getCurrentSource(slug, isIndex)
|
||||
|
@ -146,7 +148,6 @@ const query = graphql`
|
|||
code
|
||||
name
|
||||
models
|
||||
starters
|
||||
}
|
||||
nightly
|
||||
sidebars {
|
||||
|
|
|
@ -336,9 +336,7 @@ const landingQuery = graphql`
|
|||
counts {
|
||||
langs
|
||||
modelLangs
|
||||
starterLangs
|
||||
models
|
||||
starters
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user