Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-08-11 01:21:47 +02:00
commit a77713947d
21 changed files with 389 additions and 252 deletions

View File

@ -18,11 +18,12 @@ cdef class Lexeme:
cdef readonly attr_t orth
@staticmethod
cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab, int vector_length):
cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab):
cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth)
self.c = lex
self.vocab = vocab
self.orth = lex.orth
return self
@staticmethod
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:

View File

@ -1,4 +1,4 @@
# cython: infer_types=True, profile=True, binding=True
# cython: infer_types=True, profile=True
import srsly
from ..tokens.doc cimport Doc

View File

@ -43,7 +43,7 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
scores=["tag_acc"],
default_score_weights={"tag_acc": 1.0},
)
def make_tagger(nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]]):
def make_tagger(nlp: Language, name: str, model: Model):
"""Construct a part-of-speech tagger component.
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts

View File

@ -172,7 +172,7 @@ class TextCategorizer(Pipe):
return scores
def set_annotations(self, docs: Iterable[Doc], scores) -> None:
"""Modify a batch of documents, using pre-computed scores.
"""Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
scores: The scores to set, produced by TextCategorizer.predict.

View File

@ -2,6 +2,7 @@ import pytest
import numpy
from spacy.tokens import Doc, Span
from spacy.vocab import Vocab
from spacy.lexeme import Lexeme
from spacy.lang.en import English
from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
@ -389,3 +390,11 @@ def test_doc_lang(en_vocab):
assert doc.lang == en_vocab.strings["en"]
assert doc[0].lang_ == "en"
assert doc[0].lang == en_vocab.strings["en"]
def test_token_lexeme(en_vocab):
"""Test that tokens expose their lexeme."""
token = Doc(en_vocab, words=["Hello", "world"])[0]
assert isinstance(token.lex, Lexeme)
assert token.lex.text == token.text
assert en_vocab[token.orth] == token.lex

View File

@ -226,6 +226,11 @@ cdef class Token:
cdef hash_t key = self.vocab.morphology.add(features)
self.c.morph = key
@property
def lex(self):
"""RETURNS (Lexeme): The underlying lexeme."""
return self.vocab[self.c.lex.orth]
@property
def lex_id(self):
"""RETURNS (int): Sequential ID of the token's lexical type."""

View File

@ -162,7 +162,8 @@ Initialize the pipe for training, using data examples if available. Returns an
## DependencyParser.predict {#predict tag="method"}
Apply the pipeline's model to a batch of docs, without modifying them.
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
modifying them.
> #### Example
>
@ -178,7 +179,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
## DependencyParser.set_annotations {#set_annotations tag="method"}
Modify a batch of documents, using pre-computed scores.
Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
> #### Example
>

View File

@ -162,9 +162,9 @@ Initialize the pipe for training, using data examples if available. Returns an
## EntityLinker.predict {#predict tag="method"}
Apply the pipeline's model to a batch of docs, without modifying them. Returns
the KB IDs for each entity in each doc, including `NIL` if there is no
prediction.
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
modifying them. Returns the KB IDs for each entity in each doc, including `NIL`
if there is no prediction.
> #### Example
>

View File

@ -151,7 +151,8 @@ Initialize the pipe for training, using data examples if available. Returns an
## EntityRecognizer.predict {#predict tag="method"}
Apply the pipeline's model to a batch of docs, without modifying them.
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
modifying them.
> #### Example
>
@ -167,7 +168,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
## EntityRecognizer.set_annotations {#set_annotations tag="method"}
Modify a batch of documents, using pre-computed scores.
Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
> #### Example
>

View File

@ -142,7 +142,8 @@ Initialize the pipe for training, using data examples if available. Returns an
## Morphologizer.predict {#predict tag="method"}
Apply the pipeline's model to a batch of docs, without modifying them.
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
modifying them.
> #### Example
>
@ -158,7 +159,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
## Morphologizer.set_annotations {#set_annotations tag="method"}
Modify a batch of documents, using pre-computed scores.
Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
> #### Example
>
@ -175,8 +176,9 @@ Modify a batch of documents, using pre-computed scores.
## Morphologizer.update {#update tag="method"}
Learn from a batch of documents and gold-standard information, updating the
pipe's model. Delegates to [`predict`](/api/morphologizer#predict) and
Learn from a batch of [`Example`](/api/example) objects containing the
predictions and gold-standard annotations, and update the component's model.
Delegates to [`predict`](/api/morphologizer#predict) and
[`get_loss`](/api/morphologizer#get_loss).
> #### Example

View File

@ -8,7 +8,18 @@ This class is a base class and **not instantiated directly**. Trainable pipeline
components like the [`EntityRecognizer`](/api/entityrecognizer) or
[`TextCategorizer`](/api/textcategorizer) inherit from it and it defines the
interface that components should follow to function as trainable components in a
spaCy pipeline.
spaCy pipeline. See the docs on
[writing trainable components](/usage/processing-pipelines#trainable) for how to
use the `Pipe` base class to implement custom components.
> #### Why is Pipe implemented in Cython?
>
> The `Pipe` class is implemented in a `.pyx` module, the extension used by
> [Cython](/api/cython). This is needed so that **other** Cython classes, like
> the [`EntityRecognizer`](/api/entityrecognizer) can inherit from it. But it
> doesn't mean you have to implement trainable components in Cython pure
> Python components like the [`TextCategorizer`](/api/textcategorizer) can also
> inherit from `Pipe`.
```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/pipe.pyx
@ -115,7 +126,8 @@ Initialize the pipe for training, using data examples if available. Returns an
## Pipe.predict {#predict tag="method"}
Apply the pipeline's model to a batch of docs, without modifying them.
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
modifying them.
<Infobox variant="danger">
@ -137,7 +149,7 @@ This method needs to be overwritten with your own custom `predict` method.
## Pipe.set_annotations {#set_annotations tag="method"}
Modify a batch of documents, using pre-computed scores.
Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
<Infobox variant="danger">
@ -161,8 +173,8 @@ method.
## Pipe.update {#update tag="method"}
Learn from a batch of documents and gold-standard information, updating the
pipe's model. Delegates to [`predict`](/api/pipe#predict).
Learn from a batch of [`Example`](/api/example) objects containing the
predictions and gold-standard annotations, and update the component's model.
<Infobox variant="danger">

View File

@ -136,7 +136,8 @@ Initialize the pipe for training, using data examples if available. Returns an
## SentenceRecognizer.predict {#predict tag="method"}
Apply the pipeline's model to a batch of docs, without modifying them.
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
modifying them.
> #### Example
>
@ -152,7 +153,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
## SentenceRecognizer.set_annotations {#set_annotations tag="method"}
Modify a batch of documents, using pre-computed scores.
Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
> #### Example
>
@ -169,8 +170,9 @@ Modify a batch of documents, using pre-computed scores.
## SentenceRecognizer.update {#update tag="method"}
Learn from a batch of documents and gold-standard information, updating the
pipe's model. Delegates to [`predict`](/api/sentencerecognizer#predict) and
Learn from a batch of [`Example`](/api/example) objects containing the
predictions and gold-standard annotations, and update the component's model.
Delegates to [`predict`](/api/sentencerecognizer#predict) and
[`get_loss`](/api/sentencerecognizer#get_loss).
> #### Example

View File

@ -134,7 +134,8 @@ Initialize the pipe for training, using data examples if available. Returns an
## Tagger.predict {#predict tag="method"}
Apply the pipeline's model to a batch of docs, without modifying them.
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
modifying them.
> #### Example
>
@ -150,7 +151,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
## Tagger.set_annotations {#set_annotations tag="method"}
Modify a batch of documents, using pre-computed scores.
Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
> #### Example
>
@ -167,8 +168,9 @@ Modify a batch of documents, using pre-computed scores.
## Tagger.update {#update tag="method"}
Learn from a batch of documents and gold-standard information, updating the
pipe's model. Delegates to [`predict`](/api/tagger#predict) and
Learn from a batch of [`Example`](/api/example) objects containing the
predictions and gold-standard annotations, and update the component's model.
Delegates to [`predict`](/api/tagger#predict) and
[`get_loss`](/api/tagger#get_loss).
> #### Example

View File

@ -142,7 +142,8 @@ Initialize the pipe for training, using data examples if available. Returns an
## TextCategorizer.predict {#predict tag="method"}
Apply the pipeline's model to a batch of docs, without modifying them.
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
modifying them.
> #### Example
>
@ -158,7 +159,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
## TextCategorizer.set_annotations {#set_annotations tag="method"}
Modify a batch of documents, using pre-computed scores.
Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
> #### Example
>
@ -175,8 +176,9 @@ Modify a batch of documents, using pre-computed scores.
## TextCategorizer.update {#update tag="method"}
Learn from a batch of documents and gold-standard information, updating the
pipe's model. Delegates to [`predict`](/api/textcategorizer#predict) and
Learn from a batch of [`Example`](/api/example) objects containing the
predictions and gold-standard annotations, and update the component's model.
Delegates to [`predict`](/api/textcategorizer#predict) and
[`get_loss`](/api/textcategorizer#get_loss).
> #### Example

View File

@ -145,7 +145,8 @@ Initialize the pipe for training, using data examples if available. Returns an
## Tok2Vec.predict {#predict tag="method"}
Apply the pipeline's model to a batch of docs, without modifying them.
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
modifying them.
> #### Example
>
@ -161,7 +162,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
## Tok2Vec.set_annotations {#set_annotations tag="method"}
Modify a batch of documents, using pre-computed scores.
Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
> #### Example
>
@ -178,8 +179,9 @@ Modify a batch of documents, using pre-computed scores.
## Tok2Vec.update {#update tag="method"}
Learn from a batch of documents and gold-standard information, updating the
pipe's model. Delegates to [`predict`](/api/tok2vec#predict).
Learn from a batch of [`Example`](/api/example) objects containing the
predictions and gold-standard annotations, and update the component's model.
Delegates to [`predict`](/api/tok2vec#predict).
> #### Example
>

View File

@ -393,9 +393,10 @@ The L2 norm of the token's vector representation.
## Attributes {#attributes}
| Name | Type | Description |
| -------------------------------------------- | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| -------------------------------------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The parent document. |
| `sent` <Tag variant="new">2.0.12</Tag> | `Span` | The sentence span that this token is a part of. |
| `lex` <Tag variant="new">3</Tag> | [`Lexeme`](/api/lexeme) | The underlying lexeme. |
| `sent` <Tag variant="new">2.0.12</Tag> | [`Span`](/api/span) | The sentence span that this token is a part of. |
| `text` | str | Verbatim text content. |
| `text_with_ws` | str | Text content, with trailing space character if present. |
| `whitespace_` | str | Trailing space character if present. |

View File

@ -179,7 +179,8 @@ Initialize the pipe for training, using data examples if available. Returns an
## Transformer.predict {#predict tag="method"}
Apply the pipeline's model to a batch of docs, without modifying them.
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
modifying them.
> #### Example
>

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 88 KiB

View File

@ -14,8 +14,6 @@ of the pipeline. The `Language` object coordinates these components. It takes
raw text and sends it through the pipeline, returning an **annotated document**.
It also orchestrates training and serialization.
<!-- TODO: update graphic -->
![Library architecture](../../images/architecture.svg)
### Container objects {#architecture-containers}
@ -85,4 +83,4 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis. |
| [`KnowledgeBase`](/api/kb) | Storage for entities and aliases of a knowledge base for entity linking. |
| [`Scorer`](/api/scorer) | Compute evaluation scores. |
| [`Corpus`](/api/corpis) | Class for managing annotated corpora for training and evaluation data. |
| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. |

View File

@ -5,6 +5,7 @@ menu:
- ['Processing Text', 'processing']
- ['How Pipelines Work', 'pipelines']
- ['Custom Components', 'custom-components']
# - ['Trainable Components', 'trainable-components']
- ['Extension Attributes', 'custom-components-attributes']
- ['Plugins & Wrappers', 'plugins']
---
@ -885,10 +886,14 @@ available, falls back to looking up the regular factory name.
</Infobox>
<!-- TODO:
## Trainable components {#trainable-components new="3"}
### Trainable components {#trainable new="3"}
spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable
components that have their own model instance, make predictions over `Doc`
objects and can be updated using [`spacy train`](/api/cli#train). This lets you
plug fully custom machine learning components into your pipeline.
-->
--->
## Extension attributes {#custom-components-attributes new="2"}

View File

@ -6,11 +6,11 @@ menu:
- ['Features', 'features']
- ['Linguistic Annotations', 'annotations']
- ['Pipelines', 'pipelines']
- ['Architecture', 'architecture']
- ['Vocab', 'vocab']
- ['Serialization', 'serialization']
- ['Training', 'training']
- ['Language Data', 'language-data']
- ['Architecture', 'architecture']
- ['Community & FAQ', 'community-faq']
---
@ -71,12 +71,11 @@ systems, or to pre-process text for **deep learning**.
- [Named entities](#annotations-ner)
- [Word vectors and similarity](#vectors-similarity)
- [Pipelines](#pipelines)
- [Library architecture](#architecture)
- [Vocab, hashes and lexemes](#vocab)
- [Serialization](#serialization)
- [Training](#training)
- [Language data](#language-data)
- [Lightning tour](#lightning-tour)
- [Architecture](#architecture)
- [Community & FAQ](#community)
</Infobox>
@ -266,6 +265,12 @@ guide on [language processing pipelines](/usage/processing-pipelines).
</Infobox>
## Architecture {#architecture}
import Architecture101 from 'usage/101/\_architecture.md'
<Architecture101 />
## Vocab, hashes and lexemes {#vocab}
Whenever possible, spaCy tries to store data in a vocabulary, the
@ -411,12 +416,6 @@ import LanguageData101 from 'usage/101/\_language-data.md'
<LanguageData101 />
## Architecture {#architecture}
import Architecture101 from 'usage/101/\_architecture.md'
<Architecture101 />
## Community & FAQ {#community-faq}
We're very happy to see the spaCy community grow and include a mix of people