Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2020-10-05 14:17:41 +02:00
commit 6a9d14e35a
14 changed files with 530 additions and 176 deletions

View File

@ -456,10 +456,10 @@ class Errors:
"issue tracker: http://github.com/explosion/spaCy/issues") "issue tracker: http://github.com/explosion/spaCy/issues")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E092 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. " E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
"Try checking whitespace and delimiters. See " "Try checking whitespace and delimiters. See "
"https://nightly.spacy.io/api/cli#convert") "https://nightly.spacy.io/api/cli#convert")
E093 = ("The token-per-line NER file is not formatted correctly. Try checking " E903 = ("The token-per-line NER file is not formatted correctly. Try checking "
"whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert") "whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert")
E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This " E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This "
"dimension refers to the output width, after the linear projection " "dimension refers to the output width, after the linear projection "

View File

@ -25,8 +25,14 @@ class Russian(Language):
default_config={"model": None, "mode": "pymorphy2"}, default_config={"model": None, "mode": "pymorphy2"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): def make_lemmatizer(
return RussianLemmatizer(nlp.vocab, model, name, mode=mode) nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool = False,
):
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
__all__ = ["Russian"] __all__ = ["Russian"]

View File

@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Tuple
from thinc.api import Model from thinc.api import Model
from ...lookups import Lookups
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
from ...symbols import POS from ...symbols import POS
from ...tokens import Token from ...tokens import Token
@ -22,9 +21,9 @@ class RussianLemmatizer(Lemmatizer):
name: str = "lemmatizer", name: str = "lemmatizer",
*, *,
mode: str = "pymorphy2", mode: str = "pymorphy2",
lookups: Optional[Lookups] = None, overwrite: bool = False,
) -> None: ) -> None:
super().__init__(vocab, model, name, mode=mode, lookups=lookups) super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
try: try:
from pymorphy2 import MorphAnalyzer from pymorphy2 import MorphAnalyzer

View File

@ -26,8 +26,8 @@ class Ukrainian(Language):
default_config={"model": None, "mode": "pymorphy2"}, default_config={"model": None, "mode": "pymorphy2"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False,):
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode) return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
__all__ = ["Ukrainian"] __all__ = ["Ukrainian"]

View File

@ -3,7 +3,6 @@ from typing import Optional
from thinc.api import Model from thinc.api import Model
from ..ru.lemmatizer import RussianLemmatizer from ..ru.lemmatizer import RussianLemmatizer
from ...lookups import Lookups
from ...vocab import Vocab from ...vocab import Vocab
@ -15,9 +14,9 @@ class UkrainianLemmatizer(RussianLemmatizer):
name: str = "lemmatizer", name: str = "lemmatizer",
*, *,
mode: str = "pymorphy2", mode: str = "pymorphy2",
lookups: Optional[Lookups] = None, overwrite: bool = False,
) -> None: ) -> None:
super().__init__(vocab, model, name, mode=mode, lookups=lookups) super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
try: try:
from pymorphy2 import MorphAnalyzer from pymorphy2 import MorphAnalyzer
except ImportError: except ImportError:

View File

@ -248,7 +248,6 @@ def tt_tokenizer():
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def uk_tokenizer(): def uk_tokenizer():
pytest.importorskip("pymorphy2") pytest.importorskip("pymorphy2")
pytest.importorskip("pymorphy2.lang")
return get_lang_class("uk")().tokenizer return get_lang_class("uk")().tokenizer

View File

@ -1,6 +1,5 @@
from spacy.lang.en import English from spacy.lang.en import English
from spacy.pipeline import merge_entities from spacy.pipeline import merge_entities
import pytest
def test_issue5918(): def test_issue5918():

View File

@ -7,6 +7,15 @@ from spacy import util
from spacy import prefer_gpu, require_gpu from spacy import prefer_gpu, require_gpu
from spacy.ml._precomputable_affine import PrecomputableAffine from spacy.ml._precomputable_affine import PrecomputableAffine
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
from spacy.util import dot_to_object, SimpleFrozenList
from thinc.api import Config, Optimizer, ConfigValidationError
from spacy.training.batchers import minibatch_by_words
from spacy.lang.en import English
from spacy.lang.nl import Dutch
from spacy.language import DEFAULT_CONFIG_PATH
from spacy.schemas import ConfigSchemaTraining
from .util import get_random_doc
@pytest.fixture @pytest.fixture
@ -157,3 +166,128 @@ def test_dot_to_dict(dot_notation, expected):
result = util.dot_to_dict(dot_notation) result = util.dot_to_dict(dot_notation)
assert result == expected assert result == expected
assert util.dict_to_dot(result) == dot_notation assert util.dict_to_dot(result) == dot_notation
@pytest.mark.parametrize(
"doc_sizes, expected_batches",
[
([400, 400, 199], [3]),
([400, 400, 199, 3], [4]),
([400, 400, 199, 3, 200], [3, 2]),
([400, 400, 199, 3, 1], [5]),
([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
([400, 400, 199, 3, 1, 200], [3, 3]),
([400, 400, 199, 3, 1, 999], [3, 3]),
([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
([1, 2, 999], [3]),
([1, 2, 999, 1], [4]),
([1, 200, 999, 1], [2, 2]),
([1, 999, 200, 1], [2, 2]),
],
)
def test_util_minibatch(doc_sizes, expected_batches):
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
tol = 0.2
batch_size = 1000
batches = list(
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
)
assert [len(batch) for batch in batches] == expected_batches
max_size = batch_size + batch_size * tol
for batch in batches:
assert sum([len(doc) for doc in batch]) < max_size
@pytest.mark.parametrize(
"doc_sizes, expected_batches",
[
([400, 4000, 199], [1, 2]),
([400, 400, 199, 3000, 200], [1, 4]),
([400, 400, 199, 3, 1, 1500], [1, 5]),
([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
([1, 2, 9999], [1, 2]),
([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
],
)
def test_util_minibatch_oversize(doc_sizes, expected_batches):
""" Test that oversized documents are returned in their own batch"""
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
tol = 0.2
batch_size = 1000
batches = list(
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
)
assert [len(batch) for batch in batches] == expected_batches
def test_util_dot_section():
cfg_string = """
[nlp]
lang = "en"
pipeline = ["textcat"]
[components]
[components.textcat]
factory = "textcat"
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
"""
nlp_config = Config().from_str(cfg_string)
en_nlp = util.load_model_from_config(nlp_config, auto_fill=True)
default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
default_config["nlp"]["lang"] = "nl"
nl_nlp = util.load_model_from_config(default_config, auto_fill=True)
# Test that creation went OK
assert isinstance(en_nlp, English)
assert isinstance(nl_nlp, Dutch)
assert nl_nlp.pipe_names == []
assert en_nlp.pipe_names == ["textcat"]
# not exclusive_classes
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
# Test that default values got overwritten
assert en_nlp.config["nlp"]["pipeline"] == ["textcat"]
assert nl_nlp.config["nlp"]["pipeline"] == [] # default value []
# Test proper functioning of 'dot_to_object'
with pytest.raises(KeyError):
dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
with pytest.raises(KeyError):
dot_to_object(en_nlp.config, "nlp.unknownattribute")
T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
def test_simple_frozen_list():
t = SimpleFrozenList(["foo", "bar"])
assert t == ["foo", "bar"]
assert t.index("bar") == 1 # okay method
with pytest.raises(NotImplementedError):
t.append("baz")
with pytest.raises(NotImplementedError):
t.sort()
with pytest.raises(NotImplementedError):
t.extend(["baz"])
with pytest.raises(NotImplementedError):
t.pop()
t = SimpleFrozenList(["foo", "bar"], error="Error!")
with pytest.raises(NotImplementedError):
t.append("baz")
def test_resolve_dot_names():
config = {
"training": {"optimizer": {"@optimizers": "Adam.v1"}},
"foo": {"bar": "training.optimizer", "baz": "training.xyz"},
}
result = util.resolve_dot_names(config, ["training.optimizer"])
assert isinstance(result[0], Optimizer)
with pytest.raises(ConfigValidationError) as e:
util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
errors = e.value.errors
assert len(errors) == 1
assert errors[0]["loc"] == ["training", "xyz"]

View File

@ -1,137 +0,0 @@
import pytest
from spacy import util
from spacy.util import dot_to_object, SimpleFrozenList
from thinc.api import Config, Optimizer, ConfigValidationError
from spacy.training.batchers import minibatch_by_words
from spacy.lang.en import English
from spacy.lang.nl import Dutch
from spacy.language import DEFAULT_CONFIG_PATH
from spacy.schemas import ConfigSchemaTraining
from .util import get_random_doc
@pytest.mark.parametrize(
"doc_sizes, expected_batches",
[
([400, 400, 199], [3]),
([400, 400, 199, 3], [4]),
([400, 400, 199, 3, 200], [3, 2]),
([400, 400, 199, 3, 1], [5]),
([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
([400, 400, 199, 3, 1, 200], [3, 3]),
([400, 400, 199, 3, 1, 999], [3, 3]),
([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
([1, 2, 999], [3]),
([1, 2, 999, 1], [4]),
([1, 200, 999, 1], [2, 2]),
([1, 999, 200, 1], [2, 2]),
],
)
def test_util_minibatch(doc_sizes, expected_batches):
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
tol = 0.2
batch_size = 1000
batches = list(
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
)
assert [len(batch) for batch in batches] == expected_batches
max_size = batch_size + batch_size * tol
for batch in batches:
assert sum([len(doc) for doc in batch]) < max_size
@pytest.mark.parametrize(
"doc_sizes, expected_batches",
[
([400, 4000, 199], [1, 2]),
([400, 400, 199, 3000, 200], [1, 4]),
([400, 400, 199, 3, 1, 1500], [1, 5]),
([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
([1, 2, 9999], [1, 2]),
([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
],
)
def test_util_minibatch_oversize(doc_sizes, expected_batches):
""" Test that oversized documents are returned in their own batch"""
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
tol = 0.2
batch_size = 1000
batches = list(
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
)
assert [len(batch) for batch in batches] == expected_batches
def test_util_dot_section():
cfg_string = """
[nlp]
lang = "en"
pipeline = ["textcat"]
[components]
[components.textcat]
factory = "textcat"
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
"""
nlp_config = Config().from_str(cfg_string)
en_nlp = util.load_model_from_config(nlp_config, auto_fill=True)
default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
default_config["nlp"]["lang"] = "nl"
nl_nlp = util.load_model_from_config(default_config, auto_fill=True)
# Test that creation went OK
assert isinstance(en_nlp, English)
assert isinstance(nl_nlp, Dutch)
assert nl_nlp.pipe_names == []
assert en_nlp.pipe_names == ["textcat"]
# not exclusive_classes
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
# Test that default values got overwritten
assert en_nlp.config["nlp"]["pipeline"] == ["textcat"]
assert nl_nlp.config["nlp"]["pipeline"] == [] # default value []
# Test proper functioning of 'dot_to_object'
with pytest.raises(KeyError):
dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
with pytest.raises(KeyError):
dot_to_object(en_nlp.config, "nlp.unknownattribute")
T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
def test_simple_frozen_list():
t = SimpleFrozenList(["foo", "bar"])
assert t == ["foo", "bar"]
assert t.index("bar") == 1 # okay method
with pytest.raises(NotImplementedError):
t.append("baz")
with pytest.raises(NotImplementedError):
t.sort()
with pytest.raises(NotImplementedError):
t.extend(["baz"])
with pytest.raises(NotImplementedError):
t.pop()
t = SimpleFrozenList(["foo", "bar"], error="Error!")
with pytest.raises(NotImplementedError):
t.append("baz")
def test_resolve_dot_names():
config = {
"training": {"optimizer": {"@optimizers": "Adam.v1"}},
"foo": {"bar": "training.optimizer", "baz": "training.xyz"},
}
result = util.resolve_dot_names(config, ["training.optimizer"])
assert isinstance(result[0], Optimizer)
with pytest.raises(ConfigValidationError) as e:
util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
errors = e.value.errors
assert len(errors) == 1
assert errors[0]["loc"] == ["training", "xyz"]

View File

@ -103,7 +103,7 @@ def conll_ner_to_docs(
lines = [line.strip() for line in conll_sent.split("\n") if line.strip()] lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
cols = list(zip(*[line.split() for line in lines])) cols = list(zip(*[line.split() for line in lines]))
if len(cols) < 2: if len(cols) < 2:
raise ValueError(Errors.E093) raise ValueError(Errors.E903)
length = len(cols[0]) length = len(cols[0])
words.extend(cols[0]) words.extend(cols[0])
sent_starts.extend([True] + [False] * (length - 1)) sent_starts.extend([True] + [False] * (length - 1))

View File

@ -46,7 +46,7 @@ def read_iob(raw_sents, vocab, n_sents):
sent_words, sent_iob = zip(*sent_tokens) sent_words, sent_iob = zip(*sent_tokens)
sent_tags = ["-"] * len(sent_words) sent_tags = ["-"] * len(sent_words)
else: else:
raise ValueError(Errors.E092) raise ValueError(Errors.E902)
words.extend(sent_words) words.extend(sent_words)
tags.extend(sent_tags) tags.extend(sent_tags)
iob.extend(sent_iob) iob.extend(sent_iob)

View File

@ -226,6 +226,12 @@ the "catastrophic forgetting" problem. This feature is experimental.
Find the loss and gradient of loss for the batch of documents and their Find the loss and gradient of loss for the batch of documents and their
predicted scores. predicted scores.
<Infobox variant="danger">
This method needs to be overwritten with your own custom `get_loss` method.
</Infobox>
> #### Example > #### Example
> >
> ```python > ```python

View File

@ -86,7 +86,8 @@ see are:
| ~~Ragged~~ | A container to handle variable-length sequence data in an unpadded contiguous array. | | ~~Ragged~~ | A container to handle variable-length sequence data in an unpadded contiguous array. |
| ~~Padded~~ | A container to handle variable-length sequence data in a padded contiguous array. | | ~~Padded~~ | A container to handle variable-length sequence data in a padded contiguous array. |
The model type signatures help you figure out which model architectures and See the [Thinc type reference](https://thinc.ai/docs/api-types) for details. The
model type signatures help you figure out which model architectures and
components can **fit together**. For instance, the components can **fit together**. For instance, the
[`TextCategorizer`](/api/textcategorizer) class expects a model typed [`TextCategorizer`](/api/textcategorizer) class expects a model typed
~~Model[List[Doc], Floats2d]~~, because the model will predict one row of ~~Model[List[Doc], Floats2d]~~, because the model will predict one row of
@ -288,7 +289,7 @@ those parts of the network.
To use our custom model including the PyTorch subnetwork, all we need to do is To use our custom model including the PyTorch subnetwork, all we need to do is
register the architecture using the register the architecture using the
[`architectures` registry](/api/top-level#registry). This will assign the [`architectures` registry](/api/top-level#registry). This assigns the
architecture a name so spaCy knows how to find it, and allows passing in architecture a name so spaCy knows how to find it, and allows passing in
arguments like hyperparameters via the [config](/usage/training#config). The arguments like hyperparameters via the [config](/usage/training#config). The
full example then becomes: full example then becomes:
@ -373,7 +374,7 @@ gpu_allocator = "pytorch"
Of course it's also possible to define the `Model` from the previous section Of course it's also possible to define the `Model` from the previous section
entirely in Thinc. The Thinc documentation provides details on the entirely in Thinc. The Thinc documentation provides details on the
[various layers](https://thinc.ai/docs/api-layers) and helper functions [various layers](https://thinc.ai/docs/api-layers) and helper functions
available. Combinators can also be used to available. Combinators can be used to
[overload operators](https://thinc.ai/docs/usage-models#operators) and a common [overload operators](https://thinc.ai/docs/usage-models#operators) and a common
usage pattern is to bind `chain` to `>>`. The "native" Thinc version of our usage pattern is to bind `chain` to `>>`. The "native" Thinc version of our
simple neural network would then become: simple neural network would then become:
@ -486,28 +487,376 @@ with Model.define_operators({">>": chain}):
## Create new trainable components {#components} ## Create new trainable components {#components}
<Infobox title="This section is still under construction" emoji="🚧" variant="warning"> In addition to [swapping out](#swap-architectures) default models in built-in
components, you can also implement an entirely new,
[trainable](/usage/processing-pipelines#trainable-components) pipeline component
from scratch. This can be done by creating a new class inheriting from
[`Pipe`](/api/pipe), and linking it up to your custom model implementation.
<Infobox title="Trainable component API" emoji="💡">
For details on how to implement pipeline components, check out the usage guide
on [custom components](/usage/processing-pipelines#custom-component) and the
overview of the `Pipe` methods used by
[trainable components](/usage/processing-pipelines#trainable-components).
</Infobox> </Infobox>
<!-- TODO: write trainable component section ### Example: Entity elation extraction component {#component-rel}
- Interaction with `predict`, `get_loss` and `set_annotations`
- Initialization life-cycle with `initialize`, correlation with add_label
Example: relation extraction component (implemented as project template)
Avoid duplication with usage/processing-pipelines#trainable-components ?
-->
<!-- ![Diagram of a pipeline component with its model](../images/layers-architectures.svg) This section outlines an example use-case of implementing a **novel relation
extraction component** from scratch. We'll implement a binary relation
extraction method that determines whether or not **two entities** in a document
are related, and if so, what type of relation. We'll allow multiple types of
relations between two such entities (multi-label setting). There are two major
steps required:
1. Implement a [machine learning model](#component-rel-model) specific to this
task. It will have to extract candidates from a [`Doc`](/api/doc) and predict
a relation for the available candidate pairs.
2. Implement a custom [pipeline component](#component-rel-pipe) powered by the
machine learning model that sets annotations on the [`Doc`](/api/doc) passing
through the pipeline.
<!-- TODO: <Project id="tutorials/ner-relations">
</Project> -->
#### Step 1: Implementing the Model {#component-rel-model}
We need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes a
**list of documents** (~~List[Doc]~~) as input, and outputs a **two-dimensional
matrix** (~~Floats2d~~) of predictions:
> #### Model type annotations
>
> The `Model` class is a generic type that can specify its input and output
> types, e.g. ~~Model[List[Doc], Floats2d]~~. Type hints are used for static
> type checks and validation. See the section on [type signatures](#type-sigs)
> for details.
```python ```python
def update(self, examples): ### Register the model architecture
docs = [ex.predicted for ex in examples] @registry.architectures.register("rel_model.v1")
refs = [ex.reference for ex in examples] def create_relation_model(...) -> Model[List[Doc], Floats2d]:
predictions, backprop = self.model.begin_update(docs) model = ... # 👈 model will go here
gradient = self.get_loss(predictions, refs) return model
backprop(gradient)
def __call__(self, doc):
predictions = self.model([doc])
self.set_annotations(predictions)
``` ```
-->
The first layer in this model will typically be an
[embedding layer](/usage/embeddings-transformers) such as a
[`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This
layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it
transforms each **document into a list of tokens**, with each token being
represented by its embedding in the vector space.
Next, we need a method that **generates pairs of entities** that we want to
classify as being related or not. As these candidate pairs are typically formed
within one document, this function takes a [`Doc`](/api/doc) as input and
outputs a `List` of `Span` tuples. For instance, a very straightforward
implementation would be to just take any two entities from the same document:
```python
### Simple candiate generation
def get_candidates(doc: Doc) -> List[Tuple[Span, Span]]:
candidates = []
for ent1 in doc.ents:
for ent2 in doc.ents:
candidates.append((ent1, ent2))
return candidates
```
But we could also refine this further by **excluding relations** of an entity
with itself, and posing a **maximum distance** (in number of tokens) between two
entities. We register this function in the
[`@misc` registry](/api/top-level#registry) so we can refer to it from the
config, and easily swap it out for any other candidate generation function.
> #### config.cfg (excerpt)
>
> ```ini
> [model]
> @architectures = "rel_model.v1"
>
> [model.tok2vec]
> # ...
>
> [model.get_candidates]
> @misc = "rel_cand_generator.v1"
> max_length = 20
> ```
```python
### Extended candidate generation {highlight="1,2,7,8"}
@registry.misc.register("rel_cand_generator.v1")
def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
candidates = []
for ent1 in doc.ents:
for ent2 in doc.ents:
if ent1 != ent2:
if max_length and abs(ent2.start - ent1.start) <= max_length:
candidates.append((ent1, ent2))
return candidates
return get_candidates
```
Finally, we require a method that transforms the candidate entity pairs into a
2D tensor using the specified [`Tok2Vec`](/api/tok2vec) or
[`Transformer`](/api/transformer). The resulting ~~Floats2~~ object will then be
processed by a final `output_layer` of the network. Putting all this together,
we can define our relation model in a config file as such:
```ini
### config.cfg
[model]
@architectures = "rel_model.v1"
# ...
[model.tok2vec]
# ...
[model.get_candidates]
@misc = "rel_cand_generator.v2"
max_length = 20
[model.create_candidate_tensor]
@misc = "rel_cand_tensor.v1"
[model.output_layer]
@architectures = "rel_output_layer.v1"
# ...
```
<!-- TODO: link to project for implementation details -->
<!-- TODO: maybe embed files from project that show the architectures? -->
When creating this model, we store the custom functions as
[attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as
references, so we can access them easily:
```python
tok2vec_layer = model.get_ref("tok2vec")
output_layer = model.get_ref("output_layer")
create_candidate_tensor = model.attrs["create_candidate_tensor"]
get_candidates = model.attrs["get_candidates"]
```
#### Step 2: Implementing the pipeline component {#component-rel-pipe}
To use our new relation extraction model as part of a custom
[trainable component](/usage/processing-pipelines#trainable-components), we
create a subclass of [`Pipe`](/api/pipe) that holds the model:
```python
### Pipeline component skeleton
from spacy.pipeline import Pipe
class RelationExtractor(Pipe):
def __init__(self, vocab, model, name="rel"):
"""Create a component instance."""
self.model = model
self.vocab = vocab
self.name = name
def update(self, examples, drop=0.0, set_annotations=False, sgd=None, losses=None):
"""Learn from a batch of Example objects."""
...
def predict(self, docs):
"""Apply the model to a batch of Doc objects."""
...
def set_annotations(self, docs, predictions):
"""Modify a batch of Doc objects using the predictions."""
...
def initialize(self, get_examples, nlp=None, labels=None):
"""Initialize the model before training."""
...
def add_label(self, label):
"""Add a label to the component."""
...
```
Before the model can be used, it needs to be
[initialized](/usage/training#initialization). This function receives a callback
to access the full **training data set**, or a representative sample. This data
set can be used to deduce all **relevant labels**. Alternatively, a list of
labels can be provided to `initialize`, or you can call the
`RelationExtractoradd_label` directly. The number of labels defines the output
dimensionality of the network, and will be used to do
[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
layers of the neural network. This is triggered by calling
[`Model.initialize`](https://thinc.ai/api/model#initialize).
```python
### The initialize method {highlight="12,18,22"}
from itertools import islice
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],
*,
nlp: Language = None,
labels: Optional[List[str]] = None,
):
if labels is not None:
for label in labels:
self.add_label(label)
else:
for example in get_examples():
relations = example.reference._.rel
for indices, label_dict in relations.items():
for label in label_dict.keys():
self.add_label(label)
subbatch = list(islice(get_examples(), 10))
doc_sample = [eg.reference for eg in subbatch]
label_sample = self._examples_to_truth(subbatch)
self.model.initialize(X=doc_sample, Y=label_sample)
```
The `initialize` method is triggered whenever this component is part of an `nlp`
pipeline, and [`nlp.initialize`](/api/language#initialize) is invoked.
Typically, this happens when the pipeline is set up before training in
[`spacy train`](/api/cli#training). After initialization, the pipeline component
and its internal model can be trained and used to make predictions.
During training, the function [`update`](/api/pipe#update) is invoked which
delegates to
[`Model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
[`get_loss`](/api/pipe#get_loss) function that **calculate the loss** for a
batch of examples, as well as the **gradient** of loss that will be used to
update the weights of the model layers. Thinc provides several
[loss functions](https://thinc.ai/docs/api-loss) that can be used for the
implementation of the `get_loss` function.
```python
### The update method {highlight="12-14"}
def update(
self,
examples: Iterable[Example],
*,
drop: float = 0.0,
set_annotations: bool = False,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
) -> Dict[str, float]:
...
docs = [ex.predicted for ex in examples]
predictions, backprop = self.model.begin_update(docs)
loss, gradient = self.get_loss(examples, predictions)
backprop(gradient)
losses[self.name] += loss
...
return losses
```
When the internal model is trained, the component can be used to make novel
**predictions**. The [`predict`](/api/pipe#predict) function needs to be
implemented for each subclass of `Pipe`. In our case, we can simply delegate to
the internal model's [predict](https://thinc.ai/docs/api-model#predict) function
that takes a batch of `Doc` objects and returns a ~~Floats2d~~ array:
```python
### The predict method
def predict(self, docs: Iterable[Doc]) -> Floats2d:
predictions = self.model.predict(docs)
return self.model.ops.asarray(predictions)
```
The final method that needs to be implemented, is
[`set_annotations`](/api/pipe#set_annotations). This function takes the
predictions, and modifies the given `Doc` object in place to store them. For our
relation extraction component, we store the data as a dictionary in a custom
[extension attribute](/usage/processing-pipelines#custom-components-attributes)
`doc._.rel`. As keys, we represent the candidate pair by the **start offsets of
each entity**, as this defines an entity pair uniquely within one document.
To interpret the scores predicted by the relation extraction model correctly, we
need to refer to the model's `get_candidates` function that defined which pairs
of entities were relevant candidates, so that the predictions can be linked to
those exact entities:
> #### Example output
>
> ```python
> doc = nlp("Amsterdam is the capital of the Netherlands.")
> print("spans", [(e.start, e.text, e.label_) for e in doc.ents])
> for value, rel_dict in doc._.rel.items():
> print(f"{value}: {rel_dict}")
>
> # spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')]
> # (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002}
> # (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017}
> ```
```python
### Registering the extension attribute
from spacy.tokens import Doc
Doc.set_extension("rel", default={})
```
```python
### The set_annotations method {highlight="5-6,10"}
def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
c = 0
get_candidates = self.model.attrs["get_candidates"]
for doc in docs:
for (e1, e2) in get_candidates(doc):
offset = (e1.start, e2.start)
if offset not in doc._.rel:
doc._.rel[offset] = {}
for j, label in enumerate(self.labels):
doc._.rel[offset][label] = predictions[c, j]
c += 1
```
Under the hood, when the pipe is applied to a document, it delegates to the
`predict` and `set_annotations` methods:
```python
### The __call__ method
def __call__(self, Doc doc):
predictions = self.predict([doc])
self.set_annotations([doc], predictions)
return doc
```
Once our `Pipe` subclass is fully implemented, we can
[register](/usage/processing-pipelines#custom-components-factories) the
component with the [`@Language.factory`](/api/lnguage#factory) decorator. This
assigns it a name and lets you create the component with
[`nlp.add_pipe`](/api/language#add_pipe) and via the
[config](/usage/training#config).
> #### config.cfg (excerpt)
>
> ```ini
> [components.relation_extractor]
> factory = "relation_extractor"
>
> [components.relation_extractor.model]
> @architectures = "rel_model.v1"
>
> [components.relation_extractor.model.tok2vec]
> # ...
>
> [components.relation_extractor.model.get_candidates]
> @misc = "rel_cand_generator.v1"
> max_length = 20
> ```
```python
### Registering the pipeline component
from spacy.language import Language
@Language.factory("relation_extractor")
def make_relation_extractor(nlp, name, model):
return RelationExtractor(nlp.vocab, model, name)
```
<!-- TODO: <Project id="tutorials/ner-relations">
</Project> -->

View File

@ -1176,7 +1176,7 @@ plug fully custom machine learning components into your pipeline. You'll need
the following: the following:
1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This 1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This
can be a model using implemented in can be a model implemented in
[Thinc](/usage/layers-architectures#thinc), or a [Thinc](/usage/layers-architectures#thinc), or a
[wrapped model](/usage/layers-architectures#frameworks) implemented in [wrapped model](/usage/layers-architectures#frameworks) implemented in
PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a