Set annotations in update (#6767)

* bump to 3.0.0rc4

* do set_annotations in component update calls

* update docs and remove set_annotations flag

* fix EL test
This commit is contained in:
Sofie Van Landeghem 2021-01-20 01:49:25 +01:00 committed by GitHub
parent 57640aa838
commit e680efc7cc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 57 additions and 77 deletions

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy-nightly"
__version__ = "3.0.0rc3"
__version__ = "3.0.0rc4"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"

View File

@ -193,18 +193,16 @@ class EntityLinker(TrainablePipe):
self,
examples: Iterable[Example],
*,
set_annotations: bool = False,
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
) -> Dict[str, float]:
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model. Delegates to predict and get_loss.
updating the pipe's model. Delegates to predict, get_loss and
set_annotations.
examples (Iterable[Example]): A batch of Example objects.
drop (float): The dropout rate.
set_annotations (bool): Whether or not to update the Example objects
with the predictions.
sgd (thinc.api.Optimizer): The optimizer.
losses (Dict[str, float]): Optional record of the loss during training.
Updated using the component name as the key.
@ -220,11 +218,13 @@ class EntityLinker(TrainablePipe):
return losses
validate_examples(examples, "EntityLinker.update")
sentence_docs = []
docs = [eg.predicted for eg in examples]
if set_annotations:
# This seems simpler than other ways to get that exact output -- but
# it does run the model twice :(
predictions = self.model.predict(docs)
docs = []
for eg in examples:
eg.predicted.ents = eg.reference.ents
docs.append(eg.predicted)
# This seems simpler than other ways to get that exact output -- but
# it does run the model twice :(
predictions = self.predict(docs)
for eg in examples:
sentences = [s for s in eg.reference.sents]
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
@ -260,8 +260,7 @@ class EntityLinker(TrainablePipe):
if sgd is not None:
self.finish_update(sgd)
losses[self.name] += loss
if set_annotations:
self.set_annotations(docs, predictions)
self.set_annotations(docs, predictions)
return losses
def get_loss(self, examples: Iterable[Example], sentence_encodings):

View File

@ -199,7 +199,7 @@ class ClozeMultitask(TrainablePipe):
loss = self.distance.get_loss(prediction, target)
return loss, gradient
def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
def update(self, examples, *, drop=0., sgd=None, losses=None):
pass
def rehearse(self, examples, drop=0., sgd=None, losses=None):

View File

@ -173,14 +173,13 @@ class Tagger(TrainablePipe):
if doc.c[j].tag == 0:
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
def update(self, examples, *, drop=0., sgd=None, losses=None):
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model. Delegates to predict and get_loss.
updating the pipe's model. Delegates to predict, get_loss and
set_annotations.
examples (Iterable[Example]): A batch of Example objects.
drop (float): The dropout rate.
set_annotations (bool): Whether or not to update the Example objects
with the predictions.
sgd (thinc.api.Optimizer): The optimizer.
losses (Dict[str, float]): Optional record of the loss during training.
Updated using the component name as the key.
@ -206,9 +205,8 @@ class Tagger(TrainablePipe):
self.finish_update(sgd)
losses[self.name] += loss
if set_annotations:
docs = [eg.predicted for eg in examples]
self.set_annotations(docs, self._scores2guesses(tag_scores))
docs = [eg.predicted for eg in examples]
self.set_annotations(docs, self._scores2guesses(tag_scores))
return losses
def rehearse(self, examples, *, drop=0., sgd=None, losses=None):

View File

@ -195,17 +195,15 @@ class TextCategorizer(TrainablePipe):
examples: Iterable[Example],
*,
drop: float = 0.0,
set_annotations: bool = False,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
) -> Dict[str, float]:
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model. Delegates to predict and get_loss.
updating the pipe's model. Delegates to predict, get_loss and
set_annotations.
examples (Iterable[Example]): A batch of Example objects.
drop (float): The dropout rate.
set_annotations (bool): Whether or not to update the Example objects
with the predictions.
sgd (thinc.api.Optimizer): The optimizer.
losses (Dict[str, float]): Optional record of the loss during training.
Updated using the component name as the key.
@ -228,9 +226,8 @@ class TextCategorizer(TrainablePipe):
if sgd is not None:
self.finish_update(sgd)
losses[self.name] += loss
if set_annotations:
docs = [eg.predicted for eg in examples]
self.set_annotations(docs, scores=scores)
docs = [eg.predicted for eg in examples]
self.set_annotations(docs, scores=scores)
return losses
def rehearse(

View File

@ -163,15 +163,12 @@ class Tok2Vec(TrainablePipe):
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
set_annotations: bool = False,
):
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model.
examples (Iterable[Example]): A batch of Example objects.
drop (float): The dropout rate.
set_annotations (bool): Whether or not to update the Example objects
with the predictions.
sgd (thinc.api.Optimizer): The optimizer.
losses (Dict[str, float]): Optional record of the loss during training.
Updated using the component name as the key.
@ -210,8 +207,7 @@ class Tok2Vec(TrainablePipe):
listener.receive(batch_id, tokvecs, accumulate_gradient)
if self.listeners:
self.listeners[-1].receive(batch_id, tokvecs, backprop)
if set_annotations:
self.set_annotations(docs, tokvecs)
self.set_annotations(docs, tokvecs)
return losses
def get_loss(self, examples, scores) -> None:

View File

@ -91,16 +91,14 @@ cdef class TrainablePipe(Pipe):
def update(self,
examples: Iterable["Example"],
*, drop: float=0.0,
set_annotations: bool=False,
sgd: Optimizer=None,
losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model. Delegates to predict and get_loss.
updating the pipe's model. Delegates to predict, get_loss and
set_annotations.
examples (Iterable[Example]): A batch of Example objects.
drop (float): The dropout rate.
set_annotations (bool): Whether or not to update the Example objects
with the predictions.
sgd (thinc.api.Optimizer): The optimizer.
losses (Dict[str, float]): Optional record of the loss during training.
Updated using the component name as the key.
@ -124,9 +122,8 @@ cdef class TrainablePipe(Pipe):
if sgd not in (None, False):
self.finish_update(sgd)
losses[self.name] += loss
if set_annotations:
docs = [eg.predicted for eg in examples]
self.set_annotations(docs, scores=scores)
docs = [eg.predicted for eg in examples]
self.set_annotations(docs, scores=scores)
return losses
def rehearse(self,

View File

@ -308,7 +308,7 @@ cdef class Parser(TrainablePipe):
action.do(states[i], action.label)
free(is_valid)
def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
def update(self, examples, *, drop=0., sgd=None, losses=None):
cdef StateClass state
if losses is None:
losses = {}
@ -328,7 +328,6 @@ cdef class Parser(TrainablePipe):
return self.update_beam(
examples,
beam_width=self.cfg["beam_width"],
set_annotations=set_annotations,
sgd=sgd,
losses=losses,
beam_density=self.cfg["beam_density"]
@ -370,9 +369,8 @@ cdef class Parser(TrainablePipe):
backprop_tok2vec(golds)
if sgd not in (None, False):
self.finish_update(sgd)
if set_annotations:
docs = [eg.predicted for eg in examples]
self.set_annotations(docs, all_states)
docs = [eg.predicted for eg in examples]
self.set_annotations(docs, all_states)
# Ugh, this is annoying. If we're working on GPU, we want to free the
# memory ASAP. It seems that Python doesn't necessarily get around to
# removing these in time if we don't explicitly delete? It's confusing.
@ -432,7 +430,7 @@ cdef class Parser(TrainablePipe):
return losses
def update_beam(self, examples, *, beam_width,
drop=0., sgd=None, losses=None, set_annotations=False, beam_density=0.0):
drop=0., sgd=None, losses=None, beam_density=0.0):
states, golds, _ = self.moves.init_gold_batch(examples)
if not states:
return losses

View File

@ -425,6 +425,7 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
def test_overfitting_IO():
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
nlp = English()
nlp.add_pipe("sentencizer", first=True)
vector_length = 3
assert "Q2146908" not in nlp.vocab.strings
@ -464,9 +465,6 @@ def test_overfitting_IO():
nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["entity_linker"] < 0.001
# adding additional components that are required for the entity_linker
nlp.add_pipe("sentencizer", first=True)
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data
patterns = [
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}

View File

@ -220,8 +220,9 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
## DependencyParser.update {#update tag="method"}
Learn from a batch of [`Example`](/api/example) objects, updating the pipe's
model. Delegates to [`predict`](/api/dependencyparser#predict) and
[`get_loss`](/api/dependencyparser#get_loss).
model. Delegates to [`predict`](/api/dependencyparser#predict),
[`get_loss`](/api/dependencyparser#get_loss) and
[`set_annotations`](/api/dependencyparser#set_annotations).
> #### Example
>
@ -236,7 +237,6 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |

View File

@ -237,7 +237,8 @@ entities.
Learn from a batch of [`Example`](/api/example) objects, updating both the
pipe's entity linking model and context encoder. Delegates to
[`predict`](/api/entitylinker#predict).
[`predict`](/api/entitylinker#predict) and
[`set_annotations`](/api/entitylinker#set_annotations).
> #### Example
>
@ -252,7 +253,6 @@ pipe's entity linking model and context encoder. Delegates to
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |

View File

@ -209,8 +209,9 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
## EntityRecognizer.update {#update tag="method"}
Learn from a batch of [`Example`](/api/example) objects, updating the pipe's
model. Delegates to [`predict`](/api/entityrecognizer#predict) and
[`get_loss`](/api/entityrecognizer#get_loss).
model. Delegates to [`predict`](/api/entityrecognizer#predict),
[`get_loss`](/api/entityrecognizer#get_loss) and
[`set_annotations`](/api/entityrecognizer#set_annotations).
> #### Example
>
@ -225,7 +226,6 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |

View File

@ -189,8 +189,9 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
Learn from a batch of [`Example`](/api/example) objects containing the
predictions and gold-standard annotations, and update the component's model.
Delegates to [`predict`](/api/morphologizer#predict) and
[`get_loss`](/api/morphologizer#get_loss).
Delegates to [`predict`](/api/morphologizer#predict),
[`get_loss`](/api/morphologizer#get_loss) and
[`set_annotations`](/api/morphologizer#set_annotations).
> #### Example
>
@ -205,7 +206,6 @@ Delegates to [`predict`](/api/morphologizer#predict) and
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |

View File

@ -199,8 +199,9 @@ Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores.
Learn from a batch of [`Example`](/api/example) objects containing the
predictions and gold-standard annotations, and update the component's model.
Delegates to [`predict`](/api/multilabel_textcategorizer#predict) and
[`get_loss`](/api/multilabel_textcategorizer#get_loss).
Delegates to [`predict`](/api/multilabel_textcategorizer#predict),
[`get_loss`](/api/multilabel_textcategorizer#get_loss) and
[`set_annotations`](/api/multilabel_textcategorizer#set_annotations).
> #### Example
>
@ -215,7 +216,6 @@ Delegates to [`predict`](/api/multilabel_textcategorizer#predict) and
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |

View File

@ -195,7 +195,6 @@ predictions and gold-standard annotations, and update the component's model.
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |

View File

@ -176,8 +176,9 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
Learn from a batch of [`Example`](/api/example) objects containing the
predictions and gold-standard annotations, and update the component's model.
Delegates to [`predict`](/api/sentencerecognizer#predict) and
[`get_loss`](/api/sentencerecognizer#get_loss).
Delegates to [`predict`](/api/sentencerecognizer#predict),
[`get_loss`](/api/sentencerecognizer#get_loss) and
[`set_annotations`](/api/sentencerecognizer#set_annotations).
> #### Example
>
@ -192,7 +193,6 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |

View File

@ -187,8 +187,9 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
Learn from a batch of [`Example`](/api/example) objects containing the
predictions and gold-standard annotations, and update the component's model.
Delegates to [`predict`](/api/tagger#predict) and
[`get_loss`](/api/tagger#get_loss).
Delegates to [`predict`](/api/tagger#predict),
[`get_loss`](/api/tagger#get_loss) and
[`set_annotations`](/api/tagger#set_annotations).
> #### Example
>
@ -203,7 +204,6 @@ Delegates to [`predict`](/api/tagger#predict) and
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |

View File

@ -201,8 +201,9 @@ Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores.
Learn from a batch of [`Example`](/api/example) objects containing the
predictions and gold-standard annotations, and update the component's model.
Delegates to [`predict`](/api/textcategorizer#predict) and
[`get_loss`](/api/textcategorizer#get_loss).
Delegates to [`predict`](/api/textcategorizer#predict),
[`get_loss`](/api/textcategorizer#get_loss) and
[`set_annotations`](/api/textcategorizer#set_annotations).
> #### Example
>
@ -217,7 +218,6 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |

View File

@ -186,7 +186,8 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
Learn from a batch of [`Example`](/api/example) objects containing the
predictions and gold-standard annotations, and update the component's model.
Delegates to [`predict`](/api/tok2vec#predict).
Delegates to [`predict`](/api/tok2vec#predict) and
[`set_annotations`](/api/tok2vec#set_annotations).
> #### Example
>
@ -201,7 +202,6 @@ Delegates to [`predict`](/api/tok2vec#predict).
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |

View File

@ -249,7 +249,6 @@ and call the optimizer, while the others simply increment the gradients.
| `examples` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |

View File

@ -796,7 +796,7 @@ class RelationExtractor(TrainablePipe):
self.vocab = vocab
self.name = name
def update(self, examples, drop=0.0, set_annotations=False, sgd=None, losses=None):
def update(self, examples, drop=0.0, sgd=None, losses=None):
"""Learn from a batch of Example objects."""
...
@ -901,7 +901,6 @@ def update(
examples: Iterable[Example],
*,
drop: float = 0.0,
set_annotations: bool = False,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
) -> Dict[str, float]: