mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-04 06:16:33 +03:00
Support store_activations in spancat and morphologizer
This commit is contained in:
parent
789a44755e
commit
acf47e8e48
|
@ -1,5 +1,5 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from typing import Optional, Union, Dict, Callable
|
||||
from typing import Callable, Dict, List, Optional, Union
|
||||
import srsly
|
||||
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
||||
from itertools import islice
|
||||
|
@ -52,7 +52,13 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
|
|||
@Language.factory(
|
||||
"morphologizer",
|
||||
assigns=["token.morph", "token.pos"],
|
||||
default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
|
||||
default_config={
|
||||
"model": DEFAULT_MORPH_MODEL,
|
||||
"overwrite": True,
|
||||
"extend": False,
|
||||
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
|
||||
"store_activations": False
|
||||
},
|
||||
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
||||
)
|
||||
def make_morphologizer(
|
||||
|
@ -62,8 +68,10 @@ def make_morphologizer(
|
|||
overwrite: bool,
|
||||
extend: bool,
|
||||
scorer: Optional[Callable],
|
||||
store_activations: Union[bool, List[str]],
|
||||
):
|
||||
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer)
|
||||
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
|
||||
store_activations=store_activations)
|
||||
|
||||
|
||||
def morphologizer_score(examples, **kwargs):
|
||||
|
@ -95,6 +103,7 @@ class Morphologizer(Tagger):
|
|||
overwrite: bool = BACKWARD_OVERWRITE,
|
||||
extend: bool = BACKWARD_EXTEND,
|
||||
scorer: Optional[Callable] = morphologizer_score,
|
||||
store_activations=False,
|
||||
):
|
||||
"""Initialize a morphologizer.
|
||||
|
||||
|
@ -124,6 +133,7 @@ class Morphologizer(Tagger):
|
|||
}
|
||||
self.cfg = dict(sorted(cfg.items()))
|
||||
self.scorer = scorer
|
||||
self.store_activations = store_activations
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -234,6 +244,9 @@ class Morphologizer(Tagger):
|
|||
cdef bint extend = self.cfg["extend"]
|
||||
labels = self.labels
|
||||
for i, doc in enumerate(docs):
|
||||
doc.activations[self.name] = {}
|
||||
for activation in self.store_activations:
|
||||
doc.activations[self.name][activation] = activations[activation][i]
|
||||
doc_tag_ids = batch_tag_ids[i]
|
||||
if hasattr(doc_tag_ids, "get"):
|
||||
doc_tag_ids = doc_tag_ids.get()
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
|
||||
from typing import Union
|
||||
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
||||
from thinc.api import Optimizer
|
||||
from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
|
||||
|
@ -106,6 +107,7 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
|
|||
"model": DEFAULT_SPANCAT_MODEL,
|
||||
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
||||
"store_activations": False,
|
||||
},
|
||||
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
||||
)
|
||||
|
@ -118,6 +120,7 @@ def make_spancat(
|
|||
scorer: Optional[Callable],
|
||||
threshold: float,
|
||||
max_positive: Optional[int],
|
||||
store_activations: Union[bool, List[str]],
|
||||
) -> "SpanCategorizer":
|
||||
"""Create a SpanCategorizer component. The span categorizer consists of two
|
||||
parts: a suggester function that proposes candidate spans, and a labeller
|
||||
|
@ -148,6 +151,7 @@ def make_spancat(
|
|||
max_positive=max_positive,
|
||||
name=name,
|
||||
scorer=scorer,
|
||||
store_activations=store_activations,
|
||||
)
|
||||
|
||||
|
||||
|
@ -186,6 +190,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
threshold: float = 0.5,
|
||||
max_positive: Optional[int] = None,
|
||||
scorer: Optional[Callable] = spancat_score,
|
||||
store_activations=False,
|
||||
) -> None:
|
||||
"""Initialize the span categorizer.
|
||||
vocab (Vocab): The shared vocabulary.
|
||||
|
@ -218,6 +223,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
self.model = model
|
||||
self.name = name
|
||||
self.scorer = scorer
|
||||
self.store_activations = store_activations
|
||||
|
||||
@property
|
||||
def key(self) -> str:
|
||||
|
@ -270,7 +276,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
"""
|
||||
indices = self.suggester(docs, ops=self.model.ops)
|
||||
scores = self.model.predict((docs, indices)) # type: ignore
|
||||
return indices, scores
|
||||
return {"indices": indices, "scores": scores}
|
||||
|
||||
def set_candidates(
|
||||
self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
|
||||
|
@ -290,7 +296,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
for index in candidates.dataXd:
|
||||
doc.spans[candidates_key].append(doc[index[0] : index[1]])
|
||||
|
||||
def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
|
||||
def set_annotations(self, docs: Iterable[Doc], activations) -> None:
|
||||
"""Modify a batch of Doc objects, using pre-computed scores.
|
||||
|
||||
docs (Iterable[Doc]): The documents to modify.
|
||||
|
@ -299,10 +305,19 @@ class SpanCategorizer(TrainablePipe):
|
|||
DOCS: https://spacy.io/api/spancategorizer#set_annotations
|
||||
"""
|
||||
labels = self.labels
|
||||
indices, scores = indices_scores
|
||||
|
||||
indices = activations["indices"]
|
||||
scores = activations["scores"]
|
||||
offset = 0
|
||||
for i, doc in enumerate(docs):
|
||||
indices_i = indices[i].dataXd
|
||||
doc.activations[self.name] = {}
|
||||
if "indices" in self.store_activations:
|
||||
doc.activations[self.name]["indices"] = indices_i
|
||||
if "scores" in self.store_activations:
|
||||
doc.activations[self.name]["scores"] = scores[
|
||||
offset : offset + indices.lengths[i]
|
||||
]
|
||||
doc.spans[self.key] = self._make_span_group(
|
||||
doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type]
|
||||
)
|
||||
|
@ -474,3 +489,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
spans.append(Span(doc, start, end, label=labels[j]))
|
||||
|
||||
return spans
|
||||
|
||||
@property
|
||||
def activations(self):
|
||||
return ["indices", "scores"]
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from typing import cast
|
||||
import pytest
|
||||
from numpy.testing import assert_equal
|
||||
|
||||
|
@ -7,6 +8,7 @@ from spacy.lang.en import English
|
|||
from spacy.language import Language
|
||||
from spacy.tests.util import make_tempdir
|
||||
from spacy.morphology import Morphology
|
||||
from spacy.pipeline import TrainablePipe
|
||||
from spacy.attrs import MORPH
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
@ -197,3 +199,29 @@ def test_overfitting_IO():
|
|||
gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
|
||||
assert [str(t.morph) for t in doc] == gold_morphs
|
||||
assert [t.pos_ for t in doc] == gold_pos_tags
|
||||
|
||||
|
||||
def test_store_activations():
|
||||
# Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly
|
||||
nlp = English()
|
||||
morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer"))
|
||||
train_examples = []
|
||||
for inst in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
doc = nlp("This is a test.")
|
||||
assert len(list(doc.activations["morphologizer"].keys())) == 0
|
||||
|
||||
morphologizer.store_activations = True
|
||||
doc = nlp("This is a test.")
|
||||
assert "morphologizer" in doc.activations
|
||||
assert set(doc.activations["morphologizer"].keys()) == {"guesses", "probs"}
|
||||
assert doc.activations["morphologizer"]["probs"].shape == (5, 6)
|
||||
assert doc.activations["morphologizer"]["guesses"].shape == (5,)
|
||||
|
||||
morphologizer.store_activations = ["probs"]
|
||||
doc = nlp("This is a test.")
|
||||
assert "morphologizer" in doc.activations
|
||||
assert set(doc.activations["morphologizer"].keys()) == {"probs"}
|
||||
assert doc.activations["morphologizer"]["probs"].shape == (5, 6)
|
||||
|
|
|
@ -419,3 +419,29 @@ def test_set_candidates():
|
|||
assert len(docs[0].spans["candidates"]) == 9
|
||||
assert docs[0].spans["candidates"][0].text == "Just"
|
||||
assert docs[0].spans["candidates"][4].text == "Just a"
|
||||
|
||||
|
||||
def test_store_activations():
|
||||
# Simple test to try and quickly overfit the spancat component - ensuring the ML models work correctly
|
||||
nlp = English()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
train_examples = make_examples(nlp)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
nO = spancat.model.get_dim("nO")
|
||||
assert nO == 2
|
||||
assert set(spancat.labels) == {"LOC", "PERSON"}
|
||||
|
||||
doc = nlp("This is a test.")
|
||||
assert len(list(doc.activations["spancat"].keys())) == 0
|
||||
|
||||
spancat.store_activations = True
|
||||
doc = nlp("This is a test.")
|
||||
assert set(doc.activations["spancat"].keys()) == {"indices", "scores"}
|
||||
assert doc.activations["spancat"]["indices"].shape == (12, 2)
|
||||
assert doc.activations["spancat"]["scores"].shape == (12, nO)
|
||||
spancat.store_activations = True
|
||||
|
||||
spancat.store_activations = ["scores"]
|
||||
doc = nlp("This is a test.")
|
||||
assert set(doc.activations["spancat"].keys()) == {"scores"}
|
||||
assert doc.activations["spancat"]["scores"].shape == (12, nO)
|
||||
|
|
Loading…
Reference in New Issue
Block a user