mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-04 06:16:33 +03:00
Support store_activations in spancat and morphologizer
This commit is contained in:
parent
789a44755e
commit
acf47e8e48
|
@ -1,5 +1,5 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from typing import Optional, Union, Dict, Callable
|
from typing import Callable, Dict, List, Optional, Union
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
@ -52,7 +52,13 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"morphologizer",
|
"morphologizer",
|
||||||
assigns=["token.morph", "token.pos"],
|
assigns=["token.morph", "token.pos"],
|
||||||
default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
|
default_config={
|
||||||
|
"model": DEFAULT_MORPH_MODEL,
|
||||||
|
"overwrite": True,
|
||||||
|
"extend": False,
|
||||||
|
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
|
||||||
|
"store_activations": False
|
||||||
|
},
|
||||||
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
||||||
)
|
)
|
||||||
def make_morphologizer(
|
def make_morphologizer(
|
||||||
|
@ -62,8 +68,10 @@ def make_morphologizer(
|
||||||
overwrite: bool,
|
overwrite: bool,
|
||||||
extend: bool,
|
extend: bool,
|
||||||
scorer: Optional[Callable],
|
scorer: Optional[Callable],
|
||||||
|
store_activations: Union[bool, List[str]],
|
||||||
):
|
):
|
||||||
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer)
|
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
|
||||||
|
store_activations=store_activations)
|
||||||
|
|
||||||
|
|
||||||
def morphologizer_score(examples, **kwargs):
|
def morphologizer_score(examples, **kwargs):
|
||||||
|
@ -95,6 +103,7 @@ class Morphologizer(Tagger):
|
||||||
overwrite: bool = BACKWARD_OVERWRITE,
|
overwrite: bool = BACKWARD_OVERWRITE,
|
||||||
extend: bool = BACKWARD_EXTEND,
|
extend: bool = BACKWARD_EXTEND,
|
||||||
scorer: Optional[Callable] = morphologizer_score,
|
scorer: Optional[Callable] = morphologizer_score,
|
||||||
|
store_activations=False,
|
||||||
):
|
):
|
||||||
"""Initialize a morphologizer.
|
"""Initialize a morphologizer.
|
||||||
|
|
||||||
|
@ -124,6 +133,7 @@ class Morphologizer(Tagger):
|
||||||
}
|
}
|
||||||
self.cfg = dict(sorted(cfg.items()))
|
self.cfg = dict(sorted(cfg.items()))
|
||||||
self.scorer = scorer
|
self.scorer = scorer
|
||||||
|
self.store_activations = store_activations
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -234,6 +244,9 @@ class Morphologizer(Tagger):
|
||||||
cdef bint extend = self.cfg["extend"]
|
cdef bint extend = self.cfg["extend"]
|
||||||
labels = self.labels
|
labels = self.labels
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
|
doc.activations[self.name] = {}
|
||||||
|
for activation in self.store_activations:
|
||||||
|
doc.activations[self.name][activation] = activations[activation][i]
|
||||||
doc_tag_ids = batch_tag_ids[i]
|
doc_tag_ids = batch_tag_ids[i]
|
||||||
if hasattr(doc_tag_ids, "get"):
|
if hasattr(doc_tag_ids, "get"):
|
||||||
doc_tag_ids = doc_tag_ids.get()
|
doc_tag_ids = doc_tag_ids.get()
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
|
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
|
||||||
|
from typing import Union
|
||||||
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
||||||
from thinc.api import Optimizer
|
from thinc.api import Optimizer
|
||||||
from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
|
from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
|
||||||
|
@ -106,6 +107,7 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
|
||||||
"model": DEFAULT_SPANCAT_MODEL,
|
"model": DEFAULT_SPANCAT_MODEL,
|
||||||
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||||
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
||||||
|
"store_activations": False,
|
||||||
},
|
},
|
||||||
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
||||||
)
|
)
|
||||||
|
@ -118,6 +120,7 @@ def make_spancat(
|
||||||
scorer: Optional[Callable],
|
scorer: Optional[Callable],
|
||||||
threshold: float,
|
threshold: float,
|
||||||
max_positive: Optional[int],
|
max_positive: Optional[int],
|
||||||
|
store_activations: Union[bool, List[str]],
|
||||||
) -> "SpanCategorizer":
|
) -> "SpanCategorizer":
|
||||||
"""Create a SpanCategorizer component. The span categorizer consists of two
|
"""Create a SpanCategorizer component. The span categorizer consists of two
|
||||||
parts: a suggester function that proposes candidate spans, and a labeller
|
parts: a suggester function that proposes candidate spans, and a labeller
|
||||||
|
@ -148,6 +151,7 @@ def make_spancat(
|
||||||
max_positive=max_positive,
|
max_positive=max_positive,
|
||||||
name=name,
|
name=name,
|
||||||
scorer=scorer,
|
scorer=scorer,
|
||||||
|
store_activations=store_activations,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -186,6 +190,7 @@ class SpanCategorizer(TrainablePipe):
|
||||||
threshold: float = 0.5,
|
threshold: float = 0.5,
|
||||||
max_positive: Optional[int] = None,
|
max_positive: Optional[int] = None,
|
||||||
scorer: Optional[Callable] = spancat_score,
|
scorer: Optional[Callable] = spancat_score,
|
||||||
|
store_activations=False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize the span categorizer.
|
"""Initialize the span categorizer.
|
||||||
vocab (Vocab): The shared vocabulary.
|
vocab (Vocab): The shared vocabulary.
|
||||||
|
@ -218,6 +223,7 @@ class SpanCategorizer(TrainablePipe):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
self.scorer = scorer
|
self.scorer = scorer
|
||||||
|
self.store_activations = store_activations
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def key(self) -> str:
|
def key(self) -> str:
|
||||||
|
@ -270,7 +276,7 @@ class SpanCategorizer(TrainablePipe):
|
||||||
"""
|
"""
|
||||||
indices = self.suggester(docs, ops=self.model.ops)
|
indices = self.suggester(docs, ops=self.model.ops)
|
||||||
scores = self.model.predict((docs, indices)) # type: ignore
|
scores = self.model.predict((docs, indices)) # type: ignore
|
||||||
return indices, scores
|
return {"indices": indices, "scores": scores}
|
||||||
|
|
||||||
def set_candidates(
|
def set_candidates(
|
||||||
self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
|
self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
|
||||||
|
@ -290,7 +296,7 @@ class SpanCategorizer(TrainablePipe):
|
||||||
for index in candidates.dataXd:
|
for index in candidates.dataXd:
|
||||||
doc.spans[candidates_key].append(doc[index[0] : index[1]])
|
doc.spans[candidates_key].append(doc[index[0] : index[1]])
|
||||||
|
|
||||||
def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
|
def set_annotations(self, docs: Iterable[Doc], activations) -> None:
|
||||||
"""Modify a batch of Doc objects, using pre-computed scores.
|
"""Modify a batch of Doc objects, using pre-computed scores.
|
||||||
|
|
||||||
docs (Iterable[Doc]): The documents to modify.
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
|
@ -299,10 +305,19 @@ class SpanCategorizer(TrainablePipe):
|
||||||
DOCS: https://spacy.io/api/spancategorizer#set_annotations
|
DOCS: https://spacy.io/api/spancategorizer#set_annotations
|
||||||
"""
|
"""
|
||||||
labels = self.labels
|
labels = self.labels
|
||||||
indices, scores = indices_scores
|
|
||||||
|
indices = activations["indices"]
|
||||||
|
scores = activations["scores"]
|
||||||
offset = 0
|
offset = 0
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
indices_i = indices[i].dataXd
|
indices_i = indices[i].dataXd
|
||||||
|
doc.activations[self.name] = {}
|
||||||
|
if "indices" in self.store_activations:
|
||||||
|
doc.activations[self.name]["indices"] = indices_i
|
||||||
|
if "scores" in self.store_activations:
|
||||||
|
doc.activations[self.name]["scores"] = scores[
|
||||||
|
offset : offset + indices.lengths[i]
|
||||||
|
]
|
||||||
doc.spans[self.key] = self._make_span_group(
|
doc.spans[self.key] = self._make_span_group(
|
||||||
doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type]
|
doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type]
|
||||||
)
|
)
|
||||||
|
@ -474,3 +489,7 @@ class SpanCategorizer(TrainablePipe):
|
||||||
spans.append(Span(doc, start, end, label=labels[j]))
|
spans.append(Span(doc, start, end, label=labels[j]))
|
||||||
|
|
||||||
return spans
|
return spans
|
||||||
|
|
||||||
|
@property
|
||||||
|
def activations(self):
|
||||||
|
return ["indices", "scores"]
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from typing import cast
|
||||||
import pytest
|
import pytest
|
||||||
from numpy.testing import assert_equal
|
from numpy.testing import assert_equal
|
||||||
|
|
||||||
|
@ -7,6 +8,7 @@ from spacy.lang.en import English
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.tests.util import make_tempdir
|
from spacy.tests.util import make_tempdir
|
||||||
from spacy.morphology import Morphology
|
from spacy.morphology import Morphology
|
||||||
|
from spacy.pipeline import TrainablePipe
|
||||||
from spacy.attrs import MORPH
|
from spacy.attrs import MORPH
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
@ -197,3 +199,29 @@ def test_overfitting_IO():
|
||||||
gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
|
gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
|
||||||
assert [str(t.morph) for t in doc] == gold_morphs
|
assert [str(t.morph) for t in doc] == gold_morphs
|
||||||
assert [t.pos_ for t in doc] == gold_pos_tags
|
assert [t.pos_ for t in doc] == gold_pos_tags
|
||||||
|
|
||||||
|
|
||||||
|
def test_store_activations():
|
||||||
|
# Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly
|
||||||
|
nlp = English()
|
||||||
|
morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer"))
|
||||||
|
train_examples = []
|
||||||
|
for inst in TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
|
||||||
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
doc = nlp("This is a test.")
|
||||||
|
assert len(list(doc.activations["morphologizer"].keys())) == 0
|
||||||
|
|
||||||
|
morphologizer.store_activations = True
|
||||||
|
doc = nlp("This is a test.")
|
||||||
|
assert "morphologizer" in doc.activations
|
||||||
|
assert set(doc.activations["morphologizer"].keys()) == {"guesses", "probs"}
|
||||||
|
assert doc.activations["morphologizer"]["probs"].shape == (5, 6)
|
||||||
|
assert doc.activations["morphologizer"]["guesses"].shape == (5,)
|
||||||
|
|
||||||
|
morphologizer.store_activations = ["probs"]
|
||||||
|
doc = nlp("This is a test.")
|
||||||
|
assert "morphologizer" in doc.activations
|
||||||
|
assert set(doc.activations["morphologizer"].keys()) == {"probs"}
|
||||||
|
assert doc.activations["morphologizer"]["probs"].shape == (5, 6)
|
||||||
|
|
|
@ -419,3 +419,29 @@ def test_set_candidates():
|
||||||
assert len(docs[0].spans["candidates"]) == 9
|
assert len(docs[0].spans["candidates"]) == 9
|
||||||
assert docs[0].spans["candidates"][0].text == "Just"
|
assert docs[0].spans["candidates"][0].text == "Just"
|
||||||
assert docs[0].spans["candidates"][4].text == "Just a"
|
assert docs[0].spans["candidates"][4].text == "Just a"
|
||||||
|
|
||||||
|
|
||||||
|
def test_store_activations():
|
||||||
|
# Simple test to try and quickly overfit the spancat component - ensuring the ML models work correctly
|
||||||
|
nlp = English()
|
||||||
|
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||||
|
train_examples = make_examples(nlp)
|
||||||
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
nO = spancat.model.get_dim("nO")
|
||||||
|
assert nO == 2
|
||||||
|
assert set(spancat.labels) == {"LOC", "PERSON"}
|
||||||
|
|
||||||
|
doc = nlp("This is a test.")
|
||||||
|
assert len(list(doc.activations["spancat"].keys())) == 0
|
||||||
|
|
||||||
|
spancat.store_activations = True
|
||||||
|
doc = nlp("This is a test.")
|
||||||
|
assert set(doc.activations["spancat"].keys()) == {"indices", "scores"}
|
||||||
|
assert doc.activations["spancat"]["indices"].shape == (12, 2)
|
||||||
|
assert doc.activations["spancat"]["scores"].shape == (12, nO)
|
||||||
|
spancat.store_activations = True
|
||||||
|
|
||||||
|
spancat.store_activations = ["scores"]
|
||||||
|
doc = nlp("This is a test.")
|
||||||
|
assert set(doc.activations["spancat"].keys()) == {"scores"}
|
||||||
|
assert doc.activations["spancat"]["scores"].shape == (12, nO)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user