mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Remove simple_ner code (#6041)
* remove simple_ner code * remove unused _biluo and _iob files
This commit is contained in:
parent
24053d83ec
commit
cb66ea7400
|
@ -31,10 +31,13 @@ lang = "en"
|
||||||
vectors = null
|
vectors = null
|
||||||
|
|
||||||
[nlp.pipeline.ner]
|
[nlp.pipeline.ner]
|
||||||
factory = "simple_ner"
|
factory = "ner"
|
||||||
|
|
||||||
[nlp.pipeline.ner.model]
|
[nlp.pipeline.ner.model]
|
||||||
@architectures = "spacy.BiluoTagger.v1"
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
nr_feature_tokens = 6
|
||||||
|
hidden_width = 64
|
||||||
|
maxout_pieces = 2
|
||||||
|
|
||||||
[nlp.pipeline.ner.model.tok2vec]
|
[nlp.pipeline.ner.model.tok2vec]
|
||||||
@architectures = "spacy.HashEmbedCNN.v1"
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
|
|
@ -1,105 +0,0 @@
|
||||||
"""Thinc layer to do simpler transition-based parsing, NER, etc."""
|
|
||||||
from typing import Dict, Optional
|
|
||||||
import numpy
|
|
||||||
from thinc.api import Model
|
|
||||||
from thinc.types import Padded, Floats3d
|
|
||||||
|
|
||||||
|
|
||||||
def BILUO() -> Model[Padded, Padded]:
|
|
||||||
return Model(
|
|
||||||
"biluo",
|
|
||||||
forward,
|
|
||||||
init=init,
|
|
||||||
dims={"nO": None},
|
|
||||||
attrs={"get_num_actions": get_num_actions},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None):
|
|
||||||
if X is not None and Y is not None:
|
|
||||||
if X.data.shape != Y.data.shape:
|
|
||||||
# TODO: Fix error
|
|
||||||
raise ValueError("Mismatched shapes (TODO: Fix message)")
|
|
||||||
model.set_dim("nO", X.data.shape[2])
|
|
||||||
elif X is not None:
|
|
||||||
model.set_dim("nO", X.data.shape[2])
|
|
||||||
elif Y is not None:
|
|
||||||
model.set_dim("nO", Y.data.shape[2])
|
|
||||||
elif model.get_dim("nO") is None:
|
|
||||||
raise ValueError("Dimension unset for BILUO: nO")
|
|
||||||
|
|
||||||
|
|
||||||
def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
|
|
||||||
n_labels = (model.get_dim("nO") - 1) // 4
|
|
||||||
n_tokens, n_docs, n_actions = Xp.data.shape
|
|
||||||
# At each timestep, we make a validity mask of shape (n_docs, n_actions)
|
|
||||||
# to indicate which actions are valid next for each sequence. To construct
|
|
||||||
# the mask, we have a state of shape (2, n_actions) and a validity table of
|
|
||||||
# shape (2, n_actions+1, n_actions). The first dimension of the state indicates
|
|
||||||
# whether it's the last token, the second dimension indicates the previous
|
|
||||||
# action, plus a special 'null action' for the first entry.
|
|
||||||
valid_transitions = model.ops.asarray(_get_transition_table(n_labels))
|
|
||||||
prev_actions = model.ops.alloc1i(n_docs)
|
|
||||||
# Initialize as though prev action was O
|
|
||||||
prev_actions.fill(n_actions - 1)
|
|
||||||
Y = model.ops.alloc3f(*Xp.data.shape)
|
|
||||||
masks = model.ops.alloc3f(*Y.shape)
|
|
||||||
max_value = Xp.data.max()
|
|
||||||
for t in range(Xp.data.shape[0]):
|
|
||||||
is_last = (Xp.lengths < (t + 2)).astype("i")
|
|
||||||
masks[t] = valid_transitions[is_last, prev_actions]
|
|
||||||
# Don't train the out-of-bounds sequences.
|
|
||||||
masks[t, Xp.size_at_t[t] :] = 0
|
|
||||||
# Valid actions get 0*10e8, invalid get large negative value
|
|
||||||
Y[t] = Xp.data[t] + ((masks[t] - 1) * max_value * 10)
|
|
||||||
prev_actions = Y[t].argmax(axis=-1)
|
|
||||||
|
|
||||||
def backprop_biluo(dY: Padded) -> Padded:
|
|
||||||
dY.data *= masks
|
|
||||||
return dY
|
|
||||||
|
|
||||||
return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
|
|
||||||
|
|
||||||
|
|
||||||
def get_num_actions(n_labels: int) -> int:
|
|
||||||
# One BEGIN action per label
|
|
||||||
# One IN action per label
|
|
||||||
# One LAST action per label
|
|
||||||
# One UNIT action per label
|
|
||||||
# One OUT action
|
|
||||||
return n_labels + n_labels + n_labels + n_labels + 1
|
|
||||||
|
|
||||||
|
|
||||||
def _get_transition_table(
|
|
||||||
n_labels: int, *, _cache: Dict[int, Floats3d] = {}
|
|
||||||
) -> Floats3d:
|
|
||||||
n_actions = get_num_actions(n_labels)
|
|
||||||
if n_actions in _cache:
|
|
||||||
return _cache[n_actions]
|
|
||||||
table = numpy.zeros((2, n_actions, n_actions), dtype="f")
|
|
||||||
B_start, B_end = (0, n_labels)
|
|
||||||
I_start, I_end = (B_end, B_end + n_labels)
|
|
||||||
L_start, L_end = (I_end, I_end + n_labels)
|
|
||||||
U_start, _ = (L_end, L_end + n_labels) # noqa: F841
|
|
||||||
# Using ranges allows us to set specific cells, which is necessary to express
|
|
||||||
# that only actions of the same label are valid continuations.
|
|
||||||
B_range = numpy.arange(B_start, B_end)
|
|
||||||
I_range = numpy.arange(I_start, I_end)
|
|
||||||
L_range = numpy.arange(L_start, L_end)
|
|
||||||
# If this is the last token and the previous action was B or I, only L
|
|
||||||
# of that label is valid
|
|
||||||
table[1, B_range, L_range] = 1
|
|
||||||
table[1, I_range, L_range] = 1
|
|
||||||
# If this isn't the last token and the previous action was B or I, only I or
|
|
||||||
# L of that label are valid.
|
|
||||||
table[0, B_range, I_range] = 1
|
|
||||||
table[0, B_range, L_range] = 1
|
|
||||||
table[0, I_range, I_range] = 1
|
|
||||||
table[0, I_range, L_range] = 1
|
|
||||||
# If this isn't the last token and the previous was L, U or O, B is valid
|
|
||||||
table[0, L_start:, :B_end] = 1
|
|
||||||
# Regardless of whether this is the last token, if the previous action was
|
|
||||||
# {L, U, O}, U and O are valid.
|
|
||||||
table[:, L_start:, U_start:] = 1
|
|
||||||
_cache[n_actions] = table
|
|
||||||
return table
|
|
|
@ -1,88 +0,0 @@
|
||||||
"""Thinc layer to do simpler transition-based parsing, NER, etc."""
|
|
||||||
from typing import Dict, Optional
|
|
||||||
from thinc.api import Ops, Model
|
|
||||||
from thinc.types import Padded, Floats3d
|
|
||||||
|
|
||||||
|
|
||||||
def IOB() -> Model[Padded, Padded]:
|
|
||||||
return Model(
|
|
||||||
"biluo",
|
|
||||||
forward,
|
|
||||||
init=init,
|
|
||||||
dims={"nO": None},
|
|
||||||
attrs={"get_num_actions": get_num_actions},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def init(model: Model, X: Optional[Padded] = None, Y: Optional[Padded] = None) -> None:
|
|
||||||
if X is not None and Y is not None:
|
|
||||||
if X.data.shape != Y.data.shape:
|
|
||||||
# TODO: Fix error
|
|
||||||
raise ValueError("Mismatched shapes (TODO: Fix message)")
|
|
||||||
model.set_dim("nO", X.data.shape[2])
|
|
||||||
elif X is not None:
|
|
||||||
model.set_dim("nO", X.data.shape[2])
|
|
||||||
elif Y is not None:
|
|
||||||
model.set_dim("nO", Y.data.shape[2])
|
|
||||||
elif model.get_dim("nO") is None:
|
|
||||||
raise ValueError("Dimension unset for BILUO: nO")
|
|
||||||
|
|
||||||
|
|
||||||
def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
|
|
||||||
n_labels = (model.get_dim("nO") - 1) // 2
|
|
||||||
n_tokens, n_docs, n_actions = Xp.data.shape
|
|
||||||
# At each timestep, we make a validity mask of shape (n_docs, n_actions)
|
|
||||||
# to indicate which actions are valid next for each sequence. To construct
|
|
||||||
# the mask, we have a state of shape (2, n_actions) and a validity table of
|
|
||||||
# shape (2, n_actions+1, n_actions). The first dimension of the state indicates
|
|
||||||
# whether it's the last token, the second dimension indicates the previous
|
|
||||||
# action, plus a special 'null action' for the first entry.
|
|
||||||
valid_transitions = _get_transition_table(model.ops, n_labels)
|
|
||||||
prev_actions = model.ops.alloc1i(n_docs)
|
|
||||||
# Initialize as though prev action was O
|
|
||||||
prev_actions.fill(n_actions - 1)
|
|
||||||
Y = model.ops.alloc3f(*Xp.data.shape)
|
|
||||||
masks = model.ops.alloc3f(*Y.shape)
|
|
||||||
for t in range(Xp.data.shape[0]):
|
|
||||||
masks[t] = valid_transitions[prev_actions]
|
|
||||||
# Don't train the out-of-bounds sequences.
|
|
||||||
masks[t, Xp.size_at_t[t] :] = 0
|
|
||||||
# Valid actions get 0*10e8, invalid get -1*10e8
|
|
||||||
Y[t] = Xp.data[t] + ((masks[t] - 1) * 10e8)
|
|
||||||
prev_actions = Y[t].argmax(axis=-1)
|
|
||||||
|
|
||||||
def backprop_biluo(dY: Padded) -> Padded:
|
|
||||||
# Masking the gradient seems to do poorly here. But why?
|
|
||||||
# dY.data *= masks
|
|
||||||
return dY
|
|
||||||
|
|
||||||
return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
|
|
||||||
|
|
||||||
|
|
||||||
def get_num_actions(n_labels: int) -> int:
|
|
||||||
# One BEGIN action per label
|
|
||||||
# One IN action per label
|
|
||||||
# One OUT action
|
|
||||||
return n_labels * 2 + 1
|
|
||||||
|
|
||||||
|
|
||||||
def _get_transition_table(
|
|
||||||
ops: Ops, n_labels: int, _cache: Dict[int, Floats3d] = {}
|
|
||||||
) -> Floats3d:
|
|
||||||
n_actions = get_num_actions(n_labels)
|
|
||||||
if n_actions in _cache:
|
|
||||||
return ops.asarray(_cache[n_actions])
|
|
||||||
table = ops.alloc2f(n_actions, n_actions)
|
|
||||||
B_start, B_end = (0, n_labels)
|
|
||||||
I_start, I_end = (B_end, B_end + n_labels)
|
|
||||||
O_action = I_end
|
|
||||||
B_range = ops.xp.arange(B_start, B_end)
|
|
||||||
I_range = ops.xp.arange(I_start, I_end)
|
|
||||||
# B and O are always valid
|
|
||||||
table[:, B_start:B_end] = 1
|
|
||||||
table[:, O_action] = 1
|
|
||||||
# I can only follow a matching B
|
|
||||||
table[B_range, I_range] = 1
|
|
||||||
|
|
||||||
_cache[n_actions] = table
|
|
||||||
return table
|
|
|
@ -1,6 +1,5 @@
|
||||||
from .entity_linker import * # noqa
|
from .entity_linker import * # noqa
|
||||||
from .parser import * # noqa
|
from .parser import * # noqa
|
||||||
from .simple_ner import * # noqa
|
|
||||||
from .tagger import * # noqa
|
from .tagger import * # noqa
|
||||||
from .textcat import * # noqa
|
from .textcat import * # noqa
|
||||||
from .tok2vec import * # noqa
|
from .tok2vec import * # noqa
|
||||||
|
|
|
@ -1,104 +0,0 @@
|
||||||
from typing import List
|
|
||||||
from thinc.api import Model, Linear, with_array, softmax_activation, padded2list
|
|
||||||
from thinc.api import chain, list2padded, configure_normal_init
|
|
||||||
from thinc.api import Dropout
|
|
||||||
from thinc.types import Floats2d
|
|
||||||
|
|
||||||
from ...tokens import Doc
|
|
||||||
from .._biluo import BILUO
|
|
||||||
from .._iob import IOB
|
|
||||||
from ...util import registry
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.BILUOTagger.v1")
|
|
||||||
def BiluoTagger(
|
|
||||||
tok2vec: Model[List[Doc], List[Floats2d]]
|
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
|
||||||
"""Construct a simple NER tagger, that predicts BILUO tag scores for each
|
|
||||||
token and uses greedy decoding with transition-constraints to return a valid
|
|
||||||
BILUO tag sequence.
|
|
||||||
|
|
||||||
A BILUO tag sequence encodes a sequence of non-overlapping labelled spans
|
|
||||||
into tags assigned to each token. The first token of a span is given the
|
|
||||||
tag B-LABEL, the last token of the span is given the tag L-LABEL, and tokens
|
|
||||||
within the span are given the tag I-LABEL. Single-token spans are given
|
|
||||||
the tag U-LABEL. All other tokens are assigned the tag O.
|
|
||||||
|
|
||||||
The BILUO tag scheme generally results in better linear separation between
|
|
||||||
classes, especially for non-CRF models, because there are more distinct classes
|
|
||||||
for the different situations (Ratinov et al., 2009).
|
|
||||||
"""
|
|
||||||
biluo = BILUO()
|
|
||||||
linear = Linear(
|
|
||||||
nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02)
|
|
||||||
)
|
|
||||||
model = chain(
|
|
||||||
tok2vec,
|
|
||||||
list2padded(),
|
|
||||||
with_array(chain(Dropout(0.1), linear)),
|
|
||||||
biluo,
|
|
||||||
with_array(softmax_activation()),
|
|
||||||
padded2list(),
|
|
||||||
)
|
|
||||||
return Model(
|
|
||||||
"biluo-tagger",
|
|
||||||
forward,
|
|
||||||
init=init,
|
|
||||||
layers=[model, linear],
|
|
||||||
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
|
|
||||||
dims={"nO": None},
|
|
||||||
attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.IOBTagger.v1")
|
|
||||||
def IOBTagger(
|
|
||||||
tok2vec: Model[List[Doc], List[Floats2d]]
|
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
|
||||||
"""Construct a simple NER tagger, that predicts IOB tag scores for each
|
|
||||||
token and uses greedy decoding with transition-constraints to return a valid
|
|
||||||
IOB tag sequence.
|
|
||||||
|
|
||||||
An IOB tag sequence encodes a sequence of non-overlapping labelled spans
|
|
||||||
into tags assigned to each token. The first token of a span is given the
|
|
||||||
tag B-LABEL, and subsequent tokens are given the tag I-LABEL.
|
|
||||||
All other tokens are assigned the tag O.
|
|
||||||
"""
|
|
||||||
biluo = IOB()
|
|
||||||
linear = Linear(nO=None, nI=tok2vec.get_dim("nO"))
|
|
||||||
model = chain(
|
|
||||||
tok2vec,
|
|
||||||
list2padded(),
|
|
||||||
with_array(linear),
|
|
||||||
biluo,
|
|
||||||
with_array(softmax_activation()),
|
|
||||||
padded2list(),
|
|
||||||
)
|
|
||||||
return Model(
|
|
||||||
"iob-tagger",
|
|
||||||
forward,
|
|
||||||
init=init,
|
|
||||||
layers=[model],
|
|
||||||
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
|
|
||||||
dims={"nO": None},
|
|
||||||
attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None:
|
|
||||||
if model.has_dim("nO") is None and Y:
|
|
||||||
model.set_dim("nO", Y[0].shape[1])
|
|
||||||
nO = model.get_dim("nO")
|
|
||||||
biluo = model.get_ref("biluo")
|
|
||||||
linear = model.get_ref("linear")
|
|
||||||
biluo.set_dim("nO", nO)
|
|
||||||
if linear.has_dim("nO") is None:
|
|
||||||
linear.set_dim("nO", nO)
|
|
||||||
model.layers[0].initialize(X=X, Y=Y)
|
|
||||||
|
|
||||||
|
|
||||||
def forward(model: Model, X: List[Doc], is_train: bool):
|
|
||||||
return model.layers[0](X, is_train)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["BiluoTagger"]
|
|
|
@ -8,7 +8,6 @@ from .morphologizer import Morphologizer
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from .senter import SentenceRecognizer
|
from .senter import SentenceRecognizer
|
||||||
from .sentencizer import Sentencizer
|
from .sentencizer import Sentencizer
|
||||||
from .simple_ner import SimpleNER
|
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
from .textcat import TextCategorizer
|
from .textcat import TextCategorizer
|
||||||
from .tok2vec import Tok2Vec
|
from .tok2vec import Tok2Vec
|
||||||
|
@ -25,7 +24,6 @@ __all__ = [
|
||||||
"Pipe",
|
"Pipe",
|
||||||
"SentenceRecognizer",
|
"SentenceRecognizer",
|
||||||
"Sentencizer",
|
"Sentencizer",
|
||||||
"SimpleNER",
|
|
||||||
"Tagger",
|
"Tagger",
|
||||||
"TextCategorizer",
|
"TextCategorizer",
|
||||||
"Tok2Vec",
|
"Tok2Vec",
|
||||||
|
|
|
@ -1,223 +0,0 @@
|
||||||
from typing import List, Iterable, Optional, Dict, Tuple, Callable, Set
|
|
||||||
from thinc.types import Floats2d
|
|
||||||
from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate, Model
|
|
||||||
from thinc.api import Optimizer, Config
|
|
||||||
from thinc.util import to_numpy
|
|
||||||
from itertools import islice
|
|
||||||
|
|
||||||
from ..errors import Errors
|
|
||||||
from ..training import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob
|
|
||||||
from ..training import validate_examples
|
|
||||||
from ..tokens import Doc
|
|
||||||
from ..language import Language
|
|
||||||
from ..vocab import Vocab
|
|
||||||
from ..scorer import Scorer
|
|
||||||
from .pipe import Pipe
|
|
||||||
|
|
||||||
|
|
||||||
default_model_config = """
|
|
||||||
[model]
|
|
||||||
@architectures = "spacy.BILUOTagger.v1"
|
|
||||||
|
|
||||||
[model.tok2vec]
|
|
||||||
@architectures = "spacy.HashEmbedCNN.v1"
|
|
||||||
pretrained_vectors = null
|
|
||||||
width = 128
|
|
||||||
depth = 4
|
|
||||||
embed_size = 7000
|
|
||||||
window_size = 1
|
|
||||||
maxout_pieces = 3
|
|
||||||
subword_features = true
|
|
||||||
"""
|
|
||||||
DEFAULT_SIMPLE_NER_MODEL = Config().from_str(default_model_config)["model"]
|
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"simple_ner",
|
|
||||||
assigns=["doc.ents"],
|
|
||||||
default_config={"labels": [], "model": DEFAULT_SIMPLE_NER_MODEL},
|
|
||||||
scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
|
|
||||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0},
|
|
||||||
)
|
|
||||||
def make_simple_ner(
|
|
||||||
nlp: Language, name: str, model: Model, labels: Iterable[str]
|
|
||||||
) -> "SimpleNER":
|
|
||||||
return SimpleNER(nlp.vocab, model, name, labels=labels)
|
|
||||||
|
|
||||||
|
|
||||||
class SimpleNER(Pipe):
|
|
||||||
"""Named entity recognition with a tagging model. The model should include
|
|
||||||
validity constraints to ensure that only valid tag sequences are returned."""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
vocab: Vocab,
|
|
||||||
model: Model,
|
|
||||||
name: str = "simple_ner",
|
|
||||||
*,
|
|
||||||
labels: Iterable[str],
|
|
||||||
) -> None:
|
|
||||||
self.vocab = vocab
|
|
||||||
self.model = model
|
|
||||||
self.name = name
|
|
||||||
self.cfg = {"labels": []}
|
|
||||||
for label in labels:
|
|
||||||
self.add_label(label)
|
|
||||||
self.loss_func = SequenceCategoricalCrossentropy(
|
|
||||||
names=self.get_tag_names(), normalize=True, missing_value=None
|
|
||||||
)
|
|
||||||
assert self.model is not None
|
|
||||||
|
|
||||||
@property
|
|
||||||
def is_biluo(self) -> bool:
|
|
||||||
return self.model.name.startswith("biluo")
|
|
||||||
|
|
||||||
@property
|
|
||||||
def labels(self) -> Tuple[str]:
|
|
||||||
return tuple(self.cfg["labels"])
|
|
||||||
|
|
||||||
def add_label(self, label: str) -> None:
|
|
||||||
"""Add a new label to the pipe.
|
|
||||||
label (str): The label to add.
|
|
||||||
DOCS: https://nightly.spacy.io/api/simplener#add_label
|
|
||||||
"""
|
|
||||||
if not isinstance(label, str):
|
|
||||||
raise ValueError(Errors.E187)
|
|
||||||
if label not in self.labels:
|
|
||||||
self.cfg["labels"].append(label)
|
|
||||||
self.vocab.strings.add(label)
|
|
||||||
|
|
||||||
def get_tag_names(self) -> List[str]:
|
|
||||||
if self.is_biluo:
|
|
||||||
return (
|
|
||||||
[f"B-{label}" for label in self.labels]
|
|
||||||
+ [f"I-{label}" for label in self.labels]
|
|
||||||
+ [f"L-{label}" for label in self.labels]
|
|
||||||
+ [f"U-{label}" for label in self.labels]
|
|
||||||
+ ["O"]
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
return (
|
|
||||||
[f"B-{label}" for label in self.labels]
|
|
||||||
+ [f"I-{label}" for label in self.labels]
|
|
||||||
+ ["O"]
|
|
||||||
)
|
|
||||||
|
|
||||||
def predict(self, docs: List[Doc]) -> List[Floats2d]:
|
|
||||||
scores = self.model.predict(docs)
|
|
||||||
return scores
|
|
||||||
|
|
||||||
def set_annotations(self, docs: List[Doc], scores: List[Floats2d]) -> None:
|
|
||||||
"""Set entities on a batch of documents from a batch of scores."""
|
|
||||||
tag_names = self.get_tag_names()
|
|
||||||
for i, doc in enumerate(docs):
|
|
||||||
actions = to_numpy(scores[i].argmax(axis=1))
|
|
||||||
tags = [tag_names[actions[j]] for j in range(len(doc))]
|
|
||||||
if not self.is_biluo:
|
|
||||||
tags = iob_to_biluo(tags)
|
|
||||||
doc.ents = spans_from_biluo_tags(doc, tags)
|
|
||||||
|
|
||||||
def update(
|
|
||||||
self,
|
|
||||||
examples: List[Example],
|
|
||||||
*,
|
|
||||||
set_annotations: bool = False,
|
|
||||||
drop: float = 0.0,
|
|
||||||
sgd: Optional[Optimizer] = None,
|
|
||||||
losses: Optional[Dict[str, float]] = None,
|
|
||||||
) -> Dict[str, float]:
|
|
||||||
if losses is None:
|
|
||||||
losses = {}
|
|
||||||
losses.setdefault("ner", 0.0)
|
|
||||||
validate_examples(examples, "SimpleNER.update")
|
|
||||||
if not any(_has_ner(eg) for eg in examples):
|
|
||||||
return losses
|
|
||||||
docs = [eg.predicted for eg in examples]
|
|
||||||
set_dropout_rate(self.model, drop)
|
|
||||||
scores, bp_scores = self.model.begin_update(docs)
|
|
||||||
loss, d_scores = self.get_loss(examples, scores)
|
|
||||||
bp_scores(d_scores)
|
|
||||||
if set_annotations:
|
|
||||||
self.set_annotations(docs, scores)
|
|
||||||
if sgd is not None:
|
|
||||||
self.model.finish_update(sgd)
|
|
||||||
losses["ner"] += loss
|
|
||||||
return losses
|
|
||||||
|
|
||||||
def get_loss(self, examples: List[Example], scores) -> Tuple[List[Floats2d], float]:
|
|
||||||
validate_examples(examples, "SimpleNER.get_loss")
|
|
||||||
truths = []
|
|
||||||
for eg in examples:
|
|
||||||
tags = eg.get_aligned_ner()
|
|
||||||
gold_tags = [(tag if tag != "-" else None) for tag in tags]
|
|
||||||
if not self.is_biluo:
|
|
||||||
gold_tags = biluo_to_iob(gold_tags)
|
|
||||||
truths.append(gold_tags)
|
|
||||||
for i in range(len(scores)):
|
|
||||||
if len(scores[i]) != len(truths[i]):
|
|
||||||
raise ValueError(
|
|
||||||
f"Mismatched output and gold sizes.\n"
|
|
||||||
f"Output: {len(scores[i])}, gold: {len(truths[i])}."
|
|
||||||
f"Input: {len(examples[i].doc)}"
|
|
||||||
)
|
|
||||||
d_scores, loss = self.loss_func(scores, truths)
|
|
||||||
return loss, d_scores
|
|
||||||
|
|
||||||
def begin_training(
|
|
||||||
self,
|
|
||||||
get_examples: Callable[[], Iterable[Example]],
|
|
||||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
|
||||||
sgd: Optional[Optimizer] = None,
|
|
||||||
):
|
|
||||||
self._ensure_examples(get_examples)
|
|
||||||
all_labels = set()
|
|
||||||
for example in get_examples():
|
|
||||||
all_labels.update(_get_labels(example))
|
|
||||||
for label in sorted(all_labels):
|
|
||||||
if label != "":
|
|
||||||
self.add_label(label)
|
|
||||||
doc_sample = []
|
|
||||||
label_sample = []
|
|
||||||
self._require_labels()
|
|
||||||
for example in islice(get_examples(), 10):
|
|
||||||
doc_sample.append(example.x)
|
|
||||||
gold_tags = example.get_aligned_ner()
|
|
||||||
if not self.is_biluo:
|
|
||||||
gold_tags = biluo_to_iob(gold_tags)
|
|
||||||
gold_array = [
|
|
||||||
[1.0 if tag == gold_tag else 0.0 for tag in self.get_tag_names()]
|
|
||||||
for gold_tag in gold_tags
|
|
||||||
]
|
|
||||||
label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
|
|
||||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
|
||||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
|
||||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
|
||||||
if pipeline is not None:
|
|
||||||
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
|
|
||||||
self.loss_func = SequenceCategoricalCrossentropy(
|
|
||||||
names=self.get_tag_names(), normalize=True, missing_value=None
|
|
||||||
)
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
def init_multitask_objectives(self, *args, **kwargs):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
|
||||||
validate_examples(examples, "SimpleNER.score")
|
|
||||||
return Scorer.score_spans(examples, "ents", **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def _has_ner(example: Example) -> bool:
|
|
||||||
for ner_tag in example.get_aligned_ner():
|
|
||||||
if ner_tag != "-" and ner_tag is not None:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def _get_labels(example: Example) -> Set[str]:
|
|
||||||
labels = set()
|
|
||||||
for ner_tag in example.get_aligned("ENT_TYPE", as_string=True):
|
|
||||||
if ner_tag != "O" and ner_tag != "-" and ner_tag != "":
|
|
||||||
labels.add(ner_tag)
|
|
||||||
return labels
|
|
|
@ -1,106 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.training import Example
|
|
||||||
from spacy import util
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
TRAIN_DATA = [
|
|
||||||
("Who is Shaka S Khan?", {"entities": [(7, 19, "PERSON")]}),
|
|
||||||
("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def test_no_label():
|
|
||||||
nlp = English()
|
|
||||||
nlp.add_pipe("simple_ner")
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
nlp.begin_training()
|
|
||||||
|
|
||||||
|
|
||||||
def test_implicit_label():
|
|
||||||
nlp = English()
|
|
||||||
ner = nlp.add_pipe("simple_ner")
|
|
||||||
train_examples = []
|
|
||||||
ner.add_label("ORG")
|
|
||||||
for t in TRAIN_DATA:
|
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Should be fixed")
|
|
||||||
def test_untrained():
|
|
||||||
# This shouldn't crash, but it does when the simple_ner produces an invalid sequence like ['L-PERSON', 'L-ORG']
|
|
||||||
nlp = English()
|
|
||||||
ner = nlp.add_pipe("simple_ner")
|
|
||||||
ner.add_label("PERSON")
|
|
||||||
ner.add_label("LOC")
|
|
||||||
ner.add_label("ORG")
|
|
||||||
nlp.begin_training()
|
|
||||||
nlp("Example sentence")
|
|
||||||
|
|
||||||
|
|
||||||
def test_resize():
|
|
||||||
nlp = English()
|
|
||||||
ner = nlp.add_pipe("simple_ner")
|
|
||||||
ner.add_label("PERSON")
|
|
||||||
ner.add_label("LOC")
|
|
||||||
nlp.begin_training()
|
|
||||||
assert len(ner.labels) == 2
|
|
||||||
ner.add_label("ORG")
|
|
||||||
nlp.begin_training()
|
|
||||||
assert len(ner.labels) == 3
|
|
||||||
|
|
||||||
|
|
||||||
def test_begin_training_examples():
|
|
||||||
nlp = English()
|
|
||||||
ner = nlp.add_pipe("simple_ner")
|
|
||||||
train_examples = []
|
|
||||||
for text, annotations in TRAIN_DATA:
|
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
|
||||||
for ent in annotations.get("entities"):
|
|
||||||
ner.add_label(ent[2])
|
|
||||||
# you shouldn't really call this more than once, but for testing it should be fine
|
|
||||||
nlp.begin_training()
|
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
|
||||||
with pytest.raises(TypeError):
|
|
||||||
nlp.begin_training(get_examples=lambda: None)
|
|
||||||
with pytest.raises(TypeError):
|
|
||||||
nlp.begin_training(get_examples=lambda: train_examples[0])
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
nlp.begin_training(get_examples=lambda: [])
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
nlp.begin_training(get_examples=train_examples)
|
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
|
||||||
# Simple test to try and quickly overfit the SimpleNER component - ensuring the ML models work correctly
|
|
||||||
nlp = English()
|
|
||||||
ner = nlp.add_pipe("simple_ner")
|
|
||||||
train_examples = []
|
|
||||||
for text, annotations in TRAIN_DATA:
|
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
|
||||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
|
||||||
|
|
||||||
for i in range(50):
|
|
||||||
losses = {}
|
|
||||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
|
||||||
assert losses["ner"] < 0.0001
|
|
||||||
|
|
||||||
# test the trained model
|
|
||||||
test_text = "I like London."
|
|
||||||
doc = nlp(test_text)
|
|
||||||
ents = doc.ents
|
|
||||||
assert len(ents) == 1
|
|
||||||
assert ents[0].text == "London"
|
|
||||||
assert ents[0].label_ == "LOC"
|
|
||||||
|
|
||||||
# Also test the results are still the same after IO
|
|
||||||
with make_tempdir() as tmp_dir:
|
|
||||||
nlp.to_disk(tmp_dir)
|
|
||||||
nlp2 = util.load_model_from_path(tmp_dir)
|
|
||||||
doc2 = nlp2(test_text)
|
|
||||||
ents2 = doc2.ents
|
|
||||||
assert len(ents2) == 1
|
|
||||||
assert ents2[0].text == "London"
|
|
||||||
assert ents2[0].label_ == "LOC"
|
|
|
@ -456,62 +456,6 @@ consists of either two or three subnetworks:
|
||||||
| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ |
|
| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ |
|
||||||
|
|
||||||
### spacy.BILUOTagger.v1 {#BILUOTagger source="spacy/ml/models/simple_ner.py"}
|
|
||||||
|
|
||||||
> #### Example Config
|
|
||||||
>
|
|
||||||
> ```ini
|
|
||||||
> [model]
|
|
||||||
> @architectures = "spacy.BILUOTagger.v1 "
|
|
||||||
>
|
|
||||||
> [model.tok2vec]
|
|
||||||
> @architectures = "spacy.HashEmbedCNN.v1"
|
|
||||||
> # etc.
|
|
||||||
> ```
|
|
||||||
|
|
||||||
Construct a simple NER tagger that predicts
|
|
||||||
[BILUO](/usage/linguistic-features#accessing-ner) tag scores for each token and
|
|
||||||
uses greedy decoding with transition-constraints to return a valid BILUO tag
|
|
||||||
sequence. A BILUO tag sequence encodes a sequence of non-overlapping labelled
|
|
||||||
spans into tags assigned to each token. The first token of a span is given the
|
|
||||||
tag `B-LABEL`, the last token of the span is given the tag `L-LABEL`, and tokens
|
|
||||||
within the span are given the tag `U-LABEL`. Single-token spans are given the
|
|
||||||
tag `U-LABEL`. All other tokens are assigned the tag `O`. The BILUO tag scheme
|
|
||||||
generally results in better linear separation between classes, especially for
|
|
||||||
non-CRF models, because there are more distinct classes for the different
|
|
||||||
situations ([Ratinov et al., 2009](https://www.aclweb.org/anthology/W09-1119/)).
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| ----------- | ------------------------------------------------------------------------------------------ |
|
|
||||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
|
||||||
|
|
||||||
### spacy.IOBTagger.v1 {#IOBTagger source="spacy/ml/models/simple_ner.py"}
|
|
||||||
|
|
||||||
> #### Example Config
|
|
||||||
>
|
|
||||||
> ```ini
|
|
||||||
> [model]
|
|
||||||
> @architectures = "spacy.IOBTagger.v1 "
|
|
||||||
>
|
|
||||||
> [model.tok2vec]
|
|
||||||
> @architectures = "spacy.HashEmbedCNN.v1"
|
|
||||||
> # etc.
|
|
||||||
> ```
|
|
||||||
|
|
||||||
Construct a simple NER tagger, that predicts
|
|
||||||
[IOB](/usage/linguistic-features#accessing-ner) tag scores for each token and
|
|
||||||
uses greedy decoding with transition-constraints to return a valid IOB tag
|
|
||||||
sequence. An IOB tag sequence encodes a sequence of non-overlapping labeled
|
|
||||||
spans into tags assigned to each token. The first token of a span is given the
|
|
||||||
tag B-LABEL, and subsequent tokens are given the tag I-LABEL. All other tokens
|
|
||||||
are assigned the tag O.
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| ----------- | ------------------------------------------------------------------------------------------ |
|
|
||||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
|
||||||
|
|
||||||
## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
|
## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
|
||||||
|
|
||||||
### spacy.Tagger.v1 {#Tagger}
|
### spacy.Tagger.v1 {#Tagger}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user