mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Remove simple_ner code (#6041)
* remove simple_ner code * remove unused _biluo and _iob files
This commit is contained in:
parent
24053d83ec
commit
cb66ea7400
|
@ -31,10 +31,13 @@ lang = "en"
|
|||
vectors = null
|
||||
|
||||
[nlp.pipeline.ner]
|
||||
factory = "simple_ner"
|
||||
factory = "ner"
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.BiluoTagger.v1"
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 6
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
|
|
|
@ -1,105 +0,0 @@
|
|||
"""Thinc layer to do simpler transition-based parsing, NER, etc."""
|
||||
from typing import Dict, Optional
|
||||
import numpy
|
||||
from thinc.api import Model
|
||||
from thinc.types import Padded, Floats3d
|
||||
|
||||
|
||||
def BILUO() -> Model[Padded, Padded]:
|
||||
return Model(
|
||||
"biluo",
|
||||
forward,
|
||||
init=init,
|
||||
dims={"nO": None},
|
||||
attrs={"get_num_actions": get_num_actions},
|
||||
)
|
||||
|
||||
|
||||
def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None):
|
||||
if X is not None and Y is not None:
|
||||
if X.data.shape != Y.data.shape:
|
||||
# TODO: Fix error
|
||||
raise ValueError("Mismatched shapes (TODO: Fix message)")
|
||||
model.set_dim("nO", X.data.shape[2])
|
||||
elif X is not None:
|
||||
model.set_dim("nO", X.data.shape[2])
|
||||
elif Y is not None:
|
||||
model.set_dim("nO", Y.data.shape[2])
|
||||
elif model.get_dim("nO") is None:
|
||||
raise ValueError("Dimension unset for BILUO: nO")
|
||||
|
||||
|
||||
def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
|
||||
n_labels = (model.get_dim("nO") - 1) // 4
|
||||
n_tokens, n_docs, n_actions = Xp.data.shape
|
||||
# At each timestep, we make a validity mask of shape (n_docs, n_actions)
|
||||
# to indicate which actions are valid next for each sequence. To construct
|
||||
# the mask, we have a state of shape (2, n_actions) and a validity table of
|
||||
# shape (2, n_actions+1, n_actions). The first dimension of the state indicates
|
||||
# whether it's the last token, the second dimension indicates the previous
|
||||
# action, plus a special 'null action' for the first entry.
|
||||
valid_transitions = model.ops.asarray(_get_transition_table(n_labels))
|
||||
prev_actions = model.ops.alloc1i(n_docs)
|
||||
# Initialize as though prev action was O
|
||||
prev_actions.fill(n_actions - 1)
|
||||
Y = model.ops.alloc3f(*Xp.data.shape)
|
||||
masks = model.ops.alloc3f(*Y.shape)
|
||||
max_value = Xp.data.max()
|
||||
for t in range(Xp.data.shape[0]):
|
||||
is_last = (Xp.lengths < (t + 2)).astype("i")
|
||||
masks[t] = valid_transitions[is_last, prev_actions]
|
||||
# Don't train the out-of-bounds sequences.
|
||||
masks[t, Xp.size_at_t[t] :] = 0
|
||||
# Valid actions get 0*10e8, invalid get large negative value
|
||||
Y[t] = Xp.data[t] + ((masks[t] - 1) * max_value * 10)
|
||||
prev_actions = Y[t].argmax(axis=-1)
|
||||
|
||||
def backprop_biluo(dY: Padded) -> Padded:
|
||||
dY.data *= masks
|
||||
return dY
|
||||
|
||||
return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
|
||||
|
||||
|
||||
def get_num_actions(n_labels: int) -> int:
|
||||
# One BEGIN action per label
|
||||
# One IN action per label
|
||||
# One LAST action per label
|
||||
# One UNIT action per label
|
||||
# One OUT action
|
||||
return n_labels + n_labels + n_labels + n_labels + 1
|
||||
|
||||
|
||||
def _get_transition_table(
|
||||
n_labels: int, *, _cache: Dict[int, Floats3d] = {}
|
||||
) -> Floats3d:
|
||||
n_actions = get_num_actions(n_labels)
|
||||
if n_actions in _cache:
|
||||
return _cache[n_actions]
|
||||
table = numpy.zeros((2, n_actions, n_actions), dtype="f")
|
||||
B_start, B_end = (0, n_labels)
|
||||
I_start, I_end = (B_end, B_end + n_labels)
|
||||
L_start, L_end = (I_end, I_end + n_labels)
|
||||
U_start, _ = (L_end, L_end + n_labels) # noqa: F841
|
||||
# Using ranges allows us to set specific cells, which is necessary to express
|
||||
# that only actions of the same label are valid continuations.
|
||||
B_range = numpy.arange(B_start, B_end)
|
||||
I_range = numpy.arange(I_start, I_end)
|
||||
L_range = numpy.arange(L_start, L_end)
|
||||
# If this is the last token and the previous action was B or I, only L
|
||||
# of that label is valid
|
||||
table[1, B_range, L_range] = 1
|
||||
table[1, I_range, L_range] = 1
|
||||
# If this isn't the last token and the previous action was B or I, only I or
|
||||
# L of that label are valid.
|
||||
table[0, B_range, I_range] = 1
|
||||
table[0, B_range, L_range] = 1
|
||||
table[0, I_range, I_range] = 1
|
||||
table[0, I_range, L_range] = 1
|
||||
# If this isn't the last token and the previous was L, U or O, B is valid
|
||||
table[0, L_start:, :B_end] = 1
|
||||
# Regardless of whether this is the last token, if the previous action was
|
||||
# {L, U, O}, U and O are valid.
|
||||
table[:, L_start:, U_start:] = 1
|
||||
_cache[n_actions] = table
|
||||
return table
|
|
@ -1,88 +0,0 @@
|
|||
"""Thinc layer to do simpler transition-based parsing, NER, etc."""
|
||||
from typing import Dict, Optional
|
||||
from thinc.api import Ops, Model
|
||||
from thinc.types import Padded, Floats3d
|
||||
|
||||
|
||||
def IOB() -> Model[Padded, Padded]:
|
||||
return Model(
|
||||
"biluo",
|
||||
forward,
|
||||
init=init,
|
||||
dims={"nO": None},
|
||||
attrs={"get_num_actions": get_num_actions},
|
||||
)
|
||||
|
||||
|
||||
def init(model: Model, X: Optional[Padded] = None, Y: Optional[Padded] = None) -> None:
|
||||
if X is not None and Y is not None:
|
||||
if X.data.shape != Y.data.shape:
|
||||
# TODO: Fix error
|
||||
raise ValueError("Mismatched shapes (TODO: Fix message)")
|
||||
model.set_dim("nO", X.data.shape[2])
|
||||
elif X is not None:
|
||||
model.set_dim("nO", X.data.shape[2])
|
||||
elif Y is not None:
|
||||
model.set_dim("nO", Y.data.shape[2])
|
||||
elif model.get_dim("nO") is None:
|
||||
raise ValueError("Dimension unset for BILUO: nO")
|
||||
|
||||
|
||||
def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
|
||||
n_labels = (model.get_dim("nO") - 1) // 2
|
||||
n_tokens, n_docs, n_actions = Xp.data.shape
|
||||
# At each timestep, we make a validity mask of shape (n_docs, n_actions)
|
||||
# to indicate which actions are valid next for each sequence. To construct
|
||||
# the mask, we have a state of shape (2, n_actions) and a validity table of
|
||||
# shape (2, n_actions+1, n_actions). The first dimension of the state indicates
|
||||
# whether it's the last token, the second dimension indicates the previous
|
||||
# action, plus a special 'null action' for the first entry.
|
||||
valid_transitions = _get_transition_table(model.ops, n_labels)
|
||||
prev_actions = model.ops.alloc1i(n_docs)
|
||||
# Initialize as though prev action was O
|
||||
prev_actions.fill(n_actions - 1)
|
||||
Y = model.ops.alloc3f(*Xp.data.shape)
|
||||
masks = model.ops.alloc3f(*Y.shape)
|
||||
for t in range(Xp.data.shape[0]):
|
||||
masks[t] = valid_transitions[prev_actions]
|
||||
# Don't train the out-of-bounds sequences.
|
||||
masks[t, Xp.size_at_t[t] :] = 0
|
||||
# Valid actions get 0*10e8, invalid get -1*10e8
|
||||
Y[t] = Xp.data[t] + ((masks[t] - 1) * 10e8)
|
||||
prev_actions = Y[t].argmax(axis=-1)
|
||||
|
||||
def backprop_biluo(dY: Padded) -> Padded:
|
||||
# Masking the gradient seems to do poorly here. But why?
|
||||
# dY.data *= masks
|
||||
return dY
|
||||
|
||||
return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
|
||||
|
||||
|
||||
def get_num_actions(n_labels: int) -> int:
|
||||
# One BEGIN action per label
|
||||
# One IN action per label
|
||||
# One OUT action
|
||||
return n_labels * 2 + 1
|
||||
|
||||
|
||||
def _get_transition_table(
|
||||
ops: Ops, n_labels: int, _cache: Dict[int, Floats3d] = {}
|
||||
) -> Floats3d:
|
||||
n_actions = get_num_actions(n_labels)
|
||||
if n_actions in _cache:
|
||||
return ops.asarray(_cache[n_actions])
|
||||
table = ops.alloc2f(n_actions, n_actions)
|
||||
B_start, B_end = (0, n_labels)
|
||||
I_start, I_end = (B_end, B_end + n_labels)
|
||||
O_action = I_end
|
||||
B_range = ops.xp.arange(B_start, B_end)
|
||||
I_range = ops.xp.arange(I_start, I_end)
|
||||
# B and O are always valid
|
||||
table[:, B_start:B_end] = 1
|
||||
table[:, O_action] = 1
|
||||
# I can only follow a matching B
|
||||
table[B_range, I_range] = 1
|
||||
|
||||
_cache[n_actions] = table
|
||||
return table
|
|
@ -1,6 +1,5 @@
|
|||
from .entity_linker import * # noqa
|
||||
from .parser import * # noqa
|
||||
from .simple_ner import * # noqa
|
||||
from .tagger import * # noqa
|
||||
from .textcat import * # noqa
|
||||
from .tok2vec import * # noqa
|
||||
|
|
|
@ -1,104 +0,0 @@
|
|||
from typing import List
|
||||
from thinc.api import Model, Linear, with_array, softmax_activation, padded2list
|
||||
from thinc.api import chain, list2padded, configure_normal_init
|
||||
from thinc.api import Dropout
|
||||
from thinc.types import Floats2d
|
||||
|
||||
from ...tokens import Doc
|
||||
from .._biluo import BILUO
|
||||
from .._iob import IOB
|
||||
from ...util import registry
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.BILUOTagger.v1")
|
||||
def BiluoTagger(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]]
|
||||
) -> Model[List[Doc], List[Floats2d]]:
|
||||
"""Construct a simple NER tagger, that predicts BILUO tag scores for each
|
||||
token and uses greedy decoding with transition-constraints to return a valid
|
||||
BILUO tag sequence.
|
||||
|
||||
A BILUO tag sequence encodes a sequence of non-overlapping labelled spans
|
||||
into tags assigned to each token. The first token of a span is given the
|
||||
tag B-LABEL, the last token of the span is given the tag L-LABEL, and tokens
|
||||
within the span are given the tag I-LABEL. Single-token spans are given
|
||||
the tag U-LABEL. All other tokens are assigned the tag O.
|
||||
|
||||
The BILUO tag scheme generally results in better linear separation between
|
||||
classes, especially for non-CRF models, because there are more distinct classes
|
||||
for the different situations (Ratinov et al., 2009).
|
||||
"""
|
||||
biluo = BILUO()
|
||||
linear = Linear(
|
||||
nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02)
|
||||
)
|
||||
model = chain(
|
||||
tok2vec,
|
||||
list2padded(),
|
||||
with_array(chain(Dropout(0.1), linear)),
|
||||
biluo,
|
||||
with_array(softmax_activation()),
|
||||
padded2list(),
|
||||
)
|
||||
return Model(
|
||||
"biluo-tagger",
|
||||
forward,
|
||||
init=init,
|
||||
layers=[model, linear],
|
||||
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
|
||||
dims={"nO": None},
|
||||
attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
|
||||
)
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.IOBTagger.v1")
|
||||
def IOBTagger(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]]
|
||||
) -> Model[List[Doc], List[Floats2d]]:
|
||||
"""Construct a simple NER tagger, that predicts IOB tag scores for each
|
||||
token and uses greedy decoding with transition-constraints to return a valid
|
||||
IOB tag sequence.
|
||||
|
||||
An IOB tag sequence encodes a sequence of non-overlapping labelled spans
|
||||
into tags assigned to each token. The first token of a span is given the
|
||||
tag B-LABEL, and subsequent tokens are given the tag I-LABEL.
|
||||
All other tokens are assigned the tag O.
|
||||
"""
|
||||
biluo = IOB()
|
||||
linear = Linear(nO=None, nI=tok2vec.get_dim("nO"))
|
||||
model = chain(
|
||||
tok2vec,
|
||||
list2padded(),
|
||||
with_array(linear),
|
||||
biluo,
|
||||
with_array(softmax_activation()),
|
||||
padded2list(),
|
||||
)
|
||||
return Model(
|
||||
"iob-tagger",
|
||||
forward,
|
||||
init=init,
|
||||
layers=[model],
|
||||
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
|
||||
dims={"nO": None},
|
||||
attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
|
||||
)
|
||||
|
||||
|
||||
def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None:
|
||||
if model.has_dim("nO") is None and Y:
|
||||
model.set_dim("nO", Y[0].shape[1])
|
||||
nO = model.get_dim("nO")
|
||||
biluo = model.get_ref("biluo")
|
||||
linear = model.get_ref("linear")
|
||||
biluo.set_dim("nO", nO)
|
||||
if linear.has_dim("nO") is None:
|
||||
linear.set_dim("nO", nO)
|
||||
model.layers[0].initialize(X=X, Y=Y)
|
||||
|
||||
|
||||
def forward(model: Model, X: List[Doc], is_train: bool):
|
||||
return model.layers[0](X, is_train)
|
||||
|
||||
|
||||
__all__ = ["BiluoTagger"]
|
|
@ -8,7 +8,6 @@ from .morphologizer import Morphologizer
|
|||
from .pipe import Pipe
|
||||
from .senter import SentenceRecognizer
|
||||
from .sentencizer import Sentencizer
|
||||
from .simple_ner import SimpleNER
|
||||
from .tagger import Tagger
|
||||
from .textcat import TextCategorizer
|
||||
from .tok2vec import Tok2Vec
|
||||
|
@ -25,7 +24,6 @@ __all__ = [
|
|||
"Pipe",
|
||||
"SentenceRecognizer",
|
||||
"Sentencizer",
|
||||
"SimpleNER",
|
||||
"Tagger",
|
||||
"TextCategorizer",
|
||||
"Tok2Vec",
|
||||
|
|
|
@ -1,223 +0,0 @@
|
|||
from typing import List, Iterable, Optional, Dict, Tuple, Callable, Set
|
||||
from thinc.types import Floats2d
|
||||
from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate, Model
|
||||
from thinc.api import Optimizer, Config
|
||||
from thinc.util import to_numpy
|
||||
from itertools import islice
|
||||
|
||||
from ..errors import Errors
|
||||
from ..training import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob
|
||||
from ..training import validate_examples
|
||||
from ..tokens import Doc
|
||||
from ..language import Language
|
||||
from ..vocab import Vocab
|
||||
from ..scorer import Scorer
|
||||
from .pipe import Pipe
|
||||
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.BILUOTagger.v1"
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = null
|
||||
width = 128
|
||||
depth = 4
|
||||
embed_size = 7000
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
"""
|
||||
DEFAULT_SIMPLE_NER_MODEL = Config().from_str(default_model_config)["model"]
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"simple_ner",
|
||||
assigns=["doc.ents"],
|
||||
default_config={"labels": [], "model": DEFAULT_SIMPLE_NER_MODEL},
|
||||
scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
|
||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0},
|
||||
)
|
||||
def make_simple_ner(
|
||||
nlp: Language, name: str, model: Model, labels: Iterable[str]
|
||||
) -> "SimpleNER":
|
||||
return SimpleNER(nlp.vocab, model, name, labels=labels)
|
||||
|
||||
|
||||
class SimpleNER(Pipe):
|
||||
"""Named entity recognition with a tagging model. The model should include
|
||||
validity constraints to ensure that only valid tag sequences are returned."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Vocab,
|
||||
model: Model,
|
||||
name: str = "simple_ner",
|
||||
*,
|
||||
labels: Iterable[str],
|
||||
) -> None:
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.name = name
|
||||
self.cfg = {"labels": []}
|
||||
for label in labels:
|
||||
self.add_label(label)
|
||||
self.loss_func = SequenceCategoricalCrossentropy(
|
||||
names=self.get_tag_names(), normalize=True, missing_value=None
|
||||
)
|
||||
assert self.model is not None
|
||||
|
||||
@property
|
||||
def is_biluo(self) -> bool:
|
||||
return self.model.name.startswith("biluo")
|
||||
|
||||
@property
|
||||
def labels(self) -> Tuple[str]:
|
||||
return tuple(self.cfg["labels"])
|
||||
|
||||
def add_label(self, label: str) -> None:
|
||||
"""Add a new label to the pipe.
|
||||
label (str): The label to add.
|
||||
DOCS: https://nightly.spacy.io/api/simplener#add_label
|
||||
"""
|
||||
if not isinstance(label, str):
|
||||
raise ValueError(Errors.E187)
|
||||
if label not in self.labels:
|
||||
self.cfg["labels"].append(label)
|
||||
self.vocab.strings.add(label)
|
||||
|
||||
def get_tag_names(self) -> List[str]:
|
||||
if self.is_biluo:
|
||||
return (
|
||||
[f"B-{label}" for label in self.labels]
|
||||
+ [f"I-{label}" for label in self.labels]
|
||||
+ [f"L-{label}" for label in self.labels]
|
||||
+ [f"U-{label}" for label in self.labels]
|
||||
+ ["O"]
|
||||
)
|
||||
else:
|
||||
return (
|
||||
[f"B-{label}" for label in self.labels]
|
||||
+ [f"I-{label}" for label in self.labels]
|
||||
+ ["O"]
|
||||
)
|
||||
|
||||
def predict(self, docs: List[Doc]) -> List[Floats2d]:
|
||||
scores = self.model.predict(docs)
|
||||
return scores
|
||||
|
||||
def set_annotations(self, docs: List[Doc], scores: List[Floats2d]) -> None:
|
||||
"""Set entities on a batch of documents from a batch of scores."""
|
||||
tag_names = self.get_tag_names()
|
||||
for i, doc in enumerate(docs):
|
||||
actions = to_numpy(scores[i].argmax(axis=1))
|
||||
tags = [tag_names[actions[j]] for j in range(len(doc))]
|
||||
if not self.is_biluo:
|
||||
tags = iob_to_biluo(tags)
|
||||
doc.ents = spans_from_biluo_tags(doc, tags)
|
||||
|
||||
def update(
|
||||
self,
|
||||
examples: List[Example],
|
||||
*,
|
||||
set_annotations: bool = False,
|
||||
drop: float = 0.0,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
losses: Optional[Dict[str, float]] = None,
|
||||
) -> Dict[str, float]:
|
||||
if losses is None:
|
||||
losses = {}
|
||||
losses.setdefault("ner", 0.0)
|
||||
validate_examples(examples, "SimpleNER.update")
|
||||
if not any(_has_ner(eg) for eg in examples):
|
||||
return losses
|
||||
docs = [eg.predicted for eg in examples]
|
||||
set_dropout_rate(self.model, drop)
|
||||
scores, bp_scores = self.model.begin_update(docs)
|
||||
loss, d_scores = self.get_loss(examples, scores)
|
||||
bp_scores(d_scores)
|
||||
if set_annotations:
|
||||
self.set_annotations(docs, scores)
|
||||
if sgd is not None:
|
||||
self.model.finish_update(sgd)
|
||||
losses["ner"] += loss
|
||||
return losses
|
||||
|
||||
def get_loss(self, examples: List[Example], scores) -> Tuple[List[Floats2d], float]:
|
||||
validate_examples(examples, "SimpleNER.get_loss")
|
||||
truths = []
|
||||
for eg in examples:
|
||||
tags = eg.get_aligned_ner()
|
||||
gold_tags = [(tag if tag != "-" else None) for tag in tags]
|
||||
if not self.is_biluo:
|
||||
gold_tags = biluo_to_iob(gold_tags)
|
||||
truths.append(gold_tags)
|
||||
for i in range(len(scores)):
|
||||
if len(scores[i]) != len(truths[i]):
|
||||
raise ValueError(
|
||||
f"Mismatched output and gold sizes.\n"
|
||||
f"Output: {len(scores[i])}, gold: {len(truths[i])}."
|
||||
f"Input: {len(examples[i].doc)}"
|
||||
)
|
||||
d_scores, loss = self.loss_func(scores, truths)
|
||||
return loss, d_scores
|
||||
|
||||
def begin_training(
|
||||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
):
|
||||
self._ensure_examples(get_examples)
|
||||
all_labels = set()
|
||||
for example in get_examples():
|
||||
all_labels.update(_get_labels(example))
|
||||
for label in sorted(all_labels):
|
||||
if label != "":
|
||||
self.add_label(label)
|
||||
doc_sample = []
|
||||
label_sample = []
|
||||
self._require_labels()
|
||||
for example in islice(get_examples(), 10):
|
||||
doc_sample.append(example.x)
|
||||
gold_tags = example.get_aligned_ner()
|
||||
if not self.is_biluo:
|
||||
gold_tags = biluo_to_iob(gold_tags)
|
||||
gold_array = [
|
||||
[1.0 if tag == gold_tag else 0.0 for tag in self.get_tag_names()]
|
||||
for gold_tag in gold_tags
|
||||
]
|
||||
label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
|
||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||
if pipeline is not None:
|
||||
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
|
||||
self.loss_func = SequenceCategoricalCrossentropy(
|
||||
names=self.get_tag_names(), normalize=True, missing_value=None
|
||||
)
|
||||
return sgd
|
||||
|
||||
def init_multitask_objectives(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
validate_examples(examples, "SimpleNER.score")
|
||||
return Scorer.score_spans(examples, "ents", **kwargs)
|
||||
|
||||
|
||||
def _has_ner(example: Example) -> bool:
|
||||
for ner_tag in example.get_aligned_ner():
|
||||
if ner_tag != "-" and ner_tag is not None:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def _get_labels(example: Example) -> Set[str]:
|
||||
labels = set()
|
||||
for ner_tag in example.get_aligned("ENT_TYPE", as_string=True):
|
||||
if ner_tag != "O" and ner_tag != "-" and ner_tag != "":
|
||||
labels.add(ner_tag)
|
||||
return labels
|
|
@ -1,106 +0,0 @@
|
|||
import pytest
|
||||
from spacy.lang.en import English
|
||||
from spacy.training import Example
|
||||
from spacy import util
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
TRAIN_DATA = [
|
||||
("Who is Shaka S Khan?", {"entities": [(7, 19, "PERSON")]}),
|
||||
("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
|
||||
]
|
||||
|
||||
|
||||
def test_no_label():
|
||||
nlp = English()
|
||||
nlp.add_pipe("simple_ner")
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training()
|
||||
|
||||
|
||||
def test_implicit_label():
|
||||
nlp = English()
|
||||
ner = nlp.add_pipe("simple_ner")
|
||||
train_examples = []
|
||||
ner.add_label("ORG")
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
nlp.begin_training(get_examples=lambda: train_examples)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Should be fixed")
|
||||
def test_untrained():
|
||||
# This shouldn't crash, but it does when the simple_ner produces an invalid sequence like ['L-PERSON', 'L-ORG']
|
||||
nlp = English()
|
||||
ner = nlp.add_pipe("simple_ner")
|
||||
ner.add_label("PERSON")
|
||||
ner.add_label("LOC")
|
||||
ner.add_label("ORG")
|
||||
nlp.begin_training()
|
||||
nlp("Example sentence")
|
||||
|
||||
|
||||
def test_resize():
|
||||
nlp = English()
|
||||
ner = nlp.add_pipe("simple_ner")
|
||||
ner.add_label("PERSON")
|
||||
ner.add_label("LOC")
|
||||
nlp.begin_training()
|
||||
assert len(ner.labels) == 2
|
||||
ner.add_label("ORG")
|
||||
nlp.begin_training()
|
||||
assert len(ner.labels) == 3
|
||||
|
||||
|
||||
def test_begin_training_examples():
|
||||
nlp = English()
|
||||
ner = nlp.add_pipe("simple_ner")
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
for ent in annotations.get("entities"):
|
||||
ner.add_label(ent[2])
|
||||
# you shouldn't really call this more than once, but for testing it should be fine
|
||||
nlp.begin_training()
|
||||
nlp.begin_training(get_examples=lambda: train_examples)
|
||||
with pytest.raises(TypeError):
|
||||
nlp.begin_training(get_examples=lambda: None)
|
||||
with pytest.raises(TypeError):
|
||||
nlp.begin_training(get_examples=lambda: train_examples[0])
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training(get_examples=lambda: [])
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training(get_examples=train_examples)
|
||||
|
||||
|
||||
def test_overfitting_IO():
|
||||
# Simple test to try and quickly overfit the SimpleNER component - ensuring the ML models work correctly
|
||||
nlp = English()
|
||||
ner = nlp.add_pipe("simple_ner")
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
||||
|
||||
for i in range(50):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
assert losses["ner"] < 0.0001
|
||||
|
||||
# test the trained model
|
||||
test_text = "I like London."
|
||||
doc = nlp(test_text)
|
||||
ents = doc.ents
|
||||
assert len(ents) == 1
|
||||
assert ents[0].text == "London"
|
||||
assert ents[0].label_ == "LOC"
|
||||
|
||||
# Also test the results are still the same after IO
|
||||
with make_tempdir() as tmp_dir:
|
||||
nlp.to_disk(tmp_dir)
|
||||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
doc2 = nlp2(test_text)
|
||||
ents2 = doc2.ents
|
||||
assert len(ents2) == 1
|
||||
assert ents2[0].text == "London"
|
||||
assert ents2[0].label_ == "LOC"
|
|
@ -456,62 +456,6 @@ consists of either two or three subnetworks:
|
|||
| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ |
|
||||
|
||||
### spacy.BILUOTagger.v1 {#BILUOTagger source="spacy/ml/models/simple_ner.py"}
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.BILUOTagger.v1 "
|
||||
>
|
||||
> [model.tok2vec]
|
||||
> @architectures = "spacy.HashEmbedCNN.v1"
|
||||
> # etc.
|
||||
> ```
|
||||
|
||||
Construct a simple NER tagger that predicts
|
||||
[BILUO](/usage/linguistic-features#accessing-ner) tag scores for each token and
|
||||
uses greedy decoding with transition-constraints to return a valid BILUO tag
|
||||
sequence. A BILUO tag sequence encodes a sequence of non-overlapping labelled
|
||||
spans into tags assigned to each token. The first token of a span is given the
|
||||
tag `B-LABEL`, the last token of the span is given the tag `L-LABEL`, and tokens
|
||||
within the span are given the tag `U-LABEL`. Single-token spans are given the
|
||||
tag `U-LABEL`. All other tokens are assigned the tag `O`. The BILUO tag scheme
|
||||
generally results in better linear separation between classes, especially for
|
||||
non-CRF models, because there are more distinct classes for the different
|
||||
situations ([Ratinov et al., 2009](https://www.aclweb.org/anthology/W09-1119/)).
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------ |
|
||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
### spacy.IOBTagger.v1 {#IOBTagger source="spacy/ml/models/simple_ner.py"}
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.IOBTagger.v1 "
|
||||
>
|
||||
> [model.tok2vec]
|
||||
> @architectures = "spacy.HashEmbedCNN.v1"
|
||||
> # etc.
|
||||
> ```
|
||||
|
||||
Construct a simple NER tagger, that predicts
|
||||
[IOB](/usage/linguistic-features#accessing-ner) tag scores for each token and
|
||||
uses greedy decoding with transition-constraints to return a valid IOB tag
|
||||
sequence. An IOB tag sequence encodes a sequence of non-overlapping labeled
|
||||
spans into tags assigned to each token. The first token of a span is given the
|
||||
tag B-LABEL, and subsequent tokens are given the tag I-LABEL. All other tokens
|
||||
are assigned the tag O.
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------ |
|
||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
|
||||
|
||||
### spacy.Tagger.v1 {#Tagger}
|
||||
|
|
Loading…
Reference in New Issue
Block a user