From cb66ea740053e355e6303c75993fc44b5187a509 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 9 Sep 2020 16:11:27 +0200 Subject: [PATCH] Remove simple_ner code (#6041) * remove simple_ner code * remove unused _biluo and _iob files --- .../tok2vec-ner/multihashembed_tok2vec.cfg | 7 +- spacy/ml/_biluo.py | 105 --------- spacy/ml/_iob.py | 88 ------- spacy/ml/models/__init__.py | 1 - spacy/ml/models/simple_ner.py | 104 -------- spacy/pipeline/__init__.py | 2 - spacy/pipeline/simple_ner.py | 223 ------------------ spacy/tests/pipeline/test_simple_ner.py | 106 --------- website/docs/api/architectures.md | 56 ----- 9 files changed, 5 insertions(+), 687 deletions(-) delete mode 100644 spacy/ml/_biluo.py delete mode 100644 spacy/ml/_iob.py delete mode 100644 spacy/ml/models/simple_ner.py delete mode 100644 spacy/pipeline/simple_ner.py delete mode 100644 spacy/tests/pipeline/test_simple_ner.py diff --git a/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg index a5fa32b18..e2ab148c6 100644 --- a/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg +++ b/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg @@ -31,10 +31,13 @@ lang = "en" vectors = null [nlp.pipeline.ner] -factory = "simple_ner" +factory = "ner" [nlp.pipeline.ner.model] -@architectures = "spacy.BiluoTagger.v1" +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 6 +hidden_width = 64 +maxout_pieces = 2 [nlp.pipeline.ner.model.tok2vec] @architectures = "spacy.HashEmbedCNN.v1" diff --git a/spacy/ml/_biluo.py b/spacy/ml/_biluo.py deleted file mode 100644 index 5a66a35bd..000000000 --- a/spacy/ml/_biluo.py +++ /dev/null @@ -1,105 +0,0 @@ -"""Thinc layer to do simpler transition-based parsing, NER, etc.""" -from typing import Dict, Optional -import numpy -from thinc.api import Model -from thinc.types import Padded, Floats3d - - -def BILUO() -> Model[Padded, Padded]: - return Model( - "biluo", - forward, - init=init, - dims={"nO": None}, - attrs={"get_num_actions": get_num_actions}, - ) - - -def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None): - if X is not None and Y is not None: - if X.data.shape != Y.data.shape: - # TODO: Fix error - raise ValueError("Mismatched shapes (TODO: Fix message)") - model.set_dim("nO", X.data.shape[2]) - elif X is not None: - model.set_dim("nO", X.data.shape[2]) - elif Y is not None: - model.set_dim("nO", Y.data.shape[2]) - elif model.get_dim("nO") is None: - raise ValueError("Dimension unset for BILUO: nO") - - -def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool): - n_labels = (model.get_dim("nO") - 1) // 4 - n_tokens, n_docs, n_actions = Xp.data.shape - # At each timestep, we make a validity mask of shape (n_docs, n_actions) - # to indicate which actions are valid next for each sequence. To construct - # the mask, we have a state of shape (2, n_actions) and a validity table of - # shape (2, n_actions+1, n_actions). The first dimension of the state indicates - # whether it's the last token, the second dimension indicates the previous - # action, plus a special 'null action' for the first entry. - valid_transitions = model.ops.asarray(_get_transition_table(n_labels)) - prev_actions = model.ops.alloc1i(n_docs) - # Initialize as though prev action was O - prev_actions.fill(n_actions - 1) - Y = model.ops.alloc3f(*Xp.data.shape) - masks = model.ops.alloc3f(*Y.shape) - max_value = Xp.data.max() - for t in range(Xp.data.shape[0]): - is_last = (Xp.lengths < (t + 2)).astype("i") - masks[t] = valid_transitions[is_last, prev_actions] - # Don't train the out-of-bounds sequences. - masks[t, Xp.size_at_t[t] :] = 0 - # Valid actions get 0*10e8, invalid get large negative value - Y[t] = Xp.data[t] + ((masks[t] - 1) * max_value * 10) - prev_actions = Y[t].argmax(axis=-1) - - def backprop_biluo(dY: Padded) -> Padded: - dY.data *= masks - return dY - - return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo - - -def get_num_actions(n_labels: int) -> int: - # One BEGIN action per label - # One IN action per label - # One LAST action per label - # One UNIT action per label - # One OUT action - return n_labels + n_labels + n_labels + n_labels + 1 - - -def _get_transition_table( - n_labels: int, *, _cache: Dict[int, Floats3d] = {} -) -> Floats3d: - n_actions = get_num_actions(n_labels) - if n_actions in _cache: - return _cache[n_actions] - table = numpy.zeros((2, n_actions, n_actions), dtype="f") - B_start, B_end = (0, n_labels) - I_start, I_end = (B_end, B_end + n_labels) - L_start, L_end = (I_end, I_end + n_labels) - U_start, _ = (L_end, L_end + n_labels) # noqa: F841 - # Using ranges allows us to set specific cells, which is necessary to express - # that only actions of the same label are valid continuations. - B_range = numpy.arange(B_start, B_end) - I_range = numpy.arange(I_start, I_end) - L_range = numpy.arange(L_start, L_end) - # If this is the last token and the previous action was B or I, only L - # of that label is valid - table[1, B_range, L_range] = 1 - table[1, I_range, L_range] = 1 - # If this isn't the last token and the previous action was B or I, only I or - # L of that label are valid. - table[0, B_range, I_range] = 1 - table[0, B_range, L_range] = 1 - table[0, I_range, I_range] = 1 - table[0, I_range, L_range] = 1 - # If this isn't the last token and the previous was L, U or O, B is valid - table[0, L_start:, :B_end] = 1 - # Regardless of whether this is the last token, if the previous action was - # {L, U, O}, U and O are valid. - table[:, L_start:, U_start:] = 1 - _cache[n_actions] = table - return table diff --git a/spacy/ml/_iob.py b/spacy/ml/_iob.py deleted file mode 100644 index 2e6b2ffab..000000000 --- a/spacy/ml/_iob.py +++ /dev/null @@ -1,88 +0,0 @@ -"""Thinc layer to do simpler transition-based parsing, NER, etc.""" -from typing import Dict, Optional -from thinc.api import Ops, Model -from thinc.types import Padded, Floats3d - - -def IOB() -> Model[Padded, Padded]: - return Model( - "biluo", - forward, - init=init, - dims={"nO": None}, - attrs={"get_num_actions": get_num_actions}, - ) - - -def init(model: Model, X: Optional[Padded] = None, Y: Optional[Padded] = None) -> None: - if X is not None and Y is not None: - if X.data.shape != Y.data.shape: - # TODO: Fix error - raise ValueError("Mismatched shapes (TODO: Fix message)") - model.set_dim("nO", X.data.shape[2]) - elif X is not None: - model.set_dim("nO", X.data.shape[2]) - elif Y is not None: - model.set_dim("nO", Y.data.shape[2]) - elif model.get_dim("nO") is None: - raise ValueError("Dimension unset for BILUO: nO") - - -def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool): - n_labels = (model.get_dim("nO") - 1) // 2 - n_tokens, n_docs, n_actions = Xp.data.shape - # At each timestep, we make a validity mask of shape (n_docs, n_actions) - # to indicate which actions are valid next for each sequence. To construct - # the mask, we have a state of shape (2, n_actions) and a validity table of - # shape (2, n_actions+1, n_actions). The first dimension of the state indicates - # whether it's the last token, the second dimension indicates the previous - # action, plus a special 'null action' for the first entry. - valid_transitions = _get_transition_table(model.ops, n_labels) - prev_actions = model.ops.alloc1i(n_docs) - # Initialize as though prev action was O - prev_actions.fill(n_actions - 1) - Y = model.ops.alloc3f(*Xp.data.shape) - masks = model.ops.alloc3f(*Y.shape) - for t in range(Xp.data.shape[0]): - masks[t] = valid_transitions[prev_actions] - # Don't train the out-of-bounds sequences. - masks[t, Xp.size_at_t[t] :] = 0 - # Valid actions get 0*10e8, invalid get -1*10e8 - Y[t] = Xp.data[t] + ((masks[t] - 1) * 10e8) - prev_actions = Y[t].argmax(axis=-1) - - def backprop_biluo(dY: Padded) -> Padded: - # Masking the gradient seems to do poorly here. But why? - # dY.data *= masks - return dY - - return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo - - -def get_num_actions(n_labels: int) -> int: - # One BEGIN action per label - # One IN action per label - # One OUT action - return n_labels * 2 + 1 - - -def _get_transition_table( - ops: Ops, n_labels: int, _cache: Dict[int, Floats3d] = {} -) -> Floats3d: - n_actions = get_num_actions(n_labels) - if n_actions in _cache: - return ops.asarray(_cache[n_actions]) - table = ops.alloc2f(n_actions, n_actions) - B_start, B_end = (0, n_labels) - I_start, I_end = (B_end, B_end + n_labels) - O_action = I_end - B_range = ops.xp.arange(B_start, B_end) - I_range = ops.xp.arange(I_start, I_end) - # B and O are always valid - table[:, B_start:B_end] = 1 - table[:, O_action] = 1 - # I can only follow a matching B - table[B_range, I_range] = 1 - - _cache[n_actions] = table - return table diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py index dd58dab00..67e70421f 100644 --- a/spacy/ml/models/__init__.py +++ b/spacy/ml/models/__init__.py @@ -1,6 +1,5 @@ from .entity_linker import * # noqa from .parser import * # noqa -from .simple_ner import * # noqa from .tagger import * # noqa from .textcat import * # noqa from .tok2vec import * # noqa diff --git a/spacy/ml/models/simple_ner.py b/spacy/ml/models/simple_ner.py deleted file mode 100644 index b47e7f349..000000000 --- a/spacy/ml/models/simple_ner.py +++ /dev/null @@ -1,104 +0,0 @@ -from typing import List -from thinc.api import Model, Linear, with_array, softmax_activation, padded2list -from thinc.api import chain, list2padded, configure_normal_init -from thinc.api import Dropout -from thinc.types import Floats2d - -from ...tokens import Doc -from .._biluo import BILUO -from .._iob import IOB -from ...util import registry - - -@registry.architectures.register("spacy.BILUOTagger.v1") -def BiluoTagger( - tok2vec: Model[List[Doc], List[Floats2d]] -) -> Model[List[Doc], List[Floats2d]]: - """Construct a simple NER tagger, that predicts BILUO tag scores for each - token and uses greedy decoding with transition-constraints to return a valid - BILUO tag sequence. - - A BILUO tag sequence encodes a sequence of non-overlapping labelled spans - into tags assigned to each token. The first token of a span is given the - tag B-LABEL, the last token of the span is given the tag L-LABEL, and tokens - within the span are given the tag I-LABEL. Single-token spans are given - the tag U-LABEL. All other tokens are assigned the tag O. - - The BILUO tag scheme generally results in better linear separation between - classes, especially for non-CRF models, because there are more distinct classes - for the different situations (Ratinov et al., 2009). - """ - biluo = BILUO() - linear = Linear( - nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02) - ) - model = chain( - tok2vec, - list2padded(), - with_array(chain(Dropout(0.1), linear)), - biluo, - with_array(softmax_activation()), - padded2list(), - ) - return Model( - "biluo-tagger", - forward, - init=init, - layers=[model, linear], - refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, - dims={"nO": None}, - attrs={"get_num_actions": biluo.attrs["get_num_actions"]}, - ) - - -@registry.architectures.register("spacy.IOBTagger.v1") -def IOBTagger( - tok2vec: Model[List[Doc], List[Floats2d]] -) -> Model[List[Doc], List[Floats2d]]: - """Construct a simple NER tagger, that predicts IOB tag scores for each - token and uses greedy decoding with transition-constraints to return a valid - IOB tag sequence. - - An IOB tag sequence encodes a sequence of non-overlapping labelled spans - into tags assigned to each token. The first token of a span is given the - tag B-LABEL, and subsequent tokens are given the tag I-LABEL. - All other tokens are assigned the tag O. - """ - biluo = IOB() - linear = Linear(nO=None, nI=tok2vec.get_dim("nO")) - model = chain( - tok2vec, - list2padded(), - with_array(linear), - biluo, - with_array(softmax_activation()), - padded2list(), - ) - return Model( - "iob-tagger", - forward, - init=init, - layers=[model], - refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, - dims={"nO": None}, - attrs={"get_num_actions": biluo.attrs["get_num_actions"]}, - ) - - -def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None: - if model.has_dim("nO") is None and Y: - model.set_dim("nO", Y[0].shape[1]) - nO = model.get_dim("nO") - biluo = model.get_ref("biluo") - linear = model.get_ref("linear") - biluo.set_dim("nO", nO) - if linear.has_dim("nO") is None: - linear.set_dim("nO", nO) - model.layers[0].initialize(X=X, Y=Y) - - -def forward(model: Model, X: List[Doc], is_train: bool): - return model.layers[0](X, is_train) - - -__all__ = ["BiluoTagger"] diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 793aa83c3..656182088 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -8,7 +8,6 @@ from .morphologizer import Morphologizer from .pipe import Pipe from .senter import SentenceRecognizer from .sentencizer import Sentencizer -from .simple_ner import SimpleNER from .tagger import Tagger from .textcat import TextCategorizer from .tok2vec import Tok2Vec @@ -25,7 +24,6 @@ __all__ = [ "Pipe", "SentenceRecognizer", "Sentencizer", - "SimpleNER", "Tagger", "TextCategorizer", "Tok2Vec", diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py deleted file mode 100644 index 951d89931..000000000 --- a/spacy/pipeline/simple_ner.py +++ /dev/null @@ -1,223 +0,0 @@ -from typing import List, Iterable, Optional, Dict, Tuple, Callable, Set -from thinc.types import Floats2d -from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate, Model -from thinc.api import Optimizer, Config -from thinc.util import to_numpy -from itertools import islice - -from ..errors import Errors -from ..training import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob -from ..training import validate_examples -from ..tokens import Doc -from ..language import Language -from ..vocab import Vocab -from ..scorer import Scorer -from .pipe import Pipe - - -default_model_config = """ -[model] -@architectures = "spacy.BILUOTagger.v1" - -[model.tok2vec] -@architectures = "spacy.HashEmbedCNN.v1" -pretrained_vectors = null -width = 128 -depth = 4 -embed_size = 7000 -window_size = 1 -maxout_pieces = 3 -subword_features = true -""" -DEFAULT_SIMPLE_NER_MODEL = Config().from_str(default_model_config)["model"] - - -@Language.factory( - "simple_ner", - assigns=["doc.ents"], - default_config={"labels": [], "model": DEFAULT_SIMPLE_NER_MODEL}, - scores=["ents_p", "ents_r", "ents_f", "ents_per_type"], - default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0}, -) -def make_simple_ner( - nlp: Language, name: str, model: Model, labels: Iterable[str] -) -> "SimpleNER": - return SimpleNER(nlp.vocab, model, name, labels=labels) - - -class SimpleNER(Pipe): - """Named entity recognition with a tagging model. The model should include - validity constraints to ensure that only valid tag sequences are returned.""" - - def __init__( - self, - vocab: Vocab, - model: Model, - name: str = "simple_ner", - *, - labels: Iterable[str], - ) -> None: - self.vocab = vocab - self.model = model - self.name = name - self.cfg = {"labels": []} - for label in labels: - self.add_label(label) - self.loss_func = SequenceCategoricalCrossentropy( - names=self.get_tag_names(), normalize=True, missing_value=None - ) - assert self.model is not None - - @property - def is_biluo(self) -> bool: - return self.model.name.startswith("biluo") - - @property - def labels(self) -> Tuple[str]: - return tuple(self.cfg["labels"]) - - def add_label(self, label: str) -> None: - """Add a new label to the pipe. - label (str): The label to add. - DOCS: https://nightly.spacy.io/api/simplener#add_label - """ - if not isinstance(label, str): - raise ValueError(Errors.E187) - if label not in self.labels: - self.cfg["labels"].append(label) - self.vocab.strings.add(label) - - def get_tag_names(self) -> List[str]: - if self.is_biluo: - return ( - [f"B-{label}" for label in self.labels] - + [f"I-{label}" for label in self.labels] - + [f"L-{label}" for label in self.labels] - + [f"U-{label}" for label in self.labels] - + ["O"] - ) - else: - return ( - [f"B-{label}" for label in self.labels] - + [f"I-{label}" for label in self.labels] - + ["O"] - ) - - def predict(self, docs: List[Doc]) -> List[Floats2d]: - scores = self.model.predict(docs) - return scores - - def set_annotations(self, docs: List[Doc], scores: List[Floats2d]) -> None: - """Set entities on a batch of documents from a batch of scores.""" - tag_names = self.get_tag_names() - for i, doc in enumerate(docs): - actions = to_numpy(scores[i].argmax(axis=1)) - tags = [tag_names[actions[j]] for j in range(len(doc))] - if not self.is_biluo: - tags = iob_to_biluo(tags) - doc.ents = spans_from_biluo_tags(doc, tags) - - def update( - self, - examples: List[Example], - *, - set_annotations: bool = False, - drop: float = 0.0, - sgd: Optional[Optimizer] = None, - losses: Optional[Dict[str, float]] = None, - ) -> Dict[str, float]: - if losses is None: - losses = {} - losses.setdefault("ner", 0.0) - validate_examples(examples, "SimpleNER.update") - if not any(_has_ner(eg) for eg in examples): - return losses - docs = [eg.predicted for eg in examples] - set_dropout_rate(self.model, drop) - scores, bp_scores = self.model.begin_update(docs) - loss, d_scores = self.get_loss(examples, scores) - bp_scores(d_scores) - if set_annotations: - self.set_annotations(docs, scores) - if sgd is not None: - self.model.finish_update(sgd) - losses["ner"] += loss - return losses - - def get_loss(self, examples: List[Example], scores) -> Tuple[List[Floats2d], float]: - validate_examples(examples, "SimpleNER.get_loss") - truths = [] - for eg in examples: - tags = eg.get_aligned_ner() - gold_tags = [(tag if tag != "-" else None) for tag in tags] - if not self.is_biluo: - gold_tags = biluo_to_iob(gold_tags) - truths.append(gold_tags) - for i in range(len(scores)): - if len(scores[i]) != len(truths[i]): - raise ValueError( - f"Mismatched output and gold sizes.\n" - f"Output: {len(scores[i])}, gold: {len(truths[i])}." - f"Input: {len(examples[i].doc)}" - ) - d_scores, loss = self.loss_func(scores, truths) - return loss, d_scores - - def begin_training( - self, - get_examples: Callable[[], Iterable[Example]], - pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, - sgd: Optional[Optimizer] = None, - ): - self._ensure_examples(get_examples) - all_labels = set() - for example in get_examples(): - all_labels.update(_get_labels(example)) - for label in sorted(all_labels): - if label != "": - self.add_label(label) - doc_sample = [] - label_sample = [] - self._require_labels() - for example in islice(get_examples(), 10): - doc_sample.append(example.x) - gold_tags = example.get_aligned_ner() - if not self.is_biluo: - gold_tags = biluo_to_iob(gold_tags) - gold_array = [ - [1.0 if tag == gold_tag else 0.0 for tag in self.get_tag_names()] - for gold_tag in gold_tags - ] - label_sample.append(self.model.ops.asarray(gold_array, dtype="float32")) - assert len(doc_sample) > 0, Errors.E923.format(name=self.name) - assert len(label_sample) > 0, Errors.E923.format(name=self.name) - self.model.initialize(X=doc_sample, Y=label_sample) - if pipeline is not None: - self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) - self.loss_func = SequenceCategoricalCrossentropy( - names=self.get_tag_names(), normalize=True, missing_value=None - ) - return sgd - - def init_multitask_objectives(self, *args, **kwargs): - pass - - def score(self, examples, **kwargs): - validate_examples(examples, "SimpleNER.score") - return Scorer.score_spans(examples, "ents", **kwargs) - - -def _has_ner(example: Example) -> bool: - for ner_tag in example.get_aligned_ner(): - if ner_tag != "-" and ner_tag is not None: - return True - else: - return False - - -def _get_labels(example: Example) -> Set[str]: - labels = set() - for ner_tag in example.get_aligned("ENT_TYPE", as_string=True): - if ner_tag != "O" and ner_tag != "-" and ner_tag != "": - labels.add(ner_tag) - return labels diff --git a/spacy/tests/pipeline/test_simple_ner.py b/spacy/tests/pipeline/test_simple_ner.py deleted file mode 100644 index 940743ce0..000000000 --- a/spacy/tests/pipeline/test_simple_ner.py +++ /dev/null @@ -1,106 +0,0 @@ -import pytest -from spacy.lang.en import English -from spacy.training import Example -from spacy import util -from ..util import make_tempdir - - -TRAIN_DATA = [ - ("Who is Shaka S Khan?", {"entities": [(7, 19, "PERSON")]}), - ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}), -] - - -def test_no_label(): - nlp = English() - nlp.add_pipe("simple_ner") - with pytest.raises(ValueError): - nlp.begin_training() - - -def test_implicit_label(): - nlp = English() - ner = nlp.add_pipe("simple_ner") - train_examples = [] - ner.add_label("ORG") - for t in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - nlp.begin_training(get_examples=lambda: train_examples) - - -@pytest.mark.skip(reason="Should be fixed") -def test_untrained(): - # This shouldn't crash, but it does when the simple_ner produces an invalid sequence like ['L-PERSON', 'L-ORG'] - nlp = English() - ner = nlp.add_pipe("simple_ner") - ner.add_label("PERSON") - ner.add_label("LOC") - ner.add_label("ORG") - nlp.begin_training() - nlp("Example sentence") - - -def test_resize(): - nlp = English() - ner = nlp.add_pipe("simple_ner") - ner.add_label("PERSON") - ner.add_label("LOC") - nlp.begin_training() - assert len(ner.labels) == 2 - ner.add_label("ORG") - nlp.begin_training() - assert len(ner.labels) == 3 - - -def test_begin_training_examples(): - nlp = English() - ner = nlp.add_pipe("simple_ner") - train_examples = [] - for text, annotations in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - for ent in annotations.get("entities"): - ner.add_label(ent[2]) - # you shouldn't really call this more than once, but for testing it should be fine - nlp.begin_training() - nlp.begin_training(get_examples=lambda: train_examples) - with pytest.raises(TypeError): - nlp.begin_training(get_examples=lambda: None) - with pytest.raises(TypeError): - nlp.begin_training(get_examples=lambda: train_examples[0]) - with pytest.raises(ValueError): - nlp.begin_training(get_examples=lambda: []) - with pytest.raises(ValueError): - nlp.begin_training(get_examples=train_examples) - - -def test_overfitting_IO(): - # Simple test to try and quickly overfit the SimpleNER component - ensuring the ML models work correctly - nlp = English() - ner = nlp.add_pipe("simple_ner") - train_examples = [] - for text, annotations in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - optimizer = nlp.begin_training(get_examples=lambda: train_examples) - - for i in range(50): - losses = {} - nlp.update(train_examples, sgd=optimizer, losses=losses) - assert losses["ner"] < 0.0001 - - # test the trained model - test_text = "I like London." - doc = nlp(test_text) - ents = doc.ents - assert len(ents) == 1 - assert ents[0].text == "London" - assert ents[0].label_ == "LOC" - - # Also test the results are still the same after IO - with make_tempdir() as tmp_dir: - nlp.to_disk(tmp_dir) - nlp2 = util.load_model_from_path(tmp_dir) - doc2 = nlp2(test_text) - ents2 = doc2.ents - assert len(ents2) == 1 - assert ents2[0].text == "London" - assert ents2[0].label_ == "LOC" diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index ee844d961..8767c93dd 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -456,62 +456,6 @@ consists of either two or three subnetworks: | `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | | **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ | -### spacy.BILUOTagger.v1 {#BILUOTagger source="spacy/ml/models/simple_ner.py"} - -> #### Example Config -> -> ```ini -> [model] -> @architectures = "spacy.BILUOTagger.v1 " -> -> [model.tok2vec] -> @architectures = "spacy.HashEmbedCNN.v1" -> # etc. -> ``` - -Construct a simple NER tagger that predicts -[BILUO](/usage/linguistic-features#accessing-ner) tag scores for each token and -uses greedy decoding with transition-constraints to return a valid BILUO tag -sequence. A BILUO tag sequence encodes a sequence of non-overlapping labelled -spans into tags assigned to each token. The first token of a span is given the -tag `B-LABEL`, the last token of the span is given the tag `L-LABEL`, and tokens -within the span are given the tag `U-LABEL`. Single-token spans are given the -tag `U-LABEL`. All other tokens are assigned the tag `O`. The BILUO tag scheme -generally results in better linear separation between classes, especially for -non-CRF models, because there are more distinct classes for the different -situations ([Ratinov et al., 2009](https://www.aclweb.org/anthology/W09-1119/)). - -| Name | Description | -| ----------- | ------------------------------------------------------------------------------------------ | -| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | - -### spacy.IOBTagger.v1 {#IOBTagger source="spacy/ml/models/simple_ner.py"} - -> #### Example Config -> -> ```ini -> [model] -> @architectures = "spacy.IOBTagger.v1 " -> -> [model.tok2vec] -> @architectures = "spacy.HashEmbedCNN.v1" -> # etc. -> ``` - -Construct a simple NER tagger, that predicts -[IOB](/usage/linguistic-features#accessing-ner) tag scores for each token and -uses greedy decoding with transition-constraints to return a valid IOB tag -sequence. An IOB tag sequence encodes a sequence of non-overlapping labeled -spans into tags assigned to each token. The first token of a span is given the -tag B-LABEL, and subsequent tokens are given the tag I-LABEL. All other tokens -are assigned the tag O. - -| Name | Description | -| ----------- | ------------------------------------------------------------------------------------------ | -| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | - ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"} ### spacy.Tagger.v1 {#Tagger}