spaCy/spacy/pipeline/tagger.pyx
Adriane Boyd e962784531
Add Lemmatizer and simplify related components (#5848)
* Add Lemmatizer and simplify related components

* Add `Lemmatizer` pipe with `lookup` and `rule` modes using the
`Lookups` tables.
* Reduce `Tagger` to a simple tagger that sets `Token.tag` (no pos or lemma)
* Reduce `Morphology` to only keep track of morph tags (no tag map, lemmatizer,
or morph rules)
* Remove lemmatizer from `Vocab`
* Adjust many many tests

Differences:

* No default lookup lemmas
* No special treatment of TAG in `from_array` and similar required
* Easier to modify labels in a `Tagger`
* No extra strings added from morphology / tag map

* Fix test

* Initial fix for Lemmatizer config/serialization

* Adjust init test to be more generic

* Adjust init test to force empty Lookups

* Add simple cache to rule-based lemmatizer

* Convert language-specific lemmatizers

Convert language-specific lemmatizers to component lemmatizers. Remove
previous lemmatizer class.

* Fix French and Polish lemmatizers

* Remove outdated UPOS conversions

* Update Russian lemmatizer init in tests

* Add minimal init/run tests for custom lemmatizers

* Add option to overwrite existing lemmas

* Update mode setting, lookup loading, and caching

* Make `mode` an immutable property
* Only enforce strict `load_lookups` for known supported modes
* Move caching into individual `_lemmatize` methods

* Implement strict when lang is not found in lookups

* Fix tables/lookups in make_lemmatizer

* Reallow provided lookups and allow for stricter checks

* Add lookups asset to all Lemmatizer pipe tests

* Rename lookups in lemmatizer init test

* Clean up merge

* Refactor lookup table loading

* Add helper from `load_lemmatizer_lookups` that loads required and
optional lookups tables based on settings provided by a config.

Additional slight refactor of lookups:

* Add `Lookups.set_table` to set a table from a provided `Table`
* Reorder class definitions to be able to specify type as `Table`

* Move registry assets into test methods

* Refactor lookups tables config

Use class methods within `Lemmatizer` to provide the config for
particular modes and to load the lookups from a config.

* Add pipe and score to lemmatizer

* Simplify Tagger.score

* Add missing import

* Clean up imports and auto-format

* Remove unused kwarg

* Tidy up and auto-format

* Update docstrings for Lemmatizer

Update docstrings for Lemmatizer.

Additionally modify `is_base_form` API to take `Token` instead of
individual features.

* Update docstrings

* Remove tag map values from Tagger.add_label

* Update API docs

* Fix relative link in Lemmatizer API docs
2020-08-07 15:27:13 +02:00

391 lines
14 KiB
Cython

# cython: infer_types=True, profile=True, binding=True
import numpy
import srsly
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
import warnings
from ..tokens.doc cimport Doc
from ..morphology cimport Morphology
from ..vocab cimport Vocab
from .pipe import Pipe, deserialize_config
from ..language import Language
from ..attrs import POS, ID
from ..parts_of_speech import X
from ..errors import Errors, TempErrors, Warnings
from ..scorer import Scorer
from .. import util
default_model_config = """
[model]
@architectures = "spacy.Tagger.v1"
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null
"""
DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"tagger",
assigns=["token.tag"],
default_config={"model": DEFAULT_TAGGER_MODEL},
scores=["tag_acc"],
default_score_weights={"tag_acc": 1.0},
)
def make_tagger(nlp: Language, name: str, model: Model):
return Tagger(nlp.vocab, model, name)
class Tagger(Pipe):
"""Pipeline component for part-of-speech tagging.
DOCS: https://spacy.io/api/tagger
"""
def __init__(self, vocab, model, name="tagger", *, labels=None):
"""Initialize a part-of-speech tagger.
vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
labels (List): The set of labels. Defaults to None.
set_morphology (bool): Whether to set morphological features.
DOCS: https://spacy.io/api/tagger#init
"""
self.vocab = vocab
self.model = model
self.name = name
self._rehearsal_model = None
cfg = {"labels": labels or []}
self.cfg = dict(sorted(cfg.items()))
@property
def labels(self):
"""The labels currently added to the component. Note that even for a
blank component, this will always include the built-in coarse-grained
part-of-speech tags by default.
RETURNS (Tuple[str]): The labels.
DOCS: https://spacy.io/api/tagger#labels
"""
return tuple(self.cfg["labels"])
def __call__(self, doc):
"""Apply the pipe to a Doc.
doc (Doc): The document to process.
RETURNS (Doc): The processed Doc.
DOCS: https://spacy.io/api/tagger#call
"""
tags = self.predict([doc])
self.set_annotations([doc], tags)
return doc
def pipe(self, stream, *, batch_size=128):
"""Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are
applied to the Doc.
stream (Iterable[Doc]): A stream of documents.
batch_size (int): The number of documents to buffer.
YIELDS (Doc): Processed documents in order.
DOCS: https://spacy.io/api/tagger#pipe
"""
for docs in util.minibatch(stream, size=batch_size):
tag_ids = self.predict(docs)
self.set_annotations(docs, tag_ids)
yield from docs
def predict(self, docs):
"""Apply the pipeline's model to a batch of docs, without modifying them.
docs (Iterable[Doc]): The documents to predict.
RETURNS: The models prediction for each document.
DOCS: https://spacy.io/api/tagger#predict
"""
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
n_labels = len(self.labels)
guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
assert len(guesses) == len(docs)
return guesses
scores = self.model.predict(docs)
assert len(scores) == len(docs), (len(scores), len(docs))
guesses = self._scores2guesses(scores)
assert len(guesses) == len(docs)
return guesses
def _scores2guesses(self, scores):
guesses = []
for doc_scores in scores:
doc_guesses = doc_scores.argmax(axis=1)
if not isinstance(doc_guesses, numpy.ndarray):
doc_guesses = doc_guesses.get()
guesses.append(doc_guesses)
return guesses
def set_annotations(self, docs, batch_tag_ids):
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
batch_tag_ids: The IDs to set, produced by Tagger.predict.
DOCS: https://spacy.io/api/tagger#set_annotations
"""
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
cdef Vocab vocab = self.vocab
for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids):
# Don't clobber preset POS tags
if doc.c[j].tag == 0:
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
doc.is_tagged = True
def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model. Delegates to predict and get_loss.
examples (Iterable[Example]): A batch of Example objects.
drop (float): The dropout rate.
set_annotations (bool): Whether or not to update the Example objects
with the predictions.
sgd (thinc.api.Optimizer): The optimizer.
losses (Dict[str, float]): Optional record of the loss during training.
Updated using the component name as the key.
RETURNS (Dict[str, float]): The updated losses dictionary.
DOCS: https://spacy.io/api/tagger#update
"""
if losses is None:
losses = {}
losses.setdefault(self.name, 0.0)
try:
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
# Handle cases where there are no tokens in any docs.
return
except AttributeError:
types = set([type(eg) for eg in examples])
raise TypeError(Errors.E978.format(name="Tagger", method="update", types=types)) from None
set_dropout_rate(self.model, drop)
tag_scores, bp_tag_scores = self.model.begin_update(
[eg.predicted for eg in examples])
for sc in tag_scores:
if self.model.ops.xp.isnan(sc.sum()):
raise ValueError("nan value in scores")
loss, d_tag_scores = self.get_loss(examples, tag_scores)
bp_tag_scores(d_tag_scores)
if sgd not in (None, False):
self.model.finish_update(sgd)
losses[self.name] += loss
if set_annotations:
docs = [eg.predicted for eg in examples]
self.set_annotations(docs, self._scores2guesses(tag_scores))
return losses
def rehearse(self, examples, *, drop=0., sgd=None, losses=None):
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
teach the current model to make predictions similar to an initial model,
to try to address the "catastrophic forgetting" problem. This feature is
experimental.
examples (Iterable[Example]): A batch of Example objects.
drop (float): The dropout rate.
sgd (thinc.api.Optimizer): The optimizer.
losses (Dict[str, float]): Optional record of the loss during training.
Updated using the component name as the key.
RETURNS (Dict[str, float]): The updated losses dictionary.
DOCS: https://spacy.io/api/tagger#rehearse
"""
try:
docs = [eg.predicted for eg in examples]
except AttributeError:
types = set([type(eg) for eg in examples])
raise TypeError(Errors.E978.format(name="Tagger", method="rehearse", types=types)) from None
if self._rehearsal_model is None:
return
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
return
set_dropout_rate(self.model, drop)
guesses, backprop = self.model.begin_update(docs)
target = self._rehearsal_model(examples)
gradient = guesses - target
backprop(gradient)
self.model.finish_update(sgd)
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += (gradient**2).sum()
def get_loss(self, examples, scores):
"""Find the loss and gradient of loss for the batch of documents and
their predicted scores.
examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient.
DOCS: https://spacy.io/api/tagger#get_loss
"""
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
truths = [eg.get_aligned("TAG", as_string=True) for eg in examples]
d_scores, loss = loss_func(scores, truths)
if self.model.ops.xp.isnan(loss):
raise ValueError("nan value when computing loss")
return float(loss), d_scores
def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using data examples if available.
get_examples (Callable[[], Iterable[Example]]): Optional function that
returns gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
components that this component is part of. Corresponds to
nlp.pipeline.
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://spacy.io/api/tagger#begin_training
"""
tags = set()
for example in get_examples():
try:
y = example.y
except AttributeError:
raise TypeError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example))) from None
for token in y:
tags.add(token.tag_)
for tag in sorted(tags):
self.add_label(tag)
self.set_output(len(self.labels))
self.model.initialize()
if sgd is None:
sgd = self.create_optimizer()
return sgd
def add_label(self, label):
"""Add a new label to the pipe.
label (str): The label to add.
RETURNS (int): 0 if label is already present, otherwise 1.
DOCS: https://spacy.io/api/tagger#add_label
"""
if not isinstance(label, str):
raise ValueError(Errors.E187)
if label in self.labels:
return 0
self.cfg["labels"].append(label)
self.vocab.strings.add(label)
return 1
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by
Scorer.score_token_attr for the attributes "tag", "pos" and "lemma".
DOCS: https://spacy.io/api/tagger#score
"""
return Scorer.score_token_attr(examples, "tag", **kwargs)
def to_bytes(self, *, exclude=tuple()):
"""Serialize the pipe to a bytestring.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (bytes): The serialized object.
DOCS: https://spacy.io/api/tagger#to_bytes
"""
serialize = {}
serialize["model"] = self.model.to_bytes
serialize["vocab"] = self.vocab.to_bytes
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, *, exclude=tuple()):
"""Load the pipe from a bytestring.
bytes_data (bytes): The serialized pipe.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Tagger): The loaded Tagger.
DOCS: https://spacy.io/api/tagger#from_bytes
"""
def load_model(b):
try:
self.model.from_bytes(b)
except AttributeError:
raise ValueError(Errors.E149) from None
deserialize = {
"vocab": lambda b: self.vocab.from_bytes(b),
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
"model": lambda b: load_model(b),
}
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, *, exclude=tuple()):
"""Serialize the pipe to disk.
path (str / Path): Path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.
DOCS: https://spacy.io/api/tagger#to_disk
"""
serialize = {
"vocab": lambda p: self.vocab.to_disk(p),
"model": lambda p: self.model.to_disk(p),
"cfg": lambda p: srsly.write_json(p, self.cfg),
}
util.to_disk(path, serialize, exclude)
def from_disk(self, path, *, exclude=tuple()):
"""Load the pipe from disk. Modifies the object in place and returns it.
path (str / Path): Path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Tagger): The modified Tagger object.
DOCS: https://spacy.io/api/tagger#from_disk
"""
def load_model(p):
with p.open("rb") as file_:
try:
self.model.from_bytes(file_.read())
except AttributeError:
raise ValueError(Errors.E149) from None
deserialize = {
"vocab": lambda p: self.vocab.from_disk(p),
"cfg": lambda p: self.cfg.update(deserialize_config(p)),
"model": load_model,
}
util.from_disk(path, deserialize, exclude)
return self