Update morphologizer (#5108)

* Add pos and morph scoring to Scorer

Add pos, morph, and morph_per_type to `Scorer`. Report pos and morph
accuracy in `spacy evaluate`.

* Update morphologizer for v3

* switch to tagger-based morphologizer
* use `spacy.HashCharEmbedCNN` for morphologizer defaults
* add `Doc.is_morphed` flag

* Add morphologizer to train CLI

* Add basic morphologizer pipeline tests

* Add simple morphologizer training example

* Remove subword_features from CharEmbed models

Remove `subword_features` argument from `spacy.HashCharEmbedCNN.v1` and
`spacy.HashCharEmbedBiLSTM.v1` since in these cases `subword_features`
is always `False`.

* Rename setting in morphologizer example

Use `with_pos_tags` instead of `without_pos_tags`.

* Fix kwargs for spacy.HashCharEmbedBiLSTM.v1

* Remove defaults for spacy.HashCharEmbedBiLSTM.v1

Remove default `nM/nC` for `spacy.HashCharEmbedBiLSTM.v1`.

* Set random seed for textcat overfitting test
This commit is contained in:
adrianeboyd 2020-04-02 14:46:32 +02:00 committed by GitHub
parent ab59f3124e
commit b71a11ff6d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 458 additions and 128 deletions

View File

@ -0,0 +1,133 @@
#!/usr/bin/env python
# coding: utf8
"""
A simple example for training a morphologizer. For more details, see
the documentation:
* Training: https://spacy.io/usage/training
Compatible with: spaCy v3.0.0+
Last tested with: v3.0.0
"""
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
from spacy.morphology import Morphology
# Usually you'll read this in, of course. Data formats vary. Ensure your
# strings are unicode and that the number of tags assigned matches spaCy's
# tokenization. If not, you can always add a 'words' key to the annotations
# that specifies the gold-standard tokenization, e.g.:
# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']})
TRAIN_DATA = [
(
"I like green eggs",
{
"morphs": [
"PronType=Prs|Person=1",
"VerbForm=Fin",
"Degree=Pos",
"Number=Plur",
],
"pos": ["PRON", "VERB", "ADJ", "NOUN"],
},
),
(
"Eat blue ham",
{
"morphs": ["VerbForm=Inf", "Degree=Pos", "Number=Sing"],
"pos": ["VERB", "ADJ", "NOUN"],
},
),
(
"She was blue",
{
"morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos"],
"pos": ["PRON", "VERB", "ADJ"],
},
),
(
"He was blue today",
{
"morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos", ""],
"pos": ["PRON", "VERB", "ADJ", "ADV"],
},
),
]
# The POS tags are optional, set `with_pos_tags = False` to omit them for
# this example:
with_pos_tags = True
if not with_pos_tags:
for i in range(len(TRAIN_DATA)):
del TRAIN_DATA[i][1]["pos"]
@plac.annotations(
lang=("ISO Code of language to use", "option", "l", str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int),
)
def main(lang="en", output_dir=None, n_iter=25):
"""Create a new model, set up the pipeline and train the tagger. In order to
train the tagger with a custom tag map, we're creating a new Language
instance with a custom vocab.
"""
nlp = spacy.blank(lang)
# add the tagger to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
morphologizer = nlp.create_pipe("morphologizer")
nlp.add_pipe(morphologizer)
# add labels
for _, annotations in TRAIN_DATA:
morph_labels = annotations.get("morphs")
pos_labels = annotations.get("pos", [""] * len(annotations.get("morphs")))
assert len(morph_labels) == len(pos_labels)
for morph, pos in zip(morph_labels, pos_labels):
morph_dict = Morphology.feats_to_dict(morph)
if pos:
morph_dict["POS"] = pos
morph = Morphology.dict_to_feats(morph_dict)
morphologizer.add_label(morph)
optimizer = nlp.begin_training()
for i in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
nlp.update(batch, sgd=optimizer, losses=losses)
print("Losses", losses)
# test the trained model
test_text = "I like blue eggs"
doc = nlp(test_text)
print("Morphs", [(t.text, t.morph) for t in doc])
# save model to output directory
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# test the save model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
doc = nlp2(test_text)
print("Morphs", [(t.text, t.morph) for t in doc])
if __name__ == "__main__":
plac.call(main)
# Expected output:
# Morphs [('I', POS=PRON|Person=1|PronType=Prs), ('like', POS=VERB|VerbForm=Fin), ('blue', Degree=Pos|POS=ADJ), ('eggs', Number=Plur|POS=NOUN)]

View File

@ -43,7 +43,9 @@ def evaluate(
"Words": nwords, "Words": nwords,
"Words/s": f"{nwords / (end - begin):.0f}", "Words/s": f"{nwords / (end - begin):.0f}",
"TOK": f"{scorer.token_acc:.2f}", "TOK": f"{scorer.token_acc:.2f}",
"POS": f"{scorer.tags_acc:.2f}", "TAG": f"{scorer.tags_acc:.2f}",
"POS": f"{scorer.pos_acc:.2f}",
"MORPH": f"{scorer.morphs_acc:.2f}",
"UAS": f"{scorer.uas:.2f}", "UAS": f"{scorer.uas:.2f}",
"LAS": f"{scorer.las:.2f}", "LAS": f"{scorer.las:.2f}",
"NER P": f"{scorer.ents_p:.2f}", "NER P": f"{scorer.ents_p:.2f}",

View File

@ -221,6 +221,8 @@ def train(
config_loc = default_dir / "parser_defaults.cfg" config_loc = default_dir / "parser_defaults.cfg"
elif pipe == "tagger": elif pipe == "tagger":
config_loc = default_dir / "tagger_defaults.cfg" config_loc = default_dir / "tagger_defaults.cfg"
elif pipe == "morphologizer":
config_loc = default_dir / "morphologizer_defaults.cfg"
elif pipe == "ner": elif pipe == "ner":
config_loc = default_dir / "ner_defaults.cfg" config_loc = default_dir / "ner_defaults.cfg"
elif pipe == "textcat": elif pipe == "textcat":
@ -590,6 +592,8 @@ def _score_for_model(meta):
acc = meta["accuracy"] acc = meta["accuracy"]
if "tagger" in pipes: if "tagger" in pipes:
mean_acc.append(acc["tags_acc"]) mean_acc.append(acc["tags_acc"])
if "morphologizer" in pipes:
mean_acc.append((acc["morphs_acc"] + acc["pos_acc"]) / 2)
if "parser" in pipes: if "parser" in pipes:
mean_acc.append((acc["uas"] + acc["las"]) / 2) mean_acc.append((acc["uas"] + acc["las"]) / 2)
if "ner" in pipes: if "ner" in pipes:
@ -672,13 +676,15 @@ def _find_best(experiment_dir, component):
def _get_metrics(component): def _get_metrics(component):
if component == "parser": if component == "parser":
return ("las", "uas", "las_per_type", "token_acc", "sent_f") return ("las", "uas", "las_per_type", "sent_f", "token_acc")
elif component == "tagger": elif component == "tagger":
return ("tags_acc", "token_acc") return ("tags_acc", "token_acc")
elif component == "morphologizer":
return ("morphs_acc", "pos_acc", "token_acc")
elif component == "ner": elif component == "ner":
return ("ents_f", "ents_p", "ents_r", "ents_per_type", "token_acc") return ("ents_f", "ents_p", "ents_r", "ents_per_type", "token_acc")
elif component == "senter": elif component == "senter":
return ("sent_f", "sent_p", "sent_r") return ("sent_f", "sent_p", "sent_r", "token_acc")
elif component == "textcat": elif component == "textcat":
return ("textcat_score", "token_acc") return ("textcat_score", "token_acc")
return ("token_acc",) return ("token_acc",)
@ -691,6 +697,9 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths):
if pipe == "tagger": if pipe == "tagger":
row_head.extend(["Tag Loss ", " Tag % "]) row_head.extend(["Tag Loss ", " Tag % "])
output_stats.extend(["tag_loss", "tags_acc"]) output_stats.extend(["tag_loss", "tags_acc"])
elif pipe == "morphologizer" or pipe == "morphologizertagger":
row_head.extend(["Morph Loss ", " Morph % ", " POS % "])
output_stats.extend(["morph_loss", "morphs_acc", "pos_acc"])
elif pipe == "parser": elif pipe == "parser":
row_head.extend( row_head.extend(
["Dep Loss ", " UAS ", " LAS ", "Sent P", "Sent R", "Sent F"] ["Dep Loss ", " UAS ", " LAS ", "Sent P", "Sent R", "Sent F"]
@ -731,6 +740,7 @@ def _get_progress(
scores["dep_loss"] = losses.get("parser", 0.0) scores["dep_loss"] = losses.get("parser", 0.0)
scores["ner_loss"] = losses.get("ner", 0.0) scores["ner_loss"] = losses.get("ner", 0.0)
scores["tag_loss"] = losses.get("tagger", 0.0) scores["tag_loss"] = losses.get("tagger", 0.0)
scores["morph_loss"] = losses.get("morphologizer", 0.0)
scores["textcat_loss"] = losses.get("textcat", 0.0) scores["textcat_loss"] = losses.get("textcat", 0.0)
scores["senter_loss"] = losses.get("senter", 0.0) scores["senter_loss"] = losses.get("senter", 0.0)
scores["cpu_wps"] = cpu_wps scores["cpu_wps"] = cpu_wps

View File

@ -9,6 +9,5 @@ depth = 4
embed_size = 7000 embed_size = 7000
window_size = 1 window_size = 1
maxout_pieces = 3 maxout_pieces = 3
subword_features = true
nM = 64 nM = 64
nC = 8 nC = 8

View File

@ -74,7 +74,6 @@ def hash_charembed_cnn(
embed_size, embed_size,
maxout_pieces, maxout_pieces,
window_size, window_size,
subword_features,
nM, nM,
nC, nC,
): ):
@ -87,7 +86,7 @@ def hash_charembed_cnn(
bilstm_depth=0, bilstm_depth=0,
maxout_pieces=maxout_pieces, maxout_pieces=maxout_pieces,
window_size=window_size, window_size=window_size,
subword_features=subword_features, subword_features=False,
char_embed=True, char_embed=True,
nM=nM, nM=nM,
nC=nC, nC=nC,
@ -116,7 +115,7 @@ def hash_embed_bilstm_v1(
@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1") @registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1")
def hash_char_embed_bilstm_v1( def hash_char_embed_bilstm_v1(
pretrained_vectors, width, depth, embed_size, subword_features, nM, nC, maxout_pieces pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC
): ):
# Allows using character embeddings by setting nC, nM and char_embed=True # Allows using character embeddings by setting nC, nM and char_embed=True
return build_Tok2Vec_model( return build_Tok2Vec_model(
@ -127,7 +126,7 @@ def hash_char_embed_bilstm_v1(
conv_depth=0, conv_depth=0,
maxout_pieces=maxout_pieces, maxout_pieces=maxout_pieces,
window_size=1, window_size=1,
subword_features=subword_features, subword_features=False,
char_embed=True, char_embed=True,
nM=nM, nM=nM,
nC=nC, nC=nC,

View File

@ -1,166 +1,169 @@
# cython: infer_types=True, profile=True
cimport numpy as np cimport numpy as np
import numpy import numpy
from collections import defaultdict import srsly
from thinc.api import chain, list2array, to_categorical, get_array_module from thinc.api import to_categorical
from thinc.util import copy_array
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..vocab cimport Vocab from ..vocab cimport Vocab
from ..morphology cimport Morphology from ..morphology cimport Morphology
from ..parts_of_speech import IDS as POS_IDS
from ..symbols import POS
from .. import util from .. import util
from ..language import component from ..language import component
from ..util import link_vectors_to_models, create_default_optimizer from ..util import link_vectors_to_models, create_default_optimizer
from ..errors import Errors, TempErrors from ..errors import Errors, TempErrors
from .pipes import Pipe from .pipes import Tagger, _load_cfg
from .. import util
@component("morphologizer", assigns=["token.morph", "token.pos"]) @component("morphologizer", assigns=["token.morph", "token.pos"])
class Morphologizer(Pipe): class Morphologizer(Tagger):
def __init__(self, vocab, model, **cfg): def __init__(self, vocab, model, **cfg):
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
self._rehearsal_model = None
self.cfg = dict(sorted(cfg.items())) self.cfg = dict(sorted(cfg.items()))
self._class_map = self.vocab.morphology.create_class_map() # Morphology.create_class_map() ? self.cfg.setdefault("labels", {})
self.cfg.setdefault("morph_pos", {})
@property @property
def labels(self): def labels(self):
return self.vocab.morphology.tag_names return tuple(self.cfg["labels"].keys())
@property def add_label(self, label):
def tok2vec(self): if not isinstance(label, str):
if self.model in (None, True, False): raise ValueError(Errors.E187)
return None if label in self.labels:
else: return 0
return chain(self.model.get_ref("tok2vec"), list2array()) morph = Morphology.feats_to_dict(label)
norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)]
def __call__(self, doc): pos = morph.get("POS", "")
features, tokvecs = self.predict([doc]) if norm_morph_pos not in self.cfg["labels"]:
self.set_annotations([doc], features, tensors=tokvecs) self.cfg["labels"][norm_morph_pos] = norm_morph_pos
return doc self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos]
return 1
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in util.minibatch(stream, size=batch_size):
docs = list(docs)
features, tokvecs = self.predict(docs)
self.set_annotations(docs, features, tensors=tokvecs)
yield from docs
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
**kwargs): **kwargs):
for example in get_examples():
for i, morph in enumerate(example.token_annotation.morphs):
pos = example.token_annotation.get_pos(i)
morph = Morphology.feats_to_dict(morph)
norm_morph = self.vocab.strings[self.vocab.morphology.add(morph)]
if pos:
morph["POS"] = pos
norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)]
if norm_morph_pos not in self.cfg["labels"]:
self.cfg["labels"][norm_morph_pos] = norm_morph
self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos]
self.set_output(len(self.labels)) self.set_output(len(self.labels))
self.model.initialize() self.model.initialize()
link_vectors_to_models(self.vocab)
if sgd is None: if sgd is None:
sgd = self.create_optimizer() sgd = self.create_optimizer()
return sgd return sgd
def predict(self, docs): def set_annotations(self, docs, batch_tag_ids):
if not any(len(doc) for doc in docs):
# Handle case where there are no tokens in any docs.
n_labels = self.model.get_dim("nO")
guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
tokvecs = self.model.ops.alloc((0, self.model.get_ref("tok2vec").get_dim("nO")))
return guesses, tokvecs
tokvecs = self.model.get_ref("tok2vec")(docs)
scores = self.model.get_ref("softmax")(tokvecs)
return scores, tokvecs
def set_annotations(self, docs, batch_scores, tensors=None):
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
cdef Doc doc cdef Doc doc
cdef Vocab vocab = self.vocab cdef Vocab vocab = self.vocab
offsets = [self._class_map.get_field_offset(field)
for field in self._class_map.fields]
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
doc_scores = batch_scores[i] doc_tag_ids = batch_tag_ids[i]
doc_guesses = scores_to_guesses(doc_scores, self.model.get_ref("softmax").attrs["nOs"]) if hasattr(doc_tag_ids, "get"):
# Convert the neuron indices into feature IDs. doc_tag_ids = doc_tag_ids.get()
doc_feat_ids = numpy.zeros((len(doc), len(self._class_map.fields)), dtype='i') for j, tag_id in enumerate(doc_tag_ids):
for j in range(len(doc)): morph = self.labels[tag_id]
for k, offset in enumerate(offsets): doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels"][morph])
if doc_guesses[j, k] == 0: doc.c[j].pos = self.cfg["morph_pos"][morph]
doc_feat_ids[j, k] = 0
else:
doc_feat_ids[j, k] = offset + doc_guesses[j, k]
# Get the set of feature names.
feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]}
if "NIL" in feats:
feats.remove("NIL")
# Now add the analysis, and set the hash.
doc.c[j].morph = self.vocab.morphology.add(feats)
if doc[j].morph.pos != 0:
doc.c[j].pos = doc[j].morph.pos
def update(self, examples, drop=0., sgd=None, losses=None): doc.is_morphed = True
if losses is not None and self.name not in losses:
losses[self.name] = 0.
docs = [self._get_doc(ex) for ex in examples]
tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
loss, d_tag_scores = self.get_loss(examples, tag_scores)
bp_tag_scores(d_tag_scores, sgd=sgd)
if losses is not None:
losses[self.name] += loss
def get_loss(self, examples, scores): def get_loss(self, examples, scores):
guesses = [] scores = self.model.ops.flatten(scores)
for doc_scores in scores: tag_index = {tag: i for i, tag in enumerate(self.labels)}
guesses.append(scores_to_guesses(doc_scores, self.model.get_ref("softmax").attrs["nOs"]))
guesses = self.model.ops.xp.vstack(guesses)
scores = self.model.ops.xp.vstack(scores)
if not isinstance(scores, numpy.ndarray):
scores = scores.get()
if not isinstance(guesses, numpy.ndarray):
guesses = guesses.get()
cdef int idx = 0 cdef int idx = 0
# Do this on CPU, as we can't vectorize easily. correct = numpy.zeros((scores.shape[0],), dtype="i")
target = numpy.zeros(scores.shape, dtype='f') guesses = scores.argmax(axis=1)
field_sizes = self.model.get_ref("softmax").attrs["nOs"] known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
for example in examples: for ex in examples:
doc = example.doc gold = ex.gold
gold = example.gold for i in range(len(gold.morphs)):
for t, features in enumerate(gold.morphology): pos = gold.pos[i] if i < len(gold.pos) else ""
if features is None: morph = gold.morphs[i]
target[idx] = scores[idx] feats = Morphology.feats_to_dict(morph)
if pos:
feats["POS"] = pos
if len(feats) > 0:
morph = self.vocab.strings[self.vocab.morphology.add(feats)]
if morph == "":
morph = Morphology.EMPTY_MORPH
if morph is None:
correct[idx] = guesses[idx]
elif morph in tag_index:
correct[idx] = tag_index[morph]
else: else:
gold_fields = {} correct[idx] = 0
for feature in features: known_labels[idx] = 0.
field = self._class_map.feat2field[feature]
gold_fields[field] = self._class_map.feat2offset[feature]
for field in self._class_map.fields:
field_id = self._class_map.field2id[field]
col_offset = self._class_map.field2col[field]
if field_id in gold_fields:
target[idx, col_offset + gold_fields[field_id]] = 1.
else:
target[idx, col_offset] = 1.
#print(doc[t])
#for col, info in enumerate(self._class_map.col2info):
# print(col, info, scores[idx, col], target[idx, col])
idx += 1 idx += 1
target = self.model.ops.asarray(target, dtype='f') correct = self.model.ops.xp.array(correct, dtype="i")
scores = self.model.ops.asarray(scores, dtype='f') d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
d_scores = scores - target d_scores *= self.model.ops.asarray(known_labels)
loss = (d_scores**2).sum() loss = (d_scores**2).sum()
docs = [self._get_doc(ex) for ex in examples] docs = [ex.doc for ex in examples]
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores return float(loss), d_scores
def use_params(self, params): def to_bytes(self, exclude=tuple(), **kwargs):
with self.model.use_params(params): serialize = {}
yield serialize["model"] = self.model.to_bytes
serialize["vocab"] = self.vocab.to_bytes
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
return util.to_bytes(serialize, exclude)
def scores_to_guesses(scores, out_sizes): def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
xp = get_array_module(scores) def load_model(b):
guesses = xp.zeros((scores.shape[0], len(out_sizes)), dtype='i') try:
offset = 0 self.model.from_bytes(b)
for i, size in enumerate(out_sizes): except AttributeError:
slice_ = scores[:, offset : offset + size] raise ValueError(Errors.E149)
col_guesses = slice_.argmax(axis=1)
guesses[:, i] = col_guesses deserialize = {
offset += size "vocab": lambda b: self.vocab.from_bytes(b),
return guesses "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
"model": lambda b: load_model(b),
}
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, exclude=tuple(), **kwargs):
serialize = {
"vocab": lambda p: self.vocab.to_disk(p),
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
"cfg": lambda p: srsly.write_json(p, self.cfg),
}
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude)
def from_disk(self, path, exclude=tuple(), **kwargs):
def load_model(p):
with p.open("rb") as file_:
try:
self.model.from_bytes(file_.read())
except AttributeError:
raise ValueError(Errors.E149)
deserialize = {
"vocab": lambda p: self.vocab.from_disk(p),
"cfg": lambda p: self.cfg.update(_load_cfg(p)),
"model": load_model,
}
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_disk(path, deserialize, exclude)
return self

View File

@ -81,6 +81,9 @@ class Scorer(object):
self.labelled = PRFScore() self.labelled = PRFScore()
self.labelled_per_dep = dict() self.labelled_per_dep = dict()
self.tags = PRFScore() self.tags = PRFScore()
self.pos = PRFScore()
self.morphs = PRFScore()
self.morphs_per_feat = dict()
self.sent_starts = PRFScore() self.sent_starts = PRFScore()
self.ner = PRFScore() self.ner = PRFScore()
self.ner_per_ents = dict() self.ner_per_ents = dict()
@ -111,6 +114,29 @@ class Scorer(object):
""" """
return self.tags.fscore * 100 return self.tags.fscore * 100
@property
def pos_acc(self):
"""RETURNS (float): Part-of-speech tag accuracy (coarse grained pos,
i.e. `Token.pos`).
"""
return self.pos.fscore * 100
@property
def morphs_acc(self):
"""RETURNS (float): Morph tag accuracy (morphological features,
i.e. `Token.morph`).
"""
return self.morphs.fscore * 100
@property
def morphs_per_type(self):
"""RETURNS (dict): Scores per dependency label.
"""
return {
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
for k, v in self.morphs_per_feat.items()
}
@property @property
def sent_p(self): def sent_p(self):
"""RETURNS (float): F-score for identification of sentence starts. """RETURNS (float): F-score for identification of sentence starts.
@ -231,6 +257,9 @@ class Scorer(object):
"ents_f": self.ents_f, "ents_f": self.ents_f,
"ents_per_type": self.ents_per_type, "ents_per_type": self.ents_per_type,
"tags_acc": self.tags_acc, "tags_acc": self.tags_acc,
"pos_acc": self.pos_acc,
"morphs_acc": self.morphs_acc,
"morphs_per_type": self.morphs_per_type,
"sent_p": self.sent_p, "sent_p": self.sent_p,
"sent_r": self.sent_r, "sent_r": self.sent_r,
"sent_f": self.sent_f, "sent_f": self.sent_f,
@ -264,12 +293,23 @@ class Scorer(object):
gold_deps = set() gold_deps = set()
gold_deps_per_dep = {} gold_deps_per_dep = {}
gold_tags = set() gold_tags = set()
gold_pos = set()
gold_morphs = set()
gold_morphs_per_feat = {}
gold_sent_starts = set() gold_sent_starts = set()
gold_ents = set(tags_to_entities(orig.entities)) gold_ents = set(tags_to_entities(orig.entities))
for id_, tag, head, dep, sent_start in zip( for id_, tag, pos, morph, head, dep, sent_start in zip(orig.ids, orig.tags, orig.pos, orig.morphs, orig.heads, orig.deps, orig.sent_starts):
orig.ids, orig.tags, orig.heads, orig.deps, orig.sent_starts
):
gold_tags.add((id_, tag)) gold_tags.add((id_, tag))
gold_pos.add((id_, pos))
gold_morphs.add((id_, morph))
if morph:
for feat in morph.split("|"):
field, values = feat.split("=")
if field not in self.morphs_per_feat:
self.morphs_per_feat[field] = PRFScore()
if field not in gold_morphs_per_feat:
gold_morphs_per_feat[field] = set()
gold_morphs_per_feat[field].add((id_, feat))
if sent_start: if sent_start:
gold_sent_starts.add(id_) gold_sent_starts.add(id_)
if dep not in (None, "") and dep.lower() not in punct_labels: if dep not in (None, "") and dep.lower() not in punct_labels:
@ -282,6 +322,9 @@ class Scorer(object):
cand_deps = set() cand_deps = set()
cand_deps_per_dep = {} cand_deps_per_dep = {}
cand_tags = set() cand_tags = set()
cand_pos = set()
cand_morphs = set()
cand_morphs_per_feat = {}
cand_sent_starts = set() cand_sent_starts = set()
for token in doc: for token in doc:
if token.orth_.isspace(): if token.orth_.isspace():
@ -292,6 +335,16 @@ class Scorer(object):
else: else:
self.tokens.tp += 1 self.tokens.tp += 1
cand_tags.add((gold_i, token.tag_)) cand_tags.add((gold_i, token.tag_))
cand_pos.add((gold_i, token.pos_))
cand_morphs.add((gold_i, token.morph_))
if token.morph_:
for feat in token.morph_.split("|"):
field, values = feat.split("=")
if field not in self.morphs_per_feat:
self.morphs_per_feat[field] = PRFScore()
if field not in cand_morphs_per_feat:
cand_morphs_per_feat[field] = set()
cand_morphs_per_feat[field].add((gold_i, feat))
if token.is_sent_start: if token.is_sent_start:
cand_sent_starts.add(gold_i) cand_sent_starts.add(gold_i)
if token.dep_.lower() not in punct_labels and token.orth_.strip(): if token.dep_.lower() not in punct_labels and token.orth_.strip():
@ -340,6 +393,10 @@ class Scorer(object):
# Score for all ents # Score for all ents
self.ner.score_set(cand_ents, gold_ents) self.ner.score_set(cand_ents, gold_ents)
self.tags.score_set(cand_tags, gold_tags) self.tags.score_set(cand_tags, gold_tags)
self.pos.score_set(cand_pos, gold_pos)
self.morphs.score_set(cand_morphs, gold_morphs)
for field in self.morphs_per_feat:
self.morphs_per_feat[field].score_set(cand_morphs_per_feat.get(field, set()), gold_morphs_per_feat.get(field, set()))
self.sent_starts.score_set(cand_sent_starts, gold_sent_starts) self.sent_starts.score_set(cand_sent_starts, gold_sent_starts)
self.labelled.score_set(cand_deps, gold_deps) self.labelled.score_set(cand_deps, gold_deps)
for dep in self.labelled_per_dep: for dep in self.labelled_per_dep:

View File

@ -0,0 +1,49 @@
import pytest
from spacy import util
from spacy.lang.en import English
from spacy.language import Language
from spacy.tests.util import make_tempdir
def test_label_types():
nlp = Language()
nlp.add_pipe(nlp.create_pipe("morphologizer"))
nlp.get_pipe("morphologizer").add_label("Feat=A")
with pytest.raises(ValueError):
nlp.get_pipe("morphologizer").add_label(9)
TRAIN_DATA = [
("I like green eggs", {"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], "pos": ["NOUN", "VERB", "ADJ", "NOUN"]}),
("Eat blue ham", {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}),
]
def test_overfitting_IO():
# Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly
nlp = English()
morphologizer = nlp.create_pipe("morphologizer")
for inst in TRAIN_DATA:
for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]):
morphologizer.add_label(morph + "|POS=" + pos)
nlp.add_pipe(morphologizer)
optimizer = nlp.begin_training()
for i in range(50):
losses = {}
nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses)
assert losses["morphologizer"] < 0.00001
# test the trained model
test_text = "I like blue eggs"
doc = nlp(test_text)
gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"]
assert gold_morphs == [t.morph_ for t in doc]
# Also test the results are still the same after IO
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2(test_text)
assert gold_morphs == [t.morph_ for t in doc2]

View File

@ -8,6 +8,7 @@ from spacy.language import Language
from spacy.pipeline import TextCategorizer from spacy.pipeline import TextCategorizer
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.gold import GoldParse from spacy.gold import GoldParse
from spacy.util import fix_random_seed
from ..util import make_tempdir from ..util import make_tempdir
from ...ml.models.defaults import default_tok2vec from ...ml.models.defaults import default_tok2vec
@ -82,6 +83,7 @@ def test_label_types():
def test_overfitting_IO(): def test_overfitting_IO():
# Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
fix_random_seed(0)
nlp = English() nlp = English()
textcat = nlp.create_pipe("textcat") textcat = nlp.create_pipe("textcat")
for _, annotations in TRAIN_DATA: for _, annotations in TRAIN_DATA:

View File

@ -5,6 +5,7 @@ from spacy.gold import Example, GoldParse
from spacy.scorer import Scorer, ROCAUCScore from spacy.scorer import Scorer, ROCAUCScore
from spacy.scorer import _roc_auc_score, _roc_curve from spacy.scorer import _roc_auc_score, _roc_curve
from .util import get_doc from .util import get_doc
from spacy.lang.en import English
test_las_apple = [ test_las_apple = [
[ [
@ -39,6 +40,43 @@ test_ner_apple = [
] ]
] ]
@pytest.fixture
def tagged_doc():
text = "Sarah's sister flew to Silicon Valley via London."
tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
pos = [
"PROPN",
"PART",
"NOUN",
"VERB",
"ADP",
"PROPN",
"PROPN",
"ADP",
"PROPN",
"PUNCT",
]
morphs = [
"NounType=prop|Number=sing",
"Poss=yes",
"Number=sing",
"Tense=past|VerbForm=fin",
"",
"NounType=prop|Number=sing",
"NounType=prop|Number=sing",
"",
"NounType=prop|Number=sing",
"PunctType=peri",
]
nlp = English()
doc = nlp(text)
for i in range(len(tags)):
doc[i].tag_ = tags[i]
doc[i].pos_ = pos[i]
doc[i].morph_ = morphs[i]
doc.is_tagged = True
return doc
def test_las_per_type(en_vocab): def test_las_per_type(en_vocab):
# Gold and Doc are identical # Gold and Doc are identical
@ -139,6 +177,43 @@ def test_ner_per_type(en_vocab):
assert results["ents_per_type"]["ORG"]["f"] == approx(66.66666) assert results["ents_per_type"]["ORG"]["f"] == approx(66.66666)
def test_tag_score(tagged_doc):
# Gold and Doc are identical
scorer = Scorer()
gold = GoldParse(
tagged_doc,
tags=[t.tag_ for t in tagged_doc],
pos=[t.pos_ for t in tagged_doc],
morphs=[t.morph_ for t in tagged_doc]
)
scorer.score((tagged_doc, gold))
results = scorer.scores
assert results["tags_acc"] == 100
assert results["pos_acc"] == 100
assert results["morphs_acc"] == 100
assert results["morphs_per_type"]["NounType"]["f"] == 100
# Gold and Doc are identical
scorer = Scorer()
tags = [t.tag_ for t in tagged_doc]
tags[0] = "NN"
pos = [t.pos_ for t in tagged_doc]
pos[1] = "X"
morphs = [t.morph_ for t in tagged_doc]
morphs[1] = "Number=sing"
morphs[2] = "Number=plur"
gold = GoldParse(tagged_doc, tags=tags, pos=pos, morphs=morphs)
scorer.score((tagged_doc, gold))
results = scorer.scores
assert results["tags_acc"] == 90
assert results["pos_acc"] == 90
assert results["morphs_acc"] == approx(80)
assert results["morphs_per_type"]["Poss"]["f"] == 0.0
assert results["morphs_per_type"]["Number"]["f"] == approx(72.727272)
def test_roc_auc_score(): def test_roc_auc_score():
# Binary classification, toy tests from scikit-learn test suite # Binary classification, toy tests from scikit-learn test suite
y_true = [0, 1] y_true = [0, 1]

View File

@ -50,6 +50,7 @@ cdef class Doc:
cdef public bint is_tagged cdef public bint is_tagged
cdef public bint is_parsed cdef public bint is_parsed
cdef public bint is_morphed
cdef public float sentiment cdef public float sentiment