mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Update morphologizer (#5108)
* Add pos and morph scoring to Scorer Add pos, morph, and morph_per_type to `Scorer`. Report pos and morph accuracy in `spacy evaluate`. * Update morphologizer for v3 * switch to tagger-based morphologizer * use `spacy.HashCharEmbedCNN` for morphologizer defaults * add `Doc.is_morphed` flag * Add morphologizer to train CLI * Add basic morphologizer pipeline tests * Add simple morphologizer training example * Remove subword_features from CharEmbed models Remove `subword_features` argument from `spacy.HashCharEmbedCNN.v1` and `spacy.HashCharEmbedBiLSTM.v1` since in these cases `subword_features` is always `False`. * Rename setting in morphologizer example Use `with_pos_tags` instead of `without_pos_tags`. * Fix kwargs for spacy.HashCharEmbedBiLSTM.v1 * Remove defaults for spacy.HashCharEmbedBiLSTM.v1 Remove default `nM/nC` for `spacy.HashCharEmbedBiLSTM.v1`. * Set random seed for textcat overfitting test
This commit is contained in:
parent
ab59f3124e
commit
b71a11ff6d
133
examples/training/train_morphologizer.py
Normal file
133
examples/training/train_morphologizer.py
Normal file
|
@ -0,0 +1,133 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# coding: utf8
|
||||||
|
"""
|
||||||
|
A simple example for training a morphologizer. For more details, see
|
||||||
|
the documentation:
|
||||||
|
* Training: https://spacy.io/usage/training
|
||||||
|
|
||||||
|
Compatible with: spaCy v3.0.0+
|
||||||
|
Last tested with: v3.0.0
|
||||||
|
"""
|
||||||
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
import plac
|
||||||
|
import random
|
||||||
|
from pathlib import Path
|
||||||
|
import spacy
|
||||||
|
from spacy.util import minibatch, compounding
|
||||||
|
from spacy.morphology import Morphology
|
||||||
|
|
||||||
|
|
||||||
|
# Usually you'll read this in, of course. Data formats vary. Ensure your
|
||||||
|
# strings are unicode and that the number of tags assigned matches spaCy's
|
||||||
|
# tokenization. If not, you can always add a 'words' key to the annotations
|
||||||
|
# that specifies the gold-standard tokenization, e.g.:
|
||||||
|
# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']})
|
||||||
|
TRAIN_DATA = [
|
||||||
|
(
|
||||||
|
"I like green eggs",
|
||||||
|
{
|
||||||
|
"morphs": [
|
||||||
|
"PronType=Prs|Person=1",
|
||||||
|
"VerbForm=Fin",
|
||||||
|
"Degree=Pos",
|
||||||
|
"Number=Plur",
|
||||||
|
],
|
||||||
|
"pos": ["PRON", "VERB", "ADJ", "NOUN"],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Eat blue ham",
|
||||||
|
{
|
||||||
|
"morphs": ["VerbForm=Inf", "Degree=Pos", "Number=Sing"],
|
||||||
|
"pos": ["VERB", "ADJ", "NOUN"],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"She was blue",
|
||||||
|
{
|
||||||
|
"morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos"],
|
||||||
|
"pos": ["PRON", "VERB", "ADJ"],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"He was blue today",
|
||||||
|
{
|
||||||
|
"morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos", ""],
|
||||||
|
"pos": ["PRON", "VERB", "ADJ", "ADV"],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
# The POS tags are optional, set `with_pos_tags = False` to omit them for
|
||||||
|
# this example:
|
||||||
|
with_pos_tags = True
|
||||||
|
|
||||||
|
if not with_pos_tags:
|
||||||
|
for i in range(len(TRAIN_DATA)):
|
||||||
|
del TRAIN_DATA[i][1]["pos"]
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
lang=("ISO Code of language to use", "option", "l", str),
|
||||||
|
output_dir=("Optional output directory", "option", "o", Path),
|
||||||
|
n_iter=("Number of training iterations", "option", "n", int),
|
||||||
|
)
|
||||||
|
def main(lang="en", output_dir=None, n_iter=25):
|
||||||
|
"""Create a new model, set up the pipeline and train the tagger. In order to
|
||||||
|
train the tagger with a custom tag map, we're creating a new Language
|
||||||
|
instance with a custom vocab.
|
||||||
|
"""
|
||||||
|
nlp = spacy.blank(lang)
|
||||||
|
# add the tagger to the pipeline
|
||||||
|
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||||
|
morphologizer = nlp.create_pipe("morphologizer")
|
||||||
|
nlp.add_pipe(morphologizer)
|
||||||
|
|
||||||
|
# add labels
|
||||||
|
for _, annotations in TRAIN_DATA:
|
||||||
|
morph_labels = annotations.get("morphs")
|
||||||
|
pos_labels = annotations.get("pos", [""] * len(annotations.get("morphs")))
|
||||||
|
assert len(morph_labels) == len(pos_labels)
|
||||||
|
for morph, pos in zip(morph_labels, pos_labels):
|
||||||
|
morph_dict = Morphology.feats_to_dict(morph)
|
||||||
|
if pos:
|
||||||
|
morph_dict["POS"] = pos
|
||||||
|
morph = Morphology.dict_to_feats(morph_dict)
|
||||||
|
morphologizer.add_label(morph)
|
||||||
|
|
||||||
|
optimizer = nlp.begin_training()
|
||||||
|
for i in range(n_iter):
|
||||||
|
random.shuffle(TRAIN_DATA)
|
||||||
|
losses = {}
|
||||||
|
# batch up the examples using spaCy's minibatch
|
||||||
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
for batch in batches:
|
||||||
|
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||||
|
print("Losses", losses)
|
||||||
|
|
||||||
|
# test the trained model
|
||||||
|
test_text = "I like blue eggs"
|
||||||
|
doc = nlp(test_text)
|
||||||
|
print("Morphs", [(t.text, t.morph) for t in doc])
|
||||||
|
|
||||||
|
# save model to output directory
|
||||||
|
if output_dir is not None:
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
if not output_dir.exists():
|
||||||
|
output_dir.mkdir()
|
||||||
|
nlp.to_disk(output_dir)
|
||||||
|
print("Saved model to", output_dir)
|
||||||
|
|
||||||
|
# test the save model
|
||||||
|
print("Loading from", output_dir)
|
||||||
|
nlp2 = spacy.load(output_dir)
|
||||||
|
doc = nlp2(test_text)
|
||||||
|
print("Morphs", [(t.text, t.morph) for t in doc])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
plac.call(main)
|
||||||
|
|
||||||
|
# Expected output:
|
||||||
|
# Morphs [('I', POS=PRON|Person=1|PronType=Prs), ('like', POS=VERB|VerbForm=Fin), ('blue', Degree=Pos|POS=ADJ), ('eggs', Number=Plur|POS=NOUN)]
|
|
@ -43,7 +43,9 @@ def evaluate(
|
||||||
"Words": nwords,
|
"Words": nwords,
|
||||||
"Words/s": f"{nwords / (end - begin):.0f}",
|
"Words/s": f"{nwords / (end - begin):.0f}",
|
||||||
"TOK": f"{scorer.token_acc:.2f}",
|
"TOK": f"{scorer.token_acc:.2f}",
|
||||||
"POS": f"{scorer.tags_acc:.2f}",
|
"TAG": f"{scorer.tags_acc:.2f}",
|
||||||
|
"POS": f"{scorer.pos_acc:.2f}",
|
||||||
|
"MORPH": f"{scorer.morphs_acc:.2f}",
|
||||||
"UAS": f"{scorer.uas:.2f}",
|
"UAS": f"{scorer.uas:.2f}",
|
||||||
"LAS": f"{scorer.las:.2f}",
|
"LAS": f"{scorer.las:.2f}",
|
||||||
"NER P": f"{scorer.ents_p:.2f}",
|
"NER P": f"{scorer.ents_p:.2f}",
|
||||||
|
|
|
@ -221,6 +221,8 @@ def train(
|
||||||
config_loc = default_dir / "parser_defaults.cfg"
|
config_loc = default_dir / "parser_defaults.cfg"
|
||||||
elif pipe == "tagger":
|
elif pipe == "tagger":
|
||||||
config_loc = default_dir / "tagger_defaults.cfg"
|
config_loc = default_dir / "tagger_defaults.cfg"
|
||||||
|
elif pipe == "morphologizer":
|
||||||
|
config_loc = default_dir / "morphologizer_defaults.cfg"
|
||||||
elif pipe == "ner":
|
elif pipe == "ner":
|
||||||
config_loc = default_dir / "ner_defaults.cfg"
|
config_loc = default_dir / "ner_defaults.cfg"
|
||||||
elif pipe == "textcat":
|
elif pipe == "textcat":
|
||||||
|
@ -590,6 +592,8 @@ def _score_for_model(meta):
|
||||||
acc = meta["accuracy"]
|
acc = meta["accuracy"]
|
||||||
if "tagger" in pipes:
|
if "tagger" in pipes:
|
||||||
mean_acc.append(acc["tags_acc"])
|
mean_acc.append(acc["tags_acc"])
|
||||||
|
if "morphologizer" in pipes:
|
||||||
|
mean_acc.append((acc["morphs_acc"] + acc["pos_acc"]) / 2)
|
||||||
if "parser" in pipes:
|
if "parser" in pipes:
|
||||||
mean_acc.append((acc["uas"] + acc["las"]) / 2)
|
mean_acc.append((acc["uas"] + acc["las"]) / 2)
|
||||||
if "ner" in pipes:
|
if "ner" in pipes:
|
||||||
|
@ -672,13 +676,15 @@ def _find_best(experiment_dir, component):
|
||||||
|
|
||||||
def _get_metrics(component):
|
def _get_metrics(component):
|
||||||
if component == "parser":
|
if component == "parser":
|
||||||
return ("las", "uas", "las_per_type", "token_acc", "sent_f")
|
return ("las", "uas", "las_per_type", "sent_f", "token_acc")
|
||||||
elif component == "tagger":
|
elif component == "tagger":
|
||||||
return ("tags_acc", "token_acc")
|
return ("tags_acc", "token_acc")
|
||||||
|
elif component == "morphologizer":
|
||||||
|
return ("morphs_acc", "pos_acc", "token_acc")
|
||||||
elif component == "ner":
|
elif component == "ner":
|
||||||
return ("ents_f", "ents_p", "ents_r", "ents_per_type", "token_acc")
|
return ("ents_f", "ents_p", "ents_r", "ents_per_type", "token_acc")
|
||||||
elif component == "senter":
|
elif component == "senter":
|
||||||
return ("sent_f", "sent_p", "sent_r")
|
return ("sent_f", "sent_p", "sent_r", "token_acc")
|
||||||
elif component == "textcat":
|
elif component == "textcat":
|
||||||
return ("textcat_score", "token_acc")
|
return ("textcat_score", "token_acc")
|
||||||
return ("token_acc",)
|
return ("token_acc",)
|
||||||
|
@ -691,6 +697,9 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths):
|
||||||
if pipe == "tagger":
|
if pipe == "tagger":
|
||||||
row_head.extend(["Tag Loss ", " Tag % "])
|
row_head.extend(["Tag Loss ", " Tag % "])
|
||||||
output_stats.extend(["tag_loss", "tags_acc"])
|
output_stats.extend(["tag_loss", "tags_acc"])
|
||||||
|
elif pipe == "morphologizer" or pipe == "morphologizertagger":
|
||||||
|
row_head.extend(["Morph Loss ", " Morph % ", " POS % "])
|
||||||
|
output_stats.extend(["morph_loss", "morphs_acc", "pos_acc"])
|
||||||
elif pipe == "parser":
|
elif pipe == "parser":
|
||||||
row_head.extend(
|
row_head.extend(
|
||||||
["Dep Loss ", " UAS ", " LAS ", "Sent P", "Sent R", "Sent F"]
|
["Dep Loss ", " UAS ", " LAS ", "Sent P", "Sent R", "Sent F"]
|
||||||
|
@ -731,6 +740,7 @@ def _get_progress(
|
||||||
scores["dep_loss"] = losses.get("parser", 0.0)
|
scores["dep_loss"] = losses.get("parser", 0.0)
|
||||||
scores["ner_loss"] = losses.get("ner", 0.0)
|
scores["ner_loss"] = losses.get("ner", 0.0)
|
||||||
scores["tag_loss"] = losses.get("tagger", 0.0)
|
scores["tag_loss"] = losses.get("tagger", 0.0)
|
||||||
|
scores["morph_loss"] = losses.get("morphologizer", 0.0)
|
||||||
scores["textcat_loss"] = losses.get("textcat", 0.0)
|
scores["textcat_loss"] = losses.get("textcat", 0.0)
|
||||||
scores["senter_loss"] = losses.get("senter", 0.0)
|
scores["senter_loss"] = losses.get("senter", 0.0)
|
||||||
scores["cpu_wps"] = cpu_wps
|
scores["cpu_wps"] = cpu_wps
|
||||||
|
|
|
@ -9,6 +9,5 @@ depth = 4
|
||||||
embed_size = 7000
|
embed_size = 7000
|
||||||
window_size = 1
|
window_size = 1
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
subword_features = true
|
|
||||||
nM = 64
|
nM = 64
|
||||||
nC = 8
|
nC = 8
|
||||||
|
|
|
@ -74,7 +74,6 @@ def hash_charembed_cnn(
|
||||||
embed_size,
|
embed_size,
|
||||||
maxout_pieces,
|
maxout_pieces,
|
||||||
window_size,
|
window_size,
|
||||||
subword_features,
|
|
||||||
nM,
|
nM,
|
||||||
nC,
|
nC,
|
||||||
):
|
):
|
||||||
|
@ -87,7 +86,7 @@ def hash_charembed_cnn(
|
||||||
bilstm_depth=0,
|
bilstm_depth=0,
|
||||||
maxout_pieces=maxout_pieces,
|
maxout_pieces=maxout_pieces,
|
||||||
window_size=window_size,
|
window_size=window_size,
|
||||||
subword_features=subword_features,
|
subword_features=False,
|
||||||
char_embed=True,
|
char_embed=True,
|
||||||
nM=nM,
|
nM=nM,
|
||||||
nC=nC,
|
nC=nC,
|
||||||
|
@ -116,7 +115,7 @@ def hash_embed_bilstm_v1(
|
||||||
|
|
||||||
@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1")
|
@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1")
|
||||||
def hash_char_embed_bilstm_v1(
|
def hash_char_embed_bilstm_v1(
|
||||||
pretrained_vectors, width, depth, embed_size, subword_features, nM, nC, maxout_pieces
|
pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC
|
||||||
):
|
):
|
||||||
# Allows using character embeddings by setting nC, nM and char_embed=True
|
# Allows using character embeddings by setting nC, nM and char_embed=True
|
||||||
return build_Tok2Vec_model(
|
return build_Tok2Vec_model(
|
||||||
|
@ -127,7 +126,7 @@ def hash_char_embed_bilstm_v1(
|
||||||
conv_depth=0,
|
conv_depth=0,
|
||||||
maxout_pieces=maxout_pieces,
|
maxout_pieces=maxout_pieces,
|
||||||
window_size=1,
|
window_size=1,
|
||||||
subword_features=subword_features,
|
subword_features=False,
|
||||||
char_embed=True,
|
char_embed=True,
|
||||||
nM=nM,
|
nM=nM,
|
||||||
nC=nC,
|
nC=nC,
|
||||||
|
|
|
@ -1,166 +1,169 @@
|
||||||
|
# cython: infer_types=True, profile=True
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from collections import defaultdict
|
import srsly
|
||||||
from thinc.api import chain, list2array, to_categorical, get_array_module
|
from thinc.api import to_categorical
|
||||||
from thinc.util import copy_array
|
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from ..morphology cimport Morphology
|
from ..morphology cimport Morphology
|
||||||
|
from ..parts_of_speech import IDS as POS_IDS
|
||||||
|
from ..symbols import POS
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..language import component
|
from ..language import component
|
||||||
from ..util import link_vectors_to_models, create_default_optimizer
|
from ..util import link_vectors_to_models, create_default_optimizer
|
||||||
from ..errors import Errors, TempErrors
|
from ..errors import Errors, TempErrors
|
||||||
from .pipes import Pipe
|
from .pipes import Tagger, _load_cfg
|
||||||
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
@component("morphologizer", assigns=["token.morph", "token.pos"])
|
@component("morphologizer", assigns=["token.morph", "token.pos"])
|
||||||
class Morphologizer(Pipe):
|
class Morphologizer(Tagger):
|
||||||
|
|
||||||
def __init__(self, vocab, model, **cfg):
|
def __init__(self, vocab, model, **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
self._rehearsal_model = None
|
||||||
self.cfg = dict(sorted(cfg.items()))
|
self.cfg = dict(sorted(cfg.items()))
|
||||||
self._class_map = self.vocab.morphology.create_class_map() # Morphology.create_class_map() ?
|
self.cfg.setdefault("labels", {})
|
||||||
|
self.cfg.setdefault("morph_pos", {})
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
return self.vocab.morphology.tag_names
|
return tuple(self.cfg["labels"].keys())
|
||||||
|
|
||||||
@property
|
def add_label(self, label):
|
||||||
def tok2vec(self):
|
if not isinstance(label, str):
|
||||||
if self.model in (None, True, False):
|
raise ValueError(Errors.E187)
|
||||||
return None
|
if label in self.labels:
|
||||||
else:
|
return 0
|
||||||
return chain(self.model.get_ref("tok2vec"), list2array())
|
morph = Morphology.feats_to_dict(label)
|
||||||
|
norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)]
|
||||||
def __call__(self, doc):
|
pos = morph.get("POS", "")
|
||||||
features, tokvecs = self.predict([doc])
|
if norm_morph_pos not in self.cfg["labels"]:
|
||||||
self.set_annotations([doc], features, tensors=tokvecs)
|
self.cfg["labels"][norm_morph_pos] = norm_morph_pos
|
||||||
return doc
|
self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos]
|
||||||
|
return 1
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
|
||||||
docs = list(docs)
|
|
||||||
features, tokvecs = self.predict(docs)
|
|
||||||
self.set_annotations(docs, features, tensors=tokvecs)
|
|
||||||
yield from docs
|
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
|
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
|
for example in get_examples():
|
||||||
|
for i, morph in enumerate(example.token_annotation.morphs):
|
||||||
|
pos = example.token_annotation.get_pos(i)
|
||||||
|
morph = Morphology.feats_to_dict(morph)
|
||||||
|
norm_morph = self.vocab.strings[self.vocab.morphology.add(morph)]
|
||||||
|
if pos:
|
||||||
|
morph["POS"] = pos
|
||||||
|
norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)]
|
||||||
|
if norm_morph_pos not in self.cfg["labels"]:
|
||||||
|
self.cfg["labels"][norm_morph_pos] = norm_morph
|
||||||
|
self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos]
|
||||||
self.set_output(len(self.labels))
|
self.set_output(len(self.labels))
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
sgd = self.create_optimizer()
|
sgd = self.create_optimizer()
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
def predict(self, docs):
|
def set_annotations(self, docs, batch_tag_ids):
|
||||||
if not any(len(doc) for doc in docs):
|
|
||||||
# Handle case where there are no tokens in any docs.
|
|
||||||
n_labels = self.model.get_dim("nO")
|
|
||||||
guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
|
|
||||||
tokvecs = self.model.ops.alloc((0, self.model.get_ref("tok2vec").get_dim("nO")))
|
|
||||||
return guesses, tokvecs
|
|
||||||
tokvecs = self.model.get_ref("tok2vec")(docs)
|
|
||||||
scores = self.model.get_ref("softmax")(tokvecs)
|
|
||||||
return scores, tokvecs
|
|
||||||
|
|
||||||
def set_annotations(self, docs, batch_scores, tensors=None):
|
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef Vocab vocab = self.vocab
|
cdef Vocab vocab = self.vocab
|
||||||
offsets = [self._class_map.get_field_offset(field)
|
|
||||||
for field in self._class_map.fields]
|
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
doc_scores = batch_scores[i]
|
doc_tag_ids = batch_tag_ids[i]
|
||||||
doc_guesses = scores_to_guesses(doc_scores, self.model.get_ref("softmax").attrs["nOs"])
|
if hasattr(doc_tag_ids, "get"):
|
||||||
# Convert the neuron indices into feature IDs.
|
doc_tag_ids = doc_tag_ids.get()
|
||||||
doc_feat_ids = numpy.zeros((len(doc), len(self._class_map.fields)), dtype='i')
|
for j, tag_id in enumerate(doc_tag_ids):
|
||||||
for j in range(len(doc)):
|
morph = self.labels[tag_id]
|
||||||
for k, offset in enumerate(offsets):
|
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels"][morph])
|
||||||
if doc_guesses[j, k] == 0:
|
doc.c[j].pos = self.cfg["morph_pos"][morph]
|
||||||
doc_feat_ids[j, k] = 0
|
|
||||||
else:
|
|
||||||
doc_feat_ids[j, k] = offset + doc_guesses[j, k]
|
|
||||||
# Get the set of feature names.
|
|
||||||
feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]}
|
|
||||||
if "NIL" in feats:
|
|
||||||
feats.remove("NIL")
|
|
||||||
# Now add the analysis, and set the hash.
|
|
||||||
doc.c[j].morph = self.vocab.morphology.add(feats)
|
|
||||||
if doc[j].morph.pos != 0:
|
|
||||||
doc.c[j].pos = doc[j].morph.pos
|
|
||||||
|
|
||||||
def update(self, examples, drop=0., sgd=None, losses=None):
|
doc.is_morphed = True
|
||||||
if losses is not None and self.name not in losses:
|
|
||||||
losses[self.name] = 0.
|
|
||||||
|
|
||||||
docs = [self._get_doc(ex) for ex in examples]
|
|
||||||
tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
|
|
||||||
loss, d_tag_scores = self.get_loss(examples, tag_scores)
|
|
||||||
bp_tag_scores(d_tag_scores, sgd=sgd)
|
|
||||||
|
|
||||||
if losses is not None:
|
|
||||||
losses[self.name] += loss
|
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
guesses = []
|
scores = self.model.ops.flatten(scores)
|
||||||
for doc_scores in scores:
|
tag_index = {tag: i for i, tag in enumerate(self.labels)}
|
||||||
guesses.append(scores_to_guesses(doc_scores, self.model.get_ref("softmax").attrs["nOs"]))
|
|
||||||
guesses = self.model.ops.xp.vstack(guesses)
|
|
||||||
scores = self.model.ops.xp.vstack(scores)
|
|
||||||
if not isinstance(scores, numpy.ndarray):
|
|
||||||
scores = scores.get()
|
|
||||||
if not isinstance(guesses, numpy.ndarray):
|
|
||||||
guesses = guesses.get()
|
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
# Do this on CPU, as we can't vectorize easily.
|
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
||||||
target = numpy.zeros(scores.shape, dtype='f')
|
guesses = scores.argmax(axis=1)
|
||||||
field_sizes = self.model.get_ref("softmax").attrs["nOs"]
|
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
||||||
for example in examples:
|
for ex in examples:
|
||||||
doc = example.doc
|
gold = ex.gold
|
||||||
gold = example.gold
|
for i in range(len(gold.morphs)):
|
||||||
for t, features in enumerate(gold.morphology):
|
pos = gold.pos[i] if i < len(gold.pos) else ""
|
||||||
if features is None:
|
morph = gold.morphs[i]
|
||||||
target[idx] = scores[idx]
|
feats = Morphology.feats_to_dict(morph)
|
||||||
|
if pos:
|
||||||
|
feats["POS"] = pos
|
||||||
|
if len(feats) > 0:
|
||||||
|
morph = self.vocab.strings[self.vocab.morphology.add(feats)]
|
||||||
|
if morph == "":
|
||||||
|
morph = Morphology.EMPTY_MORPH
|
||||||
|
if morph is None:
|
||||||
|
correct[idx] = guesses[idx]
|
||||||
|
elif morph in tag_index:
|
||||||
|
correct[idx] = tag_index[morph]
|
||||||
else:
|
else:
|
||||||
gold_fields = {}
|
correct[idx] = 0
|
||||||
for feature in features:
|
known_labels[idx] = 0.
|
||||||
field = self._class_map.feat2field[feature]
|
|
||||||
gold_fields[field] = self._class_map.feat2offset[feature]
|
|
||||||
for field in self._class_map.fields:
|
|
||||||
field_id = self._class_map.field2id[field]
|
|
||||||
col_offset = self._class_map.field2col[field]
|
|
||||||
if field_id in gold_fields:
|
|
||||||
target[idx, col_offset + gold_fields[field_id]] = 1.
|
|
||||||
else:
|
|
||||||
target[idx, col_offset] = 1.
|
|
||||||
#print(doc[t])
|
|
||||||
#for col, info in enumerate(self._class_map.col2info):
|
|
||||||
# print(col, info, scores[idx, col], target[idx, col])
|
|
||||||
idx += 1
|
idx += 1
|
||||||
target = self.model.ops.asarray(target, dtype='f')
|
correct = self.model.ops.xp.array(correct, dtype="i")
|
||||||
scores = self.model.ops.asarray(scores, dtype='f')
|
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
||||||
d_scores = scores - target
|
d_scores *= self.model.ops.asarray(known_labels)
|
||||||
loss = (d_scores**2).sum()
|
loss = (d_scores**2).sum()
|
||||||
docs = [self._get_doc(ex) for ex in examples]
|
docs = [ex.doc for ex in examples]
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def use_params(self, params):
|
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||||
with self.model.use_params(params):
|
serialize = {}
|
||||||
yield
|
serialize["model"] = self.model.to_bytes
|
||||||
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
|
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||||
|
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
||||||
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def scores_to_guesses(scores, out_sizes):
|
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||||
xp = get_array_module(scores)
|
def load_model(b):
|
||||||
guesses = xp.zeros((scores.shape[0], len(out_sizes)), dtype='i')
|
try:
|
||||||
offset = 0
|
self.model.from_bytes(b)
|
||||||
for i, size in enumerate(out_sizes):
|
except AttributeError:
|
||||||
slice_ = scores[:, offset : offset + size]
|
raise ValueError(Errors.E149)
|
||||||
col_guesses = slice_.argmax(axis=1)
|
|
||||||
guesses[:, i] = col_guesses
|
deserialize = {
|
||||||
offset += size
|
"vocab": lambda b: self.vocab.from_bytes(b),
|
||||||
return guesses
|
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
||||||
|
"model": lambda b: load_model(b),
|
||||||
|
}
|
||||||
|
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
||||||
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def to_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
|
serialize = {
|
||||||
|
"vocab": lambda p: self.vocab.to_disk(p),
|
||||||
|
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
||||||
|
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
||||||
|
}
|
||||||
|
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
||||||
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
|
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
|
def load_model(p):
|
||||||
|
with p.open("rb") as file_:
|
||||||
|
try:
|
||||||
|
self.model.from_bytes(file_.read())
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError(Errors.E149)
|
||||||
|
|
||||||
|
deserialize = {
|
||||||
|
"vocab": lambda p: self.vocab.from_disk(p),
|
||||||
|
"cfg": lambda p: self.cfg.update(_load_cfg(p)),
|
||||||
|
"model": load_model,
|
||||||
|
}
|
||||||
|
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
||||||
|
util.from_disk(path, deserialize, exclude)
|
||||||
|
return self
|
||||||
|
|
|
@ -81,6 +81,9 @@ class Scorer(object):
|
||||||
self.labelled = PRFScore()
|
self.labelled = PRFScore()
|
||||||
self.labelled_per_dep = dict()
|
self.labelled_per_dep = dict()
|
||||||
self.tags = PRFScore()
|
self.tags = PRFScore()
|
||||||
|
self.pos = PRFScore()
|
||||||
|
self.morphs = PRFScore()
|
||||||
|
self.morphs_per_feat = dict()
|
||||||
self.sent_starts = PRFScore()
|
self.sent_starts = PRFScore()
|
||||||
self.ner = PRFScore()
|
self.ner = PRFScore()
|
||||||
self.ner_per_ents = dict()
|
self.ner_per_ents = dict()
|
||||||
|
@ -111,6 +114,29 @@ class Scorer(object):
|
||||||
"""
|
"""
|
||||||
return self.tags.fscore * 100
|
return self.tags.fscore * 100
|
||||||
|
|
||||||
|
@property
|
||||||
|
def pos_acc(self):
|
||||||
|
"""RETURNS (float): Part-of-speech tag accuracy (coarse grained pos,
|
||||||
|
i.e. `Token.pos`).
|
||||||
|
"""
|
||||||
|
return self.pos.fscore * 100
|
||||||
|
|
||||||
|
@property
|
||||||
|
def morphs_acc(self):
|
||||||
|
"""RETURNS (float): Morph tag accuracy (morphological features,
|
||||||
|
i.e. `Token.morph`).
|
||||||
|
"""
|
||||||
|
return self.morphs.fscore * 100
|
||||||
|
|
||||||
|
@property
|
||||||
|
def morphs_per_type(self):
|
||||||
|
"""RETURNS (dict): Scores per dependency label.
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
|
||||||
|
for k, v in self.morphs_per_feat.items()
|
||||||
|
}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def sent_p(self):
|
def sent_p(self):
|
||||||
"""RETURNS (float): F-score for identification of sentence starts.
|
"""RETURNS (float): F-score for identification of sentence starts.
|
||||||
|
@ -231,6 +257,9 @@ class Scorer(object):
|
||||||
"ents_f": self.ents_f,
|
"ents_f": self.ents_f,
|
||||||
"ents_per_type": self.ents_per_type,
|
"ents_per_type": self.ents_per_type,
|
||||||
"tags_acc": self.tags_acc,
|
"tags_acc": self.tags_acc,
|
||||||
|
"pos_acc": self.pos_acc,
|
||||||
|
"morphs_acc": self.morphs_acc,
|
||||||
|
"morphs_per_type": self.morphs_per_type,
|
||||||
"sent_p": self.sent_p,
|
"sent_p": self.sent_p,
|
||||||
"sent_r": self.sent_r,
|
"sent_r": self.sent_r,
|
||||||
"sent_f": self.sent_f,
|
"sent_f": self.sent_f,
|
||||||
|
@ -264,12 +293,23 @@ class Scorer(object):
|
||||||
gold_deps = set()
|
gold_deps = set()
|
||||||
gold_deps_per_dep = {}
|
gold_deps_per_dep = {}
|
||||||
gold_tags = set()
|
gold_tags = set()
|
||||||
|
gold_pos = set()
|
||||||
|
gold_morphs = set()
|
||||||
|
gold_morphs_per_feat = {}
|
||||||
gold_sent_starts = set()
|
gold_sent_starts = set()
|
||||||
gold_ents = set(tags_to_entities(orig.entities))
|
gold_ents = set(tags_to_entities(orig.entities))
|
||||||
for id_, tag, head, dep, sent_start in zip(
|
for id_, tag, pos, morph, head, dep, sent_start in zip(orig.ids, orig.tags, orig.pos, orig.morphs, orig.heads, orig.deps, orig.sent_starts):
|
||||||
orig.ids, orig.tags, orig.heads, orig.deps, orig.sent_starts
|
|
||||||
):
|
|
||||||
gold_tags.add((id_, tag))
|
gold_tags.add((id_, tag))
|
||||||
|
gold_pos.add((id_, pos))
|
||||||
|
gold_morphs.add((id_, morph))
|
||||||
|
if morph:
|
||||||
|
for feat in morph.split("|"):
|
||||||
|
field, values = feat.split("=")
|
||||||
|
if field not in self.morphs_per_feat:
|
||||||
|
self.morphs_per_feat[field] = PRFScore()
|
||||||
|
if field not in gold_morphs_per_feat:
|
||||||
|
gold_morphs_per_feat[field] = set()
|
||||||
|
gold_morphs_per_feat[field].add((id_, feat))
|
||||||
if sent_start:
|
if sent_start:
|
||||||
gold_sent_starts.add(id_)
|
gold_sent_starts.add(id_)
|
||||||
if dep not in (None, "") and dep.lower() not in punct_labels:
|
if dep not in (None, "") and dep.lower() not in punct_labels:
|
||||||
|
@ -282,6 +322,9 @@ class Scorer(object):
|
||||||
cand_deps = set()
|
cand_deps = set()
|
||||||
cand_deps_per_dep = {}
|
cand_deps_per_dep = {}
|
||||||
cand_tags = set()
|
cand_tags = set()
|
||||||
|
cand_pos = set()
|
||||||
|
cand_morphs = set()
|
||||||
|
cand_morphs_per_feat = {}
|
||||||
cand_sent_starts = set()
|
cand_sent_starts = set()
|
||||||
for token in doc:
|
for token in doc:
|
||||||
if token.orth_.isspace():
|
if token.orth_.isspace():
|
||||||
|
@ -292,6 +335,16 @@ class Scorer(object):
|
||||||
else:
|
else:
|
||||||
self.tokens.tp += 1
|
self.tokens.tp += 1
|
||||||
cand_tags.add((gold_i, token.tag_))
|
cand_tags.add((gold_i, token.tag_))
|
||||||
|
cand_pos.add((gold_i, token.pos_))
|
||||||
|
cand_morphs.add((gold_i, token.morph_))
|
||||||
|
if token.morph_:
|
||||||
|
for feat in token.morph_.split("|"):
|
||||||
|
field, values = feat.split("=")
|
||||||
|
if field not in self.morphs_per_feat:
|
||||||
|
self.morphs_per_feat[field] = PRFScore()
|
||||||
|
if field not in cand_morphs_per_feat:
|
||||||
|
cand_morphs_per_feat[field] = set()
|
||||||
|
cand_morphs_per_feat[field].add((gold_i, feat))
|
||||||
if token.is_sent_start:
|
if token.is_sent_start:
|
||||||
cand_sent_starts.add(gold_i)
|
cand_sent_starts.add(gold_i)
|
||||||
if token.dep_.lower() not in punct_labels and token.orth_.strip():
|
if token.dep_.lower() not in punct_labels and token.orth_.strip():
|
||||||
|
@ -340,6 +393,10 @@ class Scorer(object):
|
||||||
# Score for all ents
|
# Score for all ents
|
||||||
self.ner.score_set(cand_ents, gold_ents)
|
self.ner.score_set(cand_ents, gold_ents)
|
||||||
self.tags.score_set(cand_tags, gold_tags)
|
self.tags.score_set(cand_tags, gold_tags)
|
||||||
|
self.pos.score_set(cand_pos, gold_pos)
|
||||||
|
self.morphs.score_set(cand_morphs, gold_morphs)
|
||||||
|
for field in self.morphs_per_feat:
|
||||||
|
self.morphs_per_feat[field].score_set(cand_morphs_per_feat.get(field, set()), gold_morphs_per_feat.get(field, set()))
|
||||||
self.sent_starts.score_set(cand_sent_starts, gold_sent_starts)
|
self.sent_starts.score_set(cand_sent_starts, gold_sent_starts)
|
||||||
self.labelled.score_set(cand_deps, gold_deps)
|
self.labelled.score_set(cand_deps, gold_deps)
|
||||||
for dep in self.labelled_per_dep:
|
for dep in self.labelled_per_dep:
|
||||||
|
|
49
spacy/tests/pipeline/test_morphologizer.py
Normal file
49
spacy/tests/pipeline/test_morphologizer.py
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from spacy import util
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.tests.util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_label_types():
|
||||||
|
nlp = Language()
|
||||||
|
nlp.add_pipe(nlp.create_pipe("morphologizer"))
|
||||||
|
nlp.get_pipe("morphologizer").add_label("Feat=A")
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.get_pipe("morphologizer").add_label(9)
|
||||||
|
|
||||||
|
|
||||||
|
TRAIN_DATA = [
|
||||||
|
("I like green eggs", {"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], "pos": ["NOUN", "VERB", "ADJ", "NOUN"]}),
|
||||||
|
("Eat blue ham", {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_overfitting_IO():
|
||||||
|
# Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly
|
||||||
|
nlp = English()
|
||||||
|
morphologizer = nlp.create_pipe("morphologizer")
|
||||||
|
for inst in TRAIN_DATA:
|
||||||
|
for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]):
|
||||||
|
morphologizer.add_label(morph + "|POS=" + pos)
|
||||||
|
nlp.add_pipe(morphologizer)
|
||||||
|
optimizer = nlp.begin_training()
|
||||||
|
|
||||||
|
for i in range(50):
|
||||||
|
losses = {}
|
||||||
|
nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses)
|
||||||
|
assert losses["morphologizer"] < 0.00001
|
||||||
|
|
||||||
|
# test the trained model
|
||||||
|
test_text = "I like blue eggs"
|
||||||
|
doc = nlp(test_text)
|
||||||
|
gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"]
|
||||||
|
assert gold_morphs == [t.morph_ for t in doc]
|
||||||
|
|
||||||
|
# Also test the results are still the same after IO
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
nlp.to_disk(tmp_dir)
|
||||||
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
|
doc2 = nlp2(test_text)
|
||||||
|
assert gold_morphs == [t.morph_ for t in doc2]
|
|
@ -8,6 +8,7 @@ from spacy.language import Language
|
||||||
from spacy.pipeline import TextCategorizer
|
from spacy.pipeline import TextCategorizer
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
|
from spacy.util import fix_random_seed
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
from ...ml.models.defaults import default_tok2vec
|
from ...ml.models.defaults import default_tok2vec
|
||||||
|
@ -82,6 +83,7 @@ def test_label_types():
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
# Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
|
# Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
|
||||||
|
fix_random_seed(0)
|
||||||
nlp = English()
|
nlp = English()
|
||||||
textcat = nlp.create_pipe("textcat")
|
textcat = nlp.create_pipe("textcat")
|
||||||
for _, annotations in TRAIN_DATA:
|
for _, annotations in TRAIN_DATA:
|
||||||
|
|
|
@ -5,6 +5,7 @@ from spacy.gold import Example, GoldParse
|
||||||
from spacy.scorer import Scorer, ROCAUCScore
|
from spacy.scorer import Scorer, ROCAUCScore
|
||||||
from spacy.scorer import _roc_auc_score, _roc_curve
|
from spacy.scorer import _roc_auc_score, _roc_curve
|
||||||
from .util import get_doc
|
from .util import get_doc
|
||||||
|
from spacy.lang.en import English
|
||||||
|
|
||||||
test_las_apple = [
|
test_las_apple = [
|
||||||
[
|
[
|
||||||
|
@ -39,6 +40,43 @@ test_ner_apple = [
|
||||||
]
|
]
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def tagged_doc():
|
||||||
|
text = "Sarah's sister flew to Silicon Valley via London."
|
||||||
|
tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
|
||||||
|
pos = [
|
||||||
|
"PROPN",
|
||||||
|
"PART",
|
||||||
|
"NOUN",
|
||||||
|
"VERB",
|
||||||
|
"ADP",
|
||||||
|
"PROPN",
|
||||||
|
"PROPN",
|
||||||
|
"ADP",
|
||||||
|
"PROPN",
|
||||||
|
"PUNCT",
|
||||||
|
]
|
||||||
|
morphs = [
|
||||||
|
"NounType=prop|Number=sing",
|
||||||
|
"Poss=yes",
|
||||||
|
"Number=sing",
|
||||||
|
"Tense=past|VerbForm=fin",
|
||||||
|
"",
|
||||||
|
"NounType=prop|Number=sing",
|
||||||
|
"NounType=prop|Number=sing",
|
||||||
|
"",
|
||||||
|
"NounType=prop|Number=sing",
|
||||||
|
"PunctType=peri",
|
||||||
|
]
|
||||||
|
nlp = English()
|
||||||
|
doc = nlp(text)
|
||||||
|
for i in range(len(tags)):
|
||||||
|
doc[i].tag_ = tags[i]
|
||||||
|
doc[i].pos_ = pos[i]
|
||||||
|
doc[i].morph_ = morphs[i]
|
||||||
|
doc.is_tagged = True
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
def test_las_per_type(en_vocab):
|
def test_las_per_type(en_vocab):
|
||||||
# Gold and Doc are identical
|
# Gold and Doc are identical
|
||||||
|
@ -139,6 +177,43 @@ def test_ner_per_type(en_vocab):
|
||||||
assert results["ents_per_type"]["ORG"]["f"] == approx(66.66666)
|
assert results["ents_per_type"]["ORG"]["f"] == approx(66.66666)
|
||||||
|
|
||||||
|
|
||||||
|
def test_tag_score(tagged_doc):
|
||||||
|
# Gold and Doc are identical
|
||||||
|
scorer = Scorer()
|
||||||
|
gold = GoldParse(
|
||||||
|
tagged_doc,
|
||||||
|
tags=[t.tag_ for t in tagged_doc],
|
||||||
|
pos=[t.pos_ for t in tagged_doc],
|
||||||
|
morphs=[t.morph_ for t in tagged_doc]
|
||||||
|
)
|
||||||
|
scorer.score((tagged_doc, gold))
|
||||||
|
results = scorer.scores
|
||||||
|
|
||||||
|
assert results["tags_acc"] == 100
|
||||||
|
assert results["pos_acc"] == 100
|
||||||
|
assert results["morphs_acc"] == 100
|
||||||
|
assert results["morphs_per_type"]["NounType"]["f"] == 100
|
||||||
|
|
||||||
|
# Gold and Doc are identical
|
||||||
|
scorer = Scorer()
|
||||||
|
tags = [t.tag_ for t in tagged_doc]
|
||||||
|
tags[0] = "NN"
|
||||||
|
pos = [t.pos_ for t in tagged_doc]
|
||||||
|
pos[1] = "X"
|
||||||
|
morphs = [t.morph_ for t in tagged_doc]
|
||||||
|
morphs[1] = "Number=sing"
|
||||||
|
morphs[2] = "Number=plur"
|
||||||
|
gold = GoldParse(tagged_doc, tags=tags, pos=pos, morphs=morphs)
|
||||||
|
scorer.score((tagged_doc, gold))
|
||||||
|
results = scorer.scores
|
||||||
|
|
||||||
|
assert results["tags_acc"] == 90
|
||||||
|
assert results["pos_acc"] == 90
|
||||||
|
assert results["morphs_acc"] == approx(80)
|
||||||
|
assert results["morphs_per_type"]["Poss"]["f"] == 0.0
|
||||||
|
assert results["morphs_per_type"]["Number"]["f"] == approx(72.727272)
|
||||||
|
|
||||||
|
|
||||||
def test_roc_auc_score():
|
def test_roc_auc_score():
|
||||||
# Binary classification, toy tests from scikit-learn test suite
|
# Binary classification, toy tests from scikit-learn test suite
|
||||||
y_true = [0, 1]
|
y_true = [0, 1]
|
||||||
|
|
|
@ -50,6 +50,7 @@ cdef class Doc:
|
||||||
|
|
||||||
cdef public bint is_tagged
|
cdef public bint is_tagged
|
||||||
cdef public bint is_parsed
|
cdef public bint is_parsed
|
||||||
|
cdef public bint is_morphed
|
||||||
|
|
||||||
cdef public float sentiment
|
cdef public float sentiment
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user