mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
b71a11ff6d
* Add pos and morph scoring to Scorer Add pos, morph, and morph_per_type to `Scorer`. Report pos and morph accuracy in `spacy evaluate`. * Update morphologizer for v3 * switch to tagger-based morphologizer * use `spacy.HashCharEmbedCNN` for morphologizer defaults * add `Doc.is_morphed` flag * Add morphologizer to train CLI * Add basic morphologizer pipeline tests * Add simple morphologizer training example * Remove subword_features from CharEmbed models Remove `subword_features` argument from `spacy.HashCharEmbedCNN.v1` and `spacy.HashCharEmbedBiLSTM.v1` since in these cases `subword_features` is always `False`. * Rename setting in morphologizer example Use `with_pos_tags` instead of `without_pos_tags`. * Fix kwargs for spacy.HashCharEmbedBiLSTM.v1 * Remove defaults for spacy.HashCharEmbedBiLSTM.v1 Remove default `nM/nC` for `spacy.HashCharEmbedBiLSTM.v1`. * Set random seed for textcat overfitting test
134 lines
4.1 KiB
Python
134 lines
4.1 KiB
Python
#!/usr/bin/env python
|
|
# coding: utf8
|
|
"""
|
|
A simple example for training a morphologizer. For more details, see
|
|
the documentation:
|
|
* Training: https://spacy.io/usage/training
|
|
|
|
Compatible with: spaCy v3.0.0+
|
|
Last tested with: v3.0.0
|
|
"""
|
|
from __future__ import unicode_literals, print_function
|
|
|
|
import plac
|
|
import random
|
|
from pathlib import Path
|
|
import spacy
|
|
from spacy.util import minibatch, compounding
|
|
from spacy.morphology import Morphology
|
|
|
|
|
|
# Usually you'll read this in, of course. Data formats vary. Ensure your
|
|
# strings are unicode and that the number of tags assigned matches spaCy's
|
|
# tokenization. If not, you can always add a 'words' key to the annotations
|
|
# that specifies the gold-standard tokenization, e.g.:
|
|
# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']})
|
|
TRAIN_DATA = [
|
|
(
|
|
"I like green eggs",
|
|
{
|
|
"morphs": [
|
|
"PronType=Prs|Person=1",
|
|
"VerbForm=Fin",
|
|
"Degree=Pos",
|
|
"Number=Plur",
|
|
],
|
|
"pos": ["PRON", "VERB", "ADJ", "NOUN"],
|
|
},
|
|
),
|
|
(
|
|
"Eat blue ham",
|
|
{
|
|
"morphs": ["VerbForm=Inf", "Degree=Pos", "Number=Sing"],
|
|
"pos": ["VERB", "ADJ", "NOUN"],
|
|
},
|
|
),
|
|
(
|
|
"She was blue",
|
|
{
|
|
"morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos"],
|
|
"pos": ["PRON", "VERB", "ADJ"],
|
|
},
|
|
),
|
|
(
|
|
"He was blue today",
|
|
{
|
|
"morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos", ""],
|
|
"pos": ["PRON", "VERB", "ADJ", "ADV"],
|
|
},
|
|
),
|
|
]
|
|
|
|
# The POS tags are optional, set `with_pos_tags = False` to omit them for
|
|
# this example:
|
|
with_pos_tags = True
|
|
|
|
if not with_pos_tags:
|
|
for i in range(len(TRAIN_DATA)):
|
|
del TRAIN_DATA[i][1]["pos"]
|
|
|
|
|
|
@plac.annotations(
|
|
lang=("ISO Code of language to use", "option", "l", str),
|
|
output_dir=("Optional output directory", "option", "o", Path),
|
|
n_iter=("Number of training iterations", "option", "n", int),
|
|
)
|
|
def main(lang="en", output_dir=None, n_iter=25):
|
|
"""Create a new model, set up the pipeline and train the tagger. In order to
|
|
train the tagger with a custom tag map, we're creating a new Language
|
|
instance with a custom vocab.
|
|
"""
|
|
nlp = spacy.blank(lang)
|
|
# add the tagger to the pipeline
|
|
# nlp.create_pipe works for built-ins that are registered with spaCy
|
|
morphologizer = nlp.create_pipe("morphologizer")
|
|
nlp.add_pipe(morphologizer)
|
|
|
|
# add labels
|
|
for _, annotations in TRAIN_DATA:
|
|
morph_labels = annotations.get("morphs")
|
|
pos_labels = annotations.get("pos", [""] * len(annotations.get("morphs")))
|
|
assert len(morph_labels) == len(pos_labels)
|
|
for morph, pos in zip(morph_labels, pos_labels):
|
|
morph_dict = Morphology.feats_to_dict(morph)
|
|
if pos:
|
|
morph_dict["POS"] = pos
|
|
morph = Morphology.dict_to_feats(morph_dict)
|
|
morphologizer.add_label(morph)
|
|
|
|
optimizer = nlp.begin_training()
|
|
for i in range(n_iter):
|
|
random.shuffle(TRAIN_DATA)
|
|
losses = {}
|
|
# batch up the examples using spaCy's minibatch
|
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
|
for batch in batches:
|
|
nlp.update(batch, sgd=optimizer, losses=losses)
|
|
print("Losses", losses)
|
|
|
|
# test the trained model
|
|
test_text = "I like blue eggs"
|
|
doc = nlp(test_text)
|
|
print("Morphs", [(t.text, t.morph) for t in doc])
|
|
|
|
# save model to output directory
|
|
if output_dir is not None:
|
|
output_dir = Path(output_dir)
|
|
if not output_dir.exists():
|
|
output_dir.mkdir()
|
|
nlp.to_disk(output_dir)
|
|
print("Saved model to", output_dir)
|
|
|
|
# test the save model
|
|
print("Loading from", output_dir)
|
|
nlp2 = spacy.load(output_dir)
|
|
doc = nlp2(test_text)
|
|
print("Morphs", [(t.text, t.morph) for t in doc])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
plac.call(main)
|
|
|
|
# Expected output:
|
|
# Morphs [('I', POS=PRON|Person=1|PronType=Prs), ('like', POS=VERB|VerbForm=Fin), ('blue', Degree=Pos|POS=ADJ), ('eggs', Number=Plur|POS=NOUN)]
|