Fix --gold-preproc train cli command (#4392)

* Fix get labels for textcat

* Fix char_embed for gpu

* Revert "Fix char_embed for gpu"

This reverts commit 055b9a9e85.

* Fix passing of cats in gold.pyx

* Revert "Match pop with append for training format (#4516)"

This reverts commit 8e7414dace.

* Fix popping gold parses

* Fix handling of cats in gold tuples

* Fix name

* Fix ner_multitask_objective script

* Add test for 4402
This commit is contained in:
Matthew Honnibal 2019-10-27 21:58:50 +01:00 committed by GitHub
parent 8e7414dace
commit f8d740bfb1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 43 additions and 78 deletions

View File

@ -18,21 +18,21 @@ during training. We discard the auxiliary model before run-time.
The specific example here is not necessarily a good idea --- but it shows
how an arbitrary objective function for some word can be used.
Developed for spaCy 2.0.6 and last tested for 2.2.2
Developed and tested for spaCy 2.0.6. Updated for v2.2.2
"""
import random
import plac
import spacy
import os.path
from spacy.gold import read_json_file, GoldParse
from spacy.tokens import Doc
from spacy.gold import read_json_file, GoldParse
random.seed(0)
PWD = os.path.dirname(__file__)
TRAIN_DATA = list(read_json_file(os.path.join(PWD, "training-data.json")))
TRAIN_DATA = list(read_json_file(
os.path.join(PWD, "ner_example_data", "ner-sent-per-line.json")))
def get_position_label(i, words, tags, heads, labels, ents):
@ -57,22 +57,17 @@ def main(n_iter=10):
ner = nlp.create_pipe("ner")
ner.add_multitask_objective(get_position_label)
nlp.add_pipe(ner)
print(nlp.pipeline)
_, sents = TRAIN_DATA[0]
print("Create data, # of sentences =", len(sents) - 1) # not counting the cats attribute
print("Create data", len(TRAIN_DATA))
optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA)
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
for raw_text, annots_brackets in TRAIN_DATA:
cats = annots_brackets.pop()
for annotations, _ in annots_brackets:
annotations.append(cats) # temporarily add it here for from_annot_tuples to work
for text, annot_brackets in TRAIN_DATA:
for annotations, _ in annot_brackets:
doc = Doc(nlp.vocab, words=annotations[1])
gold = GoldParse.from_annot_tuples(doc, annotations)
annotations.pop() # restore data
nlp.update(
[doc], # batch of texts
[gold], # batch of annotations
@ -80,14 +75,14 @@ def main(n_iter=10):
sgd=optimizer, # callable to update weights
losses=losses,
)
annots_brackets.append(cats) # restore data
print(losses.get("nn_labeller", 0.0), losses["ner"])
# test the trained model
for text, _ in TRAIN_DATA:
doc = nlp(text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
if text is not None:
doc = nlp(text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
if __name__ == "__main__":

View File

@ -55,22 +55,22 @@ def tags_to_entities(tags):
def merge_sents(sents):
m_sents = [[], [], [], [], [], []]
m_deps = [[], [], [], [], [], []]
m_cats = {}
m_brackets = []
m_cats = sents.pop()
i = 0
for (ids, words, tags, heads, labels, ner), brackets in sents:
m_sents[0].extend(id_ + i for id_ in ids)
m_sents[1].extend(words)
m_sents[2].extend(tags)
m_sents[3].extend(head + i for head in heads)
m_sents[4].extend(labels)
m_sents[5].extend(ner)
for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents:
m_deps[0].extend(id_ + i for id_ in ids)
m_deps[1].extend(words)
m_deps[2].extend(tags)
m_deps[3].extend(head + i for head in heads)
m_deps[4].extend(labels)
m_deps[5].extend(ner)
m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
for b in brackets)
m_cats.update(cats)
i += len(ids)
sents.append(m_cats) # restore original data
return [[(m_sents, m_brackets)], m_cats]
return [(m_deps, (m_cats, m_brackets))]
_NORM_MAP = {"``": '"', "''": '"'}
@ -242,13 +242,11 @@ class GoldCorpus(object):
n = 0
i = 0
for raw_text, paragraph_tuples in self.train_tuples:
cats = paragraph_tuples.pop()
for sent_tuples, brackets in paragraph_tuples:
n += len(sent_tuples[1])
if self.limit and i >= self.limit:
break
i += 1
paragraph_tuples.append(cats) # restore original data
return n
def train_docs(self, nlp, gold_preproc=False, max_length=None,
@ -289,36 +287,26 @@ class GoldCorpus(object):
@classmethod
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
cats = paragraph_tuples.pop()
if raw_text is not None:
raw_text, paragraph_tuples = make_orth_variants(nlp, raw_text, paragraph_tuples, orth_variant_level=orth_variant_level)
raw_text = add_noise(raw_text, noise_level)
result = [nlp.make_doc(raw_text)], paragraph_tuples
return [nlp.make_doc(raw_text)], paragraph_tuples
else:
docs = []
raw_text, paragraph_tuples = make_orth_variants(nlp, None, paragraph_tuples, orth_variant_level=orth_variant_level)
result = [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
for (sent_tuples, brackets) in paragraph_tuples], paragraph_tuples
paragraph_tuples.append(cats)
return result
@classmethod
def _make_golds(cls, docs, paragraph_tuples, make_projective):
cats = paragraph_tuples.pop()
if len(docs) != len(paragraph_tuples):
n_annots = len(paragraph_tuples)
raise ValueError(Errors.E070.format(n_docs=len(docs), n_annots=n_annots))
result = []
for doc, brack_annot in zip(docs, paragraph_tuples):
if len(brack_annot) == 1:
brack_annot = brack_annot[0]
sent_tuples, brackets = brack_annot
sent_tuples.append(cats)
result.append(GoldParse.from_annot_tuples(doc, sent_tuples, make_projective=make_projective))
sent_tuples.pop()
paragraph_tuples.append(cats)
return result
return [GoldParse.from_annot_tuples(doc, sent_tuples, cats=cats,
make_projective=make_projective)
for doc, (sent_tuples, (cats, brackets))
in zip(docs, paragraph_tuples)]
def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
@ -333,7 +321,7 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
# modify words in paragraph_tuples
variant_paragraph_tuples = []
for sent_tuples, brackets in paragraph_tuples:
ids, words, tags, heads, labels, ner, cats = sent_tuples
ids, words, tags, heads, labels, ner = sent_tuples
if lower:
words = [w.lower() for w in words]
# single variants
@ -362,7 +350,7 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
pair_idx = pair.index(words[word_idx])
words[word_idx] = punct_choices[punct_idx][pair_idx]
variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner, cats), brackets))
variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner), brackets))
# modify raw to match variant_paragraph_tuples
if raw is not None:
variants = []
@ -381,7 +369,7 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
variant_raw += raw[raw_idx]
raw_idx += 1
for sent_tuples, brackets in variant_paragraph_tuples:
ids, words, tags, heads, labels, ner, cats = sent_tuples
ids, words, tags, heads, labels, ner = sent_tuples
for word in words:
match_found = False
# add identical word
@ -452,6 +440,9 @@ def json_to_tuple(doc):
paragraphs = []
for paragraph in doc["paragraphs"]:
sents = []
cats = {}
for cat in paragraph.get("cats", {}):
cats[cat["label"]] = cat["value"]
for sent in paragraph["sentences"]:
words = []
ids = []
@ -471,11 +462,7 @@ def json_to_tuple(doc):
ner.append(token.get("ner", "-"))
sents.append([
[ids, words, tags, heads, labels, ner],
sent.get("brackets", [])])
cats = {}
for cat in paragraph.get("cats", {}):
cats[cat["label"]] = cat["value"]
sents.append(cats)
[cats, sent.get("brackets", [])]])
if sents:
yield [paragraph.get("raw", None), sents]
@ -589,8 +576,8 @@ cdef class GoldParse:
DOCS: https://spacy.io/api/goldparse
"""
@classmethod
def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
_, words, tags, heads, deps, entities, cats = annot_tuples
def from_annot_tuples(cls, doc, annot_tuples, cats=None, make_projective=False):
_, words, tags, heads, deps, entities = annot_tuples
return cls(doc, words=words, tags=tags, heads=heads, deps=deps,
entities=entities, cats=cats,
make_projective=make_projective)

View File

@ -598,11 +598,10 @@ class Language(object):
# Populate vocab
else:
for _, annots_brackets in get_gold_tuples():
cats = annots_brackets.pop()
_ = annots_brackets.pop()
for annots, _ in annots_brackets:
for word in annots[1]:
_ = self.vocab[word] # noqa: F841
annots_brackets.append(cats) # restore original data
if cfg.get("device", -1) >= 0:
util.use_gpu(cfg["device"])
if self.vocab.vectors.data.shape[1] >= 1:

View File

@ -517,7 +517,6 @@ class Tagger(Pipe):
orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = OrderedDict()
for raw_text, annots_brackets in get_gold_tuples():
cats = annots_brackets.pop()
for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots
for tag in tags:
@ -525,7 +524,6 @@ class Tagger(Pipe):
new_tag_map[tag] = orig_tag_map[tag]
else:
new_tag_map[tag] = {POS: X}
annots_brackets.append(cats) # restore original data
cdef Vocab vocab = self.vocab
if new_tag_map:
vocab.morphology = Morphology(vocab.strings, new_tag_map,
@ -704,14 +702,12 @@ class MultitaskObjective(Tagger):
sgd=None, **kwargs):
gold_tuples = nonproj.preprocess_training_data(get_gold_tuples())
for raw_text, annots_brackets in gold_tuples:
cats = annots_brackets.pop()
for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots
for i in range(len(ids)):
label = self.make_label(i, words, tags, heads, deps, ents)
if label is not None and label not in self.labels:
self.labels[label] = len(self.labels)
annots_brackets.append(cats)
if self.model is True:
token_vector_width = util.env_opt("token_vector_width")
self.model = self.Model(len(self.labels), tok2vec=tok2vec)
@ -1037,10 +1033,10 @@ class TextCategorizer(Pipe):
return 1
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
for raw_text, annots_brackets in get_gold_tuples():
cats = annots_brackets[-1]
for cat in cats:
self.add_label(cat)
for raw_text, annot_brackets in get_gold_tuples():
for _, (cats, _2) in annot_brackets:
for cat in cats:
self.add_label(cat)
if self.model is True:
self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors")
self.require_labels()

View File

@ -342,7 +342,6 @@ cdef class ArcEager(TransitionSystem):
actions[RIGHT][label] = 1
actions[REDUCE][label] = 1
for raw_text, sents in kwargs.get('gold_parses', []):
cats = sents.pop()
for (ids, words, tags, heads, labels, iob), ctnts in sents:
heads, labels = nonproj.projectivize(heads, labels)
for child, head, label in zip(ids, heads, labels):
@ -356,7 +355,6 @@ cdef class ArcEager(TransitionSystem):
elif head > child:
actions[LEFT][label] += 1
actions[SHIFT][''] += 1
sents.append(cats) # restore original data
if min_freq is not None:
for action, label_freqs in actions.items():
for label, freq in list(label_freqs.items()):

View File

@ -73,14 +73,12 @@ cdef class BiluoPushDown(TransitionSystem):
actions[action][entity_type] = 1
moves = ('M', 'B', 'I', 'L', 'U')
for raw_text, sents in kwargs.get('gold_parses', []):
cats = sents.pop()
for (ids, words, tags, heads, labels, biluo), _ in sents:
for i, ner_tag in enumerate(biluo):
if ner_tag != 'O' and ner_tag != '-':
_, label = ner_tag.split('-', 1)
for action in (BEGIN, IN, LAST, UNIT):
actions[action][label] += 1
sents.append(cats) # restore original data
return actions
@property

View File

@ -606,13 +606,11 @@ cdef class Parser:
doc_sample = []
gold_sample = []
for raw_text, annots_brackets in islice(get_gold_tuples(), 1000):
cats = annots_brackets.pop()
for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots
doc_sample.append(Doc(self.vocab, words=words))
gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags,
heads=heads, deps=deps, entities=ents))
annots_brackets.append(cats) # restore original data
self.model.begin_training(doc_sample, gold_sample)
if pipeline is not None:
self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg)

View File

@ -97,7 +97,6 @@ def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
freqs = {}
for raw_text, sents in gold_tuples:
prepro_sents = []
cats = sents.pop()
for (ids, words, tags, heads, labels, iob), ctnts in sents:
proj_heads, deco_labels = projectivize(heads, labels)
# set the label to ROOT for each root dependent
@ -110,8 +109,6 @@ def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
freqs[label] = freqs.get(label, 0) + 1
prepro_sents.append(
((ids, words, tags, proj_heads, deco_labels, iob), ctnts))
sents.append(cats)
prepro_sents.append(cats)
preprocessed.append((raw_text, prepro_sents))
if label_freq_cutoff > 0:
return _filter_labels(preprocessed, label_freq_cutoff, freqs)
@ -212,7 +209,6 @@ def _filter_labels(gold_tuples, cutoff, freqs):
filtered = []
for raw_text, sents in gold_tuples:
filtered_sents = []
cats = sents.pop()
for (ids, words, tags, heads, labels, iob), ctnts in sents:
filtered_labels = []
for label in labels:
@ -222,7 +218,5 @@ def _filter_labels(gold_tuples, cutoff, freqs):
filtered_labels.append(label)
filtered_sents.append(
((ids, words, tags, heads, filtered_labels, iob), ctnts))
sents.append(cats)
filtered_sents.append(cats)
filtered.append((raw_text, filtered_sents))
return filtered