From 01f9ae774cca1cdd66353575c9cbe55a91c31813 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 18 Jun 2020 14:01:19 +0200 Subject: [PATCH 1/8] small fixes --- spacy/tests/regression/test_issue4313.py | 4 ++++ spacy/tests/regression/test_issue4529.py | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py index 946316d85..46f79d6f5 100644 --- a/spacy/tests/regression/test_issue4313.py +++ b/spacy/tests/regression/test_issue4313.py @@ -1,5 +1,7 @@ from collections import defaultdict +import pytest + from spacy.pipeline.defaults import default_ner from spacy.pipeline import EntityRecognizer @@ -7,6 +9,8 @@ from spacy.lang.en import English from spacy.tokens import Span +# skipped after removing Beam stuff during the Example/GoldParse refactor +@pytest.mark.skip def test_issue4313(): """ This should not crash or exit with some strange error code """ beam_width = 16 diff --git a/spacy/tests/regression/test_issue4529.py b/spacy/tests/regression/test_issue4529.py index 89b16a484..0708499de 100644 --- a/spacy/tests/regression/test_issue4529.py +++ b/spacy/tests/regression/test_issue4529.py @@ -1,9 +1,11 @@ import pytest +from spacy.gold import Example + @pytest.mark.parametrize( "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])] ) def test_gold_misaligned(en_tokenizer, text, words): doc = en_tokenizer(text) - GoldParse(doc, words=words) + Example.from_dict(doc, {"words": words}) From 64fc840a5dc04161c89e8ce399fe9ceebdc6b0ba Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 18 Jun 2020 15:24:40 +0200 Subject: [PATCH 2/8] bugfix tok2vec --- spacy/pipeline/tok2vec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index afd9b554f..75654145b 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -59,10 +59,10 @@ class Tok2Vec(Pipe): YIELDS (iterator): A sequence of `Doc` objects, in order of input. """ for docs in minibatch(stream, batch_size): - batch = list(batch) + docs = list(docs) tokvecses = self.predict(docs) self.set_annotations(docs, tokvecses) - yield from batch + yield from docs def predict(self, docs): """Return a single tensor for a batch of documents. From 1c71f2310c6f8cd44a62fc6f427592c61227dd1d Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 18 Jun 2020 15:33:28 +0200 Subject: [PATCH 3/8] fix renames and simple_ner labels --- spacy/cli/train_from_config.py | 14 ++++++-------- spacy/pipeline/simple_ner.py | 5 ++--- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 95fdb10d5..7dc6143f2 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -231,8 +231,8 @@ def train( # check whether the setting 'exclusive_classes' corresponds to the provided training data if textcat_multilabel: multilabel_found = False - for ex in corpus.train_examples: - cats = ex.doc_annotation.cats + for eg in corpus.train_annotations: + cats = eg.reference.cats textcat_labels.update(cats.keys()) if list(cats.values()).count(1.0) != 1: multilabel_found = True @@ -244,8 +244,8 @@ def train( "mutually exclusive classes more accurately." ) else: - for ex in corpus.train_examples: - cats = ex.doc_annotation.cats + for eg in corpus.train_annotations: + cats = eg.reference.cats textcat_labels.update(cats.keys()) if list(cats.values()).count(1.0) != 1: msg.fail( @@ -346,10 +346,8 @@ def train( progress = tqdm.tqdm(total=training["eval_frequency"], leave=False) # Clean up the objects to faciliate garbage collection. for eg in batch: - eg.doc = None - eg.goldparse = None - eg.doc_annotation = None - eg.token_annotation = None + eg.reference = None + eg.predicted = None except Exception as e: msg.warn( f"Aborting and saving the final best model. " diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index 692c74a38..3ef6a48ce 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -143,8 +143,7 @@ def _has_ner(eg): def _get_labels(examples): labels = set() for eg in examples: - for ner_tag in eg.token_annotation.entities: + for ner_tag in eg.get_aligned("ENT_TYPE", as_string=True): if ner_tag != 'O' and ner_tag != '-': - _, label = ner_tag.split('-', 1) - labels.add(label) + labels.add(ner_tag) return list(sorted(labels)) From 0b6d45eae1b5f5bc6abe2031954f95227c96dbea Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 18 Jun 2020 15:55:00 +0200 Subject: [PATCH 4/8] various small fixes --- examples/training/rehearsal.py | 12 ++++++------ examples/training/train_textcat.py | 5 ++--- spacy/cli/train_from_config.py | 2 +- spacy/tests/pipeline/test_textcat.py | 14 +++++++------- 4 files changed, 16 insertions(+), 17 deletions(-) diff --git a/examples/training/rehearsal.py b/examples/training/rehearsal.py index 92002b5e5..8c94ab14e 100644 --- a/examples/training/rehearsal.py +++ b/examples/training/rehearsal.py @@ -4,8 +4,10 @@ import random import warnings import srsly import spacy +from spacy.gold import Example from spacy.util import minibatch, compounding +# TODO: further fix & test this script for v.3 ? (read_gold_data is never called) LABEL = "ANIMAL" TRAIN_DATA = [ @@ -35,15 +37,13 @@ def read_raw_data(nlp, jsonl_loc): def read_gold_data(nlp, gold_loc): - docs = [] - golds = [] + examples = [] for json_obj in srsly.read_jsonl(gold_loc): doc = nlp.make_doc(json_obj["text"]) ents = [(ent["start"], ent["end"], ent["label"]) for ent in json_obj["spans"]] - gold = GoldParse(doc, entities=ents) - docs.append(doc) - golds.append(gold) - return list(zip(docs, golds)) + example = Example.from_dict(doc, {"entities": ents}) + examples.append(example) + return examples def main(model_name, unlabelled_loc): diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index 256aaa293..cb65b8c8b 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -62,11 +62,10 @@ def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=Non train_examples = [] for text, cats in zip(train_texts, train_cats): doc = nlp.make_doc(text) - gold = GoldParse(doc, cats=cats) + example = Example.from_dict(doc, {"cats": cats}) for cat in cats: textcat.add_label(cat) - ex = Example.from_gold(gold, doc=doc) - train_examples.append(ex) + train_examples.append(example) with nlp.select_pipes(enable="textcat"): # only train textcat optimizer = nlp.begin_training() diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 7dc6143f2..3a4d28356 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -467,7 +467,7 @@ def train_while_improving( Every iteration, the function yields out a tuple with: - * batch: A zipped sequence of Tuple[Doc, GoldParse] pairs. + * batch: A list of Example objects. * info: A dict with various information about the last update (see below). * is_best_checkpoint: A value in None, False, True, indicating whether this was the best evaluation so far. You should use this to save the model diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index fc54cc5b5..6f01ada69 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -11,6 +11,7 @@ from spacy.util import fix_random_seed from ..util import make_tempdir from spacy.pipeline.defaults import default_tok2vec +from ...gold import Example TRAIN_DATA = [ ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}), @@ -50,21 +51,20 @@ def test_textcat_learns_multilabel(): cats = {letter: float(w2 == letter) for letter in letters} docs.append((Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats)) random.shuffle(docs) - model = TextCategorizer(nlp.vocab, width=8) + textcat = TextCategorizer(nlp.vocab, width=8) for letter in letters: - model.add_label(letter) - optimizer = model.begin_training() + textcat.add_label(letter) + optimizer = textcat.begin_training() for i in range(30): losses = {} - Ys = [GoldParse(doc, cats=cats) for doc, cats in docs] - Xs = [doc for doc, cats in docs] - model.update(Xs, Ys, sgd=optimizer, losses=losses) + examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs] + textcat.update(examples, sgd=optimizer, losses=losses) random.shuffle(docs) for w1 in letters: for w2 in letters: doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3) truth = {letter: w2 == letter for letter in letters} - model(doc) + textcat(doc) for cat, score in doc.cats.items(): if not truth[cat]: assert score < 0.5 From e822367cf7fd429b70c0b61abe29b200a26e3f5c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 18 Jun 2020 17:47:59 +0200 Subject: [PATCH 5/8] prevent writing dummy values like deps because that could interfer with sent_start values --- spacy/gold/corpus.py | 2 +- spacy/gold/gold_io.pyx | 45 ++++++++++++++++++++++++++++-------------- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index f3eaabc4e..d55845fb8 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -72,7 +72,7 @@ class GoldCorpus(object): @staticmethod def read_annotations(locs, limit=0): - """ Yield training examples """ + """ Yield training examples as example dicts """ i = 0 for loc in locs: loc = util.ensure_path(loc) diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx index ea37df9f2..47f2c0451 100644 --- a/spacy/gold/gold_io.pyx +++ b/spacy/gold/gold_io.pyx @@ -108,15 +108,21 @@ def json_to_annotations(doc): words.append(token["orth"]) spaces.append(token.get("space", True)) ids.append(token.get('id', sent_start_i + i)) - tags.append(token.get('tag', "-")) - pos.append(token.get("pos", "")) - morphs.append(token.get("morph", "")) - lemmas.append(token.get("lemma", "")) - heads.append(token.get("head", 0) + sent_start_i + i) - labels.append(token.get("dep", "")) - # Ensure ROOT label is case-insensitive - if labels[-1].lower() == "root": - labels[-1] = "ROOT" + if "tag" in token: + tags.append(token["tag"]) + if "pos" in token: + pos.append(token["pos"]) + if "morph" in token: + morphs.append(token["morph"]) + if "lemma" in token: + lemmas.append(token["lemma"]) + if "head" in token: + heads.append(token["head"]) + if "dep" in token: + labels.append(token["dep"]) + # Ensure ROOT label is case-insensitive + if labels[-1].lower() == "root": + labels[-1] = "ROOT" if i == 0: sent_starts.append(1) else: @@ -130,15 +136,24 @@ def json_to_annotations(doc): ids=ids, words=words, spaces=spaces, - tags=tags, - pos=pos, - morphs=morphs, - lemmas=lemmas, - heads=heads, - deps=labels, sent_starts=sent_starts, brackets=brackets ) + # avoid including dummy values that looks like gold info was present + if tags: + example["token_annotation"]["tags"] = tags + if pos: + example["token_annotation"]["pos"] = pos + if morphs: + example["token_annotation"]["morphs"] = morphs + if lemmas: + example["token_annotation"]["lemmas"] = lemmas + if heads: + example["token_annotation"]["heads"] = heads + if labels: + example["token_annotation"]["deps"] = labels + if pos: + example["token_annotation"]["pos"] = pos cats = {} for cat in paragraph.get("cats", {}): From d1d6f167763d2a4cbd1ff78b295c61abda68e7c3 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 18 Jun 2020 19:15:32 +0200 Subject: [PATCH 6/8] fix the fix --- spacy/gold/gold_io.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx index 47f2c0451..2d105b6cd 100644 --- a/spacy/gold/gold_io.pyx +++ b/spacy/gold/gold_io.pyx @@ -117,7 +117,7 @@ def json_to_annotations(doc): if "lemma" in token: lemmas.append(token["lemma"]) if "head" in token: - heads.append(token["head"]) + heads.append(token["head"] + sent_start_i + i) if "dep" in token: labels.append(token["dep"]) # Ensure ROOT label is case-insensitive From 1951921230902ea0e18abf171551b2f8c8895b60 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 18 Jun 2020 19:41:53 +0200 Subject: [PATCH 7/8] implement split_sent with aligned SENT_START attribute --- spacy/gold/example.pyx | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 663c8cc6d..402228994 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -117,7 +117,7 @@ cdef class Example: i = j2i_multi[j] if output[i] is None: output[i] = gold_values[j] - if as_string and field not in ["ENT_IOB"]: + if as_string and field not in ["ENT_IOB", "SENT_START"]: output = [vocab.strings[o] if o is not None else o for o in output] return output @@ -146,22 +146,19 @@ cdef class Example: sent_starts and return a list of the new Examples""" if not self.reference.is_sentenced: return [self] - # TODO: Do this for misaligned somehow? - predicted_words = [t.text for t in self.predicted] - reference_words = [t.text for t in self.reference] - if predicted_words != reference_words: - raise NotImplementedError("TODO: Implement this") - # Implement the easy case. + + sent_starts = self.get_aligned("SENT_START") + sent_starts.append(1) # appending virtual start of a next sentence to facilitate search + output = [] - cls = self.__class__ + pred_start = 0 for sent in self.reference.sents: - # I guess for misaligned we just need to use the gold_to_cand? - output.append( - cls( - self.predicted[sent.start : sent.end + 1].as_doc(), - sent.as_doc() - ) - ) + new_ref = sent.as_doc() + pred_end = sent_starts.index(1, pred_start+1) # find where the next sentence starts + new_pred = self.predicted[pred_start : pred_end].as_doc() + output.append(Example(new_pred, new_ref)) + pred_start = pred_end + return output property text: From 6ca6d7d6b4d2ea8fa596f3f7be4f244aa7902e15 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 18 Jun 2020 20:01:02 +0200 Subject: [PATCH 8/8] test for split sentences with various alignment issues, works --- spacy/tests/test_gold.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 8e1399fd0..d98a93f2f 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -90,6 +90,7 @@ def merged_dict(): return { "ids": [1, 2, 3, 4, 5, 6, 7], "words": ["Hi", "there", "everyone", "It", "is", "just", "me"], + "spaces": [True, True, True, True, True, True, False], "tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"], "sent_starts": [1, 0, 0, 1, 0, 0, 0], } @@ -150,6 +151,30 @@ def test_gold_biluo_misalign(en_vocab): assert tags == ["O", "O", "O", "-", "-", "-"] +def test_split_sentences(en_vocab): + words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"] + doc = Doc(en_vocab, words=words) + gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of", "fun"] + sent_starts = [True, False, False, False, False, False, True, False, False, False] + example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts}) + assert example.text == "I flew to San Francisco Valley had loads of fun " + split_examples = example.split_sents() + assert len(split_examples) == 2 + assert split_examples[0].text == "I flew to San Francisco Valley " + assert split_examples[1].text == "had loads of fun " + + words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of fun"] + doc = Doc(en_vocab, words=words) + gold_words = ["I", "flew", "to", "San Francisco", "Valley", "had", "loads of", "fun"] + sent_starts = [True, False, False, False, False, True, False, False] + example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts}) + assert example.text == "I flew to San Francisco Valley had loads of fun " + split_examples = example.split_sents() + assert len(split_examples) == 2 + assert split_examples[0].text == "I flew to San Francisco Valley " + assert split_examples[1].text == "had loads of fun " + + def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): # one-to-many words = ["I", "flew to", "San Francisco Valley", "."] @@ -466,7 +491,7 @@ def _train(train_data): def test_split_sents(merged_dict): nlp = English() example = Example.from_dict( - Doc(nlp.vocab, words=merged_dict["words"]), + Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]), merged_dict ) assert len(get_parses_from_example( @@ -484,6 +509,8 @@ def test_split_sents(merged_dict): split_examples = example.split_sents() assert len(split_examples) == 2 + assert split_examples[0].text == "Hi there everyone " + assert split_examples[1].text == "It is just me" token_annotation_1 = split_examples[0].to_dict()["token_annotation"] assert token_annotation_1["words"] == ["Hi", "there", "everyone"]