From d9289712ba76d4c67450fe1969642416d0ac57f4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 8 Jun 2020 22:28:50 +0200 Subject: [PATCH] * Make GoldCorpus return dict, not Example * Make Example require a Doc object (previously optional) Clarify methods in GoldCorpus WIP refactor Example Refactor Example.split_sents Fix test Fix augment Update test Update test Fix import Update test_scorer Update Example --- spacy/cli/converters/conllu2json.py | 10 +- spacy/gold/annotation.py | 3 + spacy/gold/augment.py | 7 +- spacy/gold/corpus.py | 45 ++--- spacy/gold/example.py | 155 +++++++++++------- spacy/gold/gold_io.pyx | 4 +- spacy/syntax/nonproj.pyx | 4 +- spacy/tests/regression/test_issue1501-2000.py | 15 +- spacy/tests/test_gold.py | 24 ++- spacy/tests/test_scorer.py | 18 +- spacy/tokens/doc.pyx | 2 + 11 files changed, 176 insertions(+), 111 deletions(-) diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 1ece755b8..2cf5f7942 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -2,6 +2,7 @@ import re from ...gold import Example from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets +from ...gold import TokenAnnotation from ...language import Language from ...tokens import Doc, Token from .conll_ner2json import n_sents_info @@ -284,13 +285,8 @@ def example_from_conllu_sentence( spaces.append(t._.merged_spaceafter) ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] ents = biluo_tags_from_offsets(doc, ent_offsets) - raw = "" - for word, space in zip(words, spaces): - raw += word - if space: - raw += " " - example = Example(doc=raw) - example.set_token_annotation( + example = Example(doc=Doc(vocab, words=words, spaces=spaces)) + example.token_annotation = TokenAnnotation( ids=ids, words=words, tags=tags, diff --git a/spacy/gold/annotation.py b/spacy/gold/annotation.py index 6bae679c3..5f78902ab 100644 --- a/spacy/gold/annotation.py +++ b/spacy/gold/annotation.py @@ -1,3 +1,6 @@ +from .iob_utils import biluo_tags_from_offsets + + class TokenAnnotation: def __init__( self, diff --git a/spacy/gold/augment.py b/spacy/gold/augment.py index 656308214..f938f540f 100644 --- a/spacy/gold/augment.py +++ b/spacy/gold/augment.py @@ -1,6 +1,7 @@ import random import itertools from .example import Example +from .annotation import TokenAnnotation def make_orth_variants(nlp, example, orth_variant_level=0.0): @@ -17,14 +18,14 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): ndsv = nlp.Defaults.single_orth_variants ndpv = nlp.Defaults.paired_orth_variants # modify words in paragraph_tuples - variant_example = Example(doc=raw) + variant_example = Example(doc=nlp.make_doc(raw)) token_annotation = example.token_annotation words = token_annotation.words tags = token_annotation.tags if not words or not tags: # add the unmodified annotation token_dict = token_annotation.to_dict() - variant_example.set_token_annotation(**token_dict) + variant_example.token_annotation = TokenAnnotation(**token_dict) else: if lower: words = [w.lower() for w in words] @@ -60,7 +61,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): token_dict = token_annotation.to_dict() token_dict["words"] = words token_dict["tags"] = tags - variant_example.set_token_annotation(**token_dict) + variant_example.token_annotation = TokenAnnotation(**token_dict) # modify raw to match variant_paragraph_tuples if raw is not None: variants = [] diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 9462f0aa4..df13ab505 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -28,8 +28,8 @@ class GoldCorpus(object): """ self.limit = limit if isinstance(train, str) or isinstance(train, Path): - train = self.read_examples(self.walk_corpus(train)) - dev = self.read_examples(self.walk_corpus(dev)) + train = self.read_annotations(self.walk_corpus(train)) + dev = self.read_annotations(self.walk_corpus(dev)) # Write temp directory with one doc per file, so we can shuffle and stream self.tmp_dir = Path(tempfile.mkdtemp()) self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit) @@ -71,7 +71,7 @@ class GoldCorpus(object): return locs @staticmethod - def read_examples(locs, limit=0): + def read_annotations(locs, limit=0): """ Yield training examples """ i = 0 for loc in locs: @@ -101,11 +101,11 @@ class GoldCorpus(object): or isinstance(doc, str) ): raise ValueError(Errors.E987.format(type=type(doc))) - examples.append(Example.from_dict(ex_dict, doc=doc)) + examples.append(ex_dict) elif file_name.endswith("msg"): text, ex_dict = srsly.read_msgpack(loc) - examples = [Example.from_dict(ex_dict, doc=text)] + examples = [ex_dict] else: supported = ("json", "jsonl", "msg") raise ValueError(Errors.E124.format(path=loc, formats=supported)) @@ -123,21 +123,21 @@ class GoldCorpus(object): raise ValueError(Errors.E996.format(file=file_name, msg=msg)) @property - def dev_examples(self): + def dev_annotations(self): locs = (self.tmp_dir / "dev").iterdir() - yield from self.read_examples(locs, limit=self.limit) + yield from self.read_annotations(locs, limit=self.limit) @property - def train_examples(self): + def train_annotations(self): locs = (self.tmp_dir / "train").iterdir() - yield from self.read_examples(locs, limit=self.limit) + yield from self.read_annotations(locs, limit=self.limit) def count_train(self): """Returns count of words in train examples""" n = 0 i = 0 - for example in self.train_examples: - n += len(example.token_annotation.words) + for eg_dict in self.train_annotations: + n += len(eg_dict["token_annotation"]["words"]) if self.limit and i >= self.limit: break i += 1 @@ -154,10 +154,10 @@ class GoldCorpus(object): ): locs = list((self.tmp_dir / "train").iterdir()) random.shuffle(locs) - train_examples = self.read_examples(locs, limit=self.limit) - gold_examples = self.iter_gold_docs( + train_annotations = self.read_annotations(locs, limit=self.limit) + examples = self.iter_examples( nlp, - train_examples, + train_annotations, gold_preproc, max_length=max_length, noise_level=noise_level, @@ -165,33 +165,33 @@ class GoldCorpus(object): make_projective=True, ignore_misaligned=ignore_misaligned, ) - yield from gold_examples + yield from examples def train_dataset_without_preprocessing( self, nlp, gold_preproc=False, ignore_misaligned=False ): - examples = self.iter_gold_docs( + examples = self.iter_examples( nlp, - self.train_examples, + self.train_annotations, gold_preproc=gold_preproc, ignore_misaligned=ignore_misaligned, ) yield from examples def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False): - examples = self.iter_gold_docs( + examples = self.iter_examples( nlp, - self.dev_examples, + self.dev_annotations, gold_preproc=gold_preproc, ignore_misaligned=ignore_misaligned, ) yield from examples @classmethod - def iter_gold_docs( + def iter_examples( cls, nlp, - examples, + annotations, gold_preproc, max_length=None, noise_level=0.0, @@ -200,7 +200,8 @@ class GoldCorpus(object): ignore_misaligned=False, ): """ Setting gold_preproc will result in creating a doc per sentence """ - for example in examples: + for eg_dict in annotations: + example = Example.from_dict(eg_dict, doc=nlp.make_doc(eg_dict["text"])) example_docs = [] if gold_preproc: split_examples = example.split_sents() diff --git a/spacy/gold/example.py b/spacy/gold/example.py index 1d8665572..c8ad58da7 100644 --- a/spacy/gold/example.py +++ b/spacy/gold/example.py @@ -1,18 +1,69 @@ +import numpy from .annotation import TokenAnnotation, DocAnnotation +from .iob_utils import spans_from_biluo_tags, biluo_tags_from_offsets from .align import Alignment from ..errors import Errors, AlignmentError from ..tokens import Doc +def annotations2doc(doc, doc_annot, tok_annot): + # TODO: Improve and test this + words = tok_annot.words or [tok.text for tok in doc] + fields = { + "tags": "TAG", + "pos": "POS", + "lemmas": "LEMMA", + "deps": "DEP", + } + attrs = [] + values = [] + for field, attr in fields.items(): + value = getattr(tok_annot, field) + # Unset fields will be empty lists. + if value: + attrs.append(attr) + values.append([doc.vocab.strings.add(v) for v in value]) + if tok_annot.heads: + attrs.append("HEAD") + values.append([h - i for i, h in enumerate(tok_annot.heads)]) + output = Doc(doc.vocab, words=words) + if values: + array = numpy.array(values, dtype="uint64") + output = output.from_array(attrs, array.T) + if tok_annot.entities: + output.ents = spans_from_biluo_tags(output, tok_annot.entities) + doc.cats = dict(doc_annot.cats) + # TODO: Calculate token.ent_kb_id from links. + # We need to fix this and the doc.ents thing, both should be doc + # annotations. + return doc + + class Example: - def __init__(self, doc=None, doc_annotation=None, token_annotation=None): + def __init__(self, doc, doc_annotation=None, token_annotation=None): """ Doc can either be text, or an actual Doc """ + if not isinstance(doc, Doc): + raise TypeError("Must pass Doc instance") + self.predicted = doc self.doc = doc self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation() self.token_annotation = ( token_annotation if token_annotation else TokenAnnotation() ) self._alignment = None + self.reference = annotations2doc( + self.doc, + self.doc_annotation, + self.token_annotation + ) + + @property + def x(self): + return self.predicted + + @property + def y(self): + return self.reference def _deprecated_get_gold(self, make_projective=False): from ..syntax.gold_parse import get_parses_from_example @@ -24,6 +75,8 @@ class Example: def from_dict(cls, example_dict, doc=None): if example_dict is None: raise ValueError("Example.from_dict expected dict, received None") + if doc is None: + raise ValueError("Must pass doc") # TODO: This is ridiculous... token_dict = example_dict.get("token_annotation", {}) doc_dict = example_dict.get("doc_annotation", {}) @@ -34,6 +87,10 @@ class Example: doc_dict[key] = value else: token_dict[key] = value + if token_dict.get("entities"): + entities = token_dict["entities"] + if isinstance(entities[0], (list, tuple)): + token_dict["entities"] = biluo_tags_from_offsets(doc, entities) token_annotation = TokenAnnotation.from_dict(token_dict) doc_annotation = DocAnnotation.from_dict(doc_dict) return cls( @@ -45,8 +102,8 @@ class Example: if self._alignment is None: if self.doc is None: return None - spacy_words = [token.orth_ for token in self.doc] - gold_words = self.token_annotation.words + spacy_words = [token.orth_ for token in self.predicted] + gold_words = [token.orth_ for token in self.reference] if gold_words == []: gold_words = spacy_words self._alignment = Alignment(spacy_words, gold_words) @@ -92,34 +149,6 @@ class Example: output.append(gold_values[gold_i]) return output - def set_token_annotation( - self, - ids=None, - words=None, - tags=None, - pos=None, - morphs=None, - lemmas=None, - heads=None, - deps=None, - entities=None, - sent_starts=None, - brackets=None, - ): - self.token_annotation = TokenAnnotation( - ids=ids, - words=words, - tags=tags, - pos=pos, - morphs=morphs, - lemmas=lemmas, - heads=heads, - deps=deps, - entities=entities, - sent_starts=sent_starts, - brackets=brackets, - ) - def set_doc_annotation(self, cats=None, links=None): if cats: self.doc_annotation.cats = cats @@ -131,7 +160,6 @@ class Example: sent_starts and return a list of the new Examples""" if not self.token_annotation.words: return [self] - s_example = Example(doc=None, doc_annotation=self.doc_annotation) s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], [] s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], [] s_brackets = [] @@ -140,21 +168,25 @@ class Example: split_examples = [] for i in range(len(t.words)): if i > 0 and t.sent_starts[i] == 1: - s_example.set_token_annotation( - ids=s_ids, - words=s_words, - tags=s_tags, - pos=s_pos, - morphs=s_morphs, - lemmas=s_lemmas, - heads=s_heads, - deps=s_deps, - entities=s_ents, - sent_starts=s_sent_starts, - brackets=s_brackets, + split_examples.append( + Example( + doc=Doc(self.doc.vocab, words=s_words), + token_annotation=TokenAnnotation( + ids=s_ids, + words=s_words, + tags=s_tags, + pos=s_pos, + morphs=s_morphs, + lemmas=s_lemmas, + heads=s_heads, + deps=s_deps, + entities=s_ents, + sent_starts=s_sent_starts, + brackets=s_brackets, + ), + doc_annotation=self.doc_annotation + ) ) - split_examples.append(s_example) - s_example = Example(doc=None, doc_annotation=self.doc_annotation) s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], [] s_deps, s_ents, s_morphs, s_lemmas = [], [], [], [] s_sent_starts, s_brackets = [], [] @@ -172,20 +204,25 @@ class Example: for b_end, b_label in t.brackets_by_start.get(i, []): s_brackets.append((i - sent_start_i, b_end - sent_start_i, b_label)) i += 1 - s_example.set_token_annotation( - ids=s_ids, - words=s_words, - tags=s_tags, - pos=s_pos, - morphs=s_morphs, - lemmas=s_lemmas, - heads=s_heads, - deps=s_deps, - entities=s_ents, - sent_starts=s_sent_starts, - brackets=s_brackets, + split_examples.append( + Example( + doc=Doc(self.doc.vocab, words=s_words), + token_annotation=TokenAnnotation( + ids=s_ids, + words=s_words, + tags=s_tags, + pos=s_pos, + morphs=s_morphs, + lemmas=s_lemmas, + heads=s_heads, + deps=s_deps, + entities=s_ents, + sent_starts=s_sent_starts, + brackets=s_brackets, + ), + doc_annotation=self.doc_annotation + ) ) - split_examples.append(s_example) return split_examples @classmethod diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx index 424e44f72..8aa5f4017 100644 --- a/spacy/gold/gold_io.pyx +++ b/spacy/gold/gold_io.pyx @@ -76,12 +76,12 @@ def read_json_file(loc, docs_filter=None, limit=None): yield json_data -def json_to_examples(doc): +def json_to_annotations(doc): """Convert an item in the JSON-formatted training data to the format used by GoldParse. doc (dict): One entry in the training data. - YIELDS (Example): The reformatted data - one training example per paragraph + YIELDS (tuple): The reformatted data - one training example per paragraph """ for paragraph in doc["paragraphs"]: example = {"text": paragraph.get("raw", None)} diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index 1edb2e65c..a91176f44 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -108,7 +108,7 @@ def preprocess_training_data(gold_data, label_freq_cutoff=30): proj_token_dict = example.token_annotation.to_dict() proj_token_dict["heads"] = proj_heads proj_token_dict["deps"] = deco_deps - new_example.set_token_annotation(**proj_token_dict) + new_example.token_annotation = TokenAnnotation(**proj_token_dict) preprocessed.append(new_example) if label_freq_cutoff > 0: return _filter_labels(preprocessed, label_freq_cutoff, freqs) @@ -216,6 +216,6 @@ def _filter_labels(examples, cutoff, freqs): filtered_labels.append(label) filtered_token_dict = example.token_annotation.to_dict() filtered_token_dict["deps"] = filtered_labels - new_example.set_token_annotation(**filtered_token_dict) + new_example.token_annotation = TokenAnnotation(**filtered_token_dict) filtered.append(new_example) return filtered diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 5a76697bc..ed1f33351 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -3,7 +3,7 @@ import gc import numpy import copy -from spacy.gold import Example +from spacy.gold import Example, TokenAnnotation from spacy.lang.en import English from spacy.lang.en.stop_words import STOP_WORDS from spacy.lang.lex_attrs import is_stop @@ -271,9 +271,16 @@ def test_issue1963(en_tokenizer): @pytest.mark.parametrize("label", ["U-JOB-NAME"]) def test_issue1967(label): ner = EntityRecognizer(Vocab(), default_ner()) - example = Example(doc=None) - example.set_token_annotation( - ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label] + example = Example( + doc=Doc(ner.vocab, words=["word"]), + token_annotation=TokenAnnotation( + ids=[0], + words=["word"], + tags=["tag"], + heads=[0], + deps=["dep"], + entities=[label] + ) ) ner.moves.get_actions(gold_parses=[example]) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 4b4250179..29ddc7456 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -95,6 +95,12 @@ def merged_dict(): } +@pytest.fixture +def vocab(): + nlp = English() + return nlp.vocab + + def test_gold_biluo_U(en_vocab): words = ["I", "flew", "to", "London", "."] spaces = [True, True, True, False, True] @@ -475,8 +481,10 @@ def _train(train_data): def test_split_sents(merged_dict): nlp = English() - example = Example() - example.set_token_annotation(**merged_dict) + example = Example.from_dict( + merged_dict, + doc=Doc(nlp.vocab, words=merged_dict["words"]) + ) assert len(get_parses_from_example( example, merge=False, @@ -506,13 +514,15 @@ def test_split_sents(merged_dict): assert token_annotation_2.sent_starts == [1, 0, 0, 0] -def test_tuples_to_example(merged_dict): - ex = Example() - ex.set_token_annotation(**merged_dict) +def test_tuples_to_example(vocab, merged_dict): cats = {"TRAVEL": 1.0, "BAKING": 0.0} - ex.set_doc_annotation(cats=cats) + merged_dict = dict(merged_dict) + merged_dict["cats"] = cats + ex = Example.from_dict( + merged_dict, + doc=Doc(vocab, words=merged_dict["words"]) + ) ex_dict = ex.to_dict() - assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"] assert ex_dict["token_annotation"]["words"] == merged_dict["words"] assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"] diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index d750a8202..5eaf8d5b3 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -1,12 +1,14 @@ from numpy.testing import assert_almost_equal, assert_array_almost_equal import pytest from pytest import approx -from spacy.gold import Example, GoldParse +from spacy.gold import Example, GoldParse, TokenAnnotation +from spacy.gold.iob_utils import biluo_tags_from_offsets from spacy.scorer import Scorer, ROCAUCScore from spacy.scorer import _roc_auc_score, _roc_curve from .util import get_doc from spacy.lang.en import English + test_las_apple = [ [ "Apple is looking at buying U.K. startup for $ 1 billion", @@ -134,8 +136,11 @@ def test_ner_per_type(en_vocab): words=input_.split(" "), ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]], ) - ex = Example(doc=doc) - ex.set_token_annotation(entities=annot["entities"]) + entities = biluo_tags_from_offsets(doc, annot["entities"]) + ex = Example( + doc=doc, + token_annotation=TokenAnnotation(entities=entities) + ) scorer.score(ex) results = scorer.scores @@ -155,8 +160,11 @@ def test_ner_per_type(en_vocab): words=input_.split(" "), ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]], ) - ex = Example(doc=doc) - ex.set_token_annotation(entities=annot["entities"]) + entities = biluo_tags_from_offsets(doc, annot["entities"]) + ex = Example( + doc=doc, + token_annotation=TokenAnnotation(entities=entities) + ) scorer.score(ex) results = scorer.scores diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 3aa27e451..81cef4492 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -799,6 +799,8 @@ cdef class Doc: cdef attr_id_t attr_id cdef TokenC* tokens = self.c cdef int length = len(array) + if length != len(self): + raise ValueError("Cannot set array values longer than the document.") # Get set up for fast loading cdef Pool mem = Pool() cdef int n_attrs = len(attrs)