* Make GoldCorpus return dict, not Example

* Make Example require a Doc object (previously optional) Clarify methods in GoldCorpus WIP refactor Example Refactor Example.split_sents Fix test Fix augment Update test Update test Fix import Update test_scorer Update Example
2025-07-15 10:42:34 +03:00 · 2020-06-08 22:28:50 +02:00 · 2020-06-08 22:28:50 +02:00 · d9289712ba
commit d9289712ba
parent 084271c9e9
11 changed files with 176 additions and 111 deletions
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -2,6 +2,7 @@ import re

 from ...gold import Example
 from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets
+from ...gold import TokenAnnotation
 from ...language import Language
 from ...tokens import Doc, Token
 from .conll_ner2json import n_sents_info
@ -284,13 +285,8 @@ def example_from_conllu_sentence(
        spaces.append(t._.merged_spaceafter)
    ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
    ents = biluo_tags_from_offsets(doc, ent_offsets)
-    raw = ""
-    for word, space in zip(words, spaces):
-        raw += word
-        if space:
-            raw += " "
-    example = Example(doc=raw)
-    example.set_token_annotation(
+    example = Example(doc=Doc(vocab, words=words, spaces=spaces))
+    example.token_annotation = TokenAnnotation(
        ids=ids,
        words=words,
        tags=tags,
--- a/spacy/gold/annotation.py
+++ b/spacy/gold/annotation.py
@ -1,3 +1,6 @@
+from .iob_utils import biluo_tags_from_offsets
+
+
 class TokenAnnotation:
    def __init__(
        self,
--- a/spacy/gold/augment.py
+++ b/spacy/gold/augment.py
@ -1,6 +1,7 @@
 import random
 import itertools
 from .example import Example
+from .annotation import TokenAnnotation


 def make_orth_variants(nlp, example, orth_variant_level=0.0):
@ -17,14 +18,14 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
    ndsv = nlp.Defaults.single_orth_variants
    ndpv = nlp.Defaults.paired_orth_variants
    # modify words in paragraph_tuples
-    variant_example = Example(doc=raw)
+    variant_example = Example(doc=nlp.make_doc(raw))
    token_annotation = example.token_annotation
    words = token_annotation.words
    tags = token_annotation.tags
    if not words or not tags:
        # add the unmodified annotation
        token_dict = token_annotation.to_dict()
-        variant_example.set_token_annotation(**token_dict)
+        variant_example.token_annotation = TokenAnnotation(**token_dict)
    else:
        if lower:
            words = [w.lower() for w in words]
@ -60,7 +61,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
        token_dict = token_annotation.to_dict()
        token_dict["words"] = words
        token_dict["tags"] = tags
-        variant_example.set_token_annotation(**token_dict)
+        variant_example.token_annotation = TokenAnnotation(**token_dict)
    # modify raw to match variant_paragraph_tuples
    if raw is not None:
        variants = []
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@ -28,8 +28,8 @@ class GoldCorpus(object):
        """
        self.limit = limit
        if isinstance(train, str) or isinstance(train, Path):
-            train = self.read_examples(self.walk_corpus(train))
-            dev = self.read_examples(self.walk_corpus(dev))
+            train = self.read_annotations(self.walk_corpus(train))
+            dev = self.read_annotations(self.walk_corpus(dev))
        # Write temp directory with one doc per file, so we can shuffle and stream
        self.tmp_dir = Path(tempfile.mkdtemp())
        self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
@ -71,7 +71,7 @@ class GoldCorpus(object):
        return locs

    @staticmethod
-    def read_examples(locs, limit=0):
+    def read_annotations(locs, limit=0):
        """ Yield training examples """
        i = 0
        for loc in locs:
@ -101,11 +101,11 @@ class GoldCorpus(object):
                                or isinstance(doc, str)
                            ):
                                raise ValueError(Errors.E987.format(type=type(doc)))
-                            examples.append(Example.from_dict(ex_dict, doc=doc))
+                            examples.append(ex_dict)

            elif file_name.endswith("msg"):
                text, ex_dict = srsly.read_msgpack(loc)
-                examples = [Example.from_dict(ex_dict, doc=text)]
+                examples = [ex_dict]
            else:
                supported = ("json", "jsonl", "msg")
                raise ValueError(Errors.E124.format(path=loc, formats=supported))
@ -123,21 +123,21 @@ class GoldCorpus(object):
                raise ValueError(Errors.E996.format(file=file_name, msg=msg))

    @property
-    def dev_examples(self):
+    def dev_annotations(self):
        locs = (self.tmp_dir / "dev").iterdir()
-        yield from self.read_examples(locs, limit=self.limit)
+        yield from self.read_annotations(locs, limit=self.limit)

    @property
-    def train_examples(self):
+    def train_annotations(self):
        locs = (self.tmp_dir / "train").iterdir()
-        yield from self.read_examples(locs, limit=self.limit)
+        yield from self.read_annotations(locs, limit=self.limit)

    def count_train(self):
        """Returns count of words in train examples"""
        n = 0
        i = 0
-        for example in self.train_examples:
-            n += len(example.token_annotation.words)
+        for eg_dict in self.train_annotations:
+            n += len(eg_dict["token_annotation"]["words"])
            if self.limit and i >= self.limit:
                break
            i += 1
@ -154,10 +154,10 @@ class GoldCorpus(object):
    ):
        locs = list((self.tmp_dir / "train").iterdir())
        random.shuffle(locs)
-        train_examples = self.read_examples(locs, limit=self.limit)
-        gold_examples = self.iter_gold_docs(
+        train_annotations = self.read_annotations(locs, limit=self.limit)
+        examples = self.iter_examples(
            nlp,
-            train_examples,
+            train_annotations,
            gold_preproc,
            max_length=max_length,
            noise_level=noise_level,
@ -165,33 +165,33 @@ class GoldCorpus(object):
            make_projective=True,
            ignore_misaligned=ignore_misaligned,
        )
-        yield from gold_examples
+        yield from examples

    def train_dataset_without_preprocessing(
        self, nlp, gold_preproc=False, ignore_misaligned=False
    ):
-        examples = self.iter_gold_docs(
+        examples = self.iter_examples(
            nlp,
-            self.train_examples,
+            self.train_annotations,
            gold_preproc=gold_preproc,
            ignore_misaligned=ignore_misaligned,
        )
        yield from examples

    def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
-        examples = self.iter_gold_docs(
+        examples = self.iter_examples(
            nlp,
-            self.dev_examples,
+            self.dev_annotations,
            gold_preproc=gold_preproc,
            ignore_misaligned=ignore_misaligned,
        )
        yield from examples

    @classmethod
-    def iter_gold_docs(
+    def iter_examples(
        cls,
        nlp,
-        examples,
+        annotations,
        gold_preproc,
        max_length=None,
        noise_level=0.0,
@ -200,7 +200,8 @@ class GoldCorpus(object):
        ignore_misaligned=False,
    ):
        """ Setting gold_preproc will result in creating a doc per sentence """
-        for example in examples:
+        for eg_dict in annotations:
+            example = Example.from_dict(eg_dict, doc=nlp.make_doc(eg_dict["text"]))
            example_docs = []
            if gold_preproc:
                split_examples = example.split_sents()
--- a/spacy/gold/example.py
+++ b/spacy/gold/example.py
@ -1,18 +1,69 @@
+import numpy
 from .annotation import TokenAnnotation, DocAnnotation
+from .iob_utils import spans_from_biluo_tags, biluo_tags_from_offsets
 from .align import Alignment
 from ..errors import Errors, AlignmentError
 from ..tokens import Doc


+def annotations2doc(doc, doc_annot, tok_annot):
+    # TODO: Improve and test this
+    words = tok_annot.words or [tok.text for tok in doc]
+    fields = {
+        "tags": "TAG",
+        "pos": "POS",
+        "lemmas": "LEMMA",
+        "deps": "DEP",
+    }
+    attrs = []
+    values = []
+    for field, attr in fields.items():
+        value = getattr(tok_annot, field)
+        # Unset fields will be empty lists.
+        if value:
+            attrs.append(attr)
+            values.append([doc.vocab.strings.add(v) for v in value])
+    if tok_annot.heads:
+        attrs.append("HEAD")
+        values.append([h - i for i, h in enumerate(tok_annot.heads)])
+    output = Doc(doc.vocab, words=words)
+    if values:
+        array = numpy.array(values, dtype="uint64")
+        output = output.from_array(attrs, array.T)
+    if tok_annot.entities:
+        output.ents = spans_from_biluo_tags(output, tok_annot.entities)
+    doc.cats = dict(doc_annot.cats)
+    # TODO: Calculate token.ent_kb_id from links.
+    # We need to fix this and the doc.ents thing, both should be doc
+    # annotations.
+    return doc
+
+
 class Example:
-    def __init__(self, doc=None, doc_annotation=None, token_annotation=None):
+    def __init__(self, doc, doc_annotation=None, token_annotation=None):
        """ Doc can either be text, or an actual Doc """
+        if not isinstance(doc, Doc):
+            raise TypeError("Must pass Doc instance")
+        self.predicted = doc
        self.doc = doc
        self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
        self.token_annotation = (
            token_annotation if token_annotation else TokenAnnotation()
        )
        self._alignment = None
+        self.reference = annotations2doc(
+            self.doc,
+            self.doc_annotation,
+            self.token_annotation
+        )
+
+    @property
+    def x(self):
+        return self.predicted
+    
+    @property
+    def y(self):
+        return self.reference

    def _deprecated_get_gold(self, make_projective=False):
        from ..syntax.gold_parse import get_parses_from_example
@ -24,6 +75,8 @@ class Example:
    def from_dict(cls, example_dict, doc=None):
        if example_dict is None:
            raise ValueError("Example.from_dict expected dict, received None")
+        if doc is None:
+            raise ValueError("Must pass doc")
        # TODO: This is ridiculous...
        token_dict = example_dict.get("token_annotation", {})
        doc_dict = example_dict.get("doc_annotation", {})
@ -34,6 +87,10 @@ class Example:
                doc_dict[key] = value
            else:
                token_dict[key] = value
+        if token_dict.get("entities"):
+            entities = token_dict["entities"]
+            if isinstance(entities[0], (list, tuple)):
+                token_dict["entities"] = biluo_tags_from_offsets(doc, entities)
        token_annotation = TokenAnnotation.from_dict(token_dict)
        doc_annotation = DocAnnotation.from_dict(doc_dict)
        return cls(
@ -45,8 +102,8 @@ class Example:
        if self._alignment is None:
            if self.doc is None:
                return None
-            spacy_words = [token.orth_ for token in self.doc]
-            gold_words = self.token_annotation.words
+            spacy_words = [token.orth_ for token in self.predicted]
+            gold_words = [token.orth_ for token in self.reference]
            if gold_words == []:
                gold_words = spacy_words
            self._alignment = Alignment(spacy_words, gold_words)
@ -92,34 +149,6 @@ class Example:
                output.append(gold_values[gold_i])
        return output

-    def set_token_annotation(
-        self,
-        ids=None,
-        words=None,
-        tags=None,
-        pos=None,
-        morphs=None,
-        lemmas=None,
-        heads=None,
-        deps=None,
-        entities=None,
-        sent_starts=None,
-        brackets=None,
-    ):
-        self.token_annotation = TokenAnnotation(
-            ids=ids,
-            words=words,
-            tags=tags,
-            pos=pos,
-            morphs=morphs,
-            lemmas=lemmas,
-            heads=heads,
-            deps=deps,
-            entities=entities,
-            sent_starts=sent_starts,
-            brackets=brackets,
-        )
-
    def set_doc_annotation(self, cats=None, links=None):
        if cats:
            self.doc_annotation.cats = cats
@ -131,7 +160,6 @@ class Example:
        sent_starts and return a list of the new Examples"""
        if not self.token_annotation.words:
            return [self]
-        s_example = Example(doc=None, doc_annotation=self.doc_annotation)
        s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
        s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
        s_brackets = []
@ -140,21 +168,25 @@ class Example:
        split_examples = []
        for i in range(len(t.words)):
            if i > 0 and t.sent_starts[i] == 1:
-                s_example.set_token_annotation(
-                    ids=s_ids,
-                    words=s_words,
-                    tags=s_tags,
-                    pos=s_pos,
-                    morphs=s_morphs,
-                    lemmas=s_lemmas,
-                    heads=s_heads,
-                    deps=s_deps,
-                    entities=s_ents,
-                    sent_starts=s_sent_starts,
-                    brackets=s_brackets,
+                split_examples.append(
+                    Example(
+                        doc=Doc(self.doc.vocab, words=s_words),
+                        token_annotation=TokenAnnotation(
+                            ids=s_ids,
+                            words=s_words,
+                            tags=s_tags,
+                            pos=s_pos,
+                            morphs=s_morphs,
+                            lemmas=s_lemmas,
+                            heads=s_heads,
+                            deps=s_deps,
+                            entities=s_ents,
+                            sent_starts=s_sent_starts,
+                            brackets=s_brackets,
+                        ),
+                        doc_annotation=self.doc_annotation
+                    )
                )
-                split_examples.append(s_example)
-                s_example = Example(doc=None, doc_annotation=self.doc_annotation)
                s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
                s_deps, s_ents, s_morphs, s_lemmas = [], [], [], []
                s_sent_starts, s_brackets = [], []
@ -172,20 +204,25 @@ class Example:
            for b_end, b_label in t.brackets_by_start.get(i, []):
                s_brackets.append((i - sent_start_i, b_end - sent_start_i, b_label))
            i += 1
-        s_example.set_token_annotation(
-            ids=s_ids,
-            words=s_words,
-            tags=s_tags,
-            pos=s_pos,
-            morphs=s_morphs,
-            lemmas=s_lemmas,
-            heads=s_heads,
-            deps=s_deps,
-            entities=s_ents,
-            sent_starts=s_sent_starts,
-            brackets=s_brackets,
+        split_examples.append(
+            Example(
+                doc=Doc(self.doc.vocab, words=s_words),
+                token_annotation=TokenAnnotation(
+                    ids=s_ids,
+                    words=s_words,
+                    tags=s_tags,
+                    pos=s_pos,
+                    morphs=s_morphs,
+                    lemmas=s_lemmas,
+                    heads=s_heads,
+                    deps=s_deps,
+                    entities=s_ents,
+                    sent_starts=s_sent_starts,
+                    brackets=s_brackets,
+                ),
+                doc_annotation=self.doc_annotation
+            )
        )
-        split_examples.append(s_example)
        return split_examples

    @classmethod
--- a/spacy/gold/gold_io.pyx
+++ b/spacy/gold/gold_io.pyx
@ -76,12 +76,12 @@ def read_json_file(loc, docs_filter=None, limit=None):
                yield json_data


-def json_to_examples(doc):
+def json_to_annotations(doc):
    """Convert an item in the JSON-formatted training data to the format
    used by GoldParse.

    doc (dict): One entry in the training data.
-    YIELDS (Example): The reformatted data - one training example per paragraph
+    YIELDS (tuple): The reformatted data - one training example per paragraph
    """
    for paragraph in doc["paragraphs"]:
        example = {"text": paragraph.get("raw", None)}
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@ -108,7 +108,7 @@ def preprocess_training_data(gold_data, label_freq_cutoff=30):
        proj_token_dict = example.token_annotation.to_dict()
        proj_token_dict["heads"] = proj_heads
        proj_token_dict["deps"] = deco_deps
-        new_example.set_token_annotation(**proj_token_dict)
+        new_example.token_annotation = TokenAnnotation(**proj_token_dict)
        preprocessed.append(new_example)
    if label_freq_cutoff > 0:
        return _filter_labels(preprocessed, label_freq_cutoff, freqs)
@ -216,6 +216,6 @@ def _filter_labels(examples, cutoff, freqs):
                filtered_labels.append(label)
        filtered_token_dict = example.token_annotation.to_dict()
        filtered_token_dict["deps"] = filtered_labels
-        new_example.set_token_annotation(**filtered_token_dict)
+        new_example.token_annotation = TokenAnnotation(**filtered_token_dict)
        filtered.append(new_example)
    return filtered
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@ -3,7 +3,7 @@ import gc
 import numpy
 import copy

-from spacy.gold import Example
+from spacy.gold import Example, TokenAnnotation
 from spacy.lang.en import English
 from spacy.lang.en.stop_words import STOP_WORDS
 from spacy.lang.lex_attrs import is_stop
@ -271,9 +271,16 @@ def test_issue1963(en_tokenizer):
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
 def test_issue1967(label):
    ner = EntityRecognizer(Vocab(), default_ner())
-    example = Example(doc=None)
-    example.set_token_annotation(
-        ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]
+    example = Example(
+        doc=Doc(ner.vocab, words=["word"]),
+        token_annotation=TokenAnnotation(
+            ids=[0],
+            words=["word"],
+            tags=["tag"],
+            heads=[0],
+            deps=["dep"],
+            entities=[label]
+        )
    )
    ner.moves.get_actions(gold_parses=[example])

--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -95,6 +95,12 @@ def merged_dict():
    }


+@pytest.fixture
+def vocab():
+    nlp = English()
+    return nlp.vocab
+
+
 def test_gold_biluo_U(en_vocab):
    words = ["I", "flew", "to", "London", "."]
    spaces = [True, True, True, False, True]
@ -475,8 +481,10 @@ def _train(train_data):

 def test_split_sents(merged_dict):
    nlp = English()
-    example = Example()
-    example.set_token_annotation(**merged_dict)
+    example = Example.from_dict(
+        merged_dict,
+        doc=Doc(nlp.vocab, words=merged_dict["words"])
+    )
    assert len(get_parses_from_example(
        example,
        merge=False,
@ -506,13 +514,15 @@ def test_split_sents(merged_dict):
    assert token_annotation_2.sent_starts == [1, 0, 0, 0]


-def test_tuples_to_example(merged_dict):
-    ex = Example()
-    ex.set_token_annotation(**merged_dict)
+def test_tuples_to_example(vocab, merged_dict):
    cats = {"TRAVEL": 1.0, "BAKING": 0.0}
-    ex.set_doc_annotation(cats=cats)
+    merged_dict = dict(merged_dict)
+    merged_dict["cats"] = cats
+    ex = Example.from_dict(
+        merged_dict,
+        doc=Doc(vocab, words=merged_dict["words"])
+    )
    ex_dict = ex.to_dict()
-
    assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"]
    assert ex_dict["token_annotation"]["words"] == merged_dict["words"]
    assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"]
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@ -1,12 +1,14 @@
 from numpy.testing import assert_almost_equal, assert_array_almost_equal
 import pytest
 from pytest import approx
-from spacy.gold import Example, GoldParse
+from spacy.gold import Example, GoldParse, TokenAnnotation
+from spacy.gold.iob_utils import biluo_tags_from_offsets
 from spacy.scorer import Scorer, ROCAUCScore
 from spacy.scorer import _roc_auc_score, _roc_curve
 from .util import get_doc
 from spacy.lang.en import English

+
 test_las_apple = [
    [
        "Apple is looking at buying U.K. startup for $ 1 billion",
@ -134,8 +136,11 @@ def test_ner_per_type(en_vocab):
            words=input_.split(" "),
            ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
        )
-        ex = Example(doc=doc)
-        ex.set_token_annotation(entities=annot["entities"])
+        entities = biluo_tags_from_offsets(doc, annot["entities"])
+        ex = Example(
+            doc=doc,
+            token_annotation=TokenAnnotation(entities=entities)
+        )
        scorer.score(ex)
    results = scorer.scores

@ -155,8 +160,11 @@ def test_ner_per_type(en_vocab):
            words=input_.split(" "),
            ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
        )
-        ex = Example(doc=doc)
-        ex.set_token_annotation(entities=annot["entities"])
+        entities = biluo_tags_from_offsets(doc, annot["entities"])
+        ex = Example(
+            doc=doc,
+            token_annotation=TokenAnnotation(entities=entities)
+        )
        scorer.score(ex)
    results = scorer.scores

--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -799,6 +799,8 @@ cdef class Doc:
        cdef attr_id_t attr_id
        cdef TokenC* tokens = self.c
        cdef int length = len(array)
+        if length != len(self):
+            raise ValueError("Cannot set array values longer than the document.")
        # Get set up for fast loading
        cdef Pool mem = Pool()
        cdef int n_attrs = len(attrs)