* Make GoldCorpus return dict, not Example

* Make Example require a Doc object (previously optional)

Clarify methods in GoldCorpus

WIP refactor Example

Refactor Example.split_sents

Fix test

Fix augment

Update test

Update test

Fix import

Update test_scorer

Update Example
This commit is contained in:
Matthew Honnibal 2020-06-08 22:28:50 +02:00
parent 084271c9e9
commit d9289712ba
11 changed files with 176 additions and 111 deletions

View File

@ -2,6 +2,7 @@ import re
from ...gold import Example
from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets
from ...gold import TokenAnnotation
from ...language import Language
from ...tokens import Doc, Token
from .conll_ner2json import n_sents_info
@ -284,13 +285,8 @@ def example_from_conllu_sentence(
spaces.append(t._.merged_spaceafter)
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
ents = biluo_tags_from_offsets(doc, ent_offsets)
raw = ""
for word, space in zip(words, spaces):
raw += word
if space:
raw += " "
example = Example(doc=raw)
example.set_token_annotation(
example = Example(doc=Doc(vocab, words=words, spaces=spaces))
example.token_annotation = TokenAnnotation(
ids=ids,
words=words,
tags=tags,

View File

@ -1,3 +1,6 @@
from .iob_utils import biluo_tags_from_offsets
class TokenAnnotation:
def __init__(
self,

View File

@ -1,6 +1,7 @@
import random
import itertools
from .example import Example
from .annotation import TokenAnnotation
def make_orth_variants(nlp, example, orth_variant_level=0.0):
@ -17,14 +18,14 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
ndsv = nlp.Defaults.single_orth_variants
ndpv = nlp.Defaults.paired_orth_variants
# modify words in paragraph_tuples
variant_example = Example(doc=raw)
variant_example = Example(doc=nlp.make_doc(raw))
token_annotation = example.token_annotation
words = token_annotation.words
tags = token_annotation.tags
if not words or not tags:
# add the unmodified annotation
token_dict = token_annotation.to_dict()
variant_example.set_token_annotation(**token_dict)
variant_example.token_annotation = TokenAnnotation(**token_dict)
else:
if lower:
words = [w.lower() for w in words]
@ -60,7 +61,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
token_dict = token_annotation.to_dict()
token_dict["words"] = words
token_dict["tags"] = tags
variant_example.set_token_annotation(**token_dict)
variant_example.token_annotation = TokenAnnotation(**token_dict)
# modify raw to match variant_paragraph_tuples
if raw is not None:
variants = []

View File

@ -28,8 +28,8 @@ class GoldCorpus(object):
"""
self.limit = limit
if isinstance(train, str) or isinstance(train, Path):
train = self.read_examples(self.walk_corpus(train))
dev = self.read_examples(self.walk_corpus(dev))
train = self.read_annotations(self.walk_corpus(train))
dev = self.read_annotations(self.walk_corpus(dev))
# Write temp directory with one doc per file, so we can shuffle and stream
self.tmp_dir = Path(tempfile.mkdtemp())
self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
@ -71,7 +71,7 @@ class GoldCorpus(object):
return locs
@staticmethod
def read_examples(locs, limit=0):
def read_annotations(locs, limit=0):
""" Yield training examples """
i = 0
for loc in locs:
@ -101,11 +101,11 @@ class GoldCorpus(object):
or isinstance(doc, str)
):
raise ValueError(Errors.E987.format(type=type(doc)))
examples.append(Example.from_dict(ex_dict, doc=doc))
examples.append(ex_dict)
elif file_name.endswith("msg"):
text, ex_dict = srsly.read_msgpack(loc)
examples = [Example.from_dict(ex_dict, doc=text)]
examples = [ex_dict]
else:
supported = ("json", "jsonl", "msg")
raise ValueError(Errors.E124.format(path=loc, formats=supported))
@ -123,21 +123,21 @@ class GoldCorpus(object):
raise ValueError(Errors.E996.format(file=file_name, msg=msg))
@property
def dev_examples(self):
def dev_annotations(self):
locs = (self.tmp_dir / "dev").iterdir()
yield from self.read_examples(locs, limit=self.limit)
yield from self.read_annotations(locs, limit=self.limit)
@property
def train_examples(self):
def train_annotations(self):
locs = (self.tmp_dir / "train").iterdir()
yield from self.read_examples(locs, limit=self.limit)
yield from self.read_annotations(locs, limit=self.limit)
def count_train(self):
"""Returns count of words in train examples"""
n = 0
i = 0
for example in self.train_examples:
n += len(example.token_annotation.words)
for eg_dict in self.train_annotations:
n += len(eg_dict["token_annotation"]["words"])
if self.limit and i >= self.limit:
break
i += 1
@ -154,10 +154,10 @@ class GoldCorpus(object):
):
locs = list((self.tmp_dir / "train").iterdir())
random.shuffle(locs)
train_examples = self.read_examples(locs, limit=self.limit)
gold_examples = self.iter_gold_docs(
train_annotations = self.read_annotations(locs, limit=self.limit)
examples = self.iter_examples(
nlp,
train_examples,
train_annotations,
gold_preproc,
max_length=max_length,
noise_level=noise_level,
@ -165,33 +165,33 @@ class GoldCorpus(object):
make_projective=True,
ignore_misaligned=ignore_misaligned,
)
yield from gold_examples
yield from examples
def train_dataset_without_preprocessing(
self, nlp, gold_preproc=False, ignore_misaligned=False
):
examples = self.iter_gold_docs(
examples = self.iter_examples(
nlp,
self.train_examples,
self.train_annotations,
gold_preproc=gold_preproc,
ignore_misaligned=ignore_misaligned,
)
yield from examples
def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
examples = self.iter_gold_docs(
examples = self.iter_examples(
nlp,
self.dev_examples,
self.dev_annotations,
gold_preproc=gold_preproc,
ignore_misaligned=ignore_misaligned,
)
yield from examples
@classmethod
def iter_gold_docs(
def iter_examples(
cls,
nlp,
examples,
annotations,
gold_preproc,
max_length=None,
noise_level=0.0,
@ -200,7 +200,8 @@ class GoldCorpus(object):
ignore_misaligned=False,
):
""" Setting gold_preproc will result in creating a doc per sentence """
for example in examples:
for eg_dict in annotations:
example = Example.from_dict(eg_dict, doc=nlp.make_doc(eg_dict["text"]))
example_docs = []
if gold_preproc:
split_examples = example.split_sents()

View File

@ -1,18 +1,69 @@
import numpy
from .annotation import TokenAnnotation, DocAnnotation
from .iob_utils import spans_from_biluo_tags, biluo_tags_from_offsets
from .align import Alignment
from ..errors import Errors, AlignmentError
from ..tokens import Doc
def annotations2doc(doc, doc_annot, tok_annot):
# TODO: Improve and test this
words = tok_annot.words or [tok.text for tok in doc]
fields = {
"tags": "TAG",
"pos": "POS",
"lemmas": "LEMMA",
"deps": "DEP",
}
attrs = []
values = []
for field, attr in fields.items():
value = getattr(tok_annot, field)
# Unset fields will be empty lists.
if value:
attrs.append(attr)
values.append([doc.vocab.strings.add(v) for v in value])
if tok_annot.heads:
attrs.append("HEAD")
values.append([h - i for i, h in enumerate(tok_annot.heads)])
output = Doc(doc.vocab, words=words)
if values:
array = numpy.array(values, dtype="uint64")
output = output.from_array(attrs, array.T)
if tok_annot.entities:
output.ents = spans_from_biluo_tags(output, tok_annot.entities)
doc.cats = dict(doc_annot.cats)
# TODO: Calculate token.ent_kb_id from links.
# We need to fix this and the doc.ents thing, both should be doc
# annotations.
return doc
class Example:
def __init__(self, doc=None, doc_annotation=None, token_annotation=None):
def __init__(self, doc, doc_annotation=None, token_annotation=None):
""" Doc can either be text, or an actual Doc """
if not isinstance(doc, Doc):
raise TypeError("Must pass Doc instance")
self.predicted = doc
self.doc = doc
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
self.token_annotation = (
token_annotation if token_annotation else TokenAnnotation()
)
self._alignment = None
self.reference = annotations2doc(
self.doc,
self.doc_annotation,
self.token_annotation
)
@property
def x(self):
return self.predicted
@property
def y(self):
return self.reference
def _deprecated_get_gold(self, make_projective=False):
from ..syntax.gold_parse import get_parses_from_example
@ -24,6 +75,8 @@ class Example:
def from_dict(cls, example_dict, doc=None):
if example_dict is None:
raise ValueError("Example.from_dict expected dict, received None")
if doc is None:
raise ValueError("Must pass doc")
# TODO: This is ridiculous...
token_dict = example_dict.get("token_annotation", {})
doc_dict = example_dict.get("doc_annotation", {})
@ -34,6 +87,10 @@ class Example:
doc_dict[key] = value
else:
token_dict[key] = value
if token_dict.get("entities"):
entities = token_dict["entities"]
if isinstance(entities[0], (list, tuple)):
token_dict["entities"] = biluo_tags_from_offsets(doc, entities)
token_annotation = TokenAnnotation.from_dict(token_dict)
doc_annotation = DocAnnotation.from_dict(doc_dict)
return cls(
@ -45,8 +102,8 @@ class Example:
if self._alignment is None:
if self.doc is None:
return None
spacy_words = [token.orth_ for token in self.doc]
gold_words = self.token_annotation.words
spacy_words = [token.orth_ for token in self.predicted]
gold_words = [token.orth_ for token in self.reference]
if gold_words == []:
gold_words = spacy_words
self._alignment = Alignment(spacy_words, gold_words)
@ -92,34 +149,6 @@ class Example:
output.append(gold_values[gold_i])
return output
def set_token_annotation(
self,
ids=None,
words=None,
tags=None,
pos=None,
morphs=None,
lemmas=None,
heads=None,
deps=None,
entities=None,
sent_starts=None,
brackets=None,
):
self.token_annotation = TokenAnnotation(
ids=ids,
words=words,
tags=tags,
pos=pos,
morphs=morphs,
lemmas=lemmas,
heads=heads,
deps=deps,
entities=entities,
sent_starts=sent_starts,
brackets=brackets,
)
def set_doc_annotation(self, cats=None, links=None):
if cats:
self.doc_annotation.cats = cats
@ -131,7 +160,6 @@ class Example:
sent_starts and return a list of the new Examples"""
if not self.token_annotation.words:
return [self]
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
s_brackets = []
@ -140,21 +168,25 @@ class Example:
split_examples = []
for i in range(len(t.words)):
if i > 0 and t.sent_starts[i] == 1:
s_example.set_token_annotation(
ids=s_ids,
words=s_words,
tags=s_tags,
pos=s_pos,
morphs=s_morphs,
lemmas=s_lemmas,
heads=s_heads,
deps=s_deps,
entities=s_ents,
sent_starts=s_sent_starts,
brackets=s_brackets,
split_examples.append(
Example(
doc=Doc(self.doc.vocab, words=s_words),
token_annotation=TokenAnnotation(
ids=s_ids,
words=s_words,
tags=s_tags,
pos=s_pos,
morphs=s_morphs,
lemmas=s_lemmas,
heads=s_heads,
deps=s_deps,
entities=s_ents,
sent_starts=s_sent_starts,
brackets=s_brackets,
),
doc_annotation=self.doc_annotation
)
)
split_examples.append(s_example)
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
s_deps, s_ents, s_morphs, s_lemmas = [], [], [], []
s_sent_starts, s_brackets = [], []
@ -172,20 +204,25 @@ class Example:
for b_end, b_label in t.brackets_by_start.get(i, []):
s_brackets.append((i - sent_start_i, b_end - sent_start_i, b_label))
i += 1
s_example.set_token_annotation(
ids=s_ids,
words=s_words,
tags=s_tags,
pos=s_pos,
morphs=s_morphs,
lemmas=s_lemmas,
heads=s_heads,
deps=s_deps,
entities=s_ents,
sent_starts=s_sent_starts,
brackets=s_brackets,
split_examples.append(
Example(
doc=Doc(self.doc.vocab, words=s_words),
token_annotation=TokenAnnotation(
ids=s_ids,
words=s_words,
tags=s_tags,
pos=s_pos,
morphs=s_morphs,
lemmas=s_lemmas,
heads=s_heads,
deps=s_deps,
entities=s_ents,
sent_starts=s_sent_starts,
brackets=s_brackets,
),
doc_annotation=self.doc_annotation
)
)
split_examples.append(s_example)
return split_examples
@classmethod

View File

@ -76,12 +76,12 @@ def read_json_file(loc, docs_filter=None, limit=None):
yield json_data
def json_to_examples(doc):
def json_to_annotations(doc):
"""Convert an item in the JSON-formatted training data to the format
used by GoldParse.
doc (dict): One entry in the training data.
YIELDS (Example): The reformatted data - one training example per paragraph
YIELDS (tuple): The reformatted data - one training example per paragraph
"""
for paragraph in doc["paragraphs"]:
example = {"text": paragraph.get("raw", None)}

View File

@ -108,7 +108,7 @@ def preprocess_training_data(gold_data, label_freq_cutoff=30):
proj_token_dict = example.token_annotation.to_dict()
proj_token_dict["heads"] = proj_heads
proj_token_dict["deps"] = deco_deps
new_example.set_token_annotation(**proj_token_dict)
new_example.token_annotation = TokenAnnotation(**proj_token_dict)
preprocessed.append(new_example)
if label_freq_cutoff > 0:
return _filter_labels(preprocessed, label_freq_cutoff, freqs)
@ -216,6 +216,6 @@ def _filter_labels(examples, cutoff, freqs):
filtered_labels.append(label)
filtered_token_dict = example.token_annotation.to_dict()
filtered_token_dict["deps"] = filtered_labels
new_example.set_token_annotation(**filtered_token_dict)
new_example.token_annotation = TokenAnnotation(**filtered_token_dict)
filtered.append(new_example)
return filtered

View File

@ -3,7 +3,7 @@ import gc
import numpy
import copy
from spacy.gold import Example
from spacy.gold import Example, TokenAnnotation
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.lex_attrs import is_stop
@ -271,9 +271,16 @@ def test_issue1963(en_tokenizer):
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
def test_issue1967(label):
ner = EntityRecognizer(Vocab(), default_ner())
example = Example(doc=None)
example.set_token_annotation(
ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]
example = Example(
doc=Doc(ner.vocab, words=["word"]),
token_annotation=TokenAnnotation(
ids=[0],
words=["word"],
tags=["tag"],
heads=[0],
deps=["dep"],
entities=[label]
)
)
ner.moves.get_actions(gold_parses=[example])

View File

@ -95,6 +95,12 @@ def merged_dict():
}
@pytest.fixture
def vocab():
nlp = English()
return nlp.vocab
def test_gold_biluo_U(en_vocab):
words = ["I", "flew", "to", "London", "."]
spaces = [True, True, True, False, True]
@ -475,8 +481,10 @@ def _train(train_data):
def test_split_sents(merged_dict):
nlp = English()
example = Example()
example.set_token_annotation(**merged_dict)
example = Example.from_dict(
merged_dict,
doc=Doc(nlp.vocab, words=merged_dict["words"])
)
assert len(get_parses_from_example(
example,
merge=False,
@ -506,13 +514,15 @@ def test_split_sents(merged_dict):
assert token_annotation_2.sent_starts == [1, 0, 0, 0]
def test_tuples_to_example(merged_dict):
ex = Example()
ex.set_token_annotation(**merged_dict)
def test_tuples_to_example(vocab, merged_dict):
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
ex.set_doc_annotation(cats=cats)
merged_dict = dict(merged_dict)
merged_dict["cats"] = cats
ex = Example.from_dict(
merged_dict,
doc=Doc(vocab, words=merged_dict["words"])
)
ex_dict = ex.to_dict()
assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"]
assert ex_dict["token_annotation"]["words"] == merged_dict["words"]
assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"]

View File

@ -1,12 +1,14 @@
from numpy.testing import assert_almost_equal, assert_array_almost_equal
import pytest
from pytest import approx
from spacy.gold import Example, GoldParse
from spacy.gold import Example, GoldParse, TokenAnnotation
from spacy.gold.iob_utils import biluo_tags_from_offsets
from spacy.scorer import Scorer, ROCAUCScore
from spacy.scorer import _roc_auc_score, _roc_curve
from .util import get_doc
from spacy.lang.en import English
test_las_apple = [
[
"Apple is looking at buying U.K. startup for $ 1 billion",
@ -134,8 +136,11 @@ def test_ner_per_type(en_vocab):
words=input_.split(" "),
ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
)
ex = Example(doc=doc)
ex.set_token_annotation(entities=annot["entities"])
entities = biluo_tags_from_offsets(doc, annot["entities"])
ex = Example(
doc=doc,
token_annotation=TokenAnnotation(entities=entities)
)
scorer.score(ex)
results = scorer.scores
@ -155,8 +160,11 @@ def test_ner_per_type(en_vocab):
words=input_.split(" "),
ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
)
ex = Example(doc=doc)
ex.set_token_annotation(entities=annot["entities"])
entities = biluo_tags_from_offsets(doc, annot["entities"])
ex = Example(
doc=doc,
token_annotation=TokenAnnotation(entities=entities)
)
scorer.score(ex)
results = scorer.scores

View File

@ -799,6 +799,8 @@ cdef class Doc:
cdef attr_id_t attr_id
cdef TokenC* tokens = self.c
cdef int length = len(array)
if length != len(self):
raise ValueError("Cannot set array values longer than the document.")
# Get set up for fast loading
cdef Pool mem = Pool()
cdef int n_attrs = len(attrs)