From fcb4f7a6db10b94a5ae2f2b961009c67382295ef Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 14:15:12 +0200 Subject: [PATCH 01/56] Start breaking down gold.pyx --- spacy/_gold/align.py | 81 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 spacy/_gold/align.py diff --git a/spacy/_gold/align.py b/spacy/_gold/align.py new file mode 100644 index 000000000..7703232b2 --- /dev/null +++ b/spacy/_gold/align.py @@ -0,0 +1,81 @@ +import numpy +from .errors import Errors, AlignmentError + + +def align(tokens_a, tokens_b): + """Calculate alignment tables between two tokenizations. + + tokens_a (List[str]): The candidate tokenization. + tokens_b (List[str]): The reference tokenization. + RETURNS: (tuple): A 5-tuple consisting of the following information: + * cost (int): The number of misaligned tokens. + * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`. + For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns + to `tokens_b[6]`. If there's no one-to-one alignment for a token, + it has the value -1. + * b2a (List[int]): The same as `a2b`, but mapping the other direction. + * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a` + to indices in `tokens_b`, where multiple tokens of `tokens_a` align to + the same token of `tokens_b`. + * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other + direction. + """ + tokens_a = _normalize_for_alignment(tokens_a) + tokens_b = _normalize_for_alignment(tokens_b) + cost = 0 + a2b = numpy.empty(len(tokens_a), dtype="i") + b2a = numpy.empty(len(tokens_b), dtype="i") + a2b.fill(-1) + b2a.fill(-1) + a2b_multi = {} + b2a_multi = {} + i = 0 + j = 0 + offset_a = 0 + offset_b = 0 + while i < len(tokens_a) and j < len(tokens_b): + a = tokens_a[i][offset_a:] + b = tokens_b[j][offset_b:] + if a == b: + if offset_a == offset_b == 0: + a2b[i] = j + b2a[j] = i + elif offset_a == 0: + cost += 2 + a2b_multi[i] = j + elif offset_b == 0: + cost += 2 + b2a_multi[j] = i + offset_a = offset_b = 0 + i += 1 + j += 1 + elif a == "": + assert offset_a == 0 + cost += 1 + i += 1 + elif b == "": + assert offset_b == 0 + cost += 1 + j += 1 + elif b.startswith(a): + cost += 1 + if offset_a == 0: + a2b_multi[i] = j + i += 1 + offset_a = 0 + offset_b += len(a) + elif a.startswith(b): + cost += 1 + if offset_b == 0: + b2a_multi[j] = i + j += 1 + offset_b = 0 + offset_a += len(b) + else: + assert "".join(tokens_a) != "".join(tokens_b) + raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b)) + return cost, a2b, b2a, a2b_multi, b2a_multi + + +def _normalize_for_alignment(tokens): + return [w.replace(" ", "").lower() for w in tokens] From 6005b94e741eb4125894d9200b185ba5e590b245 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 14:19:06 +0200 Subject: [PATCH 02/56] Add data augmentation --- spacy/_gold/augment.py | 126 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 spacy/_gold/augment.py diff --git a/spacy/_gold/augment.py b/spacy/_gold/augment.py new file mode 100644 index 000000000..02c812825 --- /dev/null +++ b/spacy/_gold/augment.py @@ -0,0 +1,126 @@ +import random +import itertools +from .exmaple import Example + + +def make_orth_variants(nlp, example, orth_variant_level=0.0): + if random.random() >= orth_variant_level: + return example + if not example.token_annotation: + return example + raw = example.text + lower = False + if random.random() >= 0.5: + lower = True + if raw is not None: + raw = raw.lower() + ndsv = nlp.Defaults.single_orth_variants + ndpv = nlp.Defaults.paired_orth_variants + # modify words in paragraph_tuples + variant_example = Example(doc=raw) + token_annotation = example.token_annotation + words = token_annotation.words + tags = token_annotation.tags + if not words or not tags: + # add the unmodified annotation + token_dict = token_annotation.to_dict() + variant_example.set_token_annotation(**token_dict) + else: + if lower: + words = [w.lower() for w in words] + # single variants + punct_choices = [random.choice(x["variants"]) for x in ndsv] + for word_idx in range(len(words)): + for punct_idx in range(len(ndsv)): + if tags[word_idx] in ndsv[punct_idx]["tags"] \ + and words[word_idx] in ndsv[punct_idx]["variants"]: + words[word_idx] = punct_choices[punct_idx] + # paired variants + punct_choices = [random.choice(x["variants"]) for x in ndpv] + for word_idx in range(len(words)): + for punct_idx in range(len(ndpv)): + if tags[word_idx] in ndpv[punct_idx]["tags"] \ + and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): + # backup option: random left vs. right from pair + pair_idx = random.choice([0, 1]) + # best option: rely on paired POS tags like `` / '' + if len(ndpv[punct_idx]["tags"]) == 2: + pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) + # next best option: rely on position in variants + # (may not be unambiguous, so order of variants matters) + else: + for pair in ndpv[punct_idx]["variants"]: + if words[word_idx] in pair: + pair_idx = pair.index(words[word_idx]) + words[word_idx] = punct_choices[punct_idx][pair_idx] + + token_dict = token_annotation.to_dict() + token_dict["words"] = words + token_dict["tags"] = tags + variant_example.set_token_annotation(**token_dict) + # modify raw to match variant_paragraph_tuples + if raw is not None: + variants = [] + for single_variants in ndsv: + variants.extend(single_variants["variants"]) + for paired_variants in ndpv: + variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"]))) + # store variants in reverse length order to be able to prioritize + # longer matches (e.g., "---" before "--") + variants = sorted(variants, key=lambda x: len(x)) + variants.reverse() + variant_raw = "" + raw_idx = 0 + # add initial whitespace + while raw_idx < len(raw) and raw[raw_idx].isspace(): + variant_raw += raw[raw_idx] + raw_idx += 1 + for word in variant_example.token_annotation.words: + match_found = False + # skip whitespace words + if word.isspace(): + match_found = True + # add identical word + elif word not in variants and raw[raw_idx:].startswith(word): + variant_raw += word + raw_idx += len(word) + match_found = True + # add variant word + else: + for variant in variants: + if not match_found and \ + raw[raw_idx:].startswith(variant): + raw_idx += len(variant) + variant_raw += word + match_found = True + # something went wrong, abort + # (add a warning message?) + if not match_found: + return example + # add following whitespace + while raw_idx < len(raw) and raw[raw_idx].isspace(): + variant_raw += raw[raw_idx] + raw_idx += 1 + variant_example.doc = variant_raw + return variant_example + return variant_example + + +def add_noise(orig, noise_level): + if random.random() >= noise_level: + return orig + elif type(orig) == list: + corrupted = [_corrupt(word, noise_level) for word in orig] + corrupted = [w for w in corrupted if w] + return corrupted + else: + return "".join(_corrupt(c, noise_level) for c in orig) + + +def _corrupt(c, noise_level): + if random.random() >= noise_level: + return c + elif c in [".", "'", "!", "?", ","]: + return "\n" + else: + return c.lower() From cce6a51a9cacae940f81d78c2408b5cd235209db Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 14:22:27 +0200 Subject: [PATCH 03/56] Add annotation classes --- spacy/_gold/annotation.py | 123 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 spacy/_gold/annotation.py diff --git a/spacy/_gold/annotation.py b/spacy/_gold/annotation.py new file mode 100644 index 000000000..cd8ac0717 --- /dev/null +++ b/spacy/_gold/annotation.py @@ -0,0 +1,123 @@ +class TokenAnnotation: + def __init__( + self, + ids=None, + words=None, + tags=None, + pos=None, + morphs=None, + lemmas=None, + heads=None, + deps=None, + entities=None, + sent_starts=None, + brackets=None, + ): + self.ids = ids if ids else [] + self.words = words if words else [] + self.tags = tags if tags else [] + self.pos = pos if pos else [] + self.morphs = morphs if morphs else [] + self.lemmas = lemmas if lemmas else [] + self.heads = heads if heads else [] + self.deps = deps if deps else [] + self.entities = entities if entities else [] + self.sent_starts = sent_starts if sent_starts else [] + self.brackets_by_start = {} + if brackets: + for b_start, b_end, b_label in brackets: + self.brackets_by_start.setdefault(b_start, []).append((b_end, b_label)) + + @property + def brackets(self): + brackets = [] + for start, ends_labels in self.brackets_by_start.items(): + for end, label in ends_labels: + brackets.append((start, end, label)) + return brackets + + @classmethod + def from_dict(cls, token_dict): + return cls( + ids=token_dict.get("ids", None), + words=token_dict.get("words", None), + tags=token_dict.get("tags", None), + pos=token_dict.get("pos", None), + morphs=token_dict.get("morphs", None), + lemmas=token_dict.get("lemmas", None), + heads=token_dict.get("heads", None), + deps=token_dict.get("deps", None), + entities=token_dict.get("entities", None), + sent_starts=token_dict.get("sent_starts", None), + brackets=token_dict.get("brackets", None), + ) + + def to_dict(self): + return { + "ids": self.ids, + "words": self.words, + "tags": self.tags, + "pos": self.pos, + "morphs": self.morphs, + "lemmas": self.lemmas, + "heads": self.heads, + "deps": self.deps, + "entities": self.entities, + "sent_starts": self.sent_starts, + "brackets": self.brackets, + } + + def get_id(self, i): + return self.ids[i] if i < len(self.ids) else i + + def get_word(self, i): + return self.words[i] if i < len(self.words) else "" + + def get_tag(self, i): + return self.tags[i] if i < len(self.tags) else "-" + + def get_pos(self, i): + return self.pos[i] if i < len(self.pos) else "" + + def get_morph(self, i): + return self.morphs[i] if i < len(self.morphs) else "" + + def get_lemma(self, i): + return self.lemmas[i] if i < len(self.lemmas) else "" + + def get_head(self, i): + return self.heads[i] if i < len(self.heads) else i + + def get_dep(self, i): + return self.deps[i] if i < len(self.deps) else "" + + def get_entity(self, i): + return self.entities[i] if i < len(self.entities) else "-" + + def get_sent_start(self, i): + return self.sent_starts[i] if i < len(self.sent_starts) else None + + def __str__(self): + return str(self.to_dict()) + + def __repr__(self): + return self.__str__() + + +class DocAnnotation: + def __init__(self, cats=None, links=None): + self.cats = cats if cats else {} + self.links = links if links else {} + + @classmethod + def from_dict(cls, doc_dict): + return cls(cats=doc_dict.get("cats", None), links=doc_dict.get("links", None)) + + def to_dict(self): + return {"cats": self.cats, "links": self.links} + + def __str__(self): + return str(self.to_dict()) + + def __repr__(self): + return self.__str__() From 1fb8fc6ea9e9f290af12c1ea1b9755757e64c610 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 14:24:35 +0200 Subject: [PATCH 04/56] Add Example class --- spacy/_gold/example.py | 199 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 spacy/_gold/example.py diff --git a/spacy/_gold/example.py b/spacy/_gold/example.py new file mode 100644 index 000000000..db9e10093 --- /dev/null +++ b/spacy/_gold/example.py @@ -0,0 +1,199 @@ +from .annotation import TokenAnnotation, DocAnnotation +from .gold_parse import GoldParse + + +class Example: + def __init__(self, doc_annotation=None, token_annotation=None, doc=None, + goldparse=None): + """ Doc can either be text, or an actual Doc """ + self.doc = doc + self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation() + self.token_annotation = token_annotation if token_annotation else TokenAnnotation() + self.goldparse = goldparse + + @classmethod + def from_gold(cls, goldparse, doc=None): + doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links) + token_annotation = goldparse.get_token_annotation() + return cls(doc_annotation, token_annotation, doc) + + @classmethod + def from_dict(cls, example_dict, doc=None): + token_dict = example_dict.get("token_annotation", {}) + token_annotation = TokenAnnotation.from_dict(token_dict) + doc_dict = example_dict.get("doc_annotation", {}) + doc_annotation = DocAnnotation.from_dict(doc_dict) + return cls(doc_annotation, token_annotation, doc) + + def to_dict(self): + """ Note that this method does NOT export the doc, only the annotations ! """ + token_dict = self.token_annotation.to_dict() + doc_dict = self.doc_annotation.to_dict() + return {"token_annotation": token_dict, "doc_annotation": doc_dict} + + @property + def text(self): + if self.doc is None: + return None + if isinstance(self.doc, Doc): + return self.doc.text + return self.doc + + @property + def gold(self): + if self.goldparse is None: + doc, gold = self.get_gold_parses()[0] + self.goldparse = gold + return self.goldparse + + def set_token_annotation(self, ids=None, words=None, tags=None, pos=None, + morphs=None, lemmas=None, heads=None, deps=None, + entities=None, sent_starts=None, brackets=None): + self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags, + pos=pos, morphs=morphs, lemmas=lemmas, heads=heads, + deps=deps, entities=entities, + sent_starts=sent_starts, brackets=brackets) + + def set_doc_annotation(self, cats=None, links=None): + if cats: + self.doc_annotation.cats = cats + if links: + self.doc_annotation.links = links + + def split_sents(self): + """ Split the token annotations into multiple Examples based on + sent_starts and return a list of the new Examples""" + if not self.token_annotation.words: + return [self] + s_example = Example(doc=None, doc_annotation=self.doc_annotation) + s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], [] + s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], [] + s_brackets = [] + sent_start_i = 0 + t = self.token_annotation + split_examples = [] + for i in range(len(t.words)): + if i > 0 and t.sent_starts[i] == 1: + s_example.set_token_annotation(ids=s_ids, + words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs, + lemmas=s_lemmas, heads=s_heads, deps=s_deps, + entities=s_ents, sent_starts=s_sent_starts, + brackets=s_brackets) + split_examples.append(s_example) + s_example = Example(doc=None, doc_annotation=self.doc_annotation) + s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], [] + s_deps, s_ents, s_morphs, s_lemmas = [], [], [], [] + s_sent_starts, s_brackets = [], [] + sent_start_i = i + s_ids.append(t.get_id(i)) + s_words.append(t.get_word(i)) + s_tags.append(t.get_tag(i)) + s_pos.append(t.get_pos(i)) + s_morphs.append(t.get_morph(i)) + s_lemmas.append(t.get_lemma(i)) + s_heads.append(t.get_head(i) - sent_start_i) + s_deps.append(t.get_dep(i)) + s_ents.append(t.get_entity(i)) + s_sent_starts.append(t.get_sent_start(i)) + for b_end, b_label in t.brackets_by_start.get(i, []): + s_brackets.append( + (i - sent_start_i, b_end - sent_start_i, b_label) + ) + i += 1 + s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, + pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads, + deps=s_deps, entities=s_ents, sent_starts=s_sent_starts, + brackets=s_brackets) + split_examples.append(s_example) + return split_examples + + + def get_gold_parses(self, merge=True, vocab=None, make_projective=False, + ignore_misaligned=False): + """Return a list of (doc, GoldParse) objects. + If merge is set to True, keep all Token annotations as one big list.""" + d = self.doc_annotation + # merge == do not modify Example + if merge: + t = self.token_annotation + doc = self.doc + if doc is None or not isinstance(doc, Doc): + if not vocab: + raise ValueError(Errors.E998) + doc = Doc(vocab, words=t.words) + try: + gp = GoldParse.from_annotation(doc, d, t, + make_projective=make_projective) + except AlignmentError: + if ignore_misaligned: + gp = None + else: + raise + return [(doc, gp)] + # not merging: one GoldParse per sentence, defining docs with the words + # from each sentence + else: + parses = [] + split_examples = self.split_sents() + for split_example in split_examples: + if not vocab: + raise ValueError(Errors.E998) + split_doc = Doc(vocab, words=split_example.token_annotation.words) + try: + gp = GoldParse.from_annotation(split_doc, d, + split_example.token_annotation, + make_projective=make_projective) + except AlignmentError: + if ignore_misaligned: + gp = None + else: + raise + if gp is not None: + parses.append((split_doc, gp)) + return parses + + @classmethod + def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False): + """ + Return a list of Example objects, from a variety of input formats. + make_doc needs to be provided when the examples contain text strings and keep_raw_text=False + """ + if isinstance(examples, Example): + return [examples] + if isinstance(examples, tuple): + examples = [examples] + converted_examples = [] + for ex in examples: + if isinstance(ex, Example): + converted_examples.append(ex) + # convert string to Doc to Example + elif isinstance(ex, str): + if keep_raw_text: + converted_examples.append(Example(doc=ex)) + else: + doc = make_doc(ex) + converted_examples.append(Example(doc=doc)) + # convert Doc to Example + elif isinstance(ex, Doc): + converted_examples.append(Example(doc=ex)) + # convert tuples to Example + elif isinstance(ex, tuple) and len(ex) == 2: + doc, gold = ex + gold_dict = {} + # convert string to Doc + if isinstance(doc, str) and not keep_raw_text: + doc = make_doc(doc) + # convert dict to GoldParse + if isinstance(gold, dict): + gold_dict = gold + if doc is not None or gold.get("words", None) is not None: + gold = GoldParse(doc, **gold) + else: + gold = None + if gold is not None: + converted_examples.append(Example.from_gold(goldparse=gold, doc=doc)) + else: + raise ValueError(Errors.E999.format(gold_dict=gold_dict)) + else: + converted_examples.append(ex) + return converted_examples From a663d44b1b93e34f5247402d551a3045968812bc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 14:28:37 +0200 Subject: [PATCH 05/56] Add GoldCorpus --- spacy/_gold/corpus.py | 277 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 spacy/_gold/corpus.py diff --git a/spacy/_gold/corpus.py b/spacy/_gold/corpus.py new file mode 100644 index 000000000..2fdfd8d2a --- /dev/null +++ b/spacy/_gold/corpus.py @@ -0,0 +1,277 @@ +import random +import shutil +import tempfile +import srsly +from pathlib import Path +import itertools +from ..tokens import Doc +from .. import util +from ..errors import Errors +from .gold_utils import read_json_file, read_json_object +from .augment import make_orth_variants, add_noise +from .exmaple import Example + + +class GoldCorpus(object): + """An annotated corpus, using the JSON file format. Manages + annotations for tagging, dependency parsing and NER. + + DOCS: https://spacy.io/api/goldcorpus + """ + + def __init__(self, train, dev, gold_preproc=False, limit=None): + """Create a GoldCorpus. + + train (str / Path): File or directory of training data. + dev (str / Path): File or directory of development data. + RETURNS (GoldCorpus): The newly created object. + """ + self.limit = limit + if isinstance(train, str) or isinstance(train, Path): + train = self.read_examples(self.walk_corpus(train)) + dev = self.read_examples(self.walk_corpus(dev)) + # Write temp directory with one doc per file, so we can shuffle and stream + self.tmp_dir = Path(tempfile.mkdtemp()) + self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit) + self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit) + + def __del__(self): + shutil.rmtree(self.tmp_dir) + + @staticmethod + def write_msgpack(directory, examples, limit=0): + if not directory.exists(): + directory.mkdir() + n = 0 + for i, example in enumerate(examples): + ex_dict = example.to_dict() + text = example.text + srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict)) + n += 1 + if limit and n >= limit: + break + + @staticmethod + def walk_corpus(path): + path = util.ensure_path(path) + if not path.is_dir(): + return [path] + paths = [path] + locs = [] + seen = set() + for path in paths: + if str(path) in seen: + continue + seen.add(str(path)) + if path.parts[-1].startswith("."): + continue + elif path.is_dir(): + paths.extend(path.iterdir()) + elif path.parts[-1].endswith((".json", ".jsonl")): + locs.append(path) + return locs + + @staticmethod + def read_examples(locs, limit=0): + """ Yield training examples """ + i = 0 + for loc in locs: + loc = util.ensure_path(loc) + file_name = loc.parts[-1] + if file_name.endswith("json"): + examples = read_json_file(loc) + elif file_name.endswith("jsonl"): + gold_tuples = srsly.read_jsonl(loc) + first_gold_tuple = next(gold_tuples) + gold_tuples = itertools.chain([first_gold_tuple], gold_tuples) + # TODO: proper format checks with schemas + if isinstance(first_gold_tuple, dict): + if first_gold_tuple.get("paragraphs", None): + examples = read_json_object(gold_tuples) + elif first_gold_tuple.get("doc_annotation", None): + examples = [] + for ex_dict in gold_tuples: + doc = ex_dict.get("doc", None) + if doc is None: + doc = ex_dict.get("text", None) + if not ( + doc is None + or isinstance(doc, Doc) + or isinstance(doc, str) + ): + raise ValueError(Errors.E987.format(type=type(doc))) + examples.append(Example.from_dict(ex_dict, doc=doc)) + + elif file_name.endswith("msg"): + text, ex_dict = srsly.read_msgpack(loc) + examples = [Example.from_dict(ex_dict, doc=text)] + else: + supported = ("json", "jsonl", "msg") + raise ValueError(Errors.E124.format(path=loc, formats=supported)) + try: + for example in examples: + yield example + i += 1 + if limit and i >= limit: + return + except KeyError as e: + msg = "Missing key {}".format(e) + raise KeyError(Errors.E996.format(file=file_name, msg=msg)) + except UnboundLocalError: + msg = "Unexpected document structure" + raise ValueError(Errors.E996.format(file=file_name, msg=msg)) + + @property + def dev_examples(self): + locs = (self.tmp_dir / "dev").iterdir() + yield from self.read_examples(locs, limit=self.limit) + + @property + def train_examples(self): + locs = (self.tmp_dir / "train").iterdir() + yield from self.read_examples(locs, limit=self.limit) + + def count_train(self): + """Returns count of words in train examples""" + n = 0 + i = 0 + for example in self.train_examples: + n += len(example.token_annotation.words) + if self.limit and i >= self.limit: + break + i += 1 + return n + + def train_dataset( + self, + nlp, + gold_preproc=False, + max_length=None, + noise_level=0.0, + orth_variant_level=0.0, + ignore_misaligned=False, + ): + locs = list((self.tmp_dir / "train").iterdir()) + random.shuffle(locs) + train_examples = self.read_examples(locs, limit=self.limit) + gold_examples = self.iter_gold_docs( + nlp, + train_examples, + gold_preproc, + max_length=max_length, + noise_level=noise_level, + orth_variant_level=orth_variant_level, + make_projective=True, + ignore_misaligned=ignore_misaligned, + ) + yield from gold_examples + + def train_dataset_without_preprocessing( + self, nlp, gold_preproc=False, ignore_misaligned=False + ): + examples = self.iter_gold_docs( + nlp, + self.train_examples, + gold_preproc=gold_preproc, + ignore_misaligned=ignore_misaligned, + ) + yield from examples + + def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False): + examples = self.iter_gold_docs( + nlp, + self.dev_examples, + gold_preproc=gold_preproc, + ignore_misaligned=ignore_misaligned, + ) + yield from examples + + @classmethod + def iter_gold_docs( + cls, + nlp, + examples, + gold_preproc, + max_length=None, + noise_level=0.0, + orth_variant_level=0.0, + make_projective=False, + ignore_misaligned=False, + ): + """ Setting gold_preproc will result in creating a doc per sentence """ + for example in examples: + if gold_preproc: + split_examples = example.split_sents() + example_golds = [] + for split_example in split_examples: + split_example_docs = cls._make_docs( + nlp, + split_example, + gold_preproc, + noise_level=noise_level, + orth_variant_level=orth_variant_level, + ) + split_example_golds = cls._make_golds( + split_example_docs, + vocab=nlp.vocab, + make_projective=make_projective, + ignore_misaligned=ignore_misaligned, + ) + example_golds.extend(split_example_golds) + else: + example_docs = cls._make_docs( + nlp, + example, + gold_preproc, + noise_level=noise_level, + orth_variant_level=orth_variant_level, + ) + example_golds = cls._make_golds( + example_docs, + vocab=nlp.vocab, + make_projective=make_projective, + ignore_misaligned=ignore_misaligned, + ) + for ex in example_golds: + if ex.goldparse is not None: + if (not max_length) or len(ex.doc) < max_length: + yield ex + + @classmethod + def _make_docs( + cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0 + ): + var_example = make_orth_variants( + nlp, example, orth_variant_level=orth_variant_level + ) + # gold_preproc is not used ?! + if example.text is not None: + var_text = add_noise(var_example.text, noise_level) + var_doc = nlp.make_doc(var_text) + var_example.doc = var_doc + else: + var_doc = Doc( + nlp.vocab, + words=add_noise(var_example.token_annotation.words, noise_level), + ) + var_example.doc = var_doc + return [var_example] + + @classmethod + def _make_golds( + cls, examples, vocab=None, make_projective=False, ignore_misaligned=False + ): + filtered_examples = [] + for example in examples: + gold_parses = example.get_gold_parses( + vocab=vocab, + make_projective=make_projective, + ignore_misaligned=ignore_misaligned, + ) + assert len(gold_parses) == 1 + doc, gold = gold_parses[0] + if doc: + assert doc == example.doc + example.goldparse = gold + filtered_examples.append(example) + return filtered_examples From 53e6473e2466a247a007d2e6c87d22a65bf4bff3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 14:29:06 +0200 Subject: [PATCH 06/56] Add to/from dict helpers --- spacy/util.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index bc6c98a82..e7d4c8697 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -812,16 +812,23 @@ def filter_spans(spans): def to_bytes(getters, exclude): + return srsly.msgpack_dumps(to_dict(getters, exclude)) + + +def from_bytes(bytes_data, setters, exclude): + return from_dict(srsly.msgpack_loads(bytes_data), setters, exclude) + + +def to_dict(getters, exclude): serialized = {} for key, getter in getters.items(): # Split to support file names like meta.json if key.split(".")[0] not in exclude: serialized[key] = getter() - return srsly.msgpack_dumps(serialized) + return serialized -def from_bytes(bytes_data, setters, exclude): - msg = srsly.msgpack_loads(bytes_data) +def from_dict(msg, setters, exclude): for key, setter in setters.items(): # Split to support file names like meta.json if key.split(".")[0] not in exclude and key in msg: From 156466ca69355de61c537541db22bc1632a7daff Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 14:39:14 +0200 Subject: [PATCH 07/56] Add iob_utils --- spacy/_gold/iob_utils.py | 189 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100644 spacy/_gold/iob_utils.py diff --git a/spacy/_gold/iob_utils.py b/spacy/_gold/iob_utils.py new file mode 100644 index 000000000..2f0f116a1 --- /dev/null +++ b/spacy/_gold/iob_utils.py @@ -0,0 +1,189 @@ +import warnings +from ..errors import Errors, Warnings +from ..tokens import Span + + +def iob_to_biluo(tags): + out = [] + tags = list(tags) + while tags: + out.extend(_consume_os(tags)) + out.extend(_consume_ent(tags)) + return out + + +def biluo_to_iob(tags): + out = [] + for tag in tags: + tag = tag.replace("U-", "B-", 1).replace("L-", "I-", 1) + out.append(tag) + return out + + +def _consume_os(tags): + while tags and tags[0] == "O": + yield tags.pop(0) + + +def _consume_ent(tags): + if not tags: + return [] + tag = tags.pop(0) + target_in = "I" + tag[1:] + target_last = "L" + tag[1:] + length = 1 + while tags and tags[0] in {target_in, target_last}: + length += 1 + tags.pop(0) + label = tag[2:] + if length == 1: + if len(label) == 0: + raise ValueError(Errors.E177.format(tag=tag)) + return ["U-" + label] + else: + start = "B-" + label + end = "L-" + label + middle = [f"I-{label}" for _ in range(1, length - 1)] + return [start] + middle + [end] + + +def biluo_tags_from_offsets(doc, entities, missing="O"): + """Encode labelled spans into per-token tags, using the + Begin/In/Last/Unit/Out scheme (BILUO). + + doc (Doc): The document that the entity offsets refer to. The output tags + will refer to the token boundaries within the document. + entities (iterable): A sequence of `(start, end, label)` triples. `start` + and `end` should be character-offset integers denoting the slice into + the original string. + RETURNS (list): A list of unicode strings, describing the tags. Each tag + string will be of the form either "", "O" or "{action}-{label}", where + action is one of "B", "I", "L", "U". The string "-" is used where the + entity offsets don't align with the tokenization in the `Doc` object. + The training algorithm will view these as missing values. "O" denotes a + non-entity token. "B" denotes the beginning of a multi-token entity, + "I" the inside of an entity of three or more tokens, and "L" the end + of an entity of two or more tokens. "U" denotes a single-token entity. + + EXAMPLE: + >>> text = 'I like London.' + >>> entities = [(len('I like '), len('I like London'), 'LOC')] + >>> doc = nlp.tokenizer(text) + >>> tags = biluo_tags_from_offsets(doc, entities) + >>> assert tags == ["O", "O", 'U-LOC', "O"] + """ + # Ensure no overlapping entity labels exist + tokens_in_ents = {} + + starts = {token.idx: token.i for token in doc} + ends = {token.idx + len(token): token.i for token in doc} + biluo = ["-" for _ in doc] + # Handle entity cases + for start_char, end_char, label in entities: + for token_index in range(start_char, end_char): + if token_index in tokens_in_ents.keys(): + raise ValueError( + Errors.E103.format( + span1=( + tokens_in_ents[token_index][0], + tokens_in_ents[token_index][1], + tokens_in_ents[token_index][2], + ), + span2=(start_char, end_char, label), + ) + ) + tokens_in_ents[token_index] = (start_char, end_char, label) + + start_token = starts.get(start_char) + end_token = ends.get(end_char) + # Only interested if the tokenization is correct + if start_token is not None and end_token is not None: + if start_token == end_token: + biluo[start_token] = f"U-{label}" + else: + biluo[start_token] = f"B-{label}" + for i in range(start_token + 1, end_token): + biluo[i] = f"I-{label}" + biluo[end_token] = f"L-{label}" + # Now distinguish the O cases from ones where we miss the tokenization + entity_chars = set() + for start_char, end_char, label in entities: + for i in range(start_char, end_char): + entity_chars.add(i) + for token in doc: + for i in range(token.idx, token.idx + len(token)): + if i in entity_chars: + break + else: + biluo[token.i] = missing + if "-" in biluo: + ent_str = str(entities) + warnings.warn( + Warnings.W030.format( + text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text, + entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str, + ) + ) + return biluo + + +def spans_from_biluo_tags(doc, tags): + """Encode per-token tags following the BILUO scheme into Span object, e.g. + to overwrite the doc.ents. + + doc (Doc): The document that the BILUO tags refer to. + entities (iterable): A sequence of BILUO tags with each tag describing one + token. Each tags string will be of the form of either "", "O" or + "{action}-{label}", where action is one of "B", "I", "L", "U". + RETURNS (list): A sequence of Span objects. + """ + token_offsets = tags_to_entities(tags) + spans = [] + for label, start_idx, end_idx in token_offsets: + span = Span(doc, start_idx, end_idx + 1, label=label) + spans.append(span) + return spans + + +def offsets_from_biluo_tags(doc, tags): + """Encode per-token tags following the BILUO scheme into entity offsets. + + doc (Doc): The document that the BILUO tags refer to. + entities (iterable): A sequence of BILUO tags with each tag describing one + token. Each tags string will be of the form of either "", "O" or + "{action}-{label}", where action is one of "B", "I", "L", "U". + RETURNS (list): A sequence of `(start, end, label)` triples. `start` and + `end` will be character-offset integers denoting the slice into the + original string. + """ + spans = spans_from_biluo_tags(doc, tags) + return [(span.start_char, span.end_char, span.label_) for span in spans] + + +def tags_to_entities(tags): + entities = [] + start = None + for i, tag in enumerate(tags): + if tag is None: + continue + if tag.startswith("O"): + # TODO: We shouldn't be getting these malformed inputs. Fix this. + if start is not None: + start = None + continue + elif tag == "-": + continue + elif tag.startswith("I"): + if start is None: + raise ValueError(Errors.E067.format(tags=tags[: i + 1])) + continue + if tag.startswith("U"): + entities.append((tag[2:], i, i)) + elif tag.startswith("B"): + start = i + elif tag.startswith("L"): + entities.append((tag[2:], start, i)) + start = None + else: + raise ValueError(Errors.E068.format(tag=tag)) + return entities From 32c8fb1372a8f143d471352192440d5ca2d33740 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 14:41:49 +0200 Subject: [PATCH 08/56] Add gold_io.pyx --- spacy/_gold/gold_io.pyx | 202 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 spacy/_gold/gold_io.pyx diff --git a/spacy/_gold/gold_io.pyx b/spacy/_gold/gold_io.pyx new file mode 100644 index 000000000..15581c151 --- /dev/null +++ b/spacy/_gold/gold_io.pyx @@ -0,0 +1,202 @@ +import warnings +import srsly +from .. import util +from ..errors import Warnings +from ..tokens import Token, Doc +from .example import Example +from .iob_utils import biluo_tags_from_offsets + + +def merge_sents(sents): + m_deps = [[], [], [], [], [], []] + m_cats = {} + m_brackets = [] + i = 0 + for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents: + m_deps[0].extend(id_ + i for id_ in ids) + m_deps[1].extend(words) + m_deps[2].extend(tags) + m_deps[3].extend(head + i for head in heads) + m_deps[4].extend(labels) + m_deps[5].extend(ner) + m_brackets.extend((b["first"] + i, b["last"] + i, b["label"]) + for b in brackets) + m_cats.update(cats) + i += len(ids) + return [(m_deps, (m_cats, m_brackets))] + + +def docs_to_json(docs, id=0, ner_missing_tag="O"): + """Convert a list of Doc objects into the JSON-serializable format used by + the spacy train command. + + docs (iterable / Doc): The Doc object(s) to convert. + id (int): Id for the JSON. + RETURNS (dict): The data in spaCy's JSON format + - each input doc will be treated as a paragraph in the output doc + """ + if isinstance(docs, Doc): + docs = [docs] + json_doc = {"id": id, "paragraphs": []} + for i, doc in enumerate(docs): + json_para = {'raw': doc.text, "sentences": [], "cats": []} + for cat, val in doc.cats.items(): + json_cat = {"label": cat, "value": val} + json_para["cats"].append(json_cat) + ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] + biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag) + for j, sent in enumerate(doc.sents): + json_sent = {"tokens": [], "brackets": []} + for token in sent: + json_token = {"id": token.i, "orth": token.text} + if doc.is_tagged: + json_token["tag"] = token.tag_ + json_token["pos"] = token.pos_ + json_token["morph"] = token.morph_ + json_token["lemma"] = token.lemma_ + if doc.is_parsed: + json_token["head"] = token.head.i-token.i + json_token["dep"] = token.dep_ + json_token["ner"] = biluo_tags[token.i] + json_sent["tokens"].append(json_token) + json_para["sentences"].append(json_sent) + json_doc["paragraphs"].append(json_para) + return json_doc + + +def json_to_examples(doc): + """Convert an item in the JSON-formatted training data to the format + used by GoldParse. + + doc (dict): One entry in the training data. + YIELDS (Example): The reformatted data - one training example per paragraph + """ + for paragraph in doc["paragraphs"]: + example = Example(doc=paragraph.get("raw", None)) + words = [] + ids = [] + tags = [] + pos = [] + morphs = [] + lemmas = [] + heads = [] + labels = [] + ner = [] + sent_starts = [] + brackets = [] + for sent in paragraph["sentences"]: + sent_start_i = len(words) + for i, token in enumerate(sent["tokens"]): + words.append(token["orth"]) + ids.append(token.get('id', sent_start_i + i)) + tags.append(token.get('tag', "-")) + pos.append(token.get("pos", "")) + morphs.append(token.get("morph", "")) + lemmas.append(token.get("lemma", "")) + heads.append(token.get("head", 0) + sent_start_i + i) + labels.append(token.get("dep", "")) + # Ensure ROOT label is case-insensitive + if labels[-1].lower() == "root": + labels[-1] = "ROOT" + ner.append(token.get("ner", "-")) + if i == 0: + sent_starts.append(1) + else: + sent_starts.append(0) + if "brackets" in sent: + brackets.extend((b["first"] + sent_start_i, + b["last"] + sent_start_i, b["label"]) + for b in sent["brackets"]) + cats = {} + for cat in paragraph.get("cats", {}): + cats[cat["label"]] = cat["value"] + example.set_token_annotation(ids=ids, words=words, tags=tags, + pos=pos, morphs=morphs, lemmas=lemmas, heads=heads, + deps=labels, entities=ner, sent_starts=sent_starts, + brackets=brackets) + example.set_doc_annotation(cats=cats) + yield example + + +def read_json_file(loc, docs_filter=None, limit=None): + loc = util.ensure_path(loc) + if loc.is_dir(): + for filename in loc.iterdir(): + yield from read_json_file(loc / filename, limit=limit) + else: + for doc in json_iterate(loc): + if docs_filter is not None and not docs_filter(doc): + continue + for json_data in json_to_examples(doc): + yield json_data + + +def read_json_object(json_corpus_section): + """Take a list of JSON-formatted documents (e.g. from an already loaded + training data file) and yield annotations in the GoldParse format. + + json_corpus_section (list): The data. + YIELDS (Example): The reformatted data - one training example per paragraph + """ + for json_doc in json_corpus_section: + examples = json_to_examples(json_doc) + for ex in examples: + yield ex + + +def json_iterate(loc): + # We should've made these files jsonl...But since we didn't, parse out + # the docs one-by-one to reduce memory usage. + # It's okay to read in the whole file -- just don't parse it into JSON. + cdef bytes py_raw + loc = util.ensure_path(loc) + with loc.open("rb") as file_: + py_raw = file_.read() + cdef long file_length = len(py_raw) + if file_length > 2 ** 30: + warnings.warn(Warnings.W027.format(size=file_length)) + + raw = py_raw + cdef int square_depth = 0 + cdef int curly_depth = 0 + cdef int inside_string = 0 + cdef int escape = 0 + cdef long start = -1 + cdef char c + cdef char quote = ord('"') + cdef char backslash = ord("\\") + cdef char open_square = ord("[") + cdef char close_square = ord("]") + cdef char open_curly = ord("{") + cdef char close_curly = ord("}") + for i in range(file_length): + c = raw[i] + if escape: + escape = False + continue + if c == backslash: + escape = True + continue + if c == quote: + inside_string = not inside_string + continue + if inside_string: + continue + if c == open_square: + square_depth += 1 + elif c == close_square: + square_depth -= 1 + elif c == open_curly: + if square_depth == 1 and curly_depth == 0: + start = i + curly_depth += 1 + elif c == close_curly: + curly_depth -= 1 + if square_depth == 1 and curly_depth == 0: + py_str = py_raw[start : i + 1].decode("utf8") + try: + yield srsly.json_loads(py_str) + except Exception: + print(py_str) + raise + start = -1 From 7b873ce2b15494c4da9ea5919fabe1e360681d5c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 15:09:25 +0200 Subject: [PATCH 09/56] Move GoldParse under spacy.syntax --- spacy/syntax/arc_eager.pxd | 2 +- spacy/syntax/arc_eager.pyx | 2 +- spacy/syntax/gold_parse.pxd | 39 ++++ spacy/syntax/gold_parse.pyx | 311 +++++++++++++++++++++++++++++ spacy/syntax/ner.pxd | 2 +- spacy/syntax/ner.pyx | 2 +- spacy/syntax/nn_parser.pyx | 2 +- spacy/syntax/transition_system.pxd | 4 +- 8 files changed, 357 insertions(+), 7 deletions(-) create mode 100644 spacy/syntax/gold_parse.pxd create mode 100644 spacy/syntax/gold_parse.pyx diff --git a/spacy/syntax/arc_eager.pxd b/spacy/syntax/arc_eager.pxd index 14d706548..96dd37a36 100644 --- a/spacy/syntax/arc_eager.pxd +++ b/spacy/syntax/arc_eager.pxd @@ -3,7 +3,7 @@ from cymem.cymem cimport Pool from .stateclass cimport StateClass from ..typedefs cimport weight_t, attr_t from .transition_system cimport TransitionSystem, Transition -from ..gold cimport GoldParseC +from .gold_parse cimport GoldParseC cdef class ArcEager(TransitionSystem): diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 19be95f3f..df8c7d563 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -8,7 +8,7 @@ import json from ..typedefs cimport hash_t, attr_t from ..strings cimport hash_string -from ..gold cimport GoldParse, GoldParseC +from .gold_parse cimport GoldParse, GoldParseC from ..structs cimport TokenC from ..tokens.doc cimport Doc, set_children_from_heads from .stateclass cimport StateClass diff --git a/spacy/syntax/gold_parse.pxd b/spacy/syntax/gold_parse.pxd new file mode 100644 index 000000000..9815513d0 --- /dev/null +++ b/spacy/syntax/gold_parse.pxd @@ -0,0 +1,39 @@ +from cymem.cymem cimport Pool +from .transition_system cimport Transition +from ..typedefs cimport attr_t + + +cdef struct GoldParseC: + int* tags + int* heads + int* has_dep + int* sent_start + attr_t* labels + int** brackets + Transition* ner + + +cdef class GoldParse: + cdef Pool mem + + cdef GoldParseC c + cdef readonly object orig + + cdef int length + cdef public int loss + cdef public list words + cdef public list tags + cdef public list pos + cdef public list morphs + cdef public list lemmas + cdef public list sent_starts + cdef public list heads + cdef public list labels + cdef public dict orths + cdef public list ner + cdef public dict brackets + cdef public dict cats + cdef public dict links + + cdef readonly list cand_to_gold + cdef readonly list gold_to_cand diff --git a/spacy/syntax/gold_parse.pyx b/spacy/syntax/gold_parse.pyx new file mode 100644 index 000000000..59e8f4bbb --- /dev/null +++ b/spacy/syntax/gold_parse.pyx @@ -0,0 +1,311 @@ +# cython: profile=True +import re +import random +import numpy +import tempfile +import shutil +import itertools +from pathlib import Path +import srsly +import warnings + +from .. import util +from ..syntax import nonproj +from ..tokens import Doc, Span +from ..errors import Errors, AlignmentError, Warnings +from .iob_utils import offsets_from_biluo_tags +from .align import align + + +punct_re = re.compile(r"\W") + +def is_punct_label(label): + return label == "P" or label.lower() == "punct" + + +cdef class GoldParse: + """Collection for training annotations. + + DOCS: https://spacy.io/api/goldparse + """ + @classmethod + def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False): + return cls(doc, words=token_annotation.words, + tags=token_annotation.tags, + pos=token_annotation.pos, + morphs=token_annotation.morphs, + lemmas=token_annotation.lemmas, + heads=token_annotation.heads, + deps=token_annotation.deps, + entities=token_annotation.entities, + sent_starts=token_annotation.sent_starts, + cats=doc_annotation.cats, + links=doc_annotation.links, + make_projective=make_projective) + + def get_token_annotation(self): + ids = None + if self.words: + ids = list(range(len(self.words))) + + return TokenAnnotation(ids=ids, words=self.words, tags=self.tags, + pos=self.pos, morphs=self.morphs, + lemmas=self.lemmas, heads=self.heads, + deps=self.labels, entities=self.ner, + sent_starts=self.sent_starts) + + def __init__(self, doc, words=None, tags=None, pos=None, morphs=None, + lemmas=None, heads=None, deps=None, entities=None, + sent_starts=None, make_projective=False, cats=None, + links=None): + """Create a GoldParse. The fields will not be initialized if len(doc) is zero. + + doc (Doc): The document the annotations refer to. + words (iterable): A sequence of unicode word strings. + tags (iterable): A sequence of strings, representing tag annotations. + pos (iterable): A sequence of strings, representing UPOS annotations. + morphs (iterable): A sequence of strings, representing morph + annotations. + lemmas (iterable): A sequence of strings, representing lemma + annotations. + heads (iterable): A sequence of integers, representing syntactic + head offsets. + deps (iterable): A sequence of strings, representing the syntactic + relation types. + entities (iterable): A sequence of named entity annotations, either as + BILUO tag strings, or as `(start_char, end_char, label)` tuples, + representing the entity positions. + sent_starts (iterable): A sequence of sentence position tags, 1 for + the first word in a sentence, 0 for all others. + cats (dict): Labels for text classification. Each key in the dictionary + may be a string or an int, or a `(start_char, end_char, label)` + tuple, indicating that the label is applied to only part of the + document (usually a sentence). Unlike entity annotations, label + annotations can overlap, i.e. a single word can be covered by + multiple labelled spans. The TextCategorizer component expects + true examples of a label to have the value 1.0, and negative + examples of a label to have the value 0.0. Labels not in the + dictionary are treated as missing - the gradient for those labels + will be zero. + links (dict): A dict with `(start_char, end_char)` keys, + and the values being dicts with kb_id:value entries, + representing the external IDs in a knowledge base (KB) + mapped to either 1.0 or 0.0, indicating positive and + negative examples respectively. + RETURNS (GoldParse): The newly constructed object. + """ + self.mem = Pool() + self.loss = 0 + self.length = len(doc) + + self.cats = {} if cats is None else dict(cats) + self.links = {} if links is None else dict(links) + + # temporary doc for aligning entity annotation + entdoc = None + + # avoid allocating memory if the doc does not contain any tokens + if self.length == 0: + self.words = [] + self.tags = [] + self.heads = [] + self.labels = [] + self.ner = [] + self.morphs = [] + # set a minimal orig so that the scorer can score an empty doc + self.orig = TokenAnnotation(ids=[]) + else: + if not words: + words = [token.text for token in doc] + if not tags: + tags = [None for _ in words] + if not pos: + pos = [None for _ in words] + if not morphs: + morphs = [None for _ in words] + if not lemmas: + lemmas = [None for _ in words] + if not heads: + heads = [None for _ in words] + if not deps: + deps = [None for _ in words] + if not sent_starts: + sent_starts = [None for _ in words] + if entities is None: + entities = ["-" for _ in words] + elif len(entities) == 0: + entities = ["O" for _ in words] + else: + # Translate the None values to '-', to make processing easier. + # See Issue #2603 + entities = [(ent if ent is not None else "-") for ent in entities] + if not isinstance(entities[0], str): + # Assume we have entities specified by character offset. + # Create a temporary Doc corresponding to provided words + # (to preserve gold tokenization) and text (to preserve + # character offsets). + entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text) + entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces) + entdoc_entities = biluo_tags_from_offsets(entdoc, entities) + # There may be some additional whitespace tokens in the + # temporary doc, so check that the annotations align with + # the provided words while building a list of BILUO labels. + entities = [] + words_offset = 0 + for i in range(len(entdoc_words)): + if words[i + words_offset] == entdoc_words[i]: + entities.append(entdoc_entities[i]) + else: + words_offset -= 1 + if len(entities) != len(words): + warnings.warn(Warnings.W029.format(text=doc.text)) + entities = ["-" for _ in words] + + # These are filled by the tagger/parser/entity recogniser + self.c.tags = self.mem.alloc(len(doc), sizeof(int)) + self.c.heads = self.mem.alloc(len(doc), sizeof(int)) + self.c.labels = self.mem.alloc(len(doc), sizeof(attr_t)) + self.c.has_dep = self.mem.alloc(len(doc), sizeof(int)) + self.c.sent_start = self.mem.alloc(len(doc), sizeof(int)) + self.c.ner = self.mem.alloc(len(doc), sizeof(Transition)) + + self.words = [None] * len(doc) + self.tags = [None] * len(doc) + self.pos = [None] * len(doc) + self.morphs = [None] * len(doc) + self.lemmas = [None] * len(doc) + self.heads = [None] * len(doc) + self.labels = [None] * len(doc) + self.ner = [None] * len(doc) + self.sent_starts = [None] * len(doc) + + # This needs to be done before we align the words + if make_projective and any(heads) and any(deps) : + heads, deps = nonproj.projectivize(heads, deps) + + # Do many-to-one alignment for misaligned tokens. + # If we over-segment, we'll have one gold word that covers a sequence + # of predicted words + # If we under-segment, we'll have one predicted word that covers a + # sequence of gold words. + # If we "mis-segment", we'll have a sequence of predicted words covering + # a sequence of gold words. That's many-to-many -- we don't do that + # except for NER spans where the start and end can be aligned. + cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words) + + self.cand_to_gold = [(j if j >= 0 else None) for j in i2j] + self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] + + self.orig = TokenAnnotation(ids=list(range(len(words))), + words=words, tags=tags, pos=pos, morphs=morphs, + lemmas=lemmas, heads=heads, deps=deps, entities=entities, + sent_starts=sent_starts, brackets=[]) + + for i, gold_i in enumerate(self.cand_to_gold): + if doc[i].text.isspace(): + self.words[i] = doc[i].text + self.tags[i] = "_SP" + self.pos[i] = "SPACE" + self.morphs[i] = None + self.lemmas[i] = None + self.heads[i] = None + self.labels[i] = None + self.ner[i] = None + self.sent_starts[i] = 0 + if gold_i is None: + if i in i2j_multi: + self.words[i] = words[i2j_multi[i]] + self.tags[i] = tags[i2j_multi[i]] + self.pos[i] = pos[i2j_multi[i]] + self.morphs[i] = morphs[i2j_multi[i]] + self.lemmas[i] = lemmas[i2j_multi[i]] + self.sent_starts[i] = sent_starts[i2j_multi[i]] + is_last = i2j_multi[i] != i2j_multi.get(i+1) + # Set next word in multi-token span as head, until last + if not is_last: + self.heads[i] = i+1 + self.labels[i] = "subtok" + else: + head_i = heads[i2j_multi[i]] + if head_i: + self.heads[i] = self.gold_to_cand[head_i] + self.labels[i] = deps[i2j_multi[i]] + ner_tag = entities[i2j_multi[i]] + # Assign O/- for many-to-one O/- NER tags + if ner_tag in ("O", "-"): + self.ner[i] = ner_tag + else: + self.words[i] = words[gold_i] + self.tags[i] = tags[gold_i] + self.pos[i] = pos[gold_i] + self.morphs[i] = morphs[gold_i] + self.lemmas[i] = lemmas[gold_i] + self.sent_starts[i] = sent_starts[gold_i] + if heads[gold_i] is None: + self.heads[i] = None + else: + self.heads[i] = self.gold_to_cand[heads[gold_i]] + self.labels[i] = deps[gold_i] + self.ner[i] = entities[gold_i] + # Assign O/- for one-to-many O/- NER tags + for j, cand_j in enumerate(self.gold_to_cand): + if cand_j is None: + if j in j2i_multi: + i = j2i_multi[j] + ner_tag = entities[j] + if ner_tag in ("O", "-"): + self.ner[i] = ner_tag + + # If there is entity annotation and some tokens remain unaligned, + # align all entities at the character level to account for all + # possible token misalignments within the entity spans + if any([e not in ("O", "-") for e in entities]) and None in self.ner: + # If the temporary entdoc wasn't created above, initialize it + if not entdoc: + entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text) + entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces) + # Get offsets based on gold words and BILUO entities + entdoc_offsets = offsets_from_biluo_tags(entdoc, entities) + aligned_offsets = [] + aligned_spans = [] + # Filter offsets to identify those that align with doc tokens + for offset in entdoc_offsets: + span = doc.char_span(offset[0], offset[1]) + if span and not span.text.isspace(): + aligned_offsets.append(offset) + aligned_spans.append(span) + # Convert back to BILUO for doc tokens and assign NER for all + # aligned spans + biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None) + for span in aligned_spans: + for i in range(span.start, span.end): + self.ner[i] = biluo_tags[i] + + # Prevent whitespace that isn't within entities from being tagged as + # an entity. + for i in range(len(self.ner)): + if self.tags[i] == "_SP": + prev_ner = self.ner[i-1] if i >= 1 else None + next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None + if prev_ner == "O" or next_ner == "O": + self.ner[i] = "O" + + cycle = nonproj.contains_cycle(self.heads) + if cycle is not None: + raise ValueError(Errors.E069.format(cycle=cycle, + cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]), + doc_tokens=" ".join(words[:50]))) + + def __len__(self): + """Get the number of gold-standard tokens. + + RETURNS (int): The number of gold-standard tokens. + """ + return self.length + + @property + def is_projective(self): + """Whether the provided syntactic annotations form a projective + dependency tree. + """ + return not nonproj.is_nonproj_tree(self.heads) diff --git a/spacy/syntax/ner.pxd b/spacy/syntax/ner.pxd index 647f98fc0..739b8dc1f 100644 --- a/spacy/syntax/ner.pxd +++ b/spacy/syntax/ner.pxd @@ -1,6 +1,6 @@ from .transition_system cimport TransitionSystem from .transition_system cimport Transition -from ..gold cimport GoldParseC +from .gold_parse cimport GoldParseC from ..typedefs cimport attr_t diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index ff74be601..4061304d8 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -7,7 +7,7 @@ from .stateclass cimport StateClass from ._state cimport StateC from .transition_system cimport Transition from .transition_system cimport do_func_t -from ..gold cimport GoldParseC, GoldParse +from .gold_parse cimport GoldParseC, GoldParse from ..lexeme cimport Lexeme from ..attrs cimport IS_SPACE diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index fcaff444e..12f56ba67 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -20,7 +20,7 @@ import numpy import warnings from ..tokens.doc cimport Doc -from ..gold cimport GoldParse +from .gold_parse cimport GoldParse from ..typedefs cimport weight_t, class_t, hash_t from ._parser_model cimport alloc_activations, free_activations from ._parser_model cimport predict_states, arg_max_if_valid diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index 5fd3b5c5f..33f96c331 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -2,8 +2,8 @@ from cymem.cymem cimport Pool from ..typedefs cimport attr_t, weight_t from ..structs cimport TokenC -from ..gold cimport GoldParse -from ..gold cimport GoldParseC +from .gold_parse cimport GoldParse +from .gold_parse cimport GoldParseC from ..strings cimport StringStore from .stateclass cimport StateClass from ._state cimport StateC From 1d2e39d97476ed1f01b3a711db69b1ce9a4917d6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 15:10:10 +0200 Subject: [PATCH 10/56] Support to_dict in Doc --- spacy/tokens/doc.pyx | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index debab6aeb..3aa27e451 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -881,6 +881,32 @@ cdef class Doc: def to_bytes(self, exclude=tuple(), **kwargs): """Serialize, i.e. export the document contents to a binary string. + exclude (list): String names of serialization fields to exclude. + RETURNS (bytes): A losslessly serialized copy of the `Doc`, including + all annotations. + + DOCS: https://spacy.io/api/doc#to_bytes + """ + return srsly.msgpack_dumps(self.to_dict(exclude=exclude, **kwargs)) + + def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + """Deserialize, i.e. import the document contents from a binary string. + + data (bytes): The string to load from. + exclude (list): String names of serialization fields to exclude. + RETURNS (Doc): Itself. + + DOCS: https://spacy.io/api/doc#from_bytes + """ + return self.from_dict( + srsly.msgpack_loads(bytes_data), + exclude=exclude, + **kwargs + ) + + def to_dict(self, exclude=tuple(), **kwargs): + """Export the document contents to a dictionary for serialization. + exclude (list): String names of serialization fields to exclude. RETURNS (bytes): A losslessly serialized copy of the `Doc`, including all annotations. @@ -917,9 +943,9 @@ cdef class Doc: serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys) if "user_data_values" not in exclude: serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values) - return util.to_bytes(serializers, exclude) + return util.to_dict(serializers, exclude) - def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def from_dict(self, msg, exclude=tuple(), **kwargs): """Deserialize, i.e. import the document contents from a binary string. data (bytes): The string to load from. @@ -943,7 +969,6 @@ cdef class Doc: for key in kwargs: if key in deserializers or key in ("user_data",): raise ValueError(Errors.E128.format(arg=key)) - msg = util.from_bytes(bytes_data, deserializers, exclude) # Msgpack doesn't distinguish between lists and tuples, which is # vexing for user data. As a best guess, we *know* that within # keys, we must have tuples. In values we just have to hope @@ -975,6 +1000,7 @@ cdef class Doc: self.from_array(msg["array_head"][2:], attrs[:, 2:]) return self + def extend_tensor(self, tensor): """Concatenate a new tensor onto the doc.tensor object. From 3baa1ada03d4b4746091be74e733bc94984d3f36 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 15:10:33 +0200 Subject: [PATCH 11/56] Refactr spacy.gold --- spacy/gold.pxd | 68 --- spacy/gold.pyx | 1407 ------------------------------------------------ 2 files changed, 1475 deletions(-) delete mode 100644 spacy/gold.pxd delete mode 100644 spacy/gold.pyx diff --git a/spacy/gold.pxd b/spacy/gold.pxd deleted file mode 100644 index bf724868f..000000000 --- a/spacy/gold.pxd +++ /dev/null @@ -1,68 +0,0 @@ -from cymem.cymem cimport Pool - -from .typedefs cimport attr_t -from .syntax.transition_system cimport Transition - -from .tokens import Doc - - -cdef struct GoldParseC: - int* tags - int* heads - int* has_dep - int* sent_start - attr_t* labels - int** brackets - Transition* ner - - -cdef class GoldParse: - cdef Pool mem - - cdef GoldParseC c - cdef readonly TokenAnnotation orig - - cdef int length - cdef public int loss - cdef public list words - cdef public list tags - cdef public list pos - cdef public list morphs - cdef public list lemmas - cdef public list sent_starts - cdef public list heads - cdef public list labels - cdef public dict orths - cdef public list ner - cdef public dict brackets - cdef public dict cats - cdef public dict links - - cdef readonly list cand_to_gold - cdef readonly list gold_to_cand - - -cdef class TokenAnnotation: - cdef public list ids - cdef public list words - cdef public list tags - cdef public list pos - cdef public list morphs - cdef public list lemmas - cdef public list heads - cdef public list deps - cdef public list entities - cdef public list sent_starts - cdef public dict brackets_by_start - - -cdef class DocAnnotation: - cdef public object cats - cdef public object links - - -cdef class Example: - cdef public object doc - cdef public TokenAnnotation token_annotation - cdef public DocAnnotation doc_annotation - cdef public object goldparse diff --git a/spacy/gold.pyx b/spacy/gold.pyx deleted file mode 100644 index 1e58f0635..000000000 --- a/spacy/gold.pyx +++ /dev/null @@ -1,1407 +0,0 @@ -# cython: profile=True -import re -import random -import numpy -import tempfile -import shutil -import itertools -from pathlib import Path -import srsly -import warnings - -from .syntax import nonproj -from .tokens import Doc, Span -from .errors import Errors, AlignmentError, Warnings -from . import util - - -punct_re = re.compile(r"\W") - - -def tags_to_entities(tags): - entities = [] - start = None - for i, tag in enumerate(tags): - if tag is None: - continue - if tag.startswith("O"): - # TODO: We shouldn't be getting these malformed inputs. Fix this. - if start is not None: - start = None - continue - elif tag == "-": - continue - elif tag.startswith("I"): - if start is None: - raise ValueError(Errors.E067.format(tags=tags[:i + 1])) - continue - if tag.startswith("U"): - entities.append((tag[2:], i, i)) - elif tag.startswith("B"): - start = i - elif tag.startswith("L"): - entities.append((tag[2:], start, i)) - start = None - else: - raise ValueError(Errors.E068.format(tag=tag)) - return entities - - -def merge_sents(sents): - m_deps = [[], [], [], [], [], []] - m_cats = {} - m_brackets = [] - i = 0 - for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents: - m_deps[0].extend(id_ + i for id_ in ids) - m_deps[1].extend(words) - m_deps[2].extend(tags) - m_deps[3].extend(head + i for head in heads) - m_deps[4].extend(labels) - m_deps[5].extend(ner) - m_brackets.extend((b["first"] + i, b["last"] + i, b["label"]) - for b in brackets) - m_cats.update(cats) - i += len(ids) - return [(m_deps, (m_cats, m_brackets))] - - -def _normalize_for_alignment(tokens): - return [w.replace(" ", "").lower() for w in tokens] - - -def align(tokens_a, tokens_b): - """Calculate alignment tables between two tokenizations. - - tokens_a (List[str]): The candidate tokenization. - tokens_b (List[str]): The reference tokenization. - RETURNS: (tuple): A 5-tuple consisting of the following information: - * cost (int): The number of misaligned tokens. - * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`. - For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns - to `tokens_b[6]`. If there's no one-to-one alignment for a token, - it has the value -1. - * b2a (List[int]): The same as `a2b`, but mapping the other direction. - * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a` - to indices in `tokens_b`, where multiple tokens of `tokens_a` align to - the same token of `tokens_b`. - * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other - direction. - """ - tokens_a = _normalize_for_alignment(tokens_a) - tokens_b = _normalize_for_alignment(tokens_b) - cost = 0 - a2b = numpy.empty(len(tokens_a), dtype="i") - b2a = numpy.empty(len(tokens_b), dtype="i") - a2b.fill(-1) - b2a.fill(-1) - a2b_multi = {} - b2a_multi = {} - i = 0 - j = 0 - offset_a = 0 - offset_b = 0 - while i < len(tokens_a) and j < len(tokens_b): - a = tokens_a[i][offset_a:] - b = tokens_b[j][offset_b:] - if a == b: - if offset_a == offset_b == 0: - a2b[i] = j - b2a[j] = i - elif offset_a == 0: - cost += 2 - a2b_multi[i] = j - elif offset_b == 0: - cost += 2 - b2a_multi[j] = i - offset_a = offset_b = 0 - i += 1 - j += 1 - elif a == "": - assert offset_a == 0 - cost += 1 - i += 1 - elif b == "": - assert offset_b == 0 - cost += 1 - j += 1 - elif b.startswith(a): - cost += 1 - if offset_a == 0: - a2b_multi[i] = j - i += 1 - offset_a = 0 - offset_b += len(a) - elif a.startswith(b): - cost += 1 - if offset_b == 0: - b2a_multi[j] = i - j += 1 - offset_b = 0 - offset_a += len(b) - else: - assert "".join(tokens_a) != "".join(tokens_b) - raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b)) - return cost, a2b, b2a, a2b_multi, b2a_multi - - -class GoldCorpus(object): - """An annotated corpus, using the JSON file format. Manages - annotations for tagging, dependency parsing and NER. - - DOCS: https://spacy.io/api/goldcorpus - """ - def __init__(self, train, dev, gold_preproc=False, limit=None): - """Create a GoldCorpus. - - train (str / Path): File or directory of training data. - dev (str / Path): File or directory of development data. - RETURNS (GoldCorpus): The newly created object. - """ - self.limit = limit - if isinstance(train, str) or isinstance(train, Path): - train = self.read_examples(self.walk_corpus(train)) - dev = self.read_examples(self.walk_corpus(dev)) - # Write temp directory with one doc per file, so we can shuffle and stream - self.tmp_dir = Path(tempfile.mkdtemp()) - self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit) - self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit) - - def __del__(self): - shutil.rmtree(self.tmp_dir) - - @staticmethod - def write_msgpack(directory, examples, limit=0): - if not directory.exists(): - directory.mkdir() - n = 0 - for i, example in enumerate(examples): - ex_dict = example.to_dict() - text = example.text - srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict)) - n += 1 - if limit and n >= limit: - break - - @staticmethod - def walk_corpus(path): - path = util.ensure_path(path) - if not path.is_dir(): - return [path] - paths = [path] - locs = [] - seen = set() - for path in paths: - if str(path) in seen: - continue - seen.add(str(path)) - if path.parts[-1].startswith("."): - continue - elif path.is_dir(): - paths.extend(path.iterdir()) - elif path.parts[-1].endswith((".json", ".jsonl")): - locs.append(path) - return locs - - @staticmethod - def read_examples(locs, limit=0): - """ Yield training examples """ - i = 0 - for loc in locs: - loc = util.ensure_path(loc) - file_name = loc.parts[-1] - if file_name.endswith("json"): - examples = read_json_file(loc) - elif file_name.endswith("jsonl"): - gold_tuples = srsly.read_jsonl(loc) - first_gold_tuple = next(gold_tuples) - gold_tuples = itertools.chain([first_gold_tuple], gold_tuples) - # TODO: proper format checks with schemas - if isinstance(first_gold_tuple, dict): - if first_gold_tuple.get("paragraphs", None): - examples = read_json_object(gold_tuples) - elif first_gold_tuple.get("doc_annotation", None): - examples = [] - for ex_dict in gold_tuples: - doc = ex_dict.get("doc", None) - if doc is None: - doc = ex_dict.get("text", None) - if not (doc is None or isinstance(doc, Doc) or isinstance(doc, str)): - raise ValueError(Errors.E987.format(type=type(doc))) - examples.append(Example.from_dict(ex_dict, doc=doc)) - - elif file_name.endswith("msg"): - text, ex_dict = srsly.read_msgpack(loc) - examples = [Example.from_dict(ex_dict, doc=text)] - else: - supported = ("json", "jsonl", "msg") - raise ValueError(Errors.E124.format(path=loc, formats=supported)) - try: - for example in examples: - yield example - i += 1 - if limit and i >= limit: - return - except KeyError as e: - msg = "Missing key {}".format(e) - raise KeyError(Errors.E996.format(file=file_name, msg=msg)) - except UnboundLocalError as e: - msg = "Unexpected document structure" - raise ValueError(Errors.E996.format(file=file_name, msg=msg)) - - @property - def dev_examples(self): - locs = (self.tmp_dir / "dev").iterdir() - yield from self.read_examples(locs, limit=self.limit) - - @property - def train_examples(self): - locs = (self.tmp_dir / "train").iterdir() - yield from self.read_examples(locs, limit=self.limit) - - def count_train(self): - """Returns count of words in train examples""" - n = 0 - i = 0 - for example in self.train_examples: - n += len(example.token_annotation.words) - if self.limit and i >= self.limit: - break - i += 1 - return n - - def train_dataset(self, nlp, gold_preproc=False, max_length=None, - noise_level=0.0, orth_variant_level=0.0, - ignore_misaligned=False): - locs = list((self.tmp_dir / 'train').iterdir()) - random.shuffle(locs) - train_examples = self.read_examples(locs, limit=self.limit) - gold_examples = self.iter_gold_docs(nlp, train_examples, gold_preproc, - max_length=max_length, - noise_level=noise_level, - orth_variant_level=orth_variant_level, - make_projective=True, - ignore_misaligned=ignore_misaligned) - yield from gold_examples - - def train_dataset_without_preprocessing(self, nlp, gold_preproc=False, - ignore_misaligned=False): - examples = self.iter_gold_docs(nlp, self.train_examples, - gold_preproc=gold_preproc, - ignore_misaligned=ignore_misaligned) - yield from examples - - def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False): - examples = self.iter_gold_docs(nlp, self.dev_examples, - gold_preproc=gold_preproc, - ignore_misaligned=ignore_misaligned) - yield from examples - - @classmethod - def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None, - noise_level=0.0, orth_variant_level=0.0, - make_projective=False, ignore_misaligned=False): - """ Setting gold_preproc will result in creating a doc per sentence """ - for example in examples: - if gold_preproc: - split_examples = example.split_sents() - example_golds = [] - for split_example in split_examples: - split_example_docs = cls._make_docs(nlp, split_example, - gold_preproc, noise_level=noise_level, - orth_variant_level=orth_variant_level) - split_example_golds = cls._make_golds(split_example_docs, - vocab=nlp.vocab, make_projective=make_projective, - ignore_misaligned=ignore_misaligned) - example_golds.extend(split_example_golds) - else: - example_docs = cls._make_docs(nlp, example, - gold_preproc, noise_level=noise_level, - orth_variant_level=orth_variant_level) - example_golds = cls._make_golds(example_docs, vocab=nlp.vocab, - make_projective=make_projective, - ignore_misaligned=ignore_misaligned) - for ex in example_golds: - if ex.goldparse is not None: - if (not max_length) or len(ex.doc) < max_length: - yield ex - - @classmethod - def _make_docs(cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0): - var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level) - # gold_preproc is not used ?! - if example.text is not None: - var_text = add_noise(var_example.text, noise_level) - var_doc = nlp.make_doc(var_text) - var_example.doc = var_doc - else: - var_doc = Doc(nlp.vocab, words=add_noise(var_example.token_annotation.words, noise_level)) - var_example.doc = var_doc - return [var_example] - - @classmethod - def _make_golds(cls, examples, vocab=None, make_projective=False, - ignore_misaligned=False): - filtered_examples = [] - for example in examples: - gold_parses = example.get_gold_parses(vocab=vocab, - make_projective=make_projective, - ignore_misaligned=ignore_misaligned) - assert len(gold_parses) == 1 - doc, gold = gold_parses[0] - if doc: - assert doc == example.doc - example.goldparse = gold - filtered_examples.append(example) - return filtered_examples - - -def make_orth_variants(nlp, example, orth_variant_level=0.0): - if random.random() >= orth_variant_level: - return example - if not example.token_annotation: - return example - raw = example.text - lower = False - if random.random() >= 0.5: - lower = True - if raw is not None: - raw = raw.lower() - ndsv = nlp.Defaults.single_orth_variants - ndpv = nlp.Defaults.paired_orth_variants - # modify words in paragraph_tuples - variant_example = Example(doc=raw) - token_annotation = example.token_annotation - words = token_annotation.words - tags = token_annotation.tags - if not words or not tags: - # add the unmodified annotation - token_dict = token_annotation.to_dict() - variant_example.set_token_annotation(**token_dict) - else: - if lower: - words = [w.lower() for w in words] - # single variants - punct_choices = [random.choice(x["variants"]) for x in ndsv] - for word_idx in range(len(words)): - for punct_idx in range(len(ndsv)): - if tags[word_idx] in ndsv[punct_idx]["tags"] \ - and words[word_idx] in ndsv[punct_idx]["variants"]: - words[word_idx] = punct_choices[punct_idx] - # paired variants - punct_choices = [random.choice(x["variants"]) for x in ndpv] - for word_idx in range(len(words)): - for punct_idx in range(len(ndpv)): - if tags[word_idx] in ndpv[punct_idx]["tags"] \ - and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): - # backup option: random left vs. right from pair - pair_idx = random.choice([0, 1]) - # best option: rely on paired POS tags like `` / '' - if len(ndpv[punct_idx]["tags"]) == 2: - pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) - # next best option: rely on position in variants - # (may not be unambiguous, so order of variants matters) - else: - for pair in ndpv[punct_idx]["variants"]: - if words[word_idx] in pair: - pair_idx = pair.index(words[word_idx]) - words[word_idx] = punct_choices[punct_idx][pair_idx] - - token_dict = token_annotation.to_dict() - token_dict["words"] = words - token_dict["tags"] = tags - variant_example.set_token_annotation(**token_dict) - # modify raw to match variant_paragraph_tuples - if raw is not None: - variants = [] - for single_variants in ndsv: - variants.extend(single_variants["variants"]) - for paired_variants in ndpv: - variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"]))) - # store variants in reverse length order to be able to prioritize - # longer matches (e.g., "---" before "--") - variants = sorted(variants, key=lambda x: len(x)) - variants.reverse() - variant_raw = "" - raw_idx = 0 - # add initial whitespace - while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): - variant_raw += raw[raw_idx] - raw_idx += 1 - for word in variant_example.token_annotation.words: - match_found = False - # skip whitespace words - if word.isspace(): - match_found = True - # add identical word - elif word not in variants and raw[raw_idx:].startswith(word): - variant_raw += word - raw_idx += len(word) - match_found = True - # add variant word - else: - for variant in variants: - if not match_found and \ - raw[raw_idx:].startswith(variant): - raw_idx += len(variant) - variant_raw += word - match_found = True - # something went wrong, abort - # (add a warning message?) - if not match_found: - return example - # add following whitespace - while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): - variant_raw += raw[raw_idx] - raw_idx += 1 - variant_example.doc = variant_raw - return variant_example - return variant_example - - -def add_noise(orig, noise_level): - if random.random() >= noise_level: - return orig - elif type(orig) == list: - corrupted = [_corrupt(word, noise_level) for word in orig] - corrupted = [w for w in corrupted if w] - return corrupted - else: - return "".join(_corrupt(c, noise_level) for c in orig) - - -def _corrupt(c, noise_level): - if random.random() >= noise_level: - return c - elif c in [".", "'", "!", "?", ","]: - return "\n" - else: - return c.lower() - - -def read_json_object(json_corpus_section): - """Take a list of JSON-formatted documents (e.g. from an already loaded - training data file) and yield annotations in the GoldParse format. - - json_corpus_section (list): The data. - YIELDS (Example): The reformatted data - one training example per paragraph - """ - for json_doc in json_corpus_section: - examples = json_to_examples(json_doc) - for ex in examples: - yield ex - - -def json_to_examples(doc): - """Convert an item in the JSON-formatted training data to the format - used by GoldParse. - - doc (dict): One entry in the training data. - YIELDS (Example): The reformatted data - one training example per paragraph - """ - paragraphs = [] - for paragraph in doc["paragraphs"]: - example = Example(doc=paragraph.get("raw", None)) - words = [] - ids = [] - tags = [] - pos = [] - morphs = [] - lemmas = [] - heads = [] - labels = [] - ner = [] - sent_starts = [] - brackets = [] - for sent in paragraph["sentences"]: - sent_start_i = len(words) - for i, token in enumerate(sent["tokens"]): - words.append(token["orth"]) - ids.append(token.get('id', sent_start_i + i)) - tags.append(token.get('tag', "-")) - pos.append(token.get("pos", "")) - morphs.append(token.get("morph", "")) - lemmas.append(token.get("lemma", "")) - heads.append(token.get("head", 0) + sent_start_i + i) - labels.append(token.get("dep", "")) - # Ensure ROOT label is case-insensitive - if labels[-1].lower() == "root": - labels[-1] = "ROOT" - ner.append(token.get("ner", "-")) - if i == 0: - sent_starts.append(1) - else: - sent_starts.append(0) - if "brackets" in sent: - brackets.extend((b["first"] + sent_start_i, - b["last"] + sent_start_i, b["label"]) - for b in sent["brackets"]) - cats = {} - for cat in paragraph.get("cats", {}): - cats[cat["label"]] = cat["value"] - example.set_token_annotation(ids=ids, words=words, tags=tags, - pos=pos, morphs=morphs, lemmas=lemmas, heads=heads, - deps=labels, entities=ner, sent_starts=sent_starts, - brackets=brackets) - example.set_doc_annotation(cats=cats) - yield example - - -def read_json_file(loc, docs_filter=None, limit=None): - loc = util.ensure_path(loc) - if loc.is_dir(): - for filename in loc.iterdir(): - yield from read_json_file(loc / filename, limit=limit) - else: - for doc in _json_iterate(loc): - if docs_filter is not None and not docs_filter(doc): - continue - for json_data in json_to_examples(doc): - yield json_data - - -def _json_iterate(loc): - # We should've made these files jsonl...But since we didn't, parse out - # the docs one-by-one to reduce memory usage. - # It's okay to read in the whole file -- just don't parse it into JSON. - cdef bytes py_raw - loc = util.ensure_path(loc) - with loc.open("rb") as file_: - py_raw = file_.read() - cdef long file_length = len(py_raw) - if file_length > 2 ** 30: - warnings.warn(Warnings.W027.format(size=file_length)) - - raw = py_raw - cdef int square_depth = 0 - cdef int curly_depth = 0 - cdef int inside_string = 0 - cdef int escape = 0 - cdef long start = -1 - cdef char c - cdef char quote = ord('"') - cdef char backslash = ord("\\") - cdef char open_square = ord("[") - cdef char close_square = ord("]") - cdef char open_curly = ord("{") - cdef char close_curly = ord("}") - for i in range(file_length): - c = raw[i] - if escape: - escape = False - continue - if c == backslash: - escape = True - continue - if c == quote: - inside_string = not inside_string - continue - if inside_string: - continue - if c == open_square: - square_depth += 1 - elif c == close_square: - square_depth -= 1 - elif c == open_curly: - if square_depth == 1 and curly_depth == 0: - start = i - curly_depth += 1 - elif c == close_curly: - curly_depth -= 1 - if square_depth == 1 and curly_depth == 0: - py_str = py_raw[start : i + 1].decode("utf8") - try: - yield srsly.json_loads(py_str) - except Exception: - print(py_str) - raise - start = -1 - - -def iob_to_biluo(tags): - out = [] - tags = list(tags) - while tags: - out.extend(_consume_os(tags)) - out.extend(_consume_ent(tags)) - return out - - -def biluo_to_iob(tags): - out = [] - for tag in tags: - tag = tag.replace("U-", "B-", 1).replace("L-", "I-", 1) - out.append(tag) - return out - - -def _consume_os(tags): - while tags and tags[0] == "O": - yield tags.pop(0) - - -def _consume_ent(tags): - if not tags: - return [] - tag = tags.pop(0) - target_in = "I" + tag[1:] - target_last = "L" + tag[1:] - length = 1 - while tags and tags[0] in {target_in, target_last}: - length += 1 - tags.pop(0) - label = tag[2:] - if length == 1: - if len(label) == 0: - raise ValueError(Errors.E177.format(tag=tag)) - return ["U-" + label] - else: - start = "B-" + label - end = "L-" + label - middle = [f"I-{label}" for _ in range(1, length - 1)] - return [start] + middle + [end] - - -cdef class TokenAnnotation: - def __init__(self, ids=None, words=None, tags=None, pos=None, morphs=None, - lemmas=None, heads=None, deps=None, entities=None, sent_starts=None, - brackets=None): - self.ids = ids if ids else [] - self.words = words if words else [] - self.tags = tags if tags else [] - self.pos = pos if pos else [] - self.morphs = morphs if morphs else [] - self.lemmas = lemmas if lemmas else [] - self.heads = heads if heads else [] - self.deps = deps if deps else [] - self.entities = entities if entities else [] - self.sent_starts = sent_starts if sent_starts else [] - self.brackets_by_start = {} - if brackets: - for b_start, b_end, b_label in brackets: - self.brackets_by_start.setdefault(b_start, []).append((b_end, b_label)) - - @property - def brackets(self): - brackets = [] - for start, ends_labels in self.brackets_by_start.items(): - for end, label in ends_labels: - brackets.append((start, end, label)) - return brackets - - @classmethod - def from_dict(cls, token_dict): - return cls(ids=token_dict.get("ids", None), - words=token_dict.get("words", None), - tags=token_dict.get("tags", None), - pos=token_dict.get("pos", None), - morphs=token_dict.get("morphs", None), - lemmas=token_dict.get("lemmas", None), - heads=token_dict.get("heads", None), - deps=token_dict.get("deps", None), - entities=token_dict.get("entities", None), - sent_starts=token_dict.get("sent_starts", None), - brackets=token_dict.get("brackets", None)) - - def to_dict(self): - return {"ids": self.ids, - "words": self.words, - "tags": self.tags, - "pos": self.pos, - "morphs": self.morphs, - "lemmas": self.lemmas, - "heads": self.heads, - "deps": self.deps, - "entities": self.entities, - "sent_starts": self.sent_starts, - "brackets": self.brackets} - - def get_id(self, i): - return self.ids[i] if i < len(self.ids) else i - - def get_word(self, i): - return self.words[i] if i < len(self.words) else "" - - def get_tag(self, i): - return self.tags[i] if i < len(self.tags) else "-" - - def get_pos(self, i): - return self.pos[i] if i < len(self.pos) else "" - - def get_morph(self, i): - return self.morphs[i] if i < len(self.morphs) else "" - - def get_lemma(self, i): - return self.lemmas[i] if i < len(self.lemmas) else "" - - def get_head(self, i): - return self.heads[i] if i < len(self.heads) else i - - def get_dep(self, i): - return self.deps[i] if i < len(self.deps) else "" - - def get_entity(self, i): - return self.entities[i] if i < len(self.entities) else "-" - - def get_sent_start(self, i): - return self.sent_starts[i] if i < len(self.sent_starts) else None - - def __str__(self): - return str(self.to_dict()) - - def __repr__(self): - return self.__str__() - - -cdef class DocAnnotation: - def __init__(self, cats=None, links=None): - self.cats = cats if cats else {} - self.links = links if links else {} - - @classmethod - def from_dict(cls, doc_dict): - return cls(cats=doc_dict.get("cats", None), links=doc_dict.get("links", None)) - - def to_dict(self): - return {"cats": self.cats, "links": self.links} - - def __str__(self): - return str(self.to_dict()) - - def __repr__(self): - return self.__str__() - - -cdef class Example: - def __init__(self, doc_annotation=None, token_annotation=None, doc=None, - goldparse=None): - """ Doc can either be text, or an actual Doc """ - self.doc = doc - self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation() - self.token_annotation = token_annotation if token_annotation else TokenAnnotation() - self.goldparse = goldparse - - @classmethod - def from_gold(cls, goldparse, doc=None): - doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links) - token_annotation = goldparse.get_token_annotation() - return cls(doc_annotation, token_annotation, doc) - - @classmethod - def from_dict(cls, example_dict, doc=None): - token_dict = example_dict.get("token_annotation", {}) - token_annotation = TokenAnnotation.from_dict(token_dict) - doc_dict = example_dict.get("doc_annotation", {}) - doc_annotation = DocAnnotation.from_dict(doc_dict) - return cls(doc_annotation, token_annotation, doc) - - def to_dict(self): - """ Note that this method does NOT export the doc, only the annotations ! """ - token_dict = self.token_annotation.to_dict() - doc_dict = self.doc_annotation.to_dict() - return {"token_annotation": token_dict, "doc_annotation": doc_dict} - - @property - def text(self): - if self.doc is None: - return None - if isinstance(self.doc, Doc): - return self.doc.text - return self.doc - - @property - def gold(self): - if self.goldparse is None: - doc, gold = self.get_gold_parses()[0] - self.goldparse = gold - return self.goldparse - - def set_token_annotation(self, ids=None, words=None, tags=None, pos=None, - morphs=None, lemmas=None, heads=None, deps=None, - entities=None, sent_starts=None, brackets=None): - self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags, - pos=pos, morphs=morphs, lemmas=lemmas, heads=heads, - deps=deps, entities=entities, - sent_starts=sent_starts, brackets=brackets) - - def set_doc_annotation(self, cats=None, links=None): - if cats: - self.doc_annotation.cats = cats - if links: - self.doc_annotation.links = links - - def split_sents(self): - """ Split the token annotations into multiple Examples based on - sent_starts and return a list of the new Examples""" - if not self.token_annotation.words: - return [self] - s_example = Example(doc=None, doc_annotation=self.doc_annotation) - s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], [] - s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], [] - s_brackets = [] - sent_start_i = 0 - cdef TokenAnnotation t = self.token_annotation - split_examples = [] - cdef int b_start, b_end - cdef unicode b_label - for i in range(len(t.words)): - if i > 0 and t.sent_starts[i] == 1: - s_example.set_token_annotation(ids=s_ids, - words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs, - lemmas=s_lemmas, heads=s_heads, deps=s_deps, - entities=s_ents, sent_starts=s_sent_starts, - brackets=s_brackets) - split_examples.append(s_example) - s_example = Example(doc=None, doc_annotation=self.doc_annotation) - s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], [] - s_deps, s_ents, s_morphs, s_lemmas = [], [], [], [] - s_sent_starts, s_brackets = [], [] - sent_start_i = i - s_ids.append(t.get_id(i)) - s_words.append(t.get_word(i)) - s_tags.append(t.get_tag(i)) - s_pos.append(t.get_pos(i)) - s_morphs.append(t.get_morph(i)) - s_lemmas.append(t.get_lemma(i)) - s_heads.append(t.get_head(i) - sent_start_i) - s_deps.append(t.get_dep(i)) - s_ents.append(t.get_entity(i)) - s_sent_starts.append(t.get_sent_start(i)) - for b_end, b_label in t.brackets_by_start.get(i, []): - s_brackets.append( - (i - sent_start_i, b_end - sent_start_i, b_label) - ) - i += 1 - s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, - pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads, - deps=s_deps, entities=s_ents, sent_starts=s_sent_starts, - brackets=s_brackets) - split_examples.append(s_example) - return split_examples - - - def get_gold_parses(self, merge=True, vocab=None, make_projective=False, - ignore_misaligned=False): - """Return a list of (doc, GoldParse) objects. - If merge is set to True, keep all Token annotations as one big list.""" - d = self.doc_annotation - # merge == do not modify Example - if merge: - t = self.token_annotation - doc = self.doc - if doc is None or not isinstance(doc, Doc): - if not vocab: - raise ValueError(Errors.E998) - doc = Doc(vocab, words=t.words) - try: - gp = GoldParse.from_annotation(doc, d, t, - make_projective=make_projective) - except AlignmentError: - if ignore_misaligned: - gp = None - else: - raise - return [(doc, gp)] - # not merging: one GoldParse per sentence, defining docs with the words - # from each sentence - else: - parses = [] - split_examples = self.split_sents() - for split_example in split_examples: - if not vocab: - raise ValueError(Errors.E998) - split_doc = Doc(vocab, words=split_example.token_annotation.words) - try: - gp = GoldParse.from_annotation(split_doc, d, - split_example.token_annotation, - make_projective=make_projective) - except AlignmentError: - if ignore_misaligned: - gp = None - else: - raise - if gp is not None: - parses.append((split_doc, gp)) - return parses - - @classmethod - def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False): - """ - Return a list of Example objects, from a variety of input formats. - make_doc needs to be provided when the examples contain text strings and keep_raw_text=False - """ - if isinstance(examples, Example): - return [examples] - if isinstance(examples, tuple): - examples = [examples] - converted_examples = [] - for ex in examples: - if isinstance(ex, Example): - converted_examples.append(ex) - # convert string to Doc to Example - elif isinstance(ex, str): - if keep_raw_text: - converted_examples.append(Example(doc=ex)) - else: - doc = make_doc(ex) - converted_examples.append(Example(doc=doc)) - # convert Doc to Example - elif isinstance(ex, Doc): - converted_examples.append(Example(doc=ex)) - # convert tuples to Example - elif isinstance(ex, tuple) and len(ex) == 2: - doc, gold = ex - gold_dict = {} - # convert string to Doc - if isinstance(doc, str) and not keep_raw_text: - doc = make_doc(doc) - # convert dict to GoldParse - if isinstance(gold, dict): - gold_dict = gold - if doc is not None or gold.get("words", None) is not None: - gold = GoldParse(doc, **gold) - else: - gold = None - if gold is not None: - converted_examples.append(Example.from_gold(goldparse=gold, doc=doc)) - else: - raise ValueError(Errors.E999.format(gold_dict=gold_dict)) - else: - converted_examples.append(ex) - return converted_examples - - -cdef class GoldParse: - """Collection for training annotations. - - DOCS: https://spacy.io/api/goldparse - """ - @classmethod - def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False): - return cls(doc, words=token_annotation.words, - tags=token_annotation.tags, - pos=token_annotation.pos, - morphs=token_annotation.morphs, - lemmas=token_annotation.lemmas, - heads=token_annotation.heads, - deps=token_annotation.deps, - entities=token_annotation.entities, - sent_starts=token_annotation.sent_starts, - cats=doc_annotation.cats, - links=doc_annotation.links, - make_projective=make_projective) - - def get_token_annotation(self): - ids = None - if self.words: - ids = list(range(len(self.words))) - - return TokenAnnotation(ids=ids, words=self.words, tags=self.tags, - pos=self.pos, morphs=self.morphs, - lemmas=self.lemmas, heads=self.heads, - deps=self.labels, entities=self.ner, - sent_starts=self.sent_starts) - - def __init__(self, doc, words=None, tags=None, pos=None, morphs=None, - lemmas=None, heads=None, deps=None, entities=None, - sent_starts=None, make_projective=False, cats=None, - links=None): - """Create a GoldParse. The fields will not be initialized if len(doc) is zero. - - doc (Doc): The document the annotations refer to. - words (iterable): A sequence of unicode word strings. - tags (iterable): A sequence of strings, representing tag annotations. - pos (iterable): A sequence of strings, representing UPOS annotations. - morphs (iterable): A sequence of strings, representing morph - annotations. - lemmas (iterable): A sequence of strings, representing lemma - annotations. - heads (iterable): A sequence of integers, representing syntactic - head offsets. - deps (iterable): A sequence of strings, representing the syntactic - relation types. - entities (iterable): A sequence of named entity annotations, either as - BILUO tag strings, or as `(start_char, end_char, label)` tuples, - representing the entity positions. - sent_starts (iterable): A sequence of sentence position tags, 1 for - the first word in a sentence, 0 for all others. - cats (dict): Labels for text classification. Each key in the dictionary - may be a string or an int, or a `(start_char, end_char, label)` - tuple, indicating that the label is applied to only part of the - document (usually a sentence). Unlike entity annotations, label - annotations can overlap, i.e. a single word can be covered by - multiple labelled spans. The TextCategorizer component expects - true examples of a label to have the value 1.0, and negative - examples of a label to have the value 0.0. Labels not in the - dictionary are treated as missing - the gradient for those labels - will be zero. - links (dict): A dict with `(start_char, end_char)` keys, - and the values being dicts with kb_id:value entries, - representing the external IDs in a knowledge base (KB) - mapped to either 1.0 or 0.0, indicating positive and - negative examples respectively. - RETURNS (GoldParse): The newly constructed object. - """ - self.mem = Pool() - self.loss = 0 - self.length = len(doc) - - self.cats = {} if cats is None else dict(cats) - self.links = {} if links is None else dict(links) - - # temporary doc for aligning entity annotation - entdoc = None - - # avoid allocating memory if the doc does not contain any tokens - if self.length == 0: - self.words = [] - self.tags = [] - self.heads = [] - self.labels = [] - self.ner = [] - self.morphs = [] - # set a minimal orig so that the scorer can score an empty doc - self.orig = TokenAnnotation(ids=[]) - else: - if not words: - words = [token.text for token in doc] - if not tags: - tags = [None for _ in words] - if not pos: - pos = [None for _ in words] - if not morphs: - morphs = [None for _ in words] - if not lemmas: - lemmas = [None for _ in words] - if not heads: - heads = [None for _ in words] - if not deps: - deps = [None for _ in words] - if not sent_starts: - sent_starts = [None for _ in words] - if entities is None: - entities = ["-" for _ in words] - elif len(entities) == 0: - entities = ["O" for _ in words] - else: - # Translate the None values to '-', to make processing easier. - # See Issue #2603 - entities = [(ent if ent is not None else "-") for ent in entities] - if not isinstance(entities[0], str): - # Assume we have entities specified by character offset. - # Create a temporary Doc corresponding to provided words - # (to preserve gold tokenization) and text (to preserve - # character offsets). - entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text) - entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces) - entdoc_entities = biluo_tags_from_offsets(entdoc, entities) - # There may be some additional whitespace tokens in the - # temporary doc, so check that the annotations align with - # the provided words while building a list of BILUO labels. - entities = [] - words_offset = 0 - for i in range(len(entdoc_words)): - if words[i + words_offset] == entdoc_words[i]: - entities.append(entdoc_entities[i]) - else: - words_offset -= 1 - if len(entities) != len(words): - warnings.warn(Warnings.W029.format(text=doc.text)) - entities = ["-" for _ in words] - - # These are filled by the tagger/parser/entity recogniser - self.c.tags = self.mem.alloc(len(doc), sizeof(int)) - self.c.heads = self.mem.alloc(len(doc), sizeof(int)) - self.c.labels = self.mem.alloc(len(doc), sizeof(attr_t)) - self.c.has_dep = self.mem.alloc(len(doc), sizeof(int)) - self.c.sent_start = self.mem.alloc(len(doc), sizeof(int)) - self.c.ner = self.mem.alloc(len(doc), sizeof(Transition)) - - self.words = [None] * len(doc) - self.tags = [None] * len(doc) - self.pos = [None] * len(doc) - self.morphs = [None] * len(doc) - self.lemmas = [None] * len(doc) - self.heads = [None] * len(doc) - self.labels = [None] * len(doc) - self.ner = [None] * len(doc) - self.sent_starts = [None] * len(doc) - - # This needs to be done before we align the words - if make_projective and any(heads) and any(deps) : - heads, deps = nonproj.projectivize(heads, deps) - - # Do many-to-one alignment for misaligned tokens. - # If we over-segment, we'll have one gold word that covers a sequence - # of predicted words - # If we under-segment, we'll have one predicted word that covers a - # sequence of gold words. - # If we "mis-segment", we'll have a sequence of predicted words covering - # a sequence of gold words. That's many-to-many -- we don't do that - # except for NER spans where the start and end can be aligned. - cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words) - - self.cand_to_gold = [(j if j >= 0 else None) for j in i2j] - self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] - - self.orig = TokenAnnotation(ids=list(range(len(words))), - words=words, tags=tags, pos=pos, morphs=morphs, - lemmas=lemmas, heads=heads, deps=deps, entities=entities, - sent_starts=sent_starts, brackets=[]) - - for i, gold_i in enumerate(self.cand_to_gold): - if doc[i].text.isspace(): - self.words[i] = doc[i].text - self.tags[i] = "_SP" - self.pos[i] = "SPACE" - self.morphs[i] = None - self.lemmas[i] = None - self.heads[i] = None - self.labels[i] = None - self.ner[i] = None - self.sent_starts[i] = 0 - if gold_i is None: - if i in i2j_multi: - self.words[i] = words[i2j_multi[i]] - self.tags[i] = tags[i2j_multi[i]] - self.pos[i] = pos[i2j_multi[i]] - self.morphs[i] = morphs[i2j_multi[i]] - self.lemmas[i] = lemmas[i2j_multi[i]] - self.sent_starts[i] = sent_starts[i2j_multi[i]] - is_last = i2j_multi[i] != i2j_multi.get(i+1) - # Set next word in multi-token span as head, until last - if not is_last: - self.heads[i] = i+1 - self.labels[i] = "subtok" - else: - head_i = heads[i2j_multi[i]] - if head_i: - self.heads[i] = self.gold_to_cand[head_i] - self.labels[i] = deps[i2j_multi[i]] - ner_tag = entities[i2j_multi[i]] - # Assign O/- for many-to-one O/- NER tags - if ner_tag in ("O", "-"): - self.ner[i] = ner_tag - else: - self.words[i] = words[gold_i] - self.tags[i] = tags[gold_i] - self.pos[i] = pos[gold_i] - self.morphs[i] = morphs[gold_i] - self.lemmas[i] = lemmas[gold_i] - self.sent_starts[i] = sent_starts[gold_i] - if heads[gold_i] is None: - self.heads[i] = None - else: - self.heads[i] = self.gold_to_cand[heads[gold_i]] - self.labels[i] = deps[gold_i] - self.ner[i] = entities[gold_i] - # Assign O/- for one-to-many O/- NER tags - for j, cand_j in enumerate(self.gold_to_cand): - if cand_j is None: - if j in j2i_multi: - i = j2i_multi[j] - ner_tag = entities[j] - if ner_tag in ("O", "-"): - self.ner[i] = ner_tag - - # If there is entity annotation and some tokens remain unaligned, - # align all entities at the character level to account for all - # possible token misalignments within the entity spans - if any([e not in ("O", "-") for e in entities]) and None in self.ner: - # If the temporary entdoc wasn't created above, initialize it - if not entdoc: - entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text) - entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces) - # Get offsets based on gold words and BILUO entities - entdoc_offsets = offsets_from_biluo_tags(entdoc, entities) - aligned_offsets = [] - aligned_spans = [] - # Filter offsets to identify those that align with doc tokens - for offset in entdoc_offsets: - span = doc.char_span(offset[0], offset[1]) - if span and not span.text.isspace(): - aligned_offsets.append(offset) - aligned_spans.append(span) - # Convert back to BILUO for doc tokens and assign NER for all - # aligned spans - biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None) - for span in aligned_spans: - for i in range(span.start, span.end): - self.ner[i] = biluo_tags[i] - - # Prevent whitespace that isn't within entities from being tagged as - # an entity. - for i in range(len(self.ner)): - if self.tags[i] == "_SP": - prev_ner = self.ner[i-1] if i >= 1 else None - next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None - if prev_ner == "O" or next_ner == "O": - self.ner[i] = "O" - - cycle = nonproj.contains_cycle(self.heads) - if cycle is not None: - raise ValueError(Errors.E069.format(cycle=cycle, - cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]), - doc_tokens=" ".join(words[:50]))) - - def __len__(self): - """Get the number of gold-standard tokens. - - RETURNS (int): The number of gold-standard tokens. - """ - return self.length - - @property - def is_projective(self): - """Whether the provided syntactic annotations form a projective - dependency tree. - """ - return not nonproj.is_nonproj_tree(self.heads) - - -def docs_to_json(docs, id=0, ner_missing_tag="O"): - """Convert a list of Doc objects into the JSON-serializable format used by - the spacy train command. - - docs (iterable / Doc): The Doc object(s) to convert. - id (int): Id for the JSON. - RETURNS (dict): The data in spaCy's JSON format - - each input doc will be treated as a paragraph in the output doc - """ - if isinstance(docs, Doc): - docs = [docs] - json_doc = {"id": id, "paragraphs": []} - for i, doc in enumerate(docs): - json_para = {'raw': doc.text, "sentences": [], "cats": []} - for cat, val in doc.cats.items(): - json_cat = {"label": cat, "value": val} - json_para["cats"].append(json_cat) - ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] - biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag) - for j, sent in enumerate(doc.sents): - json_sent = {"tokens": [], "brackets": []} - for token in sent: - json_token = {"id": token.i, "orth": token.text} - if doc.is_tagged: - json_token["tag"] = token.tag_ - json_token["pos"] = token.pos_ - json_token["morph"] = token.morph_ - json_token["lemma"] = token.lemma_ - if doc.is_parsed: - json_token["head"] = token.head.i-token.i - json_token["dep"] = token.dep_ - json_token["ner"] = biluo_tags[token.i] - json_sent["tokens"].append(json_token) - json_para["sentences"].append(json_sent) - json_doc["paragraphs"].append(json_para) - return json_doc - - -def biluo_tags_from_offsets(doc, entities, missing="O"): - """Encode labelled spans into per-token tags, using the - Begin/In/Last/Unit/Out scheme (BILUO). - - doc (Doc): The document that the entity offsets refer to. The output tags - will refer to the token boundaries within the document. - entities (iterable): A sequence of `(start, end, label)` triples. `start` - and `end` should be character-offset integers denoting the slice into - the original string. - RETURNS (list): A list of unicode strings, describing the tags. Each tag - string will be of the form either "", "O" or "{action}-{label}", where - action is one of "B", "I", "L", "U". The string "-" is used where the - entity offsets don't align with the tokenization in the `Doc` object. - The training algorithm will view these as missing values. "O" denotes a - non-entity token. "B" denotes the beginning of a multi-token entity, - "I" the inside of an entity of three or more tokens, and "L" the end - of an entity of two or more tokens. "U" denotes a single-token entity. - - EXAMPLE: - >>> text = 'I like London.' - >>> entities = [(len('I like '), len('I like London'), 'LOC')] - >>> doc = nlp.tokenizer(text) - >>> tags = biluo_tags_from_offsets(doc, entities) - >>> assert tags == ["O", "O", 'U-LOC', "O"] - """ - # Ensure no overlapping entity labels exist - tokens_in_ents = {} - - starts = {token.idx: token.i for token in doc} - ends = {token.idx + len(token): token.i for token in doc} - biluo = ["-" for _ in doc] - # Handle entity cases - for start_char, end_char, label in entities: - for token_index in range(start_char, end_char): - if token_index in tokens_in_ents.keys(): - raise ValueError(Errors.E103.format( - span1=(tokens_in_ents[token_index][0], - tokens_in_ents[token_index][1], - tokens_in_ents[token_index][2]), - span2=(start_char, end_char, label))) - tokens_in_ents[token_index] = (start_char, end_char, label) - - start_token = starts.get(start_char) - end_token = ends.get(end_char) - # Only interested if the tokenization is correct - if start_token is not None and end_token is not None: - if start_token == end_token: - biluo[start_token] = f"U-{label}" - else: - biluo[start_token] = f"B-{label}" - for i in range(start_token+1, end_token): - biluo[i] = f"I-{label}" - biluo[end_token] = f"L-{label}" - # Now distinguish the O cases from ones where we miss the tokenization - entity_chars = set() - for start_char, end_char, label in entities: - for i in range(start_char, end_char): - entity_chars.add(i) - for token in doc: - for i in range(token.idx, token.idx + len(token)): - if i in entity_chars: - break - else: - biluo[token.i] = missing - if "-" in biluo: - ent_str = str(entities) - warnings.warn(Warnings.W030.format( - text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text, - entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str - )) - return biluo - - -def spans_from_biluo_tags(doc, tags): - """Encode per-token tags following the BILUO scheme into Span object, e.g. - to overwrite the doc.ents. - - doc (Doc): The document that the BILUO tags refer to. - entities (iterable): A sequence of BILUO tags with each tag describing one - token. Each tags string will be of the form of either "", "O" or - "{action}-{label}", where action is one of "B", "I", "L", "U". - RETURNS (list): A sequence of Span objects. - """ - token_offsets = tags_to_entities(tags) - spans = [] - for label, start_idx, end_idx in token_offsets: - span = Span(doc, start_idx, end_idx + 1, label=label) - spans.append(span) - return spans - - -def offsets_from_biluo_tags(doc, tags): - """Encode per-token tags following the BILUO scheme into entity offsets. - - doc (Doc): The document that the BILUO tags refer to. - entities (iterable): A sequence of BILUO tags with each tag describing one - token. Each tags string will be of the form of either "", "O" or - "{action}-{label}", where action is one of "B", "I", "L", "U". - RETURNS (list): A sequence of `(start, end, label)` triples. `start` and - `end` will be character-offset integers denoting the slice into the - original string. - """ - spans = spans_from_biluo_tags(doc, tags) - return [(span.start_char, span.end_char, span.label_) for span in spans] - - -def is_punct_label(label): - return label == "P" or label.lower() == "punct" From 866179350be1e86d0f56463a52976007fb07e63d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 15:11:13 +0200 Subject: [PATCH 12/56] Fix import --- spacy/_gold/example.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/_gold/example.py b/spacy/_gold/example.py index db9e10093..7528c360e 100644 --- a/spacy/_gold/example.py +++ b/spacy/_gold/example.py @@ -1,5 +1,6 @@ from .annotation import TokenAnnotation, DocAnnotation -from .gold_parse import GoldParse +# We're hoping to kill this GoldParse dependency but for now match semantics. +from ..syntax.gold_parse import GoldParse class Example: From 0f9b4bbfea2a7eb9d79797108d3e1e776b0f4a25 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 15:12:52 +0200 Subject: [PATCH 13/56] Fix imports --- spacy/_gold/example.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/_gold/example.py b/spacy/_gold/example.py index 7528c360e..969ba0374 100644 --- a/spacy/_gold/example.py +++ b/spacy/_gold/example.py @@ -1,4 +1,6 @@ from .annotation import TokenAnnotation, DocAnnotation +from ..errors import Errors, AlignmentError +from ..tokens import Doc # We're hoping to kill this GoldParse dependency but for now match semantics. from ..syntax.gold_parse import GoldParse From 17533a92863ffc8d7bf34155293e3f74018fe096 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 15:13:07 +0200 Subject: [PATCH 14/56] Format --- spacy/_gold/augment.py | 18 ++++--- spacy/_gold/example.py | 107 +++++++++++++++++++++++++++++------------ 2 files changed, 87 insertions(+), 38 deletions(-) diff --git a/spacy/_gold/augment.py b/spacy/_gold/augment.py index 02c812825..1fffe6187 100644 --- a/spacy/_gold/augment.py +++ b/spacy/_gold/augment.py @@ -32,15 +32,18 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): punct_choices = [random.choice(x["variants"]) for x in ndsv] for word_idx in range(len(words)): for punct_idx in range(len(ndsv)): - if tags[word_idx] in ndsv[punct_idx]["tags"] \ - and words[word_idx] in ndsv[punct_idx]["variants"]: + if ( + tags[word_idx] in ndsv[punct_idx]["tags"] + and words[word_idx] in ndsv[punct_idx]["variants"] + ): words[word_idx] = punct_choices[punct_idx] # paired variants punct_choices = [random.choice(x["variants"]) for x in ndpv] for word_idx in range(len(words)): for punct_idx in range(len(ndpv)): - if tags[word_idx] in ndpv[punct_idx]["tags"] \ - and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): + if tags[word_idx] in ndpv[punct_idx]["tags"] and words[ + word_idx + ] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): # backup option: random left vs. right from pair pair_idx = random.choice([0, 1]) # best option: rely on paired POS tags like `` / '' @@ -64,7 +67,9 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): for single_variants in ndsv: variants.extend(single_variants["variants"]) for paired_variants in ndpv: - variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"]))) + variants.extend( + list(itertools.chain.from_iterable(paired_variants["variants"])) + ) # store variants in reverse length order to be able to prioritize # longer matches (e.g., "---" before "--") variants = sorted(variants, key=lambda x: len(x)) @@ -88,8 +93,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): # add variant word else: for variant in variants: - if not match_found and \ - raw[raw_idx:].startswith(variant): + if not match_found and raw[raw_idx:].startswith(variant): raw_idx += len(variant) variant_raw += word match_found = True diff --git a/spacy/_gold/example.py b/spacy/_gold/example.py index 969ba0374..c637c5540 100644 --- a/spacy/_gold/example.py +++ b/spacy/_gold/example.py @@ -1,17 +1,21 @@ from .annotation import TokenAnnotation, DocAnnotation from ..errors import Errors, AlignmentError from ..tokens import Doc + # We're hoping to kill this GoldParse dependency but for now match semantics. from ..syntax.gold_parse import GoldParse class Example: - def __init__(self, doc_annotation=None, token_annotation=None, doc=None, - goldparse=None): + def __init__( + self, doc_annotation=None, token_annotation=None, doc=None, goldparse=None + ): """ Doc can either be text, or an actual Doc """ self.doc = doc self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation() - self.token_annotation = token_annotation if token_annotation else TokenAnnotation() + self.token_annotation = ( + token_annotation if token_annotation else TokenAnnotation() + ) self.goldparse = goldparse @classmethod @@ -49,13 +53,33 @@ class Example: self.goldparse = gold return self.goldparse - def set_token_annotation(self, ids=None, words=None, tags=None, pos=None, - morphs=None, lemmas=None, heads=None, deps=None, - entities=None, sent_starts=None, brackets=None): - self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags, - pos=pos, morphs=morphs, lemmas=lemmas, heads=heads, - deps=deps, entities=entities, - sent_starts=sent_starts, brackets=brackets) + def set_token_annotation( + self, + ids=None, + words=None, + tags=None, + pos=None, + morphs=None, + lemmas=None, + heads=None, + deps=None, + entities=None, + sent_starts=None, + brackets=None, + ): + self.token_annotation = TokenAnnotation( + ids=ids, + words=words, + tags=tags, + pos=pos, + morphs=morphs, + lemmas=lemmas, + heads=heads, + deps=deps, + entities=entities, + sent_starts=sent_starts, + brackets=brackets, + ) def set_doc_annotation(self, cats=None, links=None): if cats: @@ -77,11 +101,19 @@ class Example: split_examples = [] for i in range(len(t.words)): if i > 0 and t.sent_starts[i] == 1: - s_example.set_token_annotation(ids=s_ids, - words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs, - lemmas=s_lemmas, heads=s_heads, deps=s_deps, - entities=s_ents, sent_starts=s_sent_starts, - brackets=s_brackets) + s_example.set_token_annotation( + ids=s_ids, + words=s_words, + tags=s_tags, + pos=s_pos, + morphs=s_morphs, + lemmas=s_lemmas, + heads=s_heads, + deps=s_deps, + entities=s_ents, + sent_starts=s_sent_starts, + brackets=s_brackets, + ) split_examples.append(s_example) s_example = Example(doc=None, doc_annotation=self.doc_annotation) s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], [] @@ -99,20 +131,27 @@ class Example: s_ents.append(t.get_entity(i)) s_sent_starts.append(t.get_sent_start(i)) for b_end, b_label in t.brackets_by_start.get(i, []): - s_brackets.append( - (i - sent_start_i, b_end - sent_start_i, b_label) - ) + s_brackets.append((i - sent_start_i, b_end - sent_start_i, b_label)) i += 1 - s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, - pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads, - deps=s_deps, entities=s_ents, sent_starts=s_sent_starts, - brackets=s_brackets) + s_example.set_token_annotation( + ids=s_ids, + words=s_words, + tags=s_tags, + pos=s_pos, + morphs=s_morphs, + lemmas=s_lemmas, + heads=s_heads, + deps=s_deps, + entities=s_ents, + sent_starts=s_sent_starts, + brackets=s_brackets, + ) split_examples.append(s_example) return split_examples - - def get_gold_parses(self, merge=True, vocab=None, make_projective=False, - ignore_misaligned=False): + def get_gold_parses( + self, merge=True, vocab=None, make_projective=False, ignore_misaligned=False + ): """Return a list of (doc, GoldParse) objects. If merge is set to True, keep all Token annotations as one big list.""" d = self.doc_annotation @@ -125,8 +164,9 @@ class Example: raise ValueError(Errors.E998) doc = Doc(vocab, words=t.words) try: - gp = GoldParse.from_annotation(doc, d, t, - make_projective=make_projective) + gp = GoldParse.from_annotation( + doc, d, t, make_projective=make_projective + ) except AlignmentError: if ignore_misaligned: gp = None @@ -143,9 +183,12 @@ class Example: raise ValueError(Errors.E998) split_doc = Doc(vocab, words=split_example.token_annotation.words) try: - gp = GoldParse.from_annotation(split_doc, d, - split_example.token_annotation, - make_projective=make_projective) + gp = GoldParse.from_annotation( + split_doc, + d, + split_example.token_annotation, + make_projective=make_projective, + ) except AlignmentError: if ignore_misaligned: gp = None @@ -194,7 +237,9 @@ class Example: else: gold = None if gold is not None: - converted_examples.append(Example.from_gold(goldparse=gold, doc=doc)) + converted_examples.append( + Example.from_gold(goldparse=gold, doc=doc) + ) else: raise ValueError(Errors.E999.format(gold_dict=gold_dict)) else: From 7f135736f4528a7b9551dc49d0457e95dcc42deb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 15:28:52 +0200 Subject: [PATCH 15/56] Fix imports --- spacy/_gold/align.py | 2 +- spacy/_gold/augment.py | 2 +- spacy/_gold/corpus.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/_gold/align.py b/spacy/_gold/align.py index 7703232b2..ac2700c1f 100644 --- a/spacy/_gold/align.py +++ b/spacy/_gold/align.py @@ -1,5 +1,5 @@ import numpy -from .errors import Errors, AlignmentError +from ..errors import Errors, AlignmentError def align(tokens_a, tokens_b): diff --git a/spacy/_gold/augment.py b/spacy/_gold/augment.py index 1fffe6187..656308214 100644 --- a/spacy/_gold/augment.py +++ b/spacy/_gold/augment.py @@ -1,6 +1,6 @@ import random import itertools -from .exmaple import Example +from .example import Example def make_orth_variants(nlp, example, orth_variant_level=0.0): diff --git a/spacy/_gold/corpus.py b/spacy/_gold/corpus.py index 2fdfd8d2a..b0b454745 100644 --- a/spacy/_gold/corpus.py +++ b/spacy/_gold/corpus.py @@ -7,9 +7,9 @@ import itertools from ..tokens import Doc from .. import util from ..errors import Errors -from .gold_utils import read_json_file, read_json_object +from .gold_io import read_json_file, read_json_object from .augment import make_orth_variants, add_noise -from .exmaple import Example +from .example import Example class GoldCorpus(object): From 74204116a3e83708589a253a120c5941d9b6863e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 15:29:32 +0200 Subject: [PATCH 16/56] Rename _gold -> gold --- spacy/{_gold => gold}/align.py | 0 spacy/{_gold => gold}/annotation.py | 0 spacy/{_gold => gold}/augment.py | 0 spacy/{_gold => gold}/corpus.py | 0 spacy/{_gold => gold}/example.py | 0 spacy/{_gold => gold}/gold_io.pyx | 0 spacy/{_gold => gold}/iob_utils.py | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename spacy/{_gold => gold}/align.py (100%) rename spacy/{_gold => gold}/annotation.py (100%) rename spacy/{_gold => gold}/augment.py (100%) rename spacy/{_gold => gold}/corpus.py (100%) rename spacy/{_gold => gold}/example.py (100%) rename spacy/{_gold => gold}/gold_io.pyx (100%) rename spacy/{_gold => gold}/iob_utils.py (100%) diff --git a/spacy/_gold/align.py b/spacy/gold/align.py similarity index 100% rename from spacy/_gold/align.py rename to spacy/gold/align.py diff --git a/spacy/_gold/annotation.py b/spacy/gold/annotation.py similarity index 100% rename from spacy/_gold/annotation.py rename to spacy/gold/annotation.py diff --git a/spacy/_gold/augment.py b/spacy/gold/augment.py similarity index 100% rename from spacy/_gold/augment.py rename to spacy/gold/augment.py diff --git a/spacy/_gold/corpus.py b/spacy/gold/corpus.py similarity index 100% rename from spacy/_gold/corpus.py rename to spacy/gold/corpus.py diff --git a/spacy/_gold/example.py b/spacy/gold/example.py similarity index 100% rename from spacy/_gold/example.py rename to spacy/gold/example.py diff --git a/spacy/_gold/gold_io.pyx b/spacy/gold/gold_io.pyx similarity index 100% rename from spacy/_gold/gold_io.pyx rename to spacy/gold/gold_io.pyx diff --git a/spacy/_gold/iob_utils.py b/spacy/gold/iob_utils.py similarity index 100% rename from spacy/_gold/iob_utils.py rename to spacy/gold/iob_utils.py From 53b00991fd844892e3efa6090426a99babf91e9d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 15:36:46 +0200 Subject: [PATCH 17/56] Fix imports --- setup.py | 3 ++- spacy/syntax/_beam_utils.pyx | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index d16615f5f..864a4036a 100755 --- a/setup.py +++ b/setup.py @@ -35,13 +35,14 @@ MOD_NAMES = [ "spacy.syntax.stateclass", "spacy.syntax._state", "spacy.tokenizer", + "spacy.syntax.gold_parse", "spacy.syntax.nn_parser", "spacy.syntax._parser_model", "spacy.syntax._beam_utils", "spacy.syntax.nonproj", "spacy.syntax.transition_system", "spacy.syntax.arc_eager", - "spacy.gold", + "spacy.gold.gold_io", "spacy.tokens.doc", "spacy.tokens.span", "spacy.tokens.token", diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index 03702e54e..46bff1af9 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -9,7 +9,7 @@ import numpy from ..typedefs cimport hash_t, class_t from .transition_system cimport TransitionSystem, Transition -from ..gold cimport GoldParse +from .gold_parse cimport GoldParse from .stateclass cimport StateC, StateClass from ..errors import Errors From 6e87ca1f452b3a12aa6100d38a4dd0e1bf8bba4b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 15:36:58 +0200 Subject: [PATCH 18/56] Fix imports --- spacy/syntax/_parser_model.pyx | 2 +- spacy/syntax/gold_parse.pyx | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 60d22a1ab..7a4eccfc4 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -16,7 +16,7 @@ from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops, noop from ..typedefs cimport weight_t, class_t, hash_t from ..tokens.doc cimport Doc -from ..gold cimport GoldParse +from .gold_parse cimport GoldParse from .stateclass cimport StateClass from .transition_system cimport Transition diff --git a/spacy/syntax/gold_parse.pyx b/spacy/syntax/gold_parse.pyx index 59e8f4bbb..df4059a21 100644 --- a/spacy/syntax/gold_parse.pyx +++ b/spacy/syntax/gold_parse.pyx @@ -10,11 +10,12 @@ import srsly import warnings from .. import util -from ..syntax import nonproj +from . import nonproj from ..tokens import Doc, Span from ..errors import Errors, AlignmentError, Warnings -from .iob_utils import offsets_from_biluo_tags -from .align import align +from ..gold.annotation import TokenAnnotation +from ..gold.iob_utils import offsets_from_biluo_tags, biluo_tags_from_offsets +from ..gold.align import align punct_re = re.compile(r"\W") From b69fa77ccc854d69a00654ecbdfb6ec069a794b7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 15:38:46 +0200 Subject: [PATCH 19/56] Add missing inits --- spacy/gold/__init__.pxd | 0 spacy/gold/__init__.py | 13 +++++++++++++ 2 files changed, 13 insertions(+) create mode 100644 spacy/gold/__init__.pxd create mode 100644 spacy/gold/__init__.py diff --git a/spacy/gold/__init__.pxd b/spacy/gold/__init__.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/gold/__init__.py b/spacy/gold/__init__.py new file mode 100644 index 000000000..b8d35972d --- /dev/null +++ b/spacy/gold/__init__.py @@ -0,0 +1,13 @@ +from .corpus import GoldCorpus +from ..syntax.gold_parse import GoldParse +from .example import Example +from .annotation import TokenAnnotation, DocAnnotation +from .align import align + +from .iob_utils import iob_to_biluo, biluo_to_iob +from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags +from .iob_utils import spans_from_biluo_tags +from .iob_utils import tags_to_entities + +from .gold_io import docs_to_json +from .gold_io import read_json_file, read_json_object From 084271c9e9a3f50095e7c1e55a0218d42e21205e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 8 Jun 2020 22:09:57 +0200 Subject: [PATCH 20/56] Remove GoldParse from public API * Move get_parses_from_example to spacy.syntax * Get GoldParse out of Example * Avoid expecting GoldParse input in parser * Add Alignment to spacy.gold.align * Update Example object * Add comment * Update pipeline * Fix imports * Simplify gold_io * WIP on GoldCorpus * Update test * Xfail some gold tests * Remove ignore_misaligned option from GoldCorpus * Fix Example constructor * Update test * Fix usage of Example * Add deprecated_get_gold method on Example * Patch scorer * Fix test * Fix test * Update tests * Xfail a test * Fix passing of make_projective * Pass make_projective by default * Hack data format in Example.from_dict * Update tests * Fix example.from_dict * Update morphologizer * Fix entity linker * Add get_field to TokenAnnotation * Fix Example.get_aligned * Update test * Fix alignment * Fix corpus * Fix GoldCorpus * Handle misaligned * Format * Fix missing import --- spacy/cli/train_from_config.py | 4 +- spacy/gold/__init__.py | 2 +- spacy/gold/align.py | 20 +++ spacy/gold/annotation.py | 24 ++++ spacy/gold/corpus.py | 62 +++------- spacy/gold/example.py | 149 ++++++++++------------- spacy/gold/gold_io.pyx | 60 +++++---- spacy/language.py | 1 + spacy/pipeline/morphologizer.pyx | 2 +- spacy/pipeline/pipes.pyx | 52 ++++---- spacy/scorer.py | 2 +- spacy/syntax/gold_parse.pyx | 51 ++++++++ spacy/syntax/nn_parser.pyx | 15 ++- spacy/tests/parser/test_add_label.py | 12 +- spacy/tests/parser/test_neural_parser.py | 2 +- spacy/tests/parser/test_preset_sbd.py | 3 +- spacy/tests/test_gold.py | 137 ++++++++------------- spacy/tests/test_language.py | 20 +-- 18 files changed, 315 insertions(+), 303 deletions(-) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index a6d0a0abc..c4db5f6ba 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -11,6 +11,7 @@ from thinc.api import Model, use_pytorch_for_gpu_memory import random from ..gold import GoldCorpus +from ..gold import Example from .. import util from ..errors import Errors from ..ml import models # don't remove - required to load the built-in architectures @@ -243,7 +244,7 @@ def create_train_batches(nlp, corpus, cfg): orth_variant_level=cfg["orth_variant_level"], gold_preproc=cfg["gold_preproc"], max_length=cfg["max_length"], - ignore_misaligned=True, + ignore_misaligned=True )) if len(train_examples) == 0: raise ValueError(Errors.E988) @@ -271,6 +272,7 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True ) ) + n_words = sum(len(ex.doc) for ex in dev_examples) start_time = timer() diff --git a/spacy/gold/__init__.py b/spacy/gold/__init__.py index b8d35972d..5e41d30cb 100644 --- a/spacy/gold/__init__.py +++ b/spacy/gold/__init__.py @@ -10,4 +10,4 @@ from .iob_utils import spans_from_biluo_tags from .iob_utils import tags_to_entities from .gold_io import docs_to_json -from .gold_io import read_json_file, read_json_object +from .gold_io import read_json_file diff --git a/spacy/gold/align.py b/spacy/gold/align.py index ac2700c1f..49e8aaa98 100644 --- a/spacy/gold/align.py +++ b/spacy/gold/align.py @@ -2,6 +2,26 @@ import numpy from ..errors import Errors, AlignmentError +class Alignment: + def __init__(self, spacy_words, gold_words): + # Do many-to-one alignment for misaligned tokens. + # If we over-segment, we'll have one gold word that covers a sequence + # of predicted words + # If we under-segment, we'll have one predicted word that covers a + # sequence of gold words. + # If we "mis-segment", we'll have a sequence of predicted words covering + # a sequence of gold words. That's many-to-many -- we don't do that + # except for NER spans where the start and end can be aligned. + cost, i2j, j2i, i2j_multi, j2i_multi = align(spacy_words, gold_words) + self.cost = cost + self.i2j = i2j + self.j2i = j2i + self.i2j_multi = i2j_multi + self.j2i_multi = j2i_multi + self.cand_to_gold = [(j if j >= 0 else None) for j in i2j] + self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] + + def align(tokens_a, tokens_b): """Calculate alignment tables between two tokenizations. diff --git a/spacy/gold/annotation.py b/spacy/gold/annotation.py index cd8ac0717..6bae679c3 100644 --- a/spacy/gold/annotation.py +++ b/spacy/gold/annotation.py @@ -28,6 +28,30 @@ class TokenAnnotation: for b_start, b_end, b_label in brackets: self.brackets_by_start.setdefault(b_start, []).append((b_end, b_label)) + def get_field(self, field): + if field == "id": + return self.ids + elif field == "word": + return self.words + elif field == "tag": + return self.tags + elif field == "pos": + return self.pos + elif field == "morph": + return self.morphs + elif field == "lemma": + return self.lemmas + elif field == "head": + return self.heads + elif field == "dep": + return self.deps + elif field == "ner": + return self.entities + elif field == "sent_start": + return self.sent_starts + else: + raise ValueError(f"Unknown field: {field}") + @property def brackets(self): brackets = [] diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index b0b454745..9462f0aa4 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -6,8 +6,8 @@ from pathlib import Path import itertools from ..tokens import Doc from .. import util -from ..errors import Errors -from .gold_io import read_json_file, read_json_object +from ..errors import Errors, AlignmentError +from .gold_io import read_json_file, json_to_examples from .augment import make_orth_variants, add_noise from .example import Example @@ -43,9 +43,8 @@ class GoldCorpus(object): if not directory.exists(): directory.mkdir() n = 0 - for i, example in enumerate(examples): - ex_dict = example.to_dict() - text = example.text + for i, ex_dict in enumerate(examples): + text = ex_dict["text"] srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict)) n += 1 if limit and n >= limit: @@ -87,7 +86,9 @@ class GoldCorpus(object): # TODO: proper format checks with schemas if isinstance(first_gold_tuple, dict): if first_gold_tuple.get("paragraphs", None): - examples = read_json_object(gold_tuples) + examples = [] + for json_doc in gold_tuples: + examples.extend(json_to_examples(json_doc)) elif first_gold_tuple.get("doc_annotation", None): examples = [] for ex_dict in gold_tuples: @@ -117,7 +118,7 @@ class GoldCorpus(object): except KeyError as e: msg = "Missing key {}".format(e) raise KeyError(Errors.E996.format(file=file_name, msg=msg)) - except UnboundLocalError: + except UnboundLocalError as e: msg = "Unexpected document structure" raise ValueError(Errors.E996.format(file=file_name, msg=msg)) @@ -200,9 +201,9 @@ class GoldCorpus(object): ): """ Setting gold_preproc will result in creating a doc per sentence """ for example in examples: + example_docs = [] if gold_preproc: split_examples = example.split_sents() - example_golds = [] for split_example in split_examples: split_example_docs = cls._make_docs( nlp, @@ -211,13 +212,7 @@ class GoldCorpus(object): noise_level=noise_level, orth_variant_level=orth_variant_level, ) - split_example_golds = cls._make_golds( - split_example_docs, - vocab=nlp.vocab, - make_projective=make_projective, - ignore_misaligned=ignore_misaligned, - ) - example_golds.extend(split_example_golds) + example_docs.extend(split_example_docs) else: example_docs = cls._make_docs( nlp, @@ -226,16 +221,14 @@ class GoldCorpus(object): noise_level=noise_level, orth_variant_level=orth_variant_level, ) - example_golds = cls._make_golds( - example_docs, - vocab=nlp.vocab, - make_projective=make_projective, - ignore_misaligned=ignore_misaligned, - ) - for ex in example_golds: - if ex.goldparse is not None: - if (not max_length) or len(ex.doc) < max_length: - yield ex + for ex in example_docs: + if (not max_length) or len(ex.doc) < max_length: + if ignore_misaligned: + try: + _ = ex._deprecated_get_gold() + except AlignmentError: + continue + yield ex @classmethod def _make_docs( @@ -256,22 +249,3 @@ class GoldCorpus(object): ) var_example.doc = var_doc return [var_example] - - @classmethod - def _make_golds( - cls, examples, vocab=None, make_projective=False, ignore_misaligned=False - ): - filtered_examples = [] - for example in examples: - gold_parses = example.get_gold_parses( - vocab=vocab, - make_projective=make_projective, - ignore_misaligned=ignore_misaligned, - ) - assert len(gold_parses) == 1 - doc, gold = gold_parses[0] - if doc: - assert doc == example.doc - example.goldparse = gold - filtered_examples.append(example) - return filtered_examples diff --git a/spacy/gold/example.py b/spacy/gold/example.py index c637c5540..1d8665572 100644 --- a/spacy/gold/example.py +++ b/spacy/gold/example.py @@ -1,36 +1,56 @@ from .annotation import TokenAnnotation, DocAnnotation +from .align import Alignment from ..errors import Errors, AlignmentError from ..tokens import Doc -# We're hoping to kill this GoldParse dependency but for now match semantics. -from ..syntax.gold_parse import GoldParse - class Example: - def __init__( - self, doc_annotation=None, token_annotation=None, doc=None, goldparse=None - ): + def __init__(self, doc=None, doc_annotation=None, token_annotation=None): """ Doc can either be text, or an actual Doc """ self.doc = doc self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation() self.token_annotation = ( token_annotation if token_annotation else TokenAnnotation() ) - self.goldparse = goldparse + self._alignment = None - @classmethod - def from_gold(cls, goldparse, doc=None): - doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links) - token_annotation = goldparse.get_token_annotation() - return cls(doc_annotation, token_annotation, doc) + def _deprecated_get_gold(self, make_projective=False): + from ..syntax.gold_parse import get_parses_from_example + + _, gold = get_parses_from_example(self, make_projective=make_projective)[0] + return gold @classmethod def from_dict(cls, example_dict, doc=None): + if example_dict is None: + raise ValueError("Example.from_dict expected dict, received None") + # TODO: This is ridiculous... token_dict = example_dict.get("token_annotation", {}) - token_annotation = TokenAnnotation.from_dict(token_dict) doc_dict = example_dict.get("doc_annotation", {}) + for key, value in example_dict.items(): + if key in ("token_annotation", "doc_annotation"): + pass + elif key in ("cats", "links"): + doc_dict[key] = value + else: + token_dict[key] = value + token_annotation = TokenAnnotation.from_dict(token_dict) doc_annotation = DocAnnotation.from_dict(doc_dict) - return cls(doc_annotation, token_annotation, doc) + return cls( + doc=doc, doc_annotation=doc_annotation, token_annotation=token_annotation + ) + + @property + def alignment(self): + if self._alignment is None: + if self.doc is None: + return None + spacy_words = [token.orth_ for token in self.doc] + gold_words = self.token_annotation.words + if gold_words == []: + gold_words = spacy_words + self._alignment = Alignment(spacy_words, gold_words) + return self._alignment def to_dict(self): """ Note that this method does NOT export the doc, only the annotations ! """ @@ -46,12 +66,31 @@ class Example: return self.doc.text return self.doc - @property - def gold(self): - if self.goldparse is None: - doc, gold = self.get_gold_parses()[0] - self.goldparse = gold - return self.goldparse + def get_aligned(self, field): + """Return an aligned array for a token annotation field.""" + if self.doc is None: + return self.token_annotation.get_field(field) + doc = self.doc + if field == "word": + return [token.orth_ for token in doc] + gold_values = self.token_annotation.get_field(field) + alignment = self.alignment + i2j_multi = alignment.i2j_multi + gold_to_cand = alignment.gold_to_cand + cand_to_gold = alignment.cand_to_gold + + output = [] + for i, gold_i in enumerate(cand_to_gold): + if doc[i].text.isspace(): + output.append(None) + elif gold_i is None: + if i in i2j_multi: + output.append(gold_values[i2j_multi[i]]) + else: + output.append(None) + else: + output.append(gold_values[gold_i]) + return output def set_token_annotation( self, @@ -149,55 +188,6 @@ class Example: split_examples.append(s_example) return split_examples - def get_gold_parses( - self, merge=True, vocab=None, make_projective=False, ignore_misaligned=False - ): - """Return a list of (doc, GoldParse) objects. - If merge is set to True, keep all Token annotations as one big list.""" - d = self.doc_annotation - # merge == do not modify Example - if merge: - t = self.token_annotation - doc = self.doc - if doc is None or not isinstance(doc, Doc): - if not vocab: - raise ValueError(Errors.E998) - doc = Doc(vocab, words=t.words) - try: - gp = GoldParse.from_annotation( - doc, d, t, make_projective=make_projective - ) - except AlignmentError: - if ignore_misaligned: - gp = None - else: - raise - return [(doc, gp)] - # not merging: one GoldParse per sentence, defining docs with the words - # from each sentence - else: - parses = [] - split_examples = self.split_sents() - for split_example in split_examples: - if not vocab: - raise ValueError(Errors.E998) - split_doc = Doc(vocab, words=split_example.token_annotation.words) - try: - gp = GoldParse.from_annotation( - split_doc, - d, - split_example.token_annotation, - make_projective=make_projective, - ) - except AlignmentError: - if ignore_misaligned: - gp = None - else: - raise - if gp is not None: - parses.append((split_doc, gp)) - return parses - @classmethod def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False): """ @@ -219,29 +209,16 @@ class Example: else: doc = make_doc(ex) converted_examples.append(Example(doc=doc)) - # convert Doc to Example - elif isinstance(ex, Doc): - converted_examples.append(Example(doc=ex)) # convert tuples to Example elif isinstance(ex, tuple) and len(ex) == 2: doc, gold = ex - gold_dict = {} # convert string to Doc if isinstance(doc, str) and not keep_raw_text: doc = make_doc(doc) - # convert dict to GoldParse - if isinstance(gold, dict): - gold_dict = gold - if doc is not None or gold.get("words", None) is not None: - gold = GoldParse(doc, **gold) - else: - gold = None - if gold is not None: - converted_examples.append( - Example.from_gold(goldparse=gold, doc=doc) - ) - else: - raise ValueError(Errors.E999.format(gold_dict=gold_dict)) + converted_examples.append(Example.from_dict(gold, doc=doc)) + # convert Doc to Example + elif isinstance(ex, Doc): + converted_examples.append(Example(doc=ex)) else: converted_examples.append(ex) return converted_examples diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx index 15581c151..424e44f72 100644 --- a/spacy/gold/gold_io.pyx +++ b/spacy/gold/gold_io.pyx @@ -3,7 +3,6 @@ import srsly from .. import util from ..errors import Warnings from ..tokens import Token, Doc -from .example import Example from .iob_utils import biluo_tags_from_offsets @@ -64,6 +63,19 @@ def docs_to_json(docs, id=0, ner_missing_tag="O"): return json_doc +def read_json_file(loc, docs_filter=None, limit=None): + loc = util.ensure_path(loc) + if loc.is_dir(): + for filename in loc.iterdir(): + yield from read_json_file(loc / filename, limit=limit) + else: + for doc in json_iterate(loc): + if docs_filter is not None and not docs_filter(doc): + continue + for json_data in json_to_examples(doc): + yield json_data + + def json_to_examples(doc): """Convert an item in the JSON-formatted training data to the format used by GoldParse. @@ -72,7 +84,7 @@ def json_to_examples(doc): YIELDS (Example): The reformatted data - one training example per paragraph """ for paragraph in doc["paragraphs"]: - example = Example(doc=paragraph.get("raw", None)) + example = {"text": paragraph.get("raw", None)} words = [] ids = [] tags = [] @@ -110,39 +122,23 @@ def json_to_examples(doc): cats = {} for cat in paragraph.get("cats", {}): cats[cat["label"]] = cat["value"] - example.set_token_annotation(ids=ids, words=words, tags=tags, - pos=pos, morphs=morphs, lemmas=lemmas, heads=heads, - deps=labels, entities=ner, sent_starts=sent_starts, - brackets=brackets) - example.set_doc_annotation(cats=cats) + example["token_annotation"] = dict( + ids=ids, + words=words, + tags=tags, + pos=pos, + morphs=morphs, + lemmas=lemmas, + heads=heads, + deps=labels, + entities=ner, + sent_starts=sent_starts, + brackets=brackets + ) + example["doc_annotation"] = dict(cats=cats) yield example -def read_json_file(loc, docs_filter=None, limit=None): - loc = util.ensure_path(loc) - if loc.is_dir(): - for filename in loc.iterdir(): - yield from read_json_file(loc / filename, limit=limit) - else: - for doc in json_iterate(loc): - if docs_filter is not None and not docs_filter(doc): - continue - for json_data in json_to_examples(doc): - yield json_data - - -def read_json_object(json_corpus_section): - """Take a list of JSON-formatted documents (e.g. from an already loaded - training data file) and yield annotations in the GoldParse format. - - json_corpus_section (list): The data. - YIELDS (Example): The reformatted data - one training example per paragraph - """ - for json_doc in json_corpus_section: - examples = json_to_examples(json_doc) - for ex in examples: - yield ex - def json_iterate(loc): # We should've made these files jsonl...But since we didn't, parse out diff --git a/spacy/language.py b/spacy/language.py index 6341dc858..57664ec17 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -636,6 +636,7 @@ class Language(object): examples (iterable): `Example` objects. YIELDS (tuple): `Example` objects. """ + # TODO: This is deprecated right? for name, proc in self.pipeline: if hasattr(proc, "preprocess_gold"): examples = proc.preprocess_gold(examples) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index c45a72b25..7116d7afd 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -92,7 +92,7 @@ class Morphologizer(Tagger): guesses = scores.argmax(axis=1) known_labels = numpy.ones((scores.shape[0], 1), dtype="f") for ex in examples: - gold = ex.gold + gold = ex._deprecated_get_gold() for i in range(len(gold.morphs)): pos = gold.pos[i] if i < len(gold.pos) else "" morph = gold.morphs[i] diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index a6edf00d9..2c40738f6 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -373,7 +373,7 @@ class Tagger(Pipe): def get_loss(self, examples, scores): loss_func = SequenceCategoricalCrossentropy(names=self.labels) - truths = [eg.gold.tags for eg in examples] + truths = [eg.get_aligned("tag") for eg in examples] d_scores, loss = loss_func(scores, truths) if self.model.ops.xp.isnan(loss): raise ValueError("nan value when computing loss") @@ -560,9 +560,9 @@ class SentenceRecognizer(Tagger): correct = numpy.zeros((scores.shape[0],), dtype="i") guesses = scores.argmax(axis=1) known_labels = numpy.ones((scores.shape[0], 1), dtype="f") - for ex in examples: - gold = ex.gold - for sent_start in gold.sent_starts: + for eg in examples: + sent_starts = eg.get_aligned("sent_start") + for sent_start in sent_starts: if sent_start is None: correct[idx] = guesses[idx] elif sent_start in tag_index: @@ -575,7 +575,7 @@ class SentenceRecognizer(Tagger): d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) d_scores *= self.model.ops.asarray(known_labels) loss = (d_scores**2).sum() - docs = [ex.doc for ex in examples] + docs = [eg.doc for eg in examples] d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores @@ -706,13 +706,13 @@ class MultitaskObjective(Tagger): cdef int idx = 0 correct = numpy.zeros((scores.shape[0],), dtype="i") guesses = scores.argmax(axis=1) - golds = [ex.gold for ex in examples] docs = [ex.doc for ex in examples] - for i, gold in enumerate(golds): - for j in range(len(docs[i])): - # Handels alignment for tokenization differences - token_annotation = gold.get_token_annotation() - label = self.make_label(j, token_annotation) + for i, eg in enumerate(examples): + # Handles alignment for tokenization differences + doc_annots = eg.get_aligned() + for j in range(len(eg.doc)): + tok_annots = {key: values[j] for key, values in tok_annots.items()} + label = self.make_label(j, tok_annots) if label is None or label not in self.labels: correct[idx] = guesses[idx] else: @@ -951,13 +951,12 @@ class TextCategorizer(Pipe): losses[self.name] += (gradient**2).sum() def _examples_to_truth(self, examples): - golds = [ex.gold for ex in examples] - truths = numpy.zeros((len(golds), len(self.labels)), dtype="f") - not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f") - for i, gold in enumerate(golds): + truths = numpy.zeros((len(examples), len(self.labels)), dtype="f") + not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f") + for i, eg in enumerate(examples): for j, label in enumerate(self.labels): - if label in gold.cats: - truths[i, j] = gold.cats[label] + if label in eg.doc_annotation.cats: + truths[i, j] = eg.doc_annotation.cats[label] else: not_missing[i, j] = 0. truths = self.model.ops.asarray(truths) @@ -1160,14 +1159,14 @@ class EntityLinker(Pipe): # This seems simpler than other ways to get that exact output -- but # it does run the model twice :( predictions = self.model.predict(docs) - golds = [ex.gold for ex in examples] - for doc, gold in zip(docs, golds): + for eg in examples: + doc = eg.doc ents_by_offset = dict() for ent in doc.ents: ents_by_offset[(ent.start_char, ent.end_char)] = ent - for entity, kb_dict in gold.links.items(): + for entity, kb_dict in eg.doc_annotation.links.items(): if isinstance(entity, str): entity = literal_eval(entity) start, end = entity @@ -1188,7 +1187,10 @@ class EntityLinker(Pipe): raise RuntimeError(Errors.E030) set_dropout_rate(self.model, drop) sentence_encodings, bp_context = self.model.begin_update(sentence_docs) - loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds) + loss, d_scores = self.get_similarity_loss( + scores=sentence_encodings, + examples=examples + ) bp_context(d_scores) if sgd is not None: self.model.finish_update(sgd) @@ -1199,10 +1201,10 @@ class EntityLinker(Pipe): self.set_annotations(docs, predictions) return loss - def get_similarity_loss(self, golds, scores): + def get_similarity_loss(self, examples, scores): entity_encodings = [] - for gold in golds: - for entity, kb_dict in gold.links.items(): + for eg in examples: + for entity, kb_dict in eg.doc_annotation.links.items(): for kb_id, value in kb_dict.items(): # this loss function assumes we're only using positive examples if value: @@ -1222,7 +1224,7 @@ class EntityLinker(Pipe): def get_loss(self, examples, scores): cats = [] for ex in examples: - for entity, kb_dict in ex.gold.links.items(): + for entity, kb_dict in ex.doc_annotation.links.items(): for kb_id, value in kb_dict.items(): cats.append([value]) diff --git a/spacy/scorer.py b/spacy/scorer.py index 7e2466be7..5e49a90d2 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -282,7 +282,7 @@ class Scorer(object): if isinstance(example, tuple) and len(example) == 2: doc, gold = example else: - gold = example.gold + gold = example._deprecated_get_gold() doc = example.doc if len(doc) != len(gold): diff --git a/spacy/syntax/gold_parse.pyx b/spacy/syntax/gold_parse.pyx index df4059a21..05361fd82 100644 --- a/spacy/syntax/gold_parse.pyx +++ b/spacy/syntax/gold_parse.pyx @@ -24,6 +24,57 @@ def is_punct_label(label): return label == "P" or label.lower() == "punct" +def get_parses_from_example( + eg, merge=True, vocab=None, make_projective=True, ignore_misaligned=False +): + """Return a list of (doc, GoldParse) objects. + If merge is set to True, keep all Token annotations as one big list.""" + d = eg.doc_annotation + # merge == do not modify Example + if merge: + t = eg.token_annotation + doc = eg.doc + if doc is None or not isinstance(doc, Doc): + if not vocab: + raise ValueError(Errors.E998) + doc = Doc(vocab, words=t.words) + try: + gp = GoldParse.from_annotation( + doc, d, t, make_projective=make_projective + ) + except AlignmentError: + if ignore_misaligned: + gp = None + else: + raise + return [(doc, gp)] + # not merging: one GoldParse per sentence, defining docs with the words + # from each sentence + else: + parses = [] + split_examples = eg.split_sents() + for split_example in split_examples: + if not vocab: + raise ValueError(Errors.E998) + split_doc = Doc(vocab, words=split_example.token_annotation.words) + try: + gp = GoldParse.from_annotation( + split_doc, + d, + split_example.token_annotation, + make_projective=make_projective, + ) + except AlignmentError: + if ignore_misaligned: + gp = None + else: + raise + if gp is not None: + parses.append((split_doc, gp)) + return parses + + + cdef class GoldParse: """Collection for training annotations. diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 12f56ba67..f74f3dd73 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -21,6 +21,7 @@ import warnings from ..tokens.doc cimport Doc from .gold_parse cimport GoldParse +from .gold_parse import get_parses_from_example from ..typedefs cimport weight_t, class_t, hash_t from ._parser_model cimport alloc_activations, free_activations from ._parser_model cimport predict_states, arg_max_if_valid @@ -515,8 +516,8 @@ cdef class Parser: good_golds = [] good_states = [] for i, eg in enumerate(whole_examples): - doc = eg.doc - gold = self.moves.preprocess_gold(eg.gold) + parses = get_parses_from_example(eg) + doc, gold = parses[0] if gold is not None and self.moves.has_gold(gold): good_docs.append(doc) good_golds.append(gold) @@ -535,8 +536,12 @@ cdef class Parser: cdef: StateClass state Transition action - whole_docs = [ex.doc for ex in whole_examples] - whole_golds = [ex.gold for ex in whole_examples] + whole_docs = [] + whole_golds = [] + for eg in whole_examples: + for doc, gold in get_parses_from_example(eg): + whole_docs.append(doc) + whole_golds.append(gold) whole_states = self.moves.init_batch(whole_docs) max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs]))) max_moves = 0 @@ -625,7 +630,7 @@ cdef class Parser: doc_sample = [] gold_sample = [] for example in islice(get_examples(), 10): - parses = example.get_gold_parses(merge=False, vocab=self.vocab) + parses = get_parses_from_example(example, merge=False, vocab=self.vocab) for doc, gold in parses: if len(doc): doc_sample.append(doc) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index ee1bba886..fdab3a2e3 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -34,7 +34,10 @@ def _train_parser(parser): for i in range(5): losses = {} doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) - gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"]) + gold = { + "heads": [1, 1, 3, 3], + "deps": ["left", "ROOT", "left", "ROOT"] + } parser.update((doc, gold), sgd=sgd, losses=losses) return parser @@ -46,9 +49,10 @@ def test_add_label(parser): for i in range(100): losses = {} doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) - gold = GoldParse( - doc, heads=[1, 1, 3, 3], deps=["right", "ROOT", "left", "ROOT"] - ) + gold = { + "heads": [1, 1, 3, 3], + "deps": ["right", "ROOT", "left", "ROOT"] + } parser.update((doc, gold), sgd=sgd, losses=losses) doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = parser(doc) diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index b648e9a00..c07e6aa38 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -46,7 +46,7 @@ def doc(vocab): @pytest.fixture def gold(doc): - return GoldParse(doc, heads=[1, 1, 1], deps=["L", "ROOT", "R"]) + return {"heads": [1, 1, 1], "deps": ["L", "ROOT", "R"]} def test_can_init_nn_parser(parser): diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index dc13fcdf1..3d0726353 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -1,7 +1,6 @@ import pytest from thinc.api import Adam from spacy.attrs import NORM -from spacy.gold import GoldParse from spacy.vocab import Vocab from spacy.pipeline.defaults import default_parser @@ -27,7 +26,7 @@ def parser(vocab): for i in range(10): losses = {} doc = Doc(vocab, words=["a", "b", "c", "d"]) - gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"]) + gold = dict(heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"]) parser.update((doc, gold), sgd=sgd, losses=losses) return parser diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 982c0d910..4b4250179 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -1,9 +1,10 @@ from spacy.errors import AlignmentError from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags -from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo, align +from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align from spacy.gold import GoldCorpus, docs_to_json, Example, DocAnnotation from spacy.lang.en import English from spacy.syntax.nonproj import is_nonproj_tree +from spacy.syntax.gold_parse import GoldParse, get_parses_from_example from spacy.tokens import Doc from spacy.util import get_words_and_spaces, compounding, minibatch import pytest @@ -270,10 +271,9 @@ def test_roundtrip_docs_to_json(doc): srsly.write_json(json_file, [docs_to_json(doc)]) goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file)) - reloaded_example = next(goldcorpus.dev_dataset(nlp)) - goldparse = reloaded_example.gold - - assert len(doc) == goldcorpus.count_train() + reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp)) + goldparse = reloaded_example._deprecated_get_gold() + assert len(doc) == goldcorpus.count_train() assert text == reloaded_example.text assert tags == goldparse.tags assert pos == goldparse.pos @@ -287,54 +287,6 @@ def test_roundtrip_docs_to_json(doc): assert cats["TRAVEL"] == goldparse.cats["TRAVEL"] assert cats["BAKING"] == goldparse.cats["BAKING"] - # roundtrip to JSONL train dicts - with make_tempdir() as tmpdir: - jsonl_file = tmpdir / "roundtrip.jsonl" - srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) - goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) - - reloaded_example = next(goldcorpus.dev_dataset(nlp)) - goldparse = reloaded_example.gold - - assert len(doc) == goldcorpus.count_train() - assert text == reloaded_example.text - assert tags == goldparse.tags - assert pos == goldparse.pos - assert morphs == goldparse.morphs - assert lemmas == goldparse.lemmas - assert deps == goldparse.labels - assert heads == goldparse.heads - assert biluo_tags == goldparse.ner - assert "TRAVEL" in goldparse.cats - assert "BAKING" in goldparse.cats - assert cats["TRAVEL"] == goldparse.cats["TRAVEL"] - assert cats["BAKING"] == goldparse.cats["BAKING"] - - # roundtrip to JSONL tuples - with make_tempdir() as tmpdir: - jsonl_file = tmpdir / "roundtrip.jsonl" - # write to JSONL train dicts - srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) - goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) - # load and rewrite as JSONL tuples - srsly.write_jsonl(jsonl_file, goldcorpus.train_examples) - goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) - - reloaded_example = next(goldcorpus.dev_dataset(nlp)) - goldparse = reloaded_example.gold - - assert len(doc) == goldcorpus.count_train() - assert text == reloaded_example.text - assert tags == goldparse.tags - assert deps == goldparse.labels - assert heads == goldparse.heads - assert lemmas == goldparse.lemmas - assert biluo_tags == goldparse.ner - assert "TRAVEL" in goldparse.cats - assert "BAKING" in goldparse.cats - assert cats["TRAVEL"] == goldparse.cats["TRAVEL"] - assert cats["BAKING"] == goldparse.cats["BAKING"] - def test_projective_train_vs_nonprojective_dev(doc): nlp = English() @@ -342,16 +294,16 @@ def test_projective_train_vs_nonprojective_dev(doc): heads = [t.head.i for t in doc] with make_tempdir() as tmpdir: - jsonl_file = tmpdir / "test.jsonl" - # write to JSONL train dicts - srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) - goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + json_file = tmpdir / "test.json" + # write to JSON train dicts + srsly.write_json(json_file, [docs_to_json(doc)]) + goldcorpus = GoldCorpus(str(json_file), str(json_file)) - train_reloaded_example = next(goldcorpus.train_dataset(nlp)) - train_goldparse = train_reloaded_example.gold + train_reloaded_example = next(goldcorpus.train_dataset(nlp)) + train_goldparse = get_parses_from_example(train_reloaded_example)[0][1] - dev_reloaded_example = next(goldcorpus.dev_dataset(nlp)) - dev_goldparse = dev_reloaded_example.gold + dev_reloaded_example = next(goldcorpus.dev_dataset(nlp)) + dev_goldparse = dev_reloaded_example._deprecated_get_gold() assert is_nonproj_tree([t.head.i for t in doc]) is True assert is_nonproj_tree(train_goldparse.heads) is False @@ -364,45 +316,49 @@ def test_projective_train_vs_nonprojective_dev(doc): assert deps == dev_goldparse.labels +# Hm, not sure where misalignment check would be handled? In the components too? +# I guess that does make sense. A text categorizer doesn't care if it's +# misaligned... +@pytest.mark.xfail # TODO def test_ignore_misaligned(doc): nlp = English() text = doc.text with make_tempdir() as tmpdir: - jsonl_file = tmpdir / "test.jsonl" + json_file = tmpdir / "test.json" data = [docs_to_json(doc)] data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") - # write to JSONL train dicts - srsly.write_jsonl(jsonl_file, data) - goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + # write to JSON train dicts + srsly.write_json(json_file, data) + goldcorpus = GoldCorpus(str(json_file), str(json_file)) - with pytest.raises(AlignmentError): - train_reloaded_example = next(goldcorpus.train_dataset(nlp)) + with pytest.raises(AlignmentError): + train_reloaded_example = next(goldcorpus.train_dataset(nlp)) with make_tempdir() as tmpdir: - jsonl_file = tmpdir / "test.jsonl" + json_file = tmpdir / "test.json" data = [docs_to_json(doc)] data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") - # write to JSONL train dicts - srsly.write_jsonl(jsonl_file, data) - goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + # write to JSON train dicts + srsly.write_json(json_file, data) + goldcorpus = GoldCorpus(str(json_file), str(json_file)) - # doesn't raise an AlignmentError, but there is nothing to iterate over - # because the only example can't be aligned - train_reloaded_example = list(goldcorpus.train_dataset(nlp, ignore_misaligned=True)) - assert len(train_reloaded_example) == 0 + # doesn't raise an AlignmentError, but there is nothing to iterate over + # because the only example can't be aligned + train_reloaded_example = list(goldcorpus.train_dataset(nlp, ignore_misaligned=True)) + assert len(train_reloaded_example) == 0 def test_make_orth_variants(doc): nlp = English() with make_tempdir() as tmpdir: - jsonl_file = tmpdir / "test.jsonl" - # write to JSONL train dicts - srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) - goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + json_file = tmpdir / "test.json" + # write to JSON train dicts + srsly.write_json(json_file, [docs_to_json(doc)]) + goldcorpus = GoldCorpus(str(json_file), str(json_file)) - # due to randomness, test only that this runs with no errors for now - train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2)) - train_goldparse = train_reloaded_example.gold # noqa: F841 + # due to randomness, test only that this runs with no errors for now + train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2)) + train_goldparse = train_reloaded_example._deprecated_get_gold() @pytest.mark.parametrize( @@ -485,6 +441,7 @@ def test_tuple_format_implicit(): _train(train_data) +@pytest.mark.xfail # TODO def test_tuple_format_implicit_invalid(): """Test that an error is thrown for an implicit invalid GoldParse field""" @@ -520,8 +477,18 @@ def test_split_sents(merged_dict): nlp = English() example = Example() example.set_token_annotation(**merged_dict) - assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2 - assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1 + assert len(get_parses_from_example( + example, + merge=False, + vocab=nlp.vocab, + make_projective=False) + ) == 2 + assert len(get_parses_from_example( + example, + merge=True, + vocab=nlp.vocab, + make_projective=False + )) == 1 split_examples = example.split_sents() assert len(split_examples) == 2 @@ -557,4 +524,4 @@ def test_empty_example_goldparse(): nlp = English() doc = nlp("") example = Example(doc=doc) - assert len(example.get_gold_parses()) == 1 + assert len(get_parses_from_example(example)) == 1 diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 58db0a040..363366eeb 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -19,22 +19,16 @@ def nlp(): return nlp +@pytest.mark.xfail # TODO def test_language_update(nlp): text = "hello world" annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}} wrongkeyannots = {"LABEL": True} doc = Doc(nlp.vocab, words=text.split(" ")) - gold = GoldParse(doc, **annots) - # Update with doc and gold objects - nlp.update((doc, gold)) # Update with text and dict nlp.update((text, annots)) # Update with doc object and dict nlp.update((doc, annots)) - # Update with text and gold object - nlp.update((text, gold)) - # Update with empty doc and gold object - nlp.update((None, gold)) # Update badly with pytest.raises(ValueError): nlp.update((doc, None)) @@ -44,20 +38,16 @@ def test_language_update(nlp): def test_language_evaluate(nlp): text = "hello world" - annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}} + annots = { + "doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}} + } doc = Doc(nlp.vocab, words=text.split(" ")) - gold = GoldParse(doc, **annots) - # Evaluate with doc and gold objects - nlp.evaluate([(doc, gold)]) # Evaluate with text and dict nlp.evaluate([(text, annots)]) # Evaluate with doc object and dict nlp.evaluate([(doc, annots)]) - # Evaluate with text and gold object - nlp.evaluate([(text, gold)]) - # Evaluate badly with pytest.raises(Exception): - nlp.evaluate([text, gold]) + nlp.evaluate([text, annots]) def test_evaluate_no_pipe(nlp): From d9289712ba76d4c67450fe1969642416d0ac57f4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 8 Jun 2020 22:28:50 +0200 Subject: [PATCH 21/56] * Make GoldCorpus return dict, not Example * Make Example require a Doc object (previously optional) Clarify methods in GoldCorpus WIP refactor Example Refactor Example.split_sents Fix test Fix augment Update test Update test Fix import Update test_scorer Update Example --- spacy/cli/converters/conllu2json.py | 10 +- spacy/gold/annotation.py | 3 + spacy/gold/augment.py | 7 +- spacy/gold/corpus.py | 45 ++--- spacy/gold/example.py | 155 +++++++++++------- spacy/gold/gold_io.pyx | 4 +- spacy/syntax/nonproj.pyx | 4 +- spacy/tests/regression/test_issue1501-2000.py | 15 +- spacy/tests/test_gold.py | 24 ++- spacy/tests/test_scorer.py | 18 +- spacy/tokens/doc.pyx | 2 + 11 files changed, 176 insertions(+), 111 deletions(-) diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 1ece755b8..2cf5f7942 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -2,6 +2,7 @@ import re from ...gold import Example from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets +from ...gold import TokenAnnotation from ...language import Language from ...tokens import Doc, Token from .conll_ner2json import n_sents_info @@ -284,13 +285,8 @@ def example_from_conllu_sentence( spaces.append(t._.merged_spaceafter) ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] ents = biluo_tags_from_offsets(doc, ent_offsets) - raw = "" - for word, space in zip(words, spaces): - raw += word - if space: - raw += " " - example = Example(doc=raw) - example.set_token_annotation( + example = Example(doc=Doc(vocab, words=words, spaces=spaces)) + example.token_annotation = TokenAnnotation( ids=ids, words=words, tags=tags, diff --git a/spacy/gold/annotation.py b/spacy/gold/annotation.py index 6bae679c3..5f78902ab 100644 --- a/spacy/gold/annotation.py +++ b/spacy/gold/annotation.py @@ -1,3 +1,6 @@ +from .iob_utils import biluo_tags_from_offsets + + class TokenAnnotation: def __init__( self, diff --git a/spacy/gold/augment.py b/spacy/gold/augment.py index 656308214..f938f540f 100644 --- a/spacy/gold/augment.py +++ b/spacy/gold/augment.py @@ -1,6 +1,7 @@ import random import itertools from .example import Example +from .annotation import TokenAnnotation def make_orth_variants(nlp, example, orth_variant_level=0.0): @@ -17,14 +18,14 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): ndsv = nlp.Defaults.single_orth_variants ndpv = nlp.Defaults.paired_orth_variants # modify words in paragraph_tuples - variant_example = Example(doc=raw) + variant_example = Example(doc=nlp.make_doc(raw)) token_annotation = example.token_annotation words = token_annotation.words tags = token_annotation.tags if not words or not tags: # add the unmodified annotation token_dict = token_annotation.to_dict() - variant_example.set_token_annotation(**token_dict) + variant_example.token_annotation = TokenAnnotation(**token_dict) else: if lower: words = [w.lower() for w in words] @@ -60,7 +61,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): token_dict = token_annotation.to_dict() token_dict["words"] = words token_dict["tags"] = tags - variant_example.set_token_annotation(**token_dict) + variant_example.token_annotation = TokenAnnotation(**token_dict) # modify raw to match variant_paragraph_tuples if raw is not None: variants = [] diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 9462f0aa4..df13ab505 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -28,8 +28,8 @@ class GoldCorpus(object): """ self.limit = limit if isinstance(train, str) or isinstance(train, Path): - train = self.read_examples(self.walk_corpus(train)) - dev = self.read_examples(self.walk_corpus(dev)) + train = self.read_annotations(self.walk_corpus(train)) + dev = self.read_annotations(self.walk_corpus(dev)) # Write temp directory with one doc per file, so we can shuffle and stream self.tmp_dir = Path(tempfile.mkdtemp()) self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit) @@ -71,7 +71,7 @@ class GoldCorpus(object): return locs @staticmethod - def read_examples(locs, limit=0): + def read_annotations(locs, limit=0): """ Yield training examples """ i = 0 for loc in locs: @@ -101,11 +101,11 @@ class GoldCorpus(object): or isinstance(doc, str) ): raise ValueError(Errors.E987.format(type=type(doc))) - examples.append(Example.from_dict(ex_dict, doc=doc)) + examples.append(ex_dict) elif file_name.endswith("msg"): text, ex_dict = srsly.read_msgpack(loc) - examples = [Example.from_dict(ex_dict, doc=text)] + examples = [ex_dict] else: supported = ("json", "jsonl", "msg") raise ValueError(Errors.E124.format(path=loc, formats=supported)) @@ -123,21 +123,21 @@ class GoldCorpus(object): raise ValueError(Errors.E996.format(file=file_name, msg=msg)) @property - def dev_examples(self): + def dev_annotations(self): locs = (self.tmp_dir / "dev").iterdir() - yield from self.read_examples(locs, limit=self.limit) + yield from self.read_annotations(locs, limit=self.limit) @property - def train_examples(self): + def train_annotations(self): locs = (self.tmp_dir / "train").iterdir() - yield from self.read_examples(locs, limit=self.limit) + yield from self.read_annotations(locs, limit=self.limit) def count_train(self): """Returns count of words in train examples""" n = 0 i = 0 - for example in self.train_examples: - n += len(example.token_annotation.words) + for eg_dict in self.train_annotations: + n += len(eg_dict["token_annotation"]["words"]) if self.limit and i >= self.limit: break i += 1 @@ -154,10 +154,10 @@ class GoldCorpus(object): ): locs = list((self.tmp_dir / "train").iterdir()) random.shuffle(locs) - train_examples = self.read_examples(locs, limit=self.limit) - gold_examples = self.iter_gold_docs( + train_annotations = self.read_annotations(locs, limit=self.limit) + examples = self.iter_examples( nlp, - train_examples, + train_annotations, gold_preproc, max_length=max_length, noise_level=noise_level, @@ -165,33 +165,33 @@ class GoldCorpus(object): make_projective=True, ignore_misaligned=ignore_misaligned, ) - yield from gold_examples + yield from examples def train_dataset_without_preprocessing( self, nlp, gold_preproc=False, ignore_misaligned=False ): - examples = self.iter_gold_docs( + examples = self.iter_examples( nlp, - self.train_examples, + self.train_annotations, gold_preproc=gold_preproc, ignore_misaligned=ignore_misaligned, ) yield from examples def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False): - examples = self.iter_gold_docs( + examples = self.iter_examples( nlp, - self.dev_examples, + self.dev_annotations, gold_preproc=gold_preproc, ignore_misaligned=ignore_misaligned, ) yield from examples @classmethod - def iter_gold_docs( + def iter_examples( cls, nlp, - examples, + annotations, gold_preproc, max_length=None, noise_level=0.0, @@ -200,7 +200,8 @@ class GoldCorpus(object): ignore_misaligned=False, ): """ Setting gold_preproc will result in creating a doc per sentence """ - for example in examples: + for eg_dict in annotations: + example = Example.from_dict(eg_dict, doc=nlp.make_doc(eg_dict["text"])) example_docs = [] if gold_preproc: split_examples = example.split_sents() diff --git a/spacy/gold/example.py b/spacy/gold/example.py index 1d8665572..c8ad58da7 100644 --- a/spacy/gold/example.py +++ b/spacy/gold/example.py @@ -1,18 +1,69 @@ +import numpy from .annotation import TokenAnnotation, DocAnnotation +from .iob_utils import spans_from_biluo_tags, biluo_tags_from_offsets from .align import Alignment from ..errors import Errors, AlignmentError from ..tokens import Doc +def annotations2doc(doc, doc_annot, tok_annot): + # TODO: Improve and test this + words = tok_annot.words or [tok.text for tok in doc] + fields = { + "tags": "TAG", + "pos": "POS", + "lemmas": "LEMMA", + "deps": "DEP", + } + attrs = [] + values = [] + for field, attr in fields.items(): + value = getattr(tok_annot, field) + # Unset fields will be empty lists. + if value: + attrs.append(attr) + values.append([doc.vocab.strings.add(v) for v in value]) + if tok_annot.heads: + attrs.append("HEAD") + values.append([h - i for i, h in enumerate(tok_annot.heads)]) + output = Doc(doc.vocab, words=words) + if values: + array = numpy.array(values, dtype="uint64") + output = output.from_array(attrs, array.T) + if tok_annot.entities: + output.ents = spans_from_biluo_tags(output, tok_annot.entities) + doc.cats = dict(doc_annot.cats) + # TODO: Calculate token.ent_kb_id from links. + # We need to fix this and the doc.ents thing, both should be doc + # annotations. + return doc + + class Example: - def __init__(self, doc=None, doc_annotation=None, token_annotation=None): + def __init__(self, doc, doc_annotation=None, token_annotation=None): """ Doc can either be text, or an actual Doc """ + if not isinstance(doc, Doc): + raise TypeError("Must pass Doc instance") + self.predicted = doc self.doc = doc self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation() self.token_annotation = ( token_annotation if token_annotation else TokenAnnotation() ) self._alignment = None + self.reference = annotations2doc( + self.doc, + self.doc_annotation, + self.token_annotation + ) + + @property + def x(self): + return self.predicted + + @property + def y(self): + return self.reference def _deprecated_get_gold(self, make_projective=False): from ..syntax.gold_parse import get_parses_from_example @@ -24,6 +75,8 @@ class Example: def from_dict(cls, example_dict, doc=None): if example_dict is None: raise ValueError("Example.from_dict expected dict, received None") + if doc is None: + raise ValueError("Must pass doc") # TODO: This is ridiculous... token_dict = example_dict.get("token_annotation", {}) doc_dict = example_dict.get("doc_annotation", {}) @@ -34,6 +87,10 @@ class Example: doc_dict[key] = value else: token_dict[key] = value + if token_dict.get("entities"): + entities = token_dict["entities"] + if isinstance(entities[0], (list, tuple)): + token_dict["entities"] = biluo_tags_from_offsets(doc, entities) token_annotation = TokenAnnotation.from_dict(token_dict) doc_annotation = DocAnnotation.from_dict(doc_dict) return cls( @@ -45,8 +102,8 @@ class Example: if self._alignment is None: if self.doc is None: return None - spacy_words = [token.orth_ for token in self.doc] - gold_words = self.token_annotation.words + spacy_words = [token.orth_ for token in self.predicted] + gold_words = [token.orth_ for token in self.reference] if gold_words == []: gold_words = spacy_words self._alignment = Alignment(spacy_words, gold_words) @@ -92,34 +149,6 @@ class Example: output.append(gold_values[gold_i]) return output - def set_token_annotation( - self, - ids=None, - words=None, - tags=None, - pos=None, - morphs=None, - lemmas=None, - heads=None, - deps=None, - entities=None, - sent_starts=None, - brackets=None, - ): - self.token_annotation = TokenAnnotation( - ids=ids, - words=words, - tags=tags, - pos=pos, - morphs=morphs, - lemmas=lemmas, - heads=heads, - deps=deps, - entities=entities, - sent_starts=sent_starts, - brackets=brackets, - ) - def set_doc_annotation(self, cats=None, links=None): if cats: self.doc_annotation.cats = cats @@ -131,7 +160,6 @@ class Example: sent_starts and return a list of the new Examples""" if not self.token_annotation.words: return [self] - s_example = Example(doc=None, doc_annotation=self.doc_annotation) s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], [] s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], [] s_brackets = [] @@ -140,21 +168,25 @@ class Example: split_examples = [] for i in range(len(t.words)): if i > 0 and t.sent_starts[i] == 1: - s_example.set_token_annotation( - ids=s_ids, - words=s_words, - tags=s_tags, - pos=s_pos, - morphs=s_morphs, - lemmas=s_lemmas, - heads=s_heads, - deps=s_deps, - entities=s_ents, - sent_starts=s_sent_starts, - brackets=s_brackets, + split_examples.append( + Example( + doc=Doc(self.doc.vocab, words=s_words), + token_annotation=TokenAnnotation( + ids=s_ids, + words=s_words, + tags=s_tags, + pos=s_pos, + morphs=s_morphs, + lemmas=s_lemmas, + heads=s_heads, + deps=s_deps, + entities=s_ents, + sent_starts=s_sent_starts, + brackets=s_brackets, + ), + doc_annotation=self.doc_annotation + ) ) - split_examples.append(s_example) - s_example = Example(doc=None, doc_annotation=self.doc_annotation) s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], [] s_deps, s_ents, s_morphs, s_lemmas = [], [], [], [] s_sent_starts, s_brackets = [], [] @@ -172,20 +204,25 @@ class Example: for b_end, b_label in t.brackets_by_start.get(i, []): s_brackets.append((i - sent_start_i, b_end - sent_start_i, b_label)) i += 1 - s_example.set_token_annotation( - ids=s_ids, - words=s_words, - tags=s_tags, - pos=s_pos, - morphs=s_morphs, - lemmas=s_lemmas, - heads=s_heads, - deps=s_deps, - entities=s_ents, - sent_starts=s_sent_starts, - brackets=s_brackets, + split_examples.append( + Example( + doc=Doc(self.doc.vocab, words=s_words), + token_annotation=TokenAnnotation( + ids=s_ids, + words=s_words, + tags=s_tags, + pos=s_pos, + morphs=s_morphs, + lemmas=s_lemmas, + heads=s_heads, + deps=s_deps, + entities=s_ents, + sent_starts=s_sent_starts, + brackets=s_brackets, + ), + doc_annotation=self.doc_annotation + ) ) - split_examples.append(s_example) return split_examples @classmethod diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx index 424e44f72..8aa5f4017 100644 --- a/spacy/gold/gold_io.pyx +++ b/spacy/gold/gold_io.pyx @@ -76,12 +76,12 @@ def read_json_file(loc, docs_filter=None, limit=None): yield json_data -def json_to_examples(doc): +def json_to_annotations(doc): """Convert an item in the JSON-formatted training data to the format used by GoldParse. doc (dict): One entry in the training data. - YIELDS (Example): The reformatted data - one training example per paragraph + YIELDS (tuple): The reformatted data - one training example per paragraph """ for paragraph in doc["paragraphs"]: example = {"text": paragraph.get("raw", None)} diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index 1edb2e65c..a91176f44 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -108,7 +108,7 @@ def preprocess_training_data(gold_data, label_freq_cutoff=30): proj_token_dict = example.token_annotation.to_dict() proj_token_dict["heads"] = proj_heads proj_token_dict["deps"] = deco_deps - new_example.set_token_annotation(**proj_token_dict) + new_example.token_annotation = TokenAnnotation(**proj_token_dict) preprocessed.append(new_example) if label_freq_cutoff > 0: return _filter_labels(preprocessed, label_freq_cutoff, freqs) @@ -216,6 +216,6 @@ def _filter_labels(examples, cutoff, freqs): filtered_labels.append(label) filtered_token_dict = example.token_annotation.to_dict() filtered_token_dict["deps"] = filtered_labels - new_example.set_token_annotation(**filtered_token_dict) + new_example.token_annotation = TokenAnnotation(**filtered_token_dict) filtered.append(new_example) return filtered diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 5a76697bc..ed1f33351 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -3,7 +3,7 @@ import gc import numpy import copy -from spacy.gold import Example +from spacy.gold import Example, TokenAnnotation from spacy.lang.en import English from spacy.lang.en.stop_words import STOP_WORDS from spacy.lang.lex_attrs import is_stop @@ -271,9 +271,16 @@ def test_issue1963(en_tokenizer): @pytest.mark.parametrize("label", ["U-JOB-NAME"]) def test_issue1967(label): ner = EntityRecognizer(Vocab(), default_ner()) - example = Example(doc=None) - example.set_token_annotation( - ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label] + example = Example( + doc=Doc(ner.vocab, words=["word"]), + token_annotation=TokenAnnotation( + ids=[0], + words=["word"], + tags=["tag"], + heads=[0], + deps=["dep"], + entities=[label] + ) ) ner.moves.get_actions(gold_parses=[example]) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 4b4250179..29ddc7456 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -95,6 +95,12 @@ def merged_dict(): } +@pytest.fixture +def vocab(): + nlp = English() + return nlp.vocab + + def test_gold_biluo_U(en_vocab): words = ["I", "flew", "to", "London", "."] spaces = [True, True, True, False, True] @@ -475,8 +481,10 @@ def _train(train_data): def test_split_sents(merged_dict): nlp = English() - example = Example() - example.set_token_annotation(**merged_dict) + example = Example.from_dict( + merged_dict, + doc=Doc(nlp.vocab, words=merged_dict["words"]) + ) assert len(get_parses_from_example( example, merge=False, @@ -506,13 +514,15 @@ def test_split_sents(merged_dict): assert token_annotation_2.sent_starts == [1, 0, 0, 0] -def test_tuples_to_example(merged_dict): - ex = Example() - ex.set_token_annotation(**merged_dict) +def test_tuples_to_example(vocab, merged_dict): cats = {"TRAVEL": 1.0, "BAKING": 0.0} - ex.set_doc_annotation(cats=cats) + merged_dict = dict(merged_dict) + merged_dict["cats"] = cats + ex = Example.from_dict( + merged_dict, + doc=Doc(vocab, words=merged_dict["words"]) + ) ex_dict = ex.to_dict() - assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"] assert ex_dict["token_annotation"]["words"] == merged_dict["words"] assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"] diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index d750a8202..5eaf8d5b3 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -1,12 +1,14 @@ from numpy.testing import assert_almost_equal, assert_array_almost_equal import pytest from pytest import approx -from spacy.gold import Example, GoldParse +from spacy.gold import Example, GoldParse, TokenAnnotation +from spacy.gold.iob_utils import biluo_tags_from_offsets from spacy.scorer import Scorer, ROCAUCScore from spacy.scorer import _roc_auc_score, _roc_curve from .util import get_doc from spacy.lang.en import English + test_las_apple = [ [ "Apple is looking at buying U.K. startup for $ 1 billion", @@ -134,8 +136,11 @@ def test_ner_per_type(en_vocab): words=input_.split(" "), ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]], ) - ex = Example(doc=doc) - ex.set_token_annotation(entities=annot["entities"]) + entities = biluo_tags_from_offsets(doc, annot["entities"]) + ex = Example( + doc=doc, + token_annotation=TokenAnnotation(entities=entities) + ) scorer.score(ex) results = scorer.scores @@ -155,8 +160,11 @@ def test_ner_per_type(en_vocab): words=input_.split(" "), ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]], ) - ex = Example(doc=doc) - ex.set_token_annotation(entities=annot["entities"]) + entities = biluo_tags_from_offsets(doc, annot["entities"]) + ex = Example( + doc=doc, + token_annotation=TokenAnnotation(entities=entities) + ) scorer.score(ex) results = scorer.scores diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 3aa27e451..81cef4492 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -799,6 +799,8 @@ cdef class Doc: cdef attr_id_t attr_id cdef TokenC* tokens = self.c cdef int length = len(array) + if length != len(self): + raise ValueError("Cannot set array values longer than the document.") # Get set up for fast loading cdef Pool mem = Pool() cdef int n_attrs = len(attrs) From 549164c31cf273339487e97aae4f6d4e84ee7779 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 12:33:14 +0200 Subject: [PATCH 22/56] Fix corpus when no raw text supplied --- spacy/gold/corpus.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index df13ab505..e8bb91359 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -201,7 +201,16 @@ class GoldCorpus(object): ): """ Setting gold_preproc will result in creating a doc per sentence """ for eg_dict in annotations: - example = Example.from_dict(eg_dict, doc=nlp.make_doc(eg_dict["text"])) + if eg_dict["text"]: + example = Example.from_dict( + eg_dict, + doc=nlp.make_doc(eg_dict["text"]) + ) + else: + example = Example.from_dict( + eg_dict, + doc=Doc(nlp.vocab, words=eg_dict["words"]) + ) example_docs = [] if gold_preproc: split_examples = example.split_sents() From 20a1bdb29813f509f2de9b55d30cb775e2225732 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 12:33:29 +0200 Subject: [PATCH 23/56] Fix train --- spacy/cli/train_from_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index c4db5f6ba..4fea39064 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -186,7 +186,7 @@ def train( msg.info("Loading training corpus") corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit) msg.info("Initializing the nlp pipeline") - nlp.begin_training(lambda: corpus.train_examples) + nlp.begin_training(lambda: corpus.train_dataset(nlp)) train_batches = create_train_batches(nlp, corpus, training) evaluate = create_evaluation_callback(nlp, optimizer, corpus, training) From cb08ce39362a30f5d589de5f0d219c75ca269a9e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 12:40:41 +0200 Subject: [PATCH 24/56] Move alignment into Cython --- spacy/gold/{align.py => align.pyx} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename spacy/gold/{align.py => align.pyx} (100%) diff --git a/spacy/gold/align.py b/spacy/gold/align.pyx similarity index 100% rename from spacy/gold/align.py rename to spacy/gold/align.pyx From 449000c23458788eaaab5390c61452d5062d88d5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 12:43:53 +0200 Subject: [PATCH 25/56] Fix gold_io --- spacy/gold/gold_io.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx index 8aa5f4017..83208ad85 100644 --- a/spacy/gold/gold_io.pyx +++ b/spacy/gold/gold_io.pyx @@ -72,7 +72,7 @@ def read_json_file(loc, docs_filter=None, limit=None): for doc in json_iterate(loc): if docs_filter is not None and not docs_filter(doc): continue - for json_data in json_to_examples(doc): + for json_data in json_to_annotations(doc): yield json_data From 453cfa14d0200e13cf1246406fa7ae8ba58f3987 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 15:28:42 +0200 Subject: [PATCH 26/56] Start drafting new example class --- spacy/gold/new_example.pxd | 8 + spacy/gold/new_example.pyx | 304 +++++++++++++++++++++++++++++++++++++ 2 files changed, 312 insertions(+) create mode 100644 spacy/gold/new_example.pxd create mode 100644 spacy/gold/new_example.pyx diff --git a/spacy/gold/new_example.pxd b/spacy/gold/new_example.pxd new file mode 100644 index 000000000..9e513b033 --- /dev/null +++ b/spacy/gold/new_example.pxd @@ -0,0 +1,8 @@ +from ..tokens.doc cimport Doc +from .align cimport Alignment + + +cdef class NewExample: + cdef readonly Doc x + cdef readonly Doc y + cdef readonly Alignment _alignment diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx new file mode 100644 index 000000000..7f081ffbd --- /dev/null +++ b/spacy/gold/new_example.pyx @@ -0,0 +1,304 @@ +import numpy +from ..tokens.doc cimport Doc +from ..attrs import IDS +from .align cimport Alignment +from .annotation import TokenAnnotation, DocAnnotation +from .iob_utils import biluo_to_iob, biluo_tags_from_offsets +from .align import Alignment +from ..errors import Errors, AlignmentError + + +cpdef Doc annotations2doc(Doc predicted, doc_annot, tok_annot): + # TODO: Improve and test this + words = tok_annot.get("ORTH", [tok.text for tok in predicted]) + attrs, array = _annot2array(predicted.vocab.strings, tok_annot, doc_annot) + output = Doc(predicted.vocab, words=words) + if array.size: + output = output.from_array(attrs, array) + output.cats.update(doc_annot.get("cats", {})) + return output + + +cdef class NewExample: + def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None): + """ Doc can either be text, or an actual Doc """ + msg = "Example.__init__ got None for '{arg}'. Requires Doc." + if predicted is None: + raise TypeError(msg.format(arg="predicted")) + if reference is None: + raise TypeError(msg.format(arg="reference")) + self.x = predicted + self.y = reference + self._alignment = alignment + + @property + def predicted(self): + return self.x + + @property + def reference(self): + return self.y + + @classmethod + def from_dict(cls, Doc predicted, dict example_dict): + if example_dict is None: + raise ValueError("Example.from_dict expected dict, received None") + if not isinstance(predicted, Doc): + raise TypeError(f"Argument 1 should be Doc. Got {type(predicted)}") + example_dict = _fix_legacy_dict_data(predicted, example_dict) + tok_dict, doc_dict = _parse_example_dict_data(example_dict) + return NewExample( + predicted, + annotations2doc(predicted, tok_dict, doc_dict) + ) + + @property + def alignment(self): + if self._alignment is None: + if self.doc is None: + return None + spacy_words = [token.orth_ for token in self.predicted] + gold_words = [token.orth_ for token in self.reference] + if gold_words == []: + gold_words = spacy_words + self._alignment = Alignment(spacy_words, gold_words) + return self._alignment + + def get_aligned(self, field): + raise NotImplementedError + + def to_dict(self): + """ Note that this method does NOT export the doc, only the annotations ! """ + token_dict = self._token_annotation + doc_dict = self._doc_annotation + return {"token_annotation": token_dict, "doc_annotation": doc_dict} + + def text(self): + return self.x.text + + +def _annot2array(strings, tok_annot, doc_annot): + attrs = [] + values = [] + for key, value in tok_annot.items(): + if key not in IDS: + raise ValueError(f"Unknown attr: {key}") + if key == "HEAD": + values.append([h-i for i, h in enumerate(value)]) + else: + values.append([strings.add(v) for v in value]) + attrs.append(key) + # TODO: Calculate token.ent_kb_id from doc_annot["links"]. + # We need to fix this and the doc.ents thing, both should be doc + # annotations. + array = numpy.array(values, dtype="uint64") + return attrs, array + + +def _parse_example_dict_data(example_dict): + return ( + example_dict["token_annotation"], + example_dict["doc_annotation"] + ) + + +def _fix_legacy_dict_data(predicted, example_dict): + token_dict = example_dict.get("token_annotation", {}) + doc_dict = example_dict.get("doc_annotation", {}) + for key, value in example_dict.items(): + if key in ("token_annotation", "doc_annotation"): + pass + elif key in ("cats", "links"): + doc_dict[key] = value + else: + token_dict[key] = value + # Remap keys + remapping = { + "words": "ORTH", + "tags": "TAG", + "pos": "POS", + "lemmas": "LEMMA", + "deps": "DEP", + "heads": "HEAD", + "sent_starts": "SENT_START", + "morphs": "MORPH", + } + old_token_dict = token_dict + token_dict = {} + for key, value in old_token_dict.items(): + if key in remapping: + token_dict[remapping[key]] = value + elif key in ("ner", "entities") and value: + # Arguably it would be smarter to put this in the doc annotation? + words = token_dict.get("words", [t.text for t in predicted]) + ent_iobs, ent_types = _parse_ner_tags(predicted, words, value) + token_dict["ENT_IOB"] = ent_iobs + token_dict["ENT_TYPE"] = ent_types + return { + "token_annotation": token_dict, + "doc_annotation": doc_dict + } + + +def _parse_ner_tags(predicted, words, biluo_or_offsets): + if isinstance(biluo_or_offsets[0], (list, tuple)): + # Convert to biluo if necessary + # This is annoying but to convert the offsets we need a Doc + # that has the target tokenization. + reference = Doc( + predicted.vocab, + words=words + ) + biluo = biluo_tags_from_offsets(predicted, biluo_or_offsets) + else: + biluo = biluo_or_offsets + ent_iobs = [] + ent_types = [] + for iob_tag in biluo_to_iob(biluo): + ent_iobs.append(iob_tag.split("-")[0]) + if iob_tag.startswith("I") or iob_tag.startswith("B"): + ent_types.append(iob_tag.split("-", 1)[1]) + else: + ent_types.append("") + return ent_iobs, ent_types + + +class Example: + def get_aligned(self, field): + """Return an aligned array for a token annotation field.""" + if self.doc is None: + return self.token_annotation.get_field(field) + doc = self.doc + if field == "word": + return [token.orth_ for token in doc] + gold_values = self.token_annotation.get_field(field) + alignment = self.alignment + i2j_multi = alignment.i2j_multi + gold_to_cand = alignment.gold_to_cand + cand_to_gold = alignment.cand_to_gold + + output = [] + for i, gold_i in enumerate(cand_to_gold): + if doc[i].text.isspace(): + output.append(None) + elif gold_i is None: + if i in i2j_multi: + output.append(gold_values[i2j_multi[i]]) + else: + output.append(None) + else: + output.append(gold_values[gold_i]) + return output + + def split_sents(self): + """ Split the token annotations into multiple Examples based on + sent_starts and return a list of the new Examples""" + if not self.token_annotation.words: + return [self] + s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], [] + s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], [] + s_brackets = [] + sent_start_i = 0 + t = self.token_annotation + split_examples = [] + for i in range(len(t.words)): + if i > 0 and t.sent_starts[i] == 1: + split_examples.append( + Example( + doc=Doc(self.doc.vocab, words=s_words), + token_annotation=TokenAnnotation( + ids=s_ids, + words=s_words, + tags=s_tags, + pos=s_pos, + morphs=s_morphs, + lemmas=s_lemmas, + heads=s_heads, + deps=s_deps, + entities=s_ents, + sent_starts=s_sent_starts, + brackets=s_brackets, + ), + doc_annotation=self.doc_annotation + ) + ) + s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], [] + s_deps, s_ents, s_morphs, s_lemmas = [], [], [], [] + s_sent_starts, s_brackets = [], [] + sent_start_i = i + s_ids.append(t.get_id(i)) + s_words.append(t.get_word(i)) + s_tags.append(t.get_tag(i)) + s_pos.append(t.get_pos(i)) + s_morphs.append(t.get_morph(i)) + s_lemmas.append(t.get_lemma(i)) + s_heads.append(t.get_head(i) - sent_start_i) + s_deps.append(t.get_dep(i)) + s_ents.append(t.get_entity(i)) + s_sent_starts.append(t.get_sent_start(i)) + for b_end, b_label in t.brackets_by_start.get(i, []): + s_brackets.append((i - sent_start_i, b_end - sent_start_i, b_label)) + i += 1 + split_examples.append( + Example( + doc=Doc(self.doc.vocab, words=s_words), + token_annotation=TokenAnnotation( + ids=s_ids, + words=s_words, + tags=s_tags, + pos=s_pos, + morphs=s_morphs, + lemmas=s_lemmas, + heads=s_heads, + deps=s_deps, + entities=s_ents, + sent_starts=s_sent_starts, + brackets=s_brackets, + ), + doc_annotation=self.doc_annotation + ) + ) + return split_examples + + @classmethod + def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False): + """ + Return a list of Example objects, from a variety of input formats. + make_doc needs to be provided when the examples contain text strings and keep_raw_text=False + """ + if isinstance(examples, Example): + return [examples] + if isinstance(examples, tuple): + examples = [examples] + converted_examples = [] + for ex in examples: + if isinstance(ex, Example): + converted_examples.append(ex) + # convert string to Doc to Example + elif isinstance(ex, str): + if keep_raw_text: + converted_examples.append(Example(doc=ex)) + else: + doc = make_doc(ex) + converted_examples.append(Example(doc=doc)) + # convert tuples to Example + elif isinstance(ex, tuple) and len(ex) == 2: + doc, gold = ex + # convert string to Doc + if isinstance(doc, str) and not keep_raw_text: + doc = make_doc(doc) + converted_examples.append(Example.from_dict(gold, doc=doc)) + # convert Doc to Example + elif isinstance(ex, Doc): + converted_examples.append(Example(doc=ex)) + else: + converted_examples.append(ex) + return converted_examples + + def _deprecated_get_gold(self, make_projective=False): + from ..syntax.gold_parse import get_parses_from_example + + _, gold = get_parses_from_example(self, make_projective=make_projective)[0] + return gold + + From c833ebe1ad72154bbac3213832a50bae0caa84f6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 15:29:05 +0200 Subject: [PATCH 27/56] Start tests for new example class --- spacy/tests/test_new_example.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 spacy/tests/test_new_example.py diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py new file mode 100644 index 000000000..c481ae932 --- /dev/null +++ b/spacy/tests/test_new_example.py @@ -0,0 +1,30 @@ +import pytest +from spacy.gold.new_example import NewExample as Example +from spacy.tokens import Doc +from spacy.vocab import Vocab + + +@pytest.fixture +def vocab(): + return Vocab() + + +def test_Example_init_requires_doc_objects(vocab): + with pytest.raises(TypeError): + eg = Example(None, None) + with pytest.raises(TypeError): + eg = Example(Doc(vocab, words=["hi"]), None) + with pytest.raises(TypeError): + eg = Example(None, Doc(vocab, words=["hi"])) + + + +def test_Example_from_dict(vocab): + eg = Example.from_dict( + Doc(vocab, words=["hello", "world"]), + { + "words": ["hello", "world"] + } + ) + assert isinstance(eg.x, Doc) + assert isinstance(eg.y, Doc) From f1189dc205b76817a8e738463e06f7aad18883a4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 15:43:08 +0200 Subject: [PATCH 28/56] Draft tests for new Example class --- spacy/tests/test_new_example.py | 50 +++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index c481ae932..fcd02ee91 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -4,12 +4,8 @@ from spacy.tokens import Doc from spacy.vocab import Vocab -@pytest.fixture -def vocab(): - return Vocab() - - -def test_Example_init_requires_doc_objects(vocab): +def test_Example_init_requires_doc_objects(): + vocab = Vocab() with pytest.raises(TypeError): eg = Example(None, None) with pytest.raises(TypeError): @@ -19,12 +15,50 @@ def test_Example_init_requires_doc_objects(vocab): -def test_Example_from_dict(vocab): +def test_Example_from_dict_basic(): eg = Example.from_dict( - Doc(vocab, words=["hello", "world"]), + Doc(Vocab(), words=["hello", "world"]), { "words": ["hello", "world"] } ) assert isinstance(eg.x, Doc) assert isinstance(eg.y, Doc) + + +@pytest.mark.parametrize("annots", [ + {"words": ["ice", "cream"], "tags": ["NN", "NN"]}, +]) +def test_Example_from_dict_with_tags(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + eg = Example.from_dict(predicted, annots) + for i, token in enumerate(eg.reference): + assert token.tag_ == annots["tags"][i] + + +""" +def test_Example_from_dict_with_entities(vocab): + # TODO + pass + +def test_Example_from_dict_with_parse(vocab): + # TODO + pass + +def test_Example_from_dict_with_morphology(vocab): + # TODO + pass + +def test_Example_from_dict_with_sent_start(vocab): + # TODO + pass + +def test_Example_from_dict_with_cats(vocab): + # TODO + pass + +def test_Example_from_dict_with_links(vocab): + # TODO + pass +""" From 36d49a0f13e8a17185a8ee821738e57c55c3848d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 15:43:19 +0200 Subject: [PATCH 29/56] Fix NewExample class --- spacy/gold/new_example.pyx | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx index 7f081ffbd..3c42c0bb1 100644 --- a/spacy/gold/new_example.pyx +++ b/spacy/gold/new_example.pyx @@ -8,7 +8,7 @@ from .align import Alignment from ..errors import Errors, AlignmentError -cpdef Doc annotations2doc(Doc predicted, doc_annot, tok_annot): +cpdef Doc annotations2doc(Doc predicted, tok_annot, doc_annot): # TODO: Improve and test this words = tok_annot.get("ORTH", [tok.text for tok in predicted]) attrs, array = _annot2array(predicted.vocab.strings, tok_annot, doc_annot) @@ -83,16 +83,19 @@ def _annot2array(strings, tok_annot, doc_annot): for key, value in tok_annot.items(): if key not in IDS: raise ValueError(f"Unknown attr: {key}") - if key == "HEAD": + elif key == "ORTH": + pass + elif key == "HEAD": + attrs.append(key) values.append([h-i for i, h in enumerate(value)]) else: + attrs.append(key) values.append([strings.add(v) for v in value]) - attrs.append(key) # TODO: Calculate token.ent_kb_id from doc_annot["links"]. # We need to fix this and the doc.ents thing, both should be doc # annotations. array = numpy.array(values, dtype="uint64") - return attrs, array + return attrs, array.T def _parse_example_dict_data(example_dict): From 793092d2d82cdbabc2393fad1ffb3bb19575d76e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 15:43:38 +0200 Subject: [PATCH 30/56] Fix renaming in GoldCorpus --- spacy/gold/corpus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index e8bb91359..84de01665 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -7,7 +7,7 @@ import itertools from ..tokens import Doc from .. import util from ..errors import Errors, AlignmentError -from .gold_io import read_json_file, json_to_examples +from .gold_io import read_json_file, json_to_annotations from .augment import make_orth_variants, add_noise from .example import Example @@ -88,7 +88,7 @@ class GoldCorpus(object): if first_gold_tuple.get("paragraphs", None): examples = [] for json_doc in gold_tuples: - examples.extend(json_to_examples(json_doc)) + examples.extend(json_to_annotations(json_doc)) elif first_gold_tuple.get("doc_annotation", None): examples = [] for ex_dict in gold_tuples: From b5ef39763930f6ad838a260ac064ad74c3f37818 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 15:43:48 +0200 Subject: [PATCH 31/56] Add header for align.pxd --- spacy/gold/align.pxd | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 spacy/gold/align.pxd diff --git a/spacy/gold/align.pxd b/spacy/gold/align.pxd new file mode 100644 index 000000000..ea3615863 --- /dev/null +++ b/spacy/gold/align.pxd @@ -0,0 +1,8 @@ +cdef class Alignment: + cdef public object cost + cdef public object i2j + cdef public object j2i + cdef public object i2j_multi + cdef public object j2i_multi + cdef public object cand_to_gold + cdef public object gold_to_cand From f4caaa8ad9f36a5bb3c9a040859d781eb81c40b5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 15:43:57 +0200 Subject: [PATCH 32/56] Update alignment --- spacy/gold/align.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/gold/align.pyx b/spacy/gold/align.pyx index 49e8aaa98..80ba0346a 100644 --- a/spacy/gold/align.pyx +++ b/spacy/gold/align.pyx @@ -2,7 +2,7 @@ import numpy from ..errors import Errors, AlignmentError -class Alignment: +cdef class Alignment: def __init__(self, spacy_words, gold_words): # Do many-to-one alignment for misaligned tokens. # If we over-segment, we'll have one gold word that covers a sequence From 04569c0b3e6606db70a494f5b5706090bb809646 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 15:44:08 +0200 Subject: [PATCH 33/56] Fix import --- spacy/syntax/nonproj.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index a91176f44..ee3219392 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -7,7 +7,7 @@ from copy import copy from ..tokens.doc cimport Doc, set_children_from_heads -from ..gold import Example +from ..gold import Example, TokenAnnotation from ..errors import Errors From a20ac36bb7331cf963d62197d099479539387716 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 15:44:17 +0200 Subject: [PATCH 34/56] Compile new modules --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index 864a4036a..c92761f2a 100755 --- a/setup.py +++ b/setup.py @@ -23,6 +23,8 @@ Options.docstrings = True PACKAGES = find_packages() MOD_NAMES = [ + "spacy.gold.align", + "spacy.gold.new_example", "spacy.parts_of_speech", "spacy.strings", "spacy.lexeme", From ccd332a9fc6290ee0c49dcbfbd6c62349cab1a1f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 15:49:04 +0200 Subject: [PATCH 35/56] Update test stubs --- spacy/tests/test_new_example.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index fcd02ee91..473666eca 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -38,27 +38,27 @@ def test_Example_from_dict_with_tags(annots): """ -def test_Example_from_dict_with_entities(vocab): +def test_Example_from_dict_with_entities(annots): # TODO pass -def test_Example_from_dict_with_parse(vocab): +def test_Example_from_dict_with_parse(annots): # TODO pass -def test_Example_from_dict_with_morphology(vocab): +def test_Example_from_dict_with_morphology(annots): # TODO pass -def test_Example_from_dict_with_sent_start(vocab): +def test_Example_from_dict_with_sent_start(annots): # TODO pass -def test_Example_from_dict_with_cats(vocab): +def test_Example_from_dict_with_cats(annots): # TODO pass -def test_Example_from_dict_with_links(vocab): +def test_Example_from_dict_with_links(annots): # TODO pass """ From b3868cd1f8d8c1a71d81fbbf16ab8ffaaa3e21d9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 23:06:48 +0200 Subject: [PATCH 36/56] Update NewExample --- spacy/gold/new_example.pyx | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx index 3c42c0bb1..136eca130 100644 --- a/spacy/gold/new_example.pyx +++ b/spacy/gold/new_example.pyx @@ -31,13 +31,19 @@ cdef class NewExample: self.y = reference self._alignment = alignment - @property - def predicted(self): - return self.x + property predicted: + def __get__(self): + return self.x + + def __set__(self, doc): + self.x = doc - @property - def reference(self): - return self.y + property reference: + def __get__(self): + return self.y + + def __set__(self, doc): + self.y = doc @classmethod def from_dict(cls, Doc predicted, dict example_dict): From 0714f1fa5c84da386e4dc771e83d9d639fb9d301 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 23:30:06 +0200 Subject: [PATCH 37/56] Remove the 'pass example into __call__' thing --- spacy/pipeline/pipes.pyx | 237 +++++++++++++++------------------------ 1 file changed, 88 insertions(+), 149 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 2c40738f6..c6233be90 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -20,7 +20,7 @@ from .defaults import default_nel, default_senter from .functions import merge_subtokens from ..language import Language, component from ..syntax import nonproj -from ..gold import Example +from ..gold.new_example import NewExample as Example from ..attrs import POS, ID from ..util import link_vectors_to_models, create_default_optimizer from ..parts_of_speech import X @@ -48,12 +48,6 @@ class Pipe(object): def from_nlp(cls, nlp, model, **cfg): return cls(nlp.vocab, model, **cfg) - def _get_doc(self, example): - """ Use this method if the `example` can be both a Doc or an Example """ - if isinstance(example, Doc): - return example - return example.doc - def __init__(self, vocab, model, **cfg): """Create a new pipe instance.""" raise NotImplementedError @@ -73,18 +67,17 @@ class Pipe(object): else: self.set_annotations([doc], predictions) if isinstance(example, Example): - example.doc = doc + example.predicted = doc return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): + def pipe(self, stream, batch_size=128, n_threads=-1): """Apply the pipe to a stream of documents. Both __call__ and pipe should delegate to the `predict()` and `set_annotations()` methods. """ for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] predictions = self.predict(docs) if isinstance(predictions, tuple) and len(tuple) == 2: scores, tensors = predictions @@ -94,7 +87,7 @@ class Pipe(object): if as_example: for ex, doc in zip(examples, docs): - ex.doc = doc + ex.predicted = doc yield ex else: yield from docs @@ -116,7 +109,6 @@ class Pipe(object): Delegates to predict() and get_loss(). """ if set_annotations: - docs = (self._get_doc(ex) for ex in examples) docs = list(self.pipe(docs)) def rehearse(self, examples, sgd=None, losses=None, **config): @@ -256,28 +248,18 @@ class Tagger(Pipe): return tuple(self.vocab.morphology.tag_names) def __call__(self, example): - doc = self._get_doc(example) tags = self.predict([doc]) self.set_annotations([doc], tags) if isinstance(example, Example): - example.doc = doc + example.predicted = doc return example return doc def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): - for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] + for docs in util.minibatch(stream, size=batch_size): tag_ids = self.predict(docs) - assert len(docs) == len(examples) - assert len(tag_ids) == len(examples) self.set_annotations(docs, tag_ids) - - if as_example: - for ex, doc in zip(examples, docs): - ex.doc = doc - yield ex - else: - yield from docs + yield from docs def predict(self, docs): if not any(len(doc) for doc in docs): @@ -327,15 +309,17 @@ class Tagger(Pipe): doc.is_tagged = True def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False): - examples = Example.to_example_objects(examples) + for eg in examples: + assert isinstance(eg, Example) if losses is not None and self.name not in losses: losses[self.name] = 0. - if not any(len(ex.doc) if ex.doc else 0 for ex in examples): + if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): # Handle cases where there are no tokens in any docs. return set_dropout_rate(self.model, drop) - tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples]) + tag_scores, bp_tag_scores = self.model.begin_update( + [eg.predicted for eg in examples]) for sc in tag_scores: if self.model.ops.xp.isnan(sc.sum()): raise ValueError("nan value in scores") @@ -347,17 +331,16 @@ class Tagger(Pipe): if losses is not None: losses[self.name] += loss if set_annotations: - docs = [ex.doc for ex in examples] + docs = [eg.predicted for eg in examples] self.set_annotations(docs, self._scores2guesses(tag_scores)) def rehearse(self, examples, drop=0., sgd=None, losses=None): """Perform a 'rehearsal' update, where we try to match the output of an initial model. """ + docs = [eg.predicted for eg in examples] if self._rehearsal_model is None: return - examples = Example.to_example_objects(examples) - docs = [ex.doc for ex in examples] if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. return @@ -387,7 +370,8 @@ class Tagger(Pipe): orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = {} for example in get_examples(): - for tag in example.token_annotation.tags: + for token in example.y: + tag = token.tag_ if tag in orig_tag_map: new_tag_map[tag] = orig_tag_map[tag] else: @@ -575,7 +559,7 @@ class SentenceRecognizer(Tagger): d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) d_scores *= self.model.ops.asarray(known_labels) loss = (d_scores**2).sum() - docs = [eg.doc for eg in examples] + docs = [eg.predicted for eg in examples] d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores @@ -687,8 +671,8 @@ class MultitaskObjective(Tagger): gold_examples = nonproj.preprocess_training_data(get_examples()) # for raw_text, doc_annot in gold_tuples: for example in gold_examples: - for i in range(len(example.token_annotation.ids)): - label = self.make_label(i, example.token_annotation) + for token in example.y: + label = self.make_label(token) if label is not None and label not in self.labels: self.labels[label] = len(self.labels) self.model.initialize() @@ -706,11 +690,11 @@ class MultitaskObjective(Tagger): cdef int idx = 0 correct = numpy.zeros((scores.shape[0],), dtype="i") guesses = scores.argmax(axis=1) - docs = [ex.doc for ex in examples] + docs = [eg.predicted for eg in examples] for i, eg in enumerate(examples): # Handles alignment for tokenization differences doc_annots = eg.get_aligned() - for j in range(len(eg.doc)): + for j in range(len(eg.predicted)): tok_annots = {key: values[j] for key, values in tok_annots.items()} label = self.make_label(j, tok_annots) if label is None or label not in self.labels: @@ -724,83 +708,49 @@ class MultitaskObjective(Tagger): return float(loss), d_scores @staticmethod - def make_dep(i, token_annotation): - if token_annotation.deps[i] is None or token_annotation.heads[i] is None: - return None - return token_annotation.deps[i] + def make_dep(token): + return token.dep_ @staticmethod - def make_tag(i, token_annotation): - return token_annotation.tags[i] + def make_tag(token): + return token.tag_ @staticmethod - def make_ent(i, token_annotation): - if token_annotation.entities is None: - return None - return token_annotation.entities[i] + def make_ent(token): + if token.ent_iob_ == "O": + return "O" + else: + return token.ent_iob_ + "-" + token.ent_type_ @staticmethod - def make_dep_tag_offset(i, token_annotation): - if token_annotation.deps[i] is None or token_annotation.heads[i] is None: - return None - offset = token_annotation.heads[i] - i + def make_dep_tag_offset(token): + dep = token.dep_ + tag = token.tag_ + offset = token.head.i - token.i offset = min(offset, 2) offset = max(offset, -2) - return f"{token_annotation.deps[i]}-{token_annotation.tags[i]}:{offset}" + return f"{dep}-{tag}:{offset}" @staticmethod - def make_ent_tag(i, token_annotation): - if token_annotation.entities is None or token_annotation.entities[i] is None: - return None + def make_ent_tag(token): + if token.ent_iob_ == "O": + ent = "O" else: - return f"{token_annotation.tags[i]}-{token_annotation.entities[i]}" + ent = token.ent_iob_ + "-" + token.ent_type_ + tag = token.tag_ + return f"{tag}-{ent}" @staticmethod - def make_sent_start(target, token_annotation, cache=True, _cache={}): + def make_sent_start(token): """A multi-task objective for representing sentence boundaries, using BILU scheme. (O is impossible) - - The implementation of this method uses an internal cache that relies - on the identity of the heads array, to avoid requiring a new piece - of gold data. You can pass cache=False if you know the cache will - do the wrong thing. """ - words = token_annotation.words - heads = token_annotation.heads - assert len(words) == len(heads) - assert target < len(words), (target, len(words)) - if cache: - if id(heads) in _cache: - return _cache[id(heads)][target] - else: - for key in list(_cache.keys()): - _cache.pop(key) - sent_tags = ["I-SENT"] * len(words) - _cache[id(heads)] = sent_tags + if token.is_sent_start and token.is_sent_end: + return "U-SENT" + elif token.is_sent_start: + return "B-SENT" else: - sent_tags = ["I-SENT"] * len(words) - - def _find_root(child): - seen = set([child]) - while child is not None and heads[child] != child: - seen.add(child) - child = heads[child] - return child - - sentences = {} - for i in range(len(words)): - root = _find_root(i) - if root is None: - sent_tags[i] = None - else: - sentences.setdefault(root, []).append(i) - for root, span in sorted(sentences.items()): - if len(span) == 1: - sent_tags[span[0]] = "U-SENT" - else: - sent_tags[span[0]] = "B-SENT" - sent_tags[span[-1]] = "L-SENT" - return sent_tags[target] + return "I-SENT" class ClozeMultitask(Pipe): @@ -833,7 +783,7 @@ class ClozeMultitask(Pipe): # token.vector values, but that's a bit inefficient, especially on GPU. # Instead we fetch the index into the vectors table for each of our tokens, # and look them up all at once. This prevents data copying. - ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples]) + ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples]) target = vectors[ids] gradient = self.distance.get_grad(prediction, target) loss = self.distance.get_loss(prediction, target) @@ -843,11 +793,12 @@ class ClozeMultitask(Pipe): pass def rehearse(self, examples, drop=0., sgd=None, losses=None): - examples = Example.to_example_objects(examples) if losses is not None and self.name not in losses: losses[self.name] = 0. + docs = [eg.predicted for eg in examples] set_dropout_rate(self.model, drop) - predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples]) + predictions, bp_predictions = self.model.begin_update( + [eg.predicted for eg in examples]) loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions) bp_predictions(d_predictions) if sgd is not None: @@ -883,17 +834,10 @@ class TextCategorizer(Pipe): self.cfg["labels"] = tuple(value) def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): - for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] + for docs in util.minibatch(stream, size=batch_size): scores, tensors = self.predict(docs) self.set_annotations(docs, scores, tensors=tensors) - - if as_example: - for ex, doc in zip(examples, docs): - ex.doc = doc - yield ex - else: - yield from docs + yield from docs def predict(self, docs): tensors = [doc.tensor for doc in docs] @@ -914,12 +858,15 @@ class TextCategorizer(Pipe): doc.cats[label] = float(scores[i, j]) def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None): - examples = Example.to_example_objects(examples) - if not any(len(ex.doc) if ex.doc else 0 for ex in examples): + for eg in examples: + assert isinstance(eg, Example) + if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): # Handle cases where there are no tokens in any docs. return set_dropout_rate(self.model, drop) - scores, bp_scores = self.model.begin_update([ex.doc for ex in examples]) + scores, bp_scores = self.model.begin_update( + [eg.predicted for eg in examples] + ) loss, d_scores = self.get_loss(examples, scores) bp_scores(d_scores) if sgd is not None: @@ -928,14 +875,15 @@ class TextCategorizer(Pipe): losses.setdefault(self.name, 0.0) losses[self.name] += loss if set_annotations: - docs = [ex.doc for ex in examples] + docs = [eg.predicted for eg in examples] self.set_annotations(docs, scores=scores) def rehearse(self, examples, drop=0., sgd=None, losses=None): if self._rehearsal_model is None: return - examples = Example.to_example_objects(examples) - docs=[ex.doc for ex in examples] + for eg in examples: + assert isinstance(eg, Example) + docs = [eg.predicted for eg in examples] if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. return @@ -955,8 +903,8 @@ class TextCategorizer(Pipe): not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f") for i, eg in enumerate(examples): for j, label in enumerate(self.labels): - if label in eg.doc_annotation.cats: - truths[i, j] = eg.doc_annotation.cats[label] + if label in eg.predicted.cats: + truths[i, j] = eg.reference.cats[label] else: not_missing[i, j] = 0. truths = self.model.ops.asarray(truths) @@ -993,7 +941,7 @@ class TextCategorizer(Pipe): # TODO: begin_training is not guaranteed to see all data / labels ? examples = list(get_examples()) for example in examples: - for cat in example.doc_annotation.cats: + for cat in example.y.cats: self.add_label(cat) self.require_labels() docs = [Doc(Vocab(), words=["hello"])] @@ -1152,21 +1100,22 @@ class EntityLinker(Pipe): losses.setdefault(self.name, 0.0) if not examples: return 0 - examples = Example.to_example_objects(examples) + for eg in examples: + assert isinstance(eg, Example) sentence_docs = [] - docs = [ex.doc for ex in examples] + docs = [eg.predicted for eg in examples] if set_annotations: # This seems simpler than other ways to get that exact output -- but # it does run the model twice :( predictions = self.model.predict(docs) for eg in examples: - doc = eg.doc + doc = eg.predicted ents_by_offset = dict() for ent in doc.ents: ents_by_offset[(ent.start_char, ent.end_char)] = ent - - for entity, kb_dict in eg.doc_annotation.links.items(): + links = self._get_links_from_doc(eg.reference) + for entity, kb_dict in links.items(): if isinstance(entity, str): entity = literal_eval(entity) start, end = entity @@ -1204,7 +1153,8 @@ class EntityLinker(Pipe): def get_similarity_loss(self, examples, scores): entity_encodings = [] for eg in examples: - for entity, kb_dict in eg.doc_annotation.links.items(): + links = self._get_links_from_doc(eg.reference) + for entity, kb_dict in links.items(): for kb_id, value in kb_dict.items(): # this loss function assumes we're only using positive examples if value: @@ -1223,8 +1173,9 @@ class EntityLinker(Pipe): def get_loss(self, examples, scores): cats = [] - for ex in examples: - for entity, kb_dict in ex.doc_annotation.links.items(): + for eg in examples: + links = self._get_links_from_doc(eg.reference) + for entity, kb_dict in links.items(): for kb_id, value in kb_dict.items(): cats.append([value]) @@ -1237,27 +1188,22 @@ class EntityLinker(Pipe): loss = loss / len(cats) return loss, d_scores - def __call__(self, example): - doc = self._get_doc(example) + def _get_links_from_doc(self, doc): + return {} + + def __call__(self, doc): kb_ids, tensors = self.predict([doc]) self.set_annotations([doc], kb_ids, tensors=tensors) if isinstance(example, Example): - example.doc = doc + example.x = doc return example return doc def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): - for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] + for docs in util.minibatch(stream, size=batch_size): kb_ids, tensors = self.predict(docs) self.set_annotations(docs, kb_ids, tensors=tensors) - - if as_example: - for ex, doc in zip(examples, docs): - ex.doc = doc - yield ex - else: - yield from docs + yield from docs def predict(self, docs): """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """ @@ -1433,7 +1379,7 @@ class Sentencizer(Pipe): ): pass - def __call__(self, example): + def __call__(self, doc): """Apply the sentencizer to a Doc and set Token.is_sent_start. example (Doc or Example): The document to process. @@ -1441,7 +1387,6 @@ class Sentencizer(Pipe): DOCS: https://spacy.io/api/sentencizer#call """ - doc = self._get_doc(example) start = 0 seen_period = False for i, token in enumerate(doc): @@ -1460,21 +1405,15 @@ class Sentencizer(Pipe): return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): - for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] + def pipe(self, stream, batch_size=128, n_threads=-1): + for docs in util.minibatch(stream, size=batch_size): predictions = self.predict(docs) if isinstance(predictions, tuple) and len(tuple) == 2: scores, tensors = predictions self.set_annotations(docs, scores, tensors=tensors) else: self.set_annotations(docs, predictions) - if as_example: - for ex, doc in zip(examples, docs): - ex.doc = doc - yield ex - else: - yield from docs + yield from docs def predict(self, docs): """Apply the pipeline's model to a batch of docs, without From af1b5f129b8653678291dd5f8a226cc8cfe78893 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 23:31:19 +0200 Subject: [PATCH 38/56] Use new example class in GoldCorpus --- spacy/gold/corpus.py | 55 ++++++++------------------------------------ 1 file changed, 10 insertions(+), 45 deletions(-) diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 84de01665..8dc044639 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -9,7 +9,7 @@ from .. import util from ..errors import Errors, AlignmentError from .gold_io import read_json_file, json_to_annotations from .augment import make_orth_variants, add_noise -from .example import Example +from .new_example import NewExample as Example class GoldCorpus(object): @@ -203,59 +203,24 @@ class GoldCorpus(object): for eg_dict in annotations: if eg_dict["text"]: example = Example.from_dict( - eg_dict, - doc=nlp.make_doc(eg_dict["text"]) + nlp.make_doc(eg_dict["text"]), + eg_dict ) else: example = Example.from_dict( - eg_dict, - doc=Doc(nlp.vocab, words=eg_dict["words"]) + Doc(nlp.vocab, words=eg_dict["words"]), + eg_dict ) - example_docs = [] if gold_preproc: - split_examples = example.split_sents() - for split_example in split_examples: - split_example_docs = cls._make_docs( - nlp, - split_example, - gold_preproc, - noise_level=noise_level, - orth_variant_level=orth_variant_level, - ) - example_docs.extend(split_example_docs) + # TODO: Data augmentation + examples = example.split_sents() else: - example_docs = cls._make_docs( - nlp, - example, - gold_preproc, - noise_level=noise_level, - orth_variant_level=orth_variant_level, - ) - for ex in example_docs: - if (not max_length) or len(ex.doc) < max_length: + examples = [example] + for ex in examples: + if (not max_length) or len(ex.predicted) < max_length: if ignore_misaligned: try: _ = ex._deprecated_get_gold() except AlignmentError: continue yield ex - - @classmethod - def _make_docs( - cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0 - ): - var_example = make_orth_variants( - nlp, example, orth_variant_level=orth_variant_level - ) - # gold_preproc is not used ?! - if example.text is not None: - var_text = add_noise(var_example.text, noise_level) - var_doc = nlp.make_doc(var_text) - var_example.doc = var_doc - else: - var_doc = Doc( - nlp.vocab, - words=add_noise(var_example.token_annotation.words, noise_level), - ) - var_example.doc = var_doc - return [var_example] From 82810b98466376daf37602cde75d0ec2b0352577 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 23:32:07 +0200 Subject: [PATCH 39/56] Update morphologizer --- spacy/pipeline/morphologizer.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 7116d7afd..c5d140a4e 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -51,9 +51,9 @@ class Morphologizer(Tagger): def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): for example in get_examples(): - for i, morph in enumerate(example.token_annotation.morphs): - pos = example.token_annotation.get_pos(i) - morph = Morphology.feats_to_dict(morph) + for i, token in enumerate(example.reference): + pos = token.pos_ + morph = token.morph norm_morph = self.vocab.strings[self.vocab.morphology.add(morph)] if pos: morph["POS"] = pos From ad547a4b8fc7d957dc70f6454b1c672a6941b49b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 23:39:46 +0200 Subject: [PATCH 40/56] Refactor towards new Example class --- spacy/pipeline/pipes.pyx | 35 ++++++++--------------------------- 1 file changed, 8 insertions(+), 27 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index c6233be90..58a76a9a1 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -52,23 +52,19 @@ class Pipe(object): """Create a new pipe instance.""" raise NotImplementedError - def __call__(self, example): + def __call__(self, Doc doc): """Apply the pipe to one document. The document is modified in-place, and returned. Both __call__ and pipe should delegate to the `predict()` and `set_annotations()` methods. """ - doc = self._get_doc(example) predictions = self.predict([doc]) if isinstance(predictions, tuple) and len(predictions) == 2: scores, tensors = predictions self.set_annotations([doc], scores, tensors=tensors) else: self.set_annotations([doc], predictions) - if isinstance(example, Example): - example.predicted = doc - return example return doc def pipe(self, stream, batch_size=128, n_threads=-1): @@ -77,20 +73,14 @@ class Pipe(object): Both __call__ and pipe should delegate to the `predict()` and `set_annotations()` methods. """ - for examples in util.minibatch(stream, size=batch_size): + for docs in util.minibatch(stream, size=batch_size): predictions = self.predict(docs) if isinstance(predictions, tuple) and len(tuple) == 2: scores, tensors = predictions self.set_annotations(docs, scores, tensors=tensors) else: self.set_annotations(docs, predictions) - - if as_example: - for ex, doc in zip(examples, docs): - ex.predicted = doc - yield ex - else: - yield from docs + yield from docs def predict(self, docs): """Apply the pipeline's model to a batch of docs, without @@ -102,7 +92,7 @@ class Pipe(object): """Modify a batch of documents, using pre-computed scores.""" raise NotImplementedError - def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None): + def update(self, docs, set_annotations=False, drop=0.0, sgd=None, losses=None): """Learn from a batch of documents and gold-standard information, updating the pipe's model. @@ -247,15 +237,12 @@ class Tagger(Pipe): def labels(self): return tuple(self.vocab.morphology.tag_names) - def __call__(self, example): + def __call__(self, doc): tags = self.predict([doc]) self.set_annotations([doc], tags) - if isinstance(example, Example): - example.predicted = doc - return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): + def pipe(self, stream, batch_size=128, n_threads=-1): for docs in util.minibatch(stream, size=batch_size): tag_ids = self.predict(docs) self.set_annotations(docs, tag_ids) @@ -833,7 +820,7 @@ class TextCategorizer(Pipe): def labels(self, value): self.cfg["labels"] = tuple(value) - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): + def pipe(self, stream, batch_size=128, n_threads=-1): for docs in util.minibatch(stream, size=batch_size): scores, tensors = self.predict(docs) self.set_annotations(docs, scores, tensors=tensors) @@ -1194,12 +1181,9 @@ class EntityLinker(Pipe): def __call__(self, doc): kb_ids, tensors = self.predict([doc]) self.set_annotations([doc], kb_ids, tensors=tensors) - if isinstance(example, Example): - example.x = doc - return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): + def pipe(self, stream, batch_size=128, n_threads=-1): for docs in util.minibatch(stream, size=batch_size): kb_ids, tensors = self.predict(docs) self.set_annotations(docs, kb_ids, tensors=tensors) @@ -1400,9 +1384,6 @@ class Sentencizer(Pipe): seen_period = True if start < len(doc): doc[start].is_sent_start = True - if isinstance(example, Example): - example.doc = doc - return example return doc def pipe(self, stream, batch_size=128, n_threads=-1): From 337d2b5ad65508ce0897d7fea49fa39e33a8d327 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 23:58:16 +0200 Subject: [PATCH 41/56] Fix sent start in NewExample --- spacy/gold/new_example.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx index 136eca130..4247f21b5 100644 --- a/spacy/gold/new_example.pyx +++ b/spacy/gold/new_example.pyx @@ -94,13 +94,16 @@ def _annot2array(strings, tok_annot, doc_annot): elif key == "HEAD": attrs.append(key) values.append([h-i for i, h in enumerate(value)]) + elif key == "SENT_START": + attrs.append(key) + values.append(value) else: attrs.append(key) values.append([strings.add(v) for v in value]) # TODO: Calculate token.ent_kb_id from doc_annot["links"]. # We need to fix this and the doc.ents thing, both should be doc # annotations. - array = numpy.array(values, dtype="uint64") + array = numpy.asarray(values, dtype="uint64") return attrs, array.T From 488727aee0ef3bee60113264f9348d9c1ad5e422 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Jun 2020 23:58:28 +0200 Subject: [PATCH 42/56] Start updating test --- spacy/tests/test_gold.py | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 29ddc7456..3c13259ba 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -1,7 +1,8 @@ from spacy.errors import AlignmentError from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align -from spacy.gold import GoldCorpus, docs_to_json, Example, DocAnnotation +from spacy.gold import GoldCorpus, docs_to_json, DocAnnotation +from spacy.gold.new_example import NewExample as Example from spacy.lang.en import English from spacy.syntax.nonproj import is_nonproj_tree from spacy.syntax.gold_parse import GoldParse, get_parses_from_example @@ -91,7 +92,7 @@ def merged_dict(): "ids": [1, 2, 3, 4, 5, 6, 7], "words": ["Hi", "there", "everyone", "It", "is", "just", "me"], "tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"], - "sent_starts": [1, 0, 0, 1, 0, 0, 0, 0], + "sent_starts": [1, 0, 0, 1, 0, 0, 0], } @@ -482,8 +483,8 @@ def _train(train_data): def test_split_sents(merged_dict): nlp = English() example = Example.from_dict( - merged_dict, - doc=Doc(nlp.vocab, words=merged_dict["words"]) + Doc(nlp.vocab, words=merged_dict["words"]), + merged_dict ) assert len(get_parses_from_example( example, @@ -514,24 +515,20 @@ def test_split_sents(merged_dict): assert token_annotation_2.sent_starts == [1, 0, 0, 0] +# This fails on some None value? Need to look into that. +@pytest.mark.xfail # TODO def test_tuples_to_example(vocab, merged_dict): cats = {"TRAVEL": 1.0, "BAKING": 0.0} merged_dict = dict(merged_dict) merged_dict["cats"] = cats ex = Example.from_dict( - merged_dict, - doc=Doc(vocab, words=merged_dict["words"]) + Doc(vocab, words=merged_dict["words"]), + merged_dict ) - ex_dict = ex.to_dict() - assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"] - assert ex_dict["token_annotation"]["words"] == merged_dict["words"] - assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"] - assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"] - assert ex_dict["doc_annotation"]["cats"] == cats - - -def test_empty_example_goldparse(): - nlp = English() - doc = nlp("") - example = Example(doc=doc) - assert len(get_parses_from_example(example)) == 1 + words = [token.text for token in ex.reference] + assert words == merged_dict["words"] + tags = [token.tag_ for token in ex.reference] + assert tags == merged_dict["tags"] + sent_starts = [token.is_sent_start for token in ex.reference] + assert sent_starts == [bool(v) for v in merged_dict["sent_starts"]] + example.reference.cats == cats From 6a67a1168235d75eaf7db95f5ee5cec482451990 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Jun 2020 17:43:40 +0200 Subject: [PATCH 43/56] adding tests for new example class (some still failing - WIP) --- spacy/gold/new_example.pyx | 2 + spacy/tests/test_gold.py | 2 +- spacy/tests/test_new_example.py | 137 +++++++++++++++++++++++++++----- 3 files changed, 118 insertions(+), 23 deletions(-) diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx index 4247f21b5..fa50e4369 100644 --- a/spacy/gold/new_example.pyx +++ b/spacy/gold/new_example.pyx @@ -146,6 +146,8 @@ def _fix_legacy_dict_data(predicted, example_dict): ent_iobs, ent_types = _parse_ner_tags(predicted, words, value) token_dict["ENT_IOB"] = ent_iobs token_dict["ENT_TYPE"] = ent_types + else: + raise ValueError(f"Unknown attr: {key}") return { "token_annotation": token_dict, "doc_annotation": doc_dict diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 3c13259ba..f60f52e6e 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -531,4 +531,4 @@ def test_tuples_to_example(vocab, merged_dict): assert tags == merged_dict["tags"] sent_starts = [token.is_sent_start for token in ex.reference] assert sent_starts == [bool(v) for v in merged_dict["sent_starts"]] - example.reference.cats == cats + ex.reference.cats == cats diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index 473666eca..a8651dfee 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -14,21 +14,25 @@ def test_Example_init_requires_doc_objects(): eg = Example(None, Doc(vocab, words=["hi"])) - def test_Example_from_dict_basic(): eg = Example.from_dict( - Doc(Vocab(), words=["hello", "world"]), - { - "words": ["hello", "world"] - } + Doc(Vocab(), words=["hello", "world"]), {"words": ["hello", "world"]} ) assert isinstance(eg.x, Doc) assert isinstance(eg.y, Doc) -@pytest.mark.parametrize("annots", [ - {"words": ["ice", "cream"], "tags": ["NN", "NN"]}, -]) +@pytest.mark.parametrize( + "annots", [{"words": ["ice", "cream"], "weirdannots": ["something", "such"]}] +) +def test_Example_from_dict_invalid(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + with pytest.raises(ValueError): + eg = Example.from_dict(predicted, annots) + + +@pytest.mark.parametrize("annots", [{"words": ["ice", "cream"], "tags": ["NN", "NN"]}]) def test_Example_from_dict_with_tags(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) @@ -37,28 +41,117 @@ def test_Example_from_dict_with_tags(annots): assert token.tag_ == annots["tags"][i] -""" +@pytest.mark.xfail(reason="TODO - fix") +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "London", "and", "Berlin", "."], + "entities": [(7, 13, "LOC"), (18, 24, "LOC")], + } + ], +) def test_Example_from_dict_with_entities(annots): - # TODO - pass + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + eg = Example.from_dict(predicted, annots) + assert len(list(eg.reference.ents)) == 2 + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "London", "and", "Berlin", "."], + "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"], + "heads": [1, 1, 1, 2, 2, 1], + } + ], +) def test_Example_from_dict_with_parse(annots): - # TODO - pass + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + eg = Example.from_dict(predicted, annots) + for i, token in enumerate(eg.reference): + assert token.dep_ == annots["deps"][i] + assert token.head.i == annots["heads"][i] + +@pytest.mark.xfail(reason="TODO - fix") +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["Sarah", "'s", "sister", "flew"], + "morphs": [ + "NounType=prop|Number=sing", + "Poss=yes", + "Number=sing", + "Tense=past|VerbForm=fin", + ], + } + ], +) def test_Example_from_dict_with_morphology(annots): - # TODO - pass + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + eg = Example.from_dict(predicted, annots) + for i, token in enumerate(eg.reference): + assert token.morph_ == annots["morphs"][i] + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["This", "is", "one", "sentence", "this", "is", "another"], + "sent_starts": [1, 0, 0, 0, 1, 0, 0], + } + ], +) def test_Example_from_dict_with_sent_start(annots): - # TODO - pass + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + eg = Example.from_dict(predicted, annots) + assert len(list(eg.reference.sents)) == 2 + for i, token in enumerate(eg.reference): + assert bool(token.is_sent_start) == bool(annots["sent_starts"][i]) + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["This", "is", "a", "sentence"], + "cats": {"cat1": 1.0, "cat2": 0.0, "cat3": 0.5}, + } + ], +) def test_Example_from_dict_with_cats(annots): - # TODO - pass + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + eg = Example.from_dict(predicted, annots) + assert len(list(eg.reference.cats)) == 3 + assert eg.reference.cats["cat1"] == 1.0 + assert eg.reference.cats["cat2"] == 0.0 + assert eg.reference.cats["cat3"] == 0.5 + +@pytest.mark.xfail(reason="TODO - fix") +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["Russ", "Cochran", "made", "reprints"], + "links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}, + } + ], +) def test_Example_from_dict_with_links(annots): - # TODO - pass -""" + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + eg = Example.from_dict(predicted, annots) + assert eg.reference[0].ent_kb_id_ == "Q7381115" + assert eg.reference[1].ent_kb_id_ == "Q7381115" + assert eg.reference[2].ent_kb_id_ == "" + assert eg.reference[3].ent_kb_id_ == "" From 3aed177a35ced290cd6eee9773cd73d012202745 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 12 Jun 2020 11:30:24 +0200 Subject: [PATCH 44/56] fix ENT_IOB conversion and enable unit test --- spacy/errors.py | 2 ++ spacy/gold/new_example.pyx | 9 +++++++++ spacy/tests/test_new_example.py | 16 +++++++++++++--- spacy/tokens/doc.pyx | 7 +++++++ spacy/tokens/token.pyx | 7 +++++-- 5 files changed, 36 insertions(+), 5 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 94a0218a7..8efef8333 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -581,6 +581,8 @@ class Errors(object): # TODO: fix numbering after merging develop into master + E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing " + "into {values}, but found {value}.") E986 = ("Could not create any training batches: check your input. " "Perhaps discard_oversize should be set to False ?") E987 = ("The text of an example training instance is either a Doc or " diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx index fa50e4369..51007e8c3 100644 --- a/spacy/gold/new_example.pyx +++ b/spacy/gold/new_example.pyx @@ -1,4 +1,6 @@ import numpy + +from ..tokens import Token from ..tokens.doc cimport Doc from ..attrs import IDS from .align cimport Alignment @@ -97,6 +99,13 @@ def _annot2array(strings, tok_annot, doc_annot): elif key == "SENT_START": attrs.append(key) values.append(value) + elif key == "ENT_IOB": + iob_strings = Token.iob_strings() + attrs.append(key) + try: + values.append([iob_strings.index(v) for v in value]) + except ValueError: + raise ValueError(Errors.E985.format(values=iob_strings, value=values)) else: attrs.append(key) values.append([strings.add(v) for v in value]) diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index a8651dfee..7a43cd9a6 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -29,7 +29,7 @@ def test_Example_from_dict_invalid(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) with pytest.raises(ValueError): - eg = Example.from_dict(predicted, annots) + Example.from_dict(predicted, annots) @pytest.mark.parametrize("annots", [{"words": ["ice", "cream"], "tags": ["NN", "NN"]}]) @@ -46,8 +46,8 @@ def test_Example_from_dict_with_tags(annots): "annots", [ { - "words": ["I", "like", "London", "and", "Berlin", "."], - "entities": [(7, 13, "LOC"), (18, 24, "LOC")], + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "entities": [(7, 15, "LOC"), (20, 26, "LOC")], } ], ) @@ -56,6 +56,16 @@ def test_Example_from_dict_with_entities(annots): predicted = Doc(vocab, words=annots["words"]) eg = Example.from_dict(predicted, annots) assert len(list(eg.reference.ents)) == 2 + assert eg.reference[0].ent_iob_ == "O" + assert eg.reference[1].ent_iob_ == "O" + assert eg.reference[2].ent_iob_ == "B" + assert eg.reference[3].ent_iob_ == "I" + assert eg.reference[4].ent_iob_ == "O" + assert eg.reference[5].ent_iob_ == "B" + assert eg.reference[6].ent_iob_ == "O" + assert eg.reference[2].ent_type_ == "LOC" + assert eg.reference[3].ent_type_ == "LOC" + assert eg.reference[5].ent_type_ == "LOC" @pytest.mark.parametrize( diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 81cef4492..c4581d0a8 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -825,6 +825,13 @@ cdef class Doc: for i in range(length): if array[i, col] != 0: self.vocab.morphology.assign_tag(&tokens[i], array[i, col]) + # Verify ENT_IOB are proper integers + if ENT_IOB in attrs: + iob_strings = Token.iob_strings() + col = attrs.index(ENT_IOB) + for i in range(length): + if array[i, col] not in range(0, len(iob_strings)): + raise ValueError(Errors.E985.format(values=iob_strings, value=array[i, col])) # Now load the data for i in range(length): token = &self.c[i] diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 320cfaad5..f85a17d69 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -778,6 +778,10 @@ cdef class Token: """ return self.c.ent_iob + @classmethod + def iob_strings(cls): + return ("", "I", "O", "B") + @property def ent_iob_(self): """IOB code of named entity tag. "B" means the token begins an entity, @@ -787,8 +791,7 @@ cdef class Token: RETURNS (str): IOB code of named entity tag. """ - iob_strings = ("", "I", "O", "B") - return iob_strings[self.c.ent_iob] + return self.iob_strings()[self.c.ent_iob] property ent_id: """RETURNS (uint64): ID of the entity the token is an instance of, From 880dccf93e11be93bdd75c660617a551f589a82c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 12 Jun 2020 15:47:20 +0200 Subject: [PATCH 45/56] entities on doc_annotation, parse links and check their offsets against the entities. unit test works --- spacy/errors.py | 4 ++ spacy/gold/new_example.pyx | 76 +++++++++++++++++++++------- spacy/tests/test_new_example.py | 88 ++++++++++++++++++++------------- 3 files changed, 115 insertions(+), 53 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 8efef8333..e4f6610ee 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -581,6 +581,10 @@ class Errors(object): # TODO: fix numbering after merging develop into master + E983 = ("Each link annotation should refer to a dictionary with at most one " + "identifier mapping to 1.0, and all others to 0.0.") + E984 = ("The offsets of the annotations for 'links' need to refer exactly " + "to the offsets of the 'entities' annotations.") E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing " "into {values}, but found {value}.") E986 = ("Could not create any training batches: check your input. " diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx index 51007e8c3..d2492a29f 100644 --- a/spacy/gold/new_example.pyx +++ b/spacy/gold/new_example.pyx @@ -85,12 +85,28 @@ cdef class NewExample: return self.x.text -def _annot2array(strings, tok_annot, doc_annot): +def _annot2array(vocab, tok_annot, doc_annot): attrs = [] values = [] + + for key, value in doc_annot.items(): + if key == "entities": + words = tok_annot["ORTH"] + ent_iobs, ent_types = _parse_ner_tags(vocab, words, value) + tok_annot["ENT_IOB"] = ent_iobs + tok_annot["ENT_TYPE"] = ent_types + elif key == "links": + entities = doc_annot.get("entities", {}) + if value and not entities: + raise ValueError(Errors.E984) + ent_kb_ids = _parse_links(vocab, words, value, entities) + tok_annot["ENT_KB_ID"] = ent_kb_ids + else: + raise ValueError(f"Unknown doc attribute: {key}") + for key, value in tok_annot.items(): if key not in IDS: - raise ValueError(f"Unknown attr: {key}") + raise ValueError(f"Unknown token attribute: {key}") elif key == "ORTH": pass elif key == "HEAD": @@ -108,10 +124,8 @@ def _annot2array(strings, tok_annot, doc_annot): raise ValueError(Errors.E985.format(values=iob_strings, value=values)) else: attrs.append(key) - values.append([strings.add(v) for v in value]) - # TODO: Calculate token.ent_kb_id from doc_annot["links"]. - # We need to fix this and the doc.ents thing, both should be doc - # annotations. + values.append([vocab.strings.add(v) for v in value]) + array = numpy.asarray(values, dtype="uint64") return attrs, array.T @@ -129,8 +143,10 @@ def _fix_legacy_dict_data(predicted, example_dict): for key, value in example_dict.items(): if key in ("token_annotation", "doc_annotation"): pass - elif key in ("cats", "links"): + elif key in ("cats", "links") and value: doc_dict[key] = value + elif key in ("ner", "entities") and value: + doc_dict["entities"] = value else: token_dict[key] = value # Remap keys @@ -149,12 +165,6 @@ def _fix_legacy_dict_data(predicted, example_dict): for key, value in old_token_dict.items(): if key in remapping: token_dict[remapping[key]] = value - elif key in ("ner", "entities") and value: - # Arguably it would be smarter to put this in the doc annotation? - words = token_dict.get("words", [t.text for t in predicted]) - ent_iobs, ent_types = _parse_ner_tags(predicted, words, value) - token_dict["ENT_IOB"] = ent_iobs - token_dict["ENT_TYPE"] = ent_types else: raise ValueError(f"Unknown attr: {key}") return { @@ -163,16 +173,13 @@ def _fix_legacy_dict_data(predicted, example_dict): } -def _parse_ner_tags(predicted, words, biluo_or_offsets): +def _parse_ner_tags(vocab, words, biluo_or_offsets): if isinstance(biluo_or_offsets[0], (list, tuple)): # Convert to biluo if necessary # This is annoying but to convert the offsets we need a Doc # that has the target tokenization. - reference = Doc( - predicted.vocab, - words=words - ) - biluo = biluo_tags_from_offsets(predicted, biluo_or_offsets) + reference = Doc(vocab, words=words) + biluo = biluo_tags_from_offsets(reference, biluo_or_offsets) else: biluo = biluo_or_offsets ent_iobs = [] @@ -185,6 +192,37 @@ def _parse_ner_tags(predicted, words, biluo_or_offsets): ent_types.append("") return ent_iobs, ent_types +def _parse_links(vocab, words, links, entities): + reference = Doc(vocab, words=words) + + starts = {token.idx: token.i for token in reference} + ends = {token.idx + len(token): token.i for token in reference} + ent_kb_ids = ["" for _ in reference] + entity_map = [(ent[0], ent[1]) for ent in entities] + + # links annotations need to refer 1-1 to entity annotations - throw error otherwise + for index, annot_dict in links.items(): + start_char, end_char = index + if (start_char, end_char) not in entity_map: + raise ValueError(Errors.E984) + + for index, annot_dict in links.items(): + true_kb_ids = [] + for key, value in annot_dict.items(): + if value == 1.0: + true_kb_ids.append(key) + if len(true_kb_ids) > 1: + raise ValueError(Errors.E983) + + if len(true_kb_ids) == 1: + start_char, end_char = index + start_token = starts.get(start_char) + end_token = ends.get(end_char) + for i in range(start_token, end_token+1): + ent_kb_ids[i] = true_kb_ids[0] + + return ent_kb_ids + class Example: def get_aligned(self, field): diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index 7a43cd9a6..4ebafb6bb 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -41,33 +41,6 @@ def test_Example_from_dict_with_tags(annots): assert token.tag_ == annots["tags"][i] -@pytest.mark.xfail(reason="TODO - fix") -@pytest.mark.parametrize( - "annots", - [ - { - "words": ["I", "like", "New", "York", "and", "Berlin", "."], - "entities": [(7, 15, "LOC"), (20, 26, "LOC")], - } - ], -) -def test_Example_from_dict_with_entities(annots): - vocab = Vocab() - predicted = Doc(vocab, words=annots["words"]) - eg = Example.from_dict(predicted, annots) - assert len(list(eg.reference.ents)) == 2 - assert eg.reference[0].ent_iob_ == "O" - assert eg.reference[1].ent_iob_ == "O" - assert eg.reference[2].ent_iob_ == "B" - assert eg.reference[3].ent_iob_ == "I" - assert eg.reference[4].ent_iob_ == "O" - assert eg.reference[5].ent_iob_ == "B" - assert eg.reference[6].ent_iob_ == "O" - assert eg.reference[2].ent_type_ == "LOC" - assert eg.reference[3].ent_type_ == "LOC" - assert eg.reference[5].ent_type_ == "LOC" - - @pytest.mark.parametrize( "annots", [ @@ -147,13 +120,39 @@ def test_Example_from_dict_with_cats(annots): assert eg.reference.cats["cat3"] == 0.5 -@pytest.mark.xfail(reason="TODO - fix") @pytest.mark.parametrize( "annots", [ { - "words": ["Russ", "Cochran", "made", "reprints"], - "links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}, + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "entities": [(7, 15, "LOC"), (20, 26, "LOC")], + } + ], +) +def test_Example_from_dict_with_entities(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + eg = Example.from_dict(predicted, annots) + assert len(list(eg.reference.ents)) == 2 + assert eg.reference[0].ent_iob_ == "O" + assert eg.reference[1].ent_iob_ == "O" + assert eg.reference[2].ent_iob_ == "B" + assert eg.reference[3].ent_iob_ == "I" + assert eg.reference[4].ent_iob_ == "O" + assert eg.reference[5].ent_iob_ == "B" + assert eg.reference[6].ent_iob_ == "O" + assert eg.reference[2].ent_type_ == "LOC" + assert eg.reference[3].ent_type_ == "LOC" + assert eg.reference[5].ent_type_ == "LOC" + + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "entities": [(7, 15, "LOC"), (20, 26, "LOC")], + "links": {(7, 15): {"Q60": 1.0, "Q64": 0.0}, (20, 26): {"Q60": 0.0, "Q64": 1.0}}, } ], ) @@ -161,7 +160,28 @@ def test_Example_from_dict_with_links(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) eg = Example.from_dict(predicted, annots) - assert eg.reference[0].ent_kb_id_ == "Q7381115" - assert eg.reference[1].ent_kb_id_ == "Q7381115" - assert eg.reference[2].ent_kb_id_ == "" - assert eg.reference[3].ent_kb_id_ == "" + assert eg.reference[0].ent_kb_id_ == "" + assert eg.reference[1].ent_kb_id_ == "" + assert eg.reference[2].ent_kb_id_ == "Q60" + assert eg.reference[3].ent_kb_id_ == "Q60" + assert eg.reference[4].ent_kb_id_ == "" + assert eg.reference[5].ent_kb_id_ == "Q64" + assert eg.reference[6].ent_kb_id_ == "" + + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "entities": [(7, 15, "LOC"), (20, 26, "LOC")], + "links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}}, + } + ], +) +def test_Example_from_dict_with_links_invalid(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + with pytest.raises(ValueError): + Example.from_dict(predicted, annots) + From a5ee082da1c1f4c01af2dc84d6bfe8195012c5f7 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 12 Jun 2020 15:49:38 +0200 Subject: [PATCH 46/56] cats bugfix --- spacy/gold/new_example.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx index d2492a29f..e7506d697 100644 --- a/spacy/gold/new_example.pyx +++ b/spacy/gold/new_example.pyx @@ -101,6 +101,8 @@ def _annot2array(vocab, tok_annot, doc_annot): raise ValueError(Errors.E984) ent_kb_ids = _parse_links(vocab, words, value, entities) tok_annot["ENT_KB_ID"] = ent_kb_ids + elif key == "cats": + pass else: raise ValueError(f"Unknown doc attribute: {key}") From face0de74f716a318be1db4be56f503985025407 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 12 Jun 2020 16:29:09 +0200 Subject: [PATCH 47/56] fix MORPH conversion + enable unit test --- spacy/gold/new_example.pyx | 5 ++++- spacy/tests/test_new_example.py | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx index e7506d697..46b8ed423 100644 --- a/spacy/gold/new_example.pyx +++ b/spacy/gold/new_example.pyx @@ -13,7 +13,7 @@ from ..errors import Errors, AlignmentError cpdef Doc annotations2doc(Doc predicted, tok_annot, doc_annot): # TODO: Improve and test this words = tok_annot.get("ORTH", [tok.text for tok in predicted]) - attrs, array = _annot2array(predicted.vocab.strings, tok_annot, doc_annot) + attrs, array = _annot2array(predicted.vocab, tok_annot, doc_annot) output = Doc(predicted.vocab, words=words) if array.size: output = output.from_array(attrs, array) @@ -117,6 +117,9 @@ def _annot2array(vocab, tok_annot, doc_annot): elif key == "SENT_START": attrs.append(key) values.append(value) + elif key == "MORPH": + attrs.append(key) + values.append([vocab.morphology.add(v) for v in value]) elif key == "ENT_IOB": iob_strings = Token.iob_strings() attrs.append(key) diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index 4ebafb6bb..0be78624a 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -60,7 +60,6 @@ def test_Example_from_dict_with_parse(annots): assert token.head.i == annots["heads"][i] -@pytest.mark.xfail(reason="TODO - fix") @pytest.mark.parametrize( "annots", [ From b078b05ecd7d1d78a1f67f1f178db18ae6c7280f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 13 Jun 2020 15:30:12 +0200 Subject: [PATCH 48/56] Handle various data better in NewExample --- spacy/gold/new_example.pyx | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx index 46b8ed423..eb796eb83 100644 --- a/spacy/gold/new_example.pyx +++ b/spacy/gold/new_example.pyx @@ -148,6 +148,8 @@ def _fix_legacy_dict_data(predicted, example_dict): for key, value in example_dict.items(): if key in ("token_annotation", "doc_annotation"): pass + elif key == "ids": + pass elif key in ("cats", "links") and value: doc_dict[key] = value elif key in ("ner", "entities") and value: @@ -168,10 +170,15 @@ def _fix_legacy_dict_data(predicted, example_dict): old_token_dict = token_dict token_dict = {} for key, value in old_token_dict.items(): - if key in remapping: + if key in ("text", "ids", "entities", "ner", "brackets"): + pass + elif key in remapping: token_dict[remapping[key]] = value else: raise ValueError(f"Unknown attr: {key}") + if "HEAD" in token_dict and "SENT_START" in token_dict: + # If heads are set, we don't also redundantly specify SENT_START. + token_dict.pop("SENT_START") return { "token_annotation": token_dict, "doc_annotation": doc_dict From 5564314d323f746a180a81888e76166a3687ff11 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 13 Jun 2020 15:43:35 +0200 Subject: [PATCH 49/56] Suggest approach for GoldParse --- spacy/gold/new_example.pyx | 13 +++++++--- spacy/syntax/gold_parse.pyx | 50 +++++++++++-------------------------- 2 files changed, 24 insertions(+), 39 deletions(-) diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx index eb796eb83..d9a712e38 100644 --- a/spacy/gold/new_example.pyx +++ b/spacy/gold/new_example.pyx @@ -76,10 +76,15 @@ cdef class NewExample: raise NotImplementedError def to_dict(self): - """ Note that this method does NOT export the doc, only the annotations ! """ - token_dict = self._token_annotation - doc_dict = self._doc_annotation - return {"token_annotation": token_dict, "doc_annotation": doc_dict} + # We should probably implement this? We could return the + # doc_annotation and token_annotation, and this would allow us to + # easily implement the `get_parses_from_example` in + # spacy.syntax.gold_parse + raise NotImplementedError + + def split_sents(self): + # Unclear whether we should really implement this. I guess? + raise NotImplementedError def text(self): return self.x.text diff --git a/spacy/syntax/gold_parse.pyx b/spacy/syntax/gold_parse.pyx index 05361fd82..9712f6e94 100644 --- a/spacy/syntax/gold_parse.pyx +++ b/spacy/syntax/gold_parse.pyx @@ -25,54 +25,34 @@ def is_punct_label(label): def get_parses_from_example( - eg, merge=True, vocab=None, make_projective=True, ignore_misaligned=False + example, merge=True, vocab=None, make_projective=True, ignore_misaligned=False ): """Return a list of (doc, GoldParse) objects. If merge is set to True, keep all Token annotations as one big list.""" - d = eg.doc_annotation # merge == do not modify Example if merge: - t = eg.token_annotation - doc = eg.doc - if doc is None or not isinstance(doc, Doc): - if not vocab: - raise ValueError(Errors.E998) - doc = Doc(vocab, words=t.words) + examples = [example] + else: + # not merging: one GoldParse per sentence, defining docs with the words + # from each sentence + examples = eg.split_sents() + outputs = [] + for eg in examples: + eg_dict = eg.to_dict() try: gp = GoldParse.from_annotation( - doc, d, t, make_projective=make_projective + eg.predicted, + eg_dict["doc_annotation"], + eg_dict["token_annotation"], + make_projective=make_projective ) except AlignmentError: if ignore_misaligned: gp = None else: raise - return [(doc, gp)] - # not merging: one GoldParse per sentence, defining docs with the words - # from each sentence - else: - parses = [] - split_examples = eg.split_sents() - for split_example in split_examples: - if not vocab: - raise ValueError(Errors.E998) - split_doc = Doc(vocab, words=split_example.token_annotation.words) - try: - gp = GoldParse.from_annotation( - split_doc, - d, - split_example.token_annotation, - make_projective=make_projective, - ) - except AlignmentError: - if ignore_misaligned: - gp = None - else: - raise - if gp is not None: - parses.append((split_doc, gp)) - return parses - + outputs.append((eg.predicted, gp)) + return outputs cdef class GoldParse: From 3eb8f3867e03e6d4c4017c081189f9505c2f7567 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 13 Jun 2020 23:05:16 +0200 Subject: [PATCH 50/56] Update test --- spacy/tests/test_gold.py | 44 ++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index f60f52e6e..cc9224ae1 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -6,6 +6,7 @@ from spacy.gold.new_example import NewExample as Example from spacy.lang.en import English from spacy.syntax.nonproj import is_nonproj_tree from spacy.syntax.gold_parse import GoldParse, get_parses_from_example +from spacy.syntax.gold_parse import get_parses_from_example from spacy.tokens import Doc from spacy.util import get_words_and_spaces, compounding, minibatch import pytest @@ -279,22 +280,21 @@ def test_roundtrip_docs_to_json(doc): goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file)) reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp)) - goldparse = reloaded_example._deprecated_get_gold() assert len(doc) == goldcorpus.count_train() - assert text == reloaded_example.text - assert tags == goldparse.tags - assert pos == goldparse.pos - assert morphs == goldparse.morphs - assert lemmas == goldparse.lemmas - assert deps == goldparse.labels - assert heads == goldparse.heads - assert biluo_tags == goldparse.ner - assert "TRAVEL" in goldparse.cats - assert "BAKING" in goldparse.cats - assert cats["TRAVEL"] == goldparse.cats["TRAVEL"] - assert cats["BAKING"] == goldparse.cats["BAKING"] + assert text == reloaded_example.predicted.text + assert tags == [t.tag_ for t in reloaded_example.reference] + assert pos == [t.pos_ for t in reloaded_example.reference] + assert morphs == [t.morph_ for t in reloaded_example.reference] + assert lemmas == [t.lemma_ for t in reloaded_example.reference] + assert deps == [t.dep_ for t in reloaded_example.reference] + assert heads == [t.head.i for t in reloaded_example.reference] + assert "TRAVEL" in reloaded_example.reference.cats + assert "BAKING" in reloaded_example.reference.cats + assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"] + assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"] +@pytest.mark.xfail # TODO do we need to do the projectivity differently? def test_projective_train_vs_nonprojective_dev(doc): nlp = English() deps = [t.dep_ for t in doc] @@ -310,7 +310,7 @@ def test_projective_train_vs_nonprojective_dev(doc): train_goldparse = get_parses_from_example(train_reloaded_example)[0][1] dev_reloaded_example = next(goldcorpus.dev_dataset(nlp)) - dev_goldparse = dev_reloaded_example._deprecated_get_gold() + dev_goldparse = get_parses_from_example(dev_reloaded_example)[0][1] assert is_nonproj_tree([t.head.i for t in doc]) is True assert is_nonproj_tree(train_goldparse.heads) is False @@ -365,7 +365,7 @@ def test_make_orth_variants(doc): # due to randomness, test only that this runs with no errors for now train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2)) - train_goldparse = train_reloaded_example._deprecated_get_gold() + train_goldparse = get_parses_from_example(train_reloaded_example)[0][1] @pytest.mark.parametrize( @@ -419,20 +419,6 @@ def test_gold_constructor(): assert gold.words == ["This", "is", "a", "sentence"] -def test_gold_orig_annot(): - nlp = English() - doc = nlp("This is a sentence") - gold = GoldParse(doc, cats={"cat1": 1.0, "cat2": 0.0}) - - assert gold.orig.words == ["This", "is", "a", "sentence"] - assert gold.cats["cat1"] - - doc_annotation = DocAnnotation(cats={"cat1": 0.0, "cat2": 1.0}) - gold2 = GoldParse.from_annotation(doc, doc_annotation, gold.orig) - assert gold2.orig.words == ["This", "is", "a", "sentence"] - assert not gold2.cats["cat1"] - - def test_tuple_format_implicit(): """Test tuple format with implicit GoldParse creation""" From caa75087252649e527923f31ee88fe01e4694f7d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 13 Jun 2020 23:10:21 +0200 Subject: [PATCH 51/56] Draft missing NewExample stuff --- spacy/gold/new_example.pyx | 70 +++++++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 9 deletions(-) diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx index d9a712e38..5b66d0cae 100644 --- a/spacy/gold/new_example.pyx +++ b/spacy/gold/new_example.pyx @@ -5,7 +5,7 @@ from ..tokens.doc cimport Doc from ..attrs import IDS from .align cimport Alignment from .annotation import TokenAnnotation, DocAnnotation -from .iob_utils import biluo_to_iob, biluo_tags_from_offsets +from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc from .align import Alignment from ..errors import Errors, AlignmentError @@ -73,18 +73,70 @@ cdef class NewExample: return self._alignment def get_aligned(self, field): - raise NotImplementedError + """Return an aligned array for a token attribute.""" + # TODO: This is probably wrong. I just bashed this out and there's probably + # all sorts of edge-cases. + alignment = self.alignment + i2j_multi = alignment.i2j_multi + gold_to_cand = alignment.gold_to_cand + cand_to_gold = alignment.cand_to_gold + + gold_values = self.reference.to_array([field]) + output = [] + for i, gold_i in enumerate(cand_to_gold): + if self.predicted[i].text.isspace(): + output.append(None) + elif gold_i is None: + if i in i2j_multi: + output.append(gold_values[i2j_multi[i]]) + else: + output.append(None) + else: + output.append(gold_values[gold_i]) + return output def to_dict(self): - # We should probably implement this? We could return the - # doc_annotation and token_annotation, and this would allow us to - # easily implement the `get_parses_from_example` in - # spacy.syntax.gold_parse - raise NotImplementedError + return { + "doc_annotation": { + "cats": dict(self.reference.cats), + "links": [], # TODO + }, + "token_annotation": { + "ids": [t.i+1 for t in self.reference], + "words": [t.text for t in self.reference], + "tags": [t.tag_ for t in self.reference], + "lemmas": [t.lemma_ for t in self.reference], + "pos": [t.pos_ for t in self.reference], + "morphs": [t.morph_ for t in self.reference], + "heads": [t.head.i for t in self.reference], + "deps": [t.dep_ for t in self.reference], + "sent_starts": [int(bool(t.is_sent_start)) for t in self.reference], + "entities": biluo_tags_from_doc(self.reference) + } + } def split_sents(self): - # Unclear whether we should really implement this. I guess? - raise NotImplementedError + """ Split the token annotations into multiple Examples based on + sent_starts and return a list of the new Examples""" + if not self.reference.is_sentenced: + return [self] + # TODO: Do this for misaligned somehow? + predicted_words = [t.text for t in self.predicted] + reference_words = [t.text for t in self.reference] + if predicted_words != reference_words: + raise NotImplementedError("TODO: Implement this") + # Implement the easy case. + output = [] + cls = self.__class__ + for sent in self.reference.sents: + # I guess for misaligned we just need to use the gold_to_cand? + output.append( + cls( + self.predicted[sent.start : sent.end + 1].as_doc(), + sent.as_doc() + ) + ) + return output def text(self): return self.x.text From 3a0bbcfb4ca31c89a8235e91d454ae5ceb6da424 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 13 Jun 2020 23:10:54 +0200 Subject: [PATCH 52/56] Add biluo_tags_from_doc function --- spacy/gold/iob_utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/spacy/gold/iob_utils.py b/spacy/gold/iob_utils.py index 2f0f116a1..6d16cf1a5 100644 --- a/spacy/gold/iob_utils.py +++ b/spacy/gold/iob_utils.py @@ -47,6 +47,14 @@ def _consume_ent(tags): return [start] + middle + [end] +def biluo_tags_from_doc(doc, missing="O"): + return biluo_tags_from_offsets( + doc, + [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents], + missing=missing + ) + + def biluo_tags_from_offsets(doc, entities, missing="O"): """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out scheme (BILUO). From 8f941ef527794ca7b7102b69c73e562731248b4d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 13 Jun 2020 23:11:29 +0200 Subject: [PATCH 53/56] Update GoldParse --- spacy/syntax/gold_parse.pyx | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/spacy/syntax/gold_parse.pyx b/spacy/syntax/gold_parse.pyx index 9712f6e94..d547de821 100644 --- a/spacy/syntax/gold_parse.pyx +++ b/spacy/syntax/gold_parse.pyx @@ -35,7 +35,7 @@ def get_parses_from_example( else: # not merging: one GoldParse per sentence, defining docs with the words # from each sentence - examples = eg.split_sents() + examples = example.split_sents() outputs = [] for eg in examples: eg_dict = eg.to_dict() @@ -62,18 +62,21 @@ cdef class GoldParse: """ @classmethod def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False): - return cls(doc, words=token_annotation.words, - tags=token_annotation.tags, - pos=token_annotation.pos, - morphs=token_annotation.morphs, - lemmas=token_annotation.lemmas, - heads=token_annotation.heads, - deps=token_annotation.deps, - entities=token_annotation.entities, - sent_starts=token_annotation.sent_starts, - cats=doc_annotation.cats, - links=doc_annotation.links, - make_projective=make_projective) + return cls( + doc, + words=token_annotation["words"], + tags=token_annotation["tags"], + pos=token_annotation["pos"], + morphs=token_annotation["morphs"], + lemmas=token_annotation["lemmas"], + heads=token_annotation["heads"], + deps=token_annotation["deps"], + entities=token_annotation["entities"], + sent_starts=token_annotation["sent_starts"], + cats=doc_annotation["cats"], + links=doc_annotation["links"], + make_projective=make_projective + ) def get_token_annotation(self): ids = None From 7de997c0a53adaf7ed8881c44593946d772a5081 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 13 Jun 2020 23:11:45 +0200 Subject: [PATCH 54/56] Update test --- spacy/tests/test_gold.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index cc9224ae1..6e3f7b2ba 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -488,17 +488,15 @@ def test_split_sents(merged_dict): split_examples = example.split_sents() assert len(split_examples) == 2 - token_annotation_1 = split_examples[0].token_annotation - assert token_annotation_1.ids == [1, 2, 3] - assert token_annotation_1.words == ["Hi", "there", "everyone"] - assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"] - assert token_annotation_1.sent_starts == [1, 0, 0] + token_annotation_1 = split_examples[0].to_dict()["token_annotation"] + assert token_annotation_1["words"] == ["Hi", "there", "everyone"] + assert token_annotation_1["tags"] == ["INTJ", "ADV", "PRON"] + assert token_annotation_1["sent_starts"] == [1, 0, 0] - token_annotation_2 = split_examples[1].token_annotation - assert token_annotation_2.ids == [4, 5, 6, 7] - assert token_annotation_2.words == ["It", "is", "just", "me"] - assert token_annotation_2.tags == ["PRON", "AUX", "ADV", "PRON"] - assert token_annotation_2.sent_starts == [1, 0, 0, 0] + token_annotation_2 = split_examples[1].to_dict()["token_annotation"] + assert token_annotation_2["words"] == ["It", "is", "just", "me"] + assert token_annotation_2["tags"] == ["PRON", "AUX", "ADV", "PRON"] + assert token_annotation_2["sent_starts"] == [1, 0, 0, 0] # This fails on some None value? Need to look into that. From 4362ec7084597f90919c5d9e33523c955b96d472 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 13 Jun 2020 23:37:42 +0200 Subject: [PATCH 55/56] Hack Language.evaluate --- spacy/language.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 57664ec17..4ab9bed5a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -723,24 +723,26 @@ class Language(object): DOCS: https://spacy.io/api/language#evaluate """ - examples = Example.to_example_objects(examples, make_doc=self.make_doc) + examples = Example.to_example_objects(examples) if scorer is None: scorer = Scorer(pipeline=self.pipeline) if component_cfg is None: component_cfg = {} + docs = (eg.predicted for eg in examples) for name, pipe in self.pipeline: kwargs = component_cfg.get(name, {}) kwargs.setdefault("batch_size", batch_size) if not hasattr(pipe, "pipe"): - examples = _pipe(examples, pipe, kwargs) + docs = _pipe(docs, pipe, kwargs) else: - examples = pipe.pipe(examples, as_example=True, **kwargs) - for ex in examples: + docs = pipe.pipe(docs, **kwargs) + for doc, eg in zip(docs, examples): if verbose: print(ex.doc) + eg.predicted = doc kwargs = component_cfg.get("scorer", {}) kwargs.setdefault("verbose", verbose) - scorer.score(ex, **kwargs) + scorer.score(eg, **kwargs) return scorer @contextmanager From 380cce9d8b3b1a90c3b25b5187a3d666ee416d71 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 14 Jun 2020 17:40:05 +0200 Subject: [PATCH 56/56] Update errors --- spacy/errors.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 459301315..9c7bf9e50 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -580,14 +580,6 @@ class Errors(object): "table, which contains {n_rows} vectors.") # TODO: fix numbering after merging develop into master - - E983 = ("Each link annotation should refer to a dictionary with at most one " - "identifier mapping to 1.0, and all others to 0.0.") - E984 = ("The offsets of the annotations for 'links' need to refer exactly " - "to the offsets of the 'entities' annotations.") - E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing " - "into {values}, but found {value}.") - E983 = ("Invalid key for '{dict_name}': {key}. Available keys: " "{keys}") E984 = ("Could not parse the {input} - double check the data is written " @@ -628,6 +620,14 @@ class Errors(object): E999 = ("Encountered an unexpected format for the dictionary holding " "gold annotations: {gold_dict}") + # TODO: These were left over after a merge, but I couldn't find them? + #E983 = ("Each link annotation should refer to a dictionary with at most one " + # "identifier mapping to 1.0, and all others to 0.0.") + #E984 = ("The offsets of the annotations for 'links' need to refer exactly " + # "to the offsets of the 'entities' annotations.") + #E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing " + # "into {values}, but found {value}.") + @add_codes class TempErrors(object):