diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 43216c943..ff720f4bf 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -24,17 +24,16 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): checked_for_ner = False has_ner_tags = False for i, example in enumerate(conll_data): - for token_annotation in example.token_annotations: - if not checked_for_ner: - has_ner_tags = is_ner(token_annotation.entities[0]) - checked_for_ner = True - sentences.append(generate_sentence(token_annotation, has_ner_tags)) - # Real-sized documents could be extracted using the comments on the - # conluu document - if len(sentences) % n_sents == 0: - doc = create_doc(sentences, i) - docs.append(doc) - sentences = [] + if not checked_for_ner: + has_ner_tags = is_ner(example.token_annotation.entities[0]) + checked_for_ner = True + sentences.append(generate_sentence(example.token_annotation, has_ner_tags)) + # Real-sized documents could be extracted using the comments on the + # conllu document + if len(sentences) % n_sents == 0: + doc = create_doc(sentences, i) + docs.append(doc) + sentences = [] return docs @@ -84,7 +83,7 @@ def read_conllx(input_data, use_morphology=False, n=0): print(line) raise example = Example(doc=None) - example.add_token_annotation(ids=ids, words=words, tags=tags, + example.set_token_annotation(ids=ids, words=words, tags=tags, heads=heads, deps=deps, entities=ents) yield example i += 1 diff --git a/spacy/gold.pxd b/spacy/gold.pxd index 8527ba2b6..247ff8aa1 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -25,7 +25,7 @@ cdef class GoldParse: cdef public int loss cdef public list words cdef public list tags - cdef public list morphology + cdef public list morphs cdef public list heads cdef public list labels cdef public dict orths @@ -45,7 +45,8 @@ cdef class TokenAnnotation: cdef public list heads cdef public list deps cdef public list entities - cdef public list morphology + cdef public list morphs + cdef public list sent_starts cdef public list brackets @@ -56,7 +57,7 @@ cdef class DocAnnotation: cdef class Example: cdef public object doc - cdef public list token_annotations + cdef public TokenAnnotation token_annotation cdef public DocAnnotation doc_annotation cdef public object goldparse diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 39e867a33..0659ddd02 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -215,7 +215,7 @@ class GoldCorpus(object): ex_dict = example.to_dict() text = example.text srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict)) - n += len(example.token_annotations) + n += 1 if limit and n >= limit: break @@ -271,7 +271,7 @@ class GoldCorpus(object): raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported)) for example in examples: yield example - i += len(example.token_annotations) + i += 1 if limit and i >= limit: return @@ -286,15 +286,14 @@ class GoldCorpus(object): yield from self.read_examples(locs, limit=self.limit) def count_train(self): - # TODO: should this count words or sentences ? + """Returns count of words in train examples""" n = 0 i = 0 for example in self.train_examples: - for token_annotation in example.token_annotations: - n += len(token_annotation.words) - if self.limit and i >= self.limit: - break - i += 1 + n += len(example.token_annotation.words) + if self.limit and i >= self.limit: + break + i += 1 return n def train_dataset(self, nlp, gold_preproc=False, max_length=None, @@ -328,18 +327,27 @@ class GoldCorpus(object): def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None, noise_level=0.0, orth_variant_level=0.0, make_projective=False, ignore_misaligned=False): - """ Setting gold_preproc will result in creating a doc per 'sentence' """ + """ Setting gold_preproc will result in creating a doc per sentence """ for example in examples: if gold_preproc: example.doc = None + split_examples = example.split_sents() + example_golds = [] + for split_example in split_examples: + split_example_docs = cls._make_docs(nlp, split_example, + gold_preproc, noise_level=noise_level, + orth_variant_level=orth_variant_level) + split_example_golds = cls._make_golds(split_example_docs, + vocab=nlp.vocab, make_projective=make_projective, + ignore_misaligned=ignore_misaligned) + example_golds.extend(split_example_golds) else: - example = example.merge_sents() - example_docs = cls._make_docs(nlp, example, - gold_preproc, noise_level=noise_level, - orth_variant_level=orth_variant_level) - example_golds = cls._make_golds(example_docs, vocab=nlp.vocab, - make_projective=make_projective, - ignore_misaligned=ignore_misaligned) + example_docs = cls._make_docs(nlp, example, + gold_preproc, noise_level=noise_level, + orth_variant_level=orth_variant_level) + example_golds = cls._make_golds(example_docs, vocab=nlp.vocab, + make_projective=make_projective, + ignore_misaligned=ignore_misaligned) for ex in example_golds: if ex.goldparse is not None: if (not max_length) or len(ex.doc) < max_length: @@ -353,35 +361,28 @@ class GoldCorpus(object): var_text = add_noise(var_example.text, noise_level) var_doc = nlp.make_doc(var_text) var_example.doc = var_doc - return [var_example] else: - doc_examples = [] - for token_annotation in var_example.token_annotations: - t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level)) - doc_example = Example(doc_annotation=example.doc_annotation, - token_annotations=[token_annotation], - doc=t_doc) - doc_examples.append(doc_example) - return doc_examples + var_doc = Doc(nlp.vocab, words=add_noise(var_example.token_annotation.words, noise_level)) + var_example.doc = var_doc + return [var_example] @classmethod def _make_golds(cls, examples, vocab=None, make_projective=False, ignore_misaligned=False): - gold_examples = [] for example in examples: gold_parses = example.get_gold_parses(vocab=vocab, make_projective=make_projective, ignore_misaligned=ignore_misaligned) - for (doc, gold) in gold_parses: - ex = Example(doc=doc) - ex.goldparse = gold - gold_examples.append(ex) - return gold_examples + assert len(gold_parses) == 1 + assert gold_parses[0][0] == example.doc + example.goldparse = gold_parses[0][1] + return examples + def make_orth_variants(nlp, example, orth_variant_level=0.0): if random.random() >= orth_variant_level: return example - if not example.token_annotations: + if not example.token_annotation: return example raw = example.text if random.random() >= 0.5: @@ -392,46 +393,46 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): ndpv = nlp.Defaults.paired_orth_variants # modify words in paragraph_tuples variant_example = Example(doc=raw) - for token_annotation in example.token_annotations: - words = token_annotation.words - tags = token_annotation.tags - if not words or not tags: - # add the unmodified annotation - token_dict = token_annotation.to_dict() - variant_example.add_token_annotation(**token_dict) - else: - if lower: - words = [w.lower() for w in words] - # single variants - punct_choices = [random.choice(x["variants"]) for x in ndsv] - for word_idx in range(len(words)): - for punct_idx in range(len(ndsv)): - if tags[word_idx] in ndsv[punct_idx]["tags"] \ - and words[word_idx] in ndsv[punct_idx]["variants"]: - words[word_idx] = punct_choices[punct_idx] - # paired variants - punct_choices = [random.choice(x["variants"]) for x in ndpv] - for word_idx in range(len(words)): - for punct_idx in range(len(ndpv)): - if tags[word_idx] in ndpv[punct_idx]["tags"] \ - and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): - # backup option: random left vs. right from pair - pair_idx = random.choice([0, 1]) - # best option: rely on paired POS tags like `` / '' - if len(ndpv[punct_idx]["tags"]) == 2: - pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) - # next best option: rely on position in variants - # (may not be unambiguous, so order of variants matters) - else: - for pair in ndpv[punct_idx]["variants"]: - if words[word_idx] in pair: - pair_idx = pair.index(words[word_idx]) - words[word_idx] = punct_choices[punct_idx][pair_idx] + token_annotation = example.token_annotation + words = token_annotation.words + tags = token_annotation.tags + if not words or not tags: + # add the unmodified annotation + token_dict = token_annotation.to_dict() + variant_example.set_token_annotation(**token_dict) + else: + if lower: + words = [w.lower() for w in words] + # single variants + punct_choices = [random.choice(x["variants"]) for x in ndsv] + for word_idx in range(len(words)): + for punct_idx in range(len(ndsv)): + if tags[word_idx] in ndsv[punct_idx]["tags"] \ + and words[word_idx] in ndsv[punct_idx]["variants"]: + words[word_idx] = punct_choices[punct_idx] + # paired variants + punct_choices = [random.choice(x["variants"]) for x in ndpv] + for word_idx in range(len(words)): + for punct_idx in range(len(ndpv)): + if tags[word_idx] in ndpv[punct_idx]["tags"] \ + and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): + # backup option: random left vs. right from pair + pair_idx = random.choice([0, 1]) + # best option: rely on paired POS tags like `` / '' + if len(ndpv[punct_idx]["tags"]) == 2: + pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) + # next best option: rely on position in variants + # (may not be unambiguous, so order of variants matters) + else: + for pair in ndpv[punct_idx]["variants"]: + if words[word_idx] in pair: + pair_idx = pair.index(words[word_idx]) + words[word_idx] = punct_choices[punct_idx][pair_idx] - token_dict = token_annotation.to_dict() - token_dict["words"] = words - token_dict["tags"] = tags - variant_example.add_token_annotation(**token_dict) + token_dict = token_annotation.to_dict() + token_dict["words"] = words + token_dict["tags"] = tags + variant_example.set_token_annotation(**token_dict) # modify raw to match variant_paragraph_tuples if raw is not None: variants = [] @@ -449,30 +450,29 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): variant_raw += raw[raw_idx] raw_idx += 1 - for token_annotation in variant_example.token_annotations: - for word in token_annotation.words: - match_found = False - # add identical word - if word not in variants and raw[raw_idx:].startswith(word): - variant_raw += word - raw_idx += len(word) - match_found = True - # add variant word - else: - for variant in variants: - if not match_found and \ - raw[raw_idx:].startswith(variant): - raw_idx += len(variant) - variant_raw += word - match_found = True - # something went wrong, abort - # (add a warning message?) - if not match_found: - return example - # add following whitespace - while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): - variant_raw += raw[raw_idx] - raw_idx += 1 + for word in variant_example.token_annotation.words: + match_found = False + # add identical word + if word not in variants and raw[raw_idx:].startswith(word): + variant_raw += word + raw_idx += len(word) + match_found = True + # add variant word + else: + for variant in variants: + if not match_found and \ + raw[raw_idx:].startswith(variant): + raw_idx += len(variant) + variant_raw += word + match_found = True + # something went wrong, abort + # (add a warning message?) + if not match_found: + return example + # add following whitespace + while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): + variant_raw += raw[raw_idx] + raw_idx += 1 variant_example.doc = variant_raw return variant_example return variant_example @@ -521,30 +521,43 @@ def json_to_examples(doc): paragraphs = [] for paragraph in doc["paragraphs"]: example = Example(doc=paragraph.get("raw", None)) + words = [] + ids = [] + tags = [] + heads = [] + labels = [] + ner = [] + morphs = [] + sent_starts = [] + brackets = [] for sent in paragraph["sentences"]: - words = [] - ids = [] - tags = [] - heads = [] - labels = [] - ner = [] + sent_start_i = len(words) for i, token in enumerate(sent["tokens"]): words.append(token["orth"]) - ids.append(i) + ids.append(token.get('id', sent_start_i + i)) tags.append(token.get('tag', "-")) - heads.append(token.get("head", 0) + i) + heads.append(token.get("head", 0) + sent_start_i + i) labels.append(token.get("dep", "")) # Ensure ROOT label is case-insensitive if labels[-1].lower() == "root": labels[-1] = "ROOT" ner.append(token.get("ner", "-")) - example.add_token_annotation(ids=ids, words=words, tags=tags, - heads=heads, deps=labels, entities=ner, - brackets=sent.get("brackets", [])) + morphs.append(token.get("morph", {})) + if i == 0: + sent_starts.append(True) + else: + sent_starts.append(False) + if "brackets" in sent: + brackets.extend((b["first"] + sent_start_i, + b["last"] + sent_start_i, b["label"]) + for b in sent["brackets"]) cats = {} for cat in paragraph.get("cats", {}): cats[cat["label"]] = cat["value"] - example.add_doc_annotation(cats=cats) + example.set_token_annotation(ids=ids, words=words, tags=tags, + heads=heads, deps=labels, entities=ner, morphs=morphs, + sent_starts=sent_starts, brackets=brackets) + example.set_doc_annotation(cats=cats) yield example @@ -652,15 +665,16 @@ def _consume_ent(tags): cdef class TokenAnnotation: - def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphology=None, brackets=None): + def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphs=None, sent_starts=None, brackets=None): self.ids = ids if ids else [] self.words = words if words else [] self.tags = tags if tags else [] self.heads = heads if heads else [] self.deps = deps if deps else [] self.entities = entities if entities else [] + self.morphs = morphs if morphs else [] + self.sent_starts = sent_starts if sent_starts else [] self.brackets = brackets if brackets else [] - self.morphology = morphology if morphology else [] @classmethod def from_dict(cls, token_dict): @@ -670,7 +684,8 @@ cdef class TokenAnnotation: heads=token_dict.get("heads", None), deps=token_dict.get("deps", None), entities=token_dict.get("entities", None), - morphology=token_dict.get("morphology", None), + morphs=token_dict.get("morphs", None), + sent_starts=token_dict.get("sent_starts", None), brackets=token_dict.get("brackets", None)) def to_dict(self): @@ -680,9 +695,34 @@ cdef class TokenAnnotation: "heads": self.heads, "deps": self.deps, "entities": self.entities, - "morphology": self.morphology, + "morphs": self.morphs, + "sent_starts": self.sent_starts, "brackets": self.brackets} + def get_id(self, i): + return self.ids[i] if i < len(self.ids) else i + + def get_word(self, i): + return self.words[i] if i < len(self.words) else "" + + def get_tag(self, i): + return self.tags[i] if i < len(self.tags) else "-" + + def get_head(self, i): + return self.heads[i] if i < len(self.heads) else i + + def get_dep(self, i): + return self.deps[i] if i < len(self.deps) else "" + + def get_entity(self, i): + return self.entities[i] if i < len(self.entities) else "-" + + def get_morph(self, i): + return self.morphs[i] if i < len(self.morphs) else set() + + def get_sent_start(self, i): + return self.sent_starts[i] if i < len(self.sent_starts) else None + cdef class DocAnnotation: def __init__(self, cats=None, links=None): @@ -698,33 +738,33 @@ cdef class DocAnnotation: cdef class Example: - def __init__(self, doc_annotation=None, token_annotations=None, doc=None, + def __init__(self, doc_annotation=None, token_annotation=None, doc=None, goldparse=None): """ Doc can either be text, or an actual Doc """ self.doc = doc self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation() - self.token_annotations = token_annotations if token_annotations else [] + self.token_annotation = token_annotation if token_annotation else TokenAnnotation() self.goldparse = goldparse @classmethod def from_gold(cls, goldparse, doc=None): doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links) token_annotation = goldparse.get_token_annotation() - return cls(doc_annotation, [token_annotation], doc) + return cls(doc_annotation, token_annotation, doc) @classmethod def from_dict(cls, example_dict, doc=None): - token_dicts = example_dict["token_annotations"] - token_annotations = [TokenAnnotation.from_dict(t) for t in token_dicts] + token_dict = example_dict["token_annotation"] + token_annotation = TokenAnnotation.from_dict(token_dict) doc_dict = example_dict["doc_annotation"] doc_annotation = DocAnnotation.from_dict(doc_dict) - return cls(doc_annotation, token_annotations, doc) + return cls(doc_annotation, token_annotation, doc) def to_dict(self): """ Note that this method does NOT export the doc, only the annotations ! """ - token_dicts = [t.to_dict() for t in self.token_annotations] + token_dict = self.token_annotation.to_dict() doc_dict = self.doc_annotation.to_dict() - return {"token_annotations": token_dicts, "doc_annotation": doc_dict} + return {"token_annotation": token_dict, "doc_annotation": doc_dict} @property def text(self): @@ -737,96 +777,108 @@ cdef class Example: @property def gold(self): if self.goldparse is None: - doc, gold = self.get_gold_parses(merge=True)[0] + doc, gold = self.get_gold_parses()[0] self.goldparse = gold return self.goldparse - def add_token_annotation(self, ids=None, words=None, tags=None, heads=None, - deps=None, entities=None, morphology=None, brackets=None): - t = TokenAnnotation(ids=ids, words=words, tags=tags, + def set_token_annotation(self, ids=None, words=None, tags=None, heads=None, + deps=None, entities=None, morphs=None, + sent_starts=None, brackets=None): + self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags, heads=heads, deps=deps, entities=entities, - morphology=morphology, brackets=brackets) - self.token_annotations.append(t) + morphs=morphs, sent_starts=sent_starts, + brackets=brackets) - def add_doc_annotation(self, cats=None, links=None): + def set_doc_annotation(self, cats=None, links=None): if cats: - self.doc_annotation.cats.update(cats) + self.doc_annotation.cats = cats if links: - self.doc_annotation.links.update(links) + self.doc_annotation.links = links - def merge_sents(self): - """ Merge the list of token annotations into one object and return this new object """ - m_example = Example(doc=self.doc, doc_annotation=self.doc_annotation) - m_ids, m_words, m_tags, m_heads, m_deps, m_ents, m_morph = [], [], [], [], [], [], [] - m_brackets = [] - i = 0 - for t in self.token_annotations: - m_ids.extend(id_ + i for id_ in t.ids) - m_words.extend(t.words) - m_tags.extend(t.tags) - m_heads.extend(head + i if head is not None and head >= 0 else head_i + i for head_i, head in enumerate(t.heads)) - m_deps.extend(t.deps) - m_ents.extend(t.entities) - m_morph.extend(t.morphology) - m_brackets.extend((b["first"] + i, b["last"] + i, b["label"]) - for b in t.brackets) - i += len(t.ids) - m_example.add_token_annotation(ids=m_ids, words=m_words, tags=m_tags, - heads=m_heads, deps=m_deps, entities=m_ents, - morphology=m_morph, brackets=m_brackets) - return m_example + def split_sents(self): + """ Split the token annotations into multiple Examples based on + sent_starts and return a list of the new Examples""" + s_example = Example(doc=None, doc_annotation=self.doc_annotation) + s_ids, s_words, s_tags, s_heads = [], [], [], [] + s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], [] + s_brackets = [] + sent_start_i = 0 + t = self.token_annotation + split_examples = [] + for i in range(len(t.words)): + if i > 0 and t.sent_starts[i] == True: + s_example.set_token_annotation(ids=s_ids, + words=s_words, tags=s_tags, heads=s_heads, deps=s_deps, + entities=s_ents, morphs=s_morphs, + sent_starts=s_sent_starts, brackets=s_brackets) + split_examples.append(s_example) + s_example = Example(doc=None, doc_annotation=self.doc_annotation) + s_ids, s_words, s_tags, s_heads = [], [], [], [] + s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], [] + s_brackets = [] + sent_start_i = i + s_ids.append(t.get_id(i)) + s_words.append(t.get_word(i)) + s_tags.append(t.get_tag(i)) + s_heads.append(t.get_head(i) - sent_start_i) + s_deps.append(t.get_dep(i)) + s_ents.append(t.get_entity(i)) + s_morphs.append(t.get_morph(i)) + s_sent_starts.append(t.get_sent_start(i)) + s_brackets.extend((b[0] - sent_start_i, + b[1] - sent_start_i, b[2]) + for b in t.brackets if b[0] == i) + i += 1 + s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, + heads=s_heads, deps=s_deps, entities=s_ents, + morphs=s_morphs, sent_starts=s_sent_starts, + brackets=s_brackets) + split_examples.append(s_example) + return split_examples - def get_gold_parses(self, merge=False, vocab=None, make_projective=False, + def get_gold_parses(self, merge=True, vocab=None, make_projective=False, ignore_misaligned=False): """Return a list of (doc, GoldParse) objects. - If merge is set to True, add all Token annotations to one big list.""" + If merge is set to True, keep all Token annotations as one big list.""" d = self.doc_annotation - # merging different sentences + # merge == do not modify Example if merge: - merged_example = self.merge_sents() - assert(len(merged_example.token_annotations)) == 1 - t = merged_example.token_annotations[0] - m_doc = merged_example.doc - if not m_doc: + t = self.token_annotation + doc = self.doc + if not self.doc: if not vocab: raise ValueError(Errors.E998) - m_doc = Doc(vocab, words=t.words) + doc = Doc(vocab, words=t.words) try: - gp = GoldParse.from_annotation(m_doc, d, t, make_projective=make_projective) + gp = GoldParse.from_annotation(doc, d, t, + make_projective=make_projective) except AlignmentError: if ignore_misaligned: gp = None else: raise - return [(self.doc, gp)] - # we only have one sentence and an appropriate doc - elif len(self.token_annotations) == 1 and isinstance(self.doc, Doc): - t = self.token_annotations[0] - try: - gp = GoldParse.from_annotation(self.doc, d, t, make_projective=make_projective) - except AlignmentError: - if ignore_misaligned: - gp = None - else: - raise - return [(self.doc, gp)] - # not merging: one GoldParse per 'sentence', defining docs with the words from each sentence + return [(doc, gp)] + # not merging: one GoldParse per sentence, defining docs with the words + # from each sentence else: parses = [] - for t in self.token_annotations: + split_examples = self.split_sents() + for split_example in split_examples: if not vocab: raise ValueError(Errors.E998) - t_doc = Doc(vocab, words=t.words) + split_doc = Doc(vocab, words=split_example.token_annotation.words) try: - gp = GoldParse.from_annotation(t_doc, d, t, make_projective=make_projective) + gp = GoldParse.from_annotation(split_doc, d, + split_example.token_annotation, + make_projective=make_projective) except AlignmentError: if ignore_misaligned: gp = None else: raise if gp is not None: - parses.append((t_doc, gp)) + parses.append((split_doc, gp)) return parses @classmethod @@ -881,9 +933,14 @@ cdef class GoldParse: """ @classmethod def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False): - return cls(doc, words=token_annotation.words, tags=token_annotation.tags, - heads=token_annotation.heads, deps=token_annotation.deps, entities=token_annotation.entities, - morphology=token_annotation.morphology, cats=doc_annotation.cats, links=doc_annotation.links, + return cls(doc, words=token_annotation.words, + tags=token_annotation.tags, + heads=token_annotation.heads, + deps=token_annotation.deps, + entities=token_annotation.entities, + morphs=token_annotation.morphs, + cats=doc_annotation.cats, + links=doc_annotation.links, make_projective=make_projective) def get_token_annotation(self): @@ -893,9 +950,9 @@ cdef class GoldParse: return TokenAnnotation(ids=ids, words=self.words, tags=self.tags, heads=self.heads, deps=self.labels, entities=self.ner, - morphology=self.morphology) + morphs=self.morphs) - def __init__(self, doc, words=None, tags=None, morphology=None, + def __init__(self, doc, words=None, tags=None, morphs=None, heads=None, deps=None, entities=None, make_projective=False, cats=None, links=None): """Create a GoldParse. The fields will not be initialized if len(doc) is zero. @@ -944,8 +1001,8 @@ cdef class GoldParse: heads = [None for _ in words] if not deps: deps = [None for _ in words] - if not morphology: - morphology = [None for _ in words] + if not morphs: + morphs = [None for _ in words] if entities is None: entities = ["-" for _ in words] elif len(entities) == 0: @@ -971,7 +1028,7 @@ cdef class GoldParse: self.heads = [None] * len(doc) self.labels = [None] * len(doc) self.ner = [None] * len(doc) - self.morphology = [None] * len(doc) + self.morphs = [None] * len(doc) # This needs to be done before we align the words if make_projective and heads is not None and deps is not None: @@ -990,7 +1047,7 @@ cdef class GoldParse: self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags, - heads=heads, deps=deps, entities=entities, morphology=morphology, + heads=heads, deps=deps, entities=entities, morphs=morphs, brackets=[]) for i, gold_i in enumerate(self.cand_to_gold): @@ -1000,12 +1057,12 @@ cdef class GoldParse: self.heads[i] = None self.labels[i] = None self.ner[i] = None - self.morphology[i] = set() + self.morphs[i] = set() if gold_i is None: if i in i2j_multi: self.words[i] = words[i2j_multi[i]] self.tags[i] = tags[i2j_multi[i]] - self.morphology[i] = morphology[i2j_multi[i]] + self.morphs[i] = morphs[i2j_multi[i]] is_last = i2j_multi[i] != i2j_multi.get(i+1) is_first = i2j_multi[i] != i2j_multi.get(i-1) # Set next word in multi-token span as head, until last @@ -1044,7 +1101,7 @@ cdef class GoldParse: else: self.words[i] = words[gold_i] self.tags[i] = tags[gold_i] - self.morphology[i] = morphology[gold_i] + self.morphs[i] = morphs[gold_i] if heads[gold_i] is None: self.heads[i] = None else: diff --git a/spacy/language.py b/spacy/language.py index c84f597d9..8ec602ed7 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -574,9 +574,8 @@ class Language(object): # Populate vocab else: for example in get_examples(): - for token_annotation in example.token_annotations: - for word in token_annotation.words: - _ = self.vocab[word] # noqa: F841 + for word in example.token_annotation.words: + _ = self.vocab[word] # noqa: F841 if cfg.get("device", -1) >= 0: util.use_gpu(cfg["device"]) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 04a769b27..56a00e33b 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -565,12 +565,11 @@ class Tagger(Pipe): orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = OrderedDict() for example in get_examples(): - for token_annotation in example.token_annotations: - for tag in token_annotation.tags: - if tag in orig_tag_map: - new_tag_map[tag] = orig_tag_map[tag] - else: - new_tag_map[tag] = {POS: X} + for tag in example.token_annotation.tags: + if tag in orig_tag_map: + new_tag_map[tag] = orig_tag_map[tag] + else: + new_tag_map[tag] = {POS: X} cdef Vocab vocab = self.vocab if new_tag_map: vocab.morphology = Morphology(vocab.strings, new_tag_map, @@ -750,11 +749,10 @@ class MultitaskObjective(Tagger): gold_examples = nonproj.preprocess_training_data(get_examples()) # for raw_text, doc_annot in gold_tuples: for example in gold_examples: - for token_annotation in example.token_annotations: - for i in range(len(token_annotation.ids)): - label = self.make_label(i, token_annotation) - if label is not None and label not in self.labels: - self.labels[label] = len(self.labels) + for i in range(len(example.token_annotation.ids)): + label = self.make_label(i, example.token_annotation) + if label is not None and label not in self.labels: + self.labels[label] = len(self.labels) if self.model is True: token_vector_width = util.env_opt("token_vector_width") self.model = self.Model(len(self.labels), tok2vec=tok2vec) diff --git a/spacy/scorer.py b/spacy/scorer.py index 25c6935f3..723259acd 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -237,7 +237,7 @@ class Scorer(object): if len(doc) != len(gold): doc_annotation = DocAnnotation(cats=gold.cats) token_annotation = gold.orig - gold = GoldParse.from_annotation(doc, doc_annotation, [token_annotation]) + gold = GoldParse.from_annotation(doc, doc_annotation, token_annotation) orig = gold.orig gold_deps = set() gold_deps_per_dep = {} diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 0a99609a8..d358c1277 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -342,19 +342,19 @@ cdef class ArcEager(TransitionSystem): actions[RIGHT][label] = 1 actions[REDUCE][label] = 1 for example in kwargs.get('gold_parses', []): - for token_annotation in example.token_annotations: - heads, labels = nonproj.projectivize(token_annotation.heads, token_annotation.deps) - for child, head, label in zip(token_annotation.ids, heads, labels): - if label.upper() == 'ROOT' : - label = 'ROOT' - if head == child: - actions[BREAK][label] += 1 - elif head < child: - actions[RIGHT][label] += 1 - actions[REDUCE][''] += 1 - elif head > child: - actions[LEFT][label] += 1 - actions[SHIFT][''] += 1 + heads, labels = nonproj.projectivize(example.token_annotation.heads, + example.token_annotation.deps) + for child, head, label in zip(example.token_annotation.ids, heads, labels): + if label.upper() == 'ROOT' : + label = 'ROOT' + if head == child: + actions[BREAK][label] += 1 + elif head < child: + actions[RIGHT][label] += 1 + actions[REDUCE][''] += 1 + elif head > child: + actions[LEFT][label] += 1 + actions[SHIFT][''] += 1 if min_freq is not None: for action, label_freqs in actions.items(): for label, freq in list(label_freqs.items()): diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index d791534ee..7467aa342 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -73,12 +73,11 @@ cdef class BiluoPushDown(TransitionSystem): actions[action][entity_type] = 1 moves = ('M', 'B', 'I', 'L', 'U') for example in kwargs.get('gold_parses', []): - for token_annotation in example.token_annotations: - for i, ner_tag in enumerate(token_annotation.entities): - if ner_tag != 'O' and ner_tag != '-': - _, label = ner_tag.split('-', 1) - for action in (BEGIN, IN, LAST, UNIT): - actions[action][label] += 1 + for i, ner_tag in enumerate(example.token_annotation.entities): + if ner_tag != 'O' and ner_tag != '-': + _, label = ner_tag.split('-', 1) + for action in (BEGIN, IN, LAST, UNIT): + actions[action][label] += 1 return actions @property diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index c7ed25948..2ec6b61ac 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -81,15 +81,15 @@ def is_decorated(label): def count_decorated_labels(gold_data): freqs = {} for example in gold_data: - for token_annotation in example.token_annotations: - proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps) - # set the label to ROOT for each root dependent - deco_deps = ['ROOT' if head == i else deco_deps[i] - for i, head in enumerate(proj_heads)] - # count label frequencies - for label in deco_deps: - if is_decorated(label): - freqs[label] = freqs.get(label, 0) + 1 + proj_heads, deco_deps = projectivize(example.token_annotation.heads, + example.token_annotation.deps) + # set the label to ROOT for each root dependent + deco_deps = ['ROOT' if head == i else deco_deps[i] + for i, head in enumerate(proj_heads)] + # count label frequencies + for label in deco_deps: + if is_decorated(label): + freqs[label] = freqs.get(label, 0) + 1 return freqs @@ -98,21 +98,20 @@ def preprocess_training_data(gold_data, label_freq_cutoff=30): freqs = {} for example in gold_data: new_example = Example(doc=example.doc) - for token_annotation in example.token_annotations: - proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps) - # set the label to ROOT for each root dependent - deco_deps = ['ROOT' if head == i else deco_deps[i] - for i, head in enumerate(proj_heads)] - # count label frequencies - if label_freq_cutoff > 0: - for label in deco_deps: - if is_decorated(label): - freqs[label] = freqs.get(label, 0) + 1 - # TODO: the code would be less ugly when changing heads and deps in-place, but is this OK upstream ? - proj_token_dict = token_annotation.to_dict() - proj_token_dict["heads"] = proj_heads - proj_token_dict["deps"] = deco_deps - new_example.add_token_annotation(**proj_token_dict) + proj_heads, deco_deps = projectivize(example.token_annotation.heads, + example.token_annotation.deps) + # set the label to ROOT for each root dependent + deco_deps = ['ROOT' if head == i else deco_deps[i] + for i, head in enumerate(proj_heads)] + # count label frequencies + if label_freq_cutoff > 0: + for label in deco_deps: + if is_decorated(label): + freqs[label] = freqs.get(label, 0) + 1 + proj_token_dict = example.token_annotation.to_dict() + proj_token_dict["heads"] = proj_heads + proj_token_dict["deps"] = deco_deps + new_example.set_token_annotation(**proj_token_dict) preprocessed.append(new_example) if label_freq_cutoff > 0: return _filter_labels(preprocessed, label_freq_cutoff, freqs) @@ -213,15 +212,14 @@ def _filter_labels(examples, cutoff, freqs): filtered = [] for example in examples: new_example = Example(doc=example.doc) - for token_annotation in example.token_annotations: - filtered_labels = [] - for label in token_annotation.deps: - if is_decorated(label) and freqs.get(label, 0) < cutoff: - filtered_labels.append(decompose(label)[0]) - else: - filtered_labels.append(label) - filtered_token_dict = token_annotation.to_dict() - filtered_token_dict["deps"] = filtered_labels - new_example.add_token_annotation(**filtered_token_dict) + filtered_labels = [] + for label in example.token_annotation.deps: + if is_decorated(label) and freqs.get(label, 0) < cutoff: + filtered_labels.append(decompose(label)[0]) + else: + filtered_labels.append(label) + filtered_token_dict = example.token_annotation.to_dict() + filtered_token_dict["deps"] = filtered_labels + new_example.set_token_annotation(**filtered_token_dict) filtered.append(new_example) return filtered diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index ace25f8cc..4b27901ad 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -273,7 +273,7 @@ def test_issue1963(en_tokenizer): def test_issue1967(label): ner = EntityRecognizer(Vocab()) example = Example(doc=None) - example.add_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]) + example.set_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]) ner.moves.get_actions(gold_parses=[example]) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index b43eb3431..d1255c176 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -36,6 +36,16 @@ def doc(): return doc +@pytest.fixture() +def merged_dict(): + return { + "ids": [1, 2, 3, 4, 5, 6, 7], + "words": ["Hi", "there", "everyone", "It", "is", "just", "me"], + "tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"], + "sent_starts": [1, 0, 0, 1, 0, 0, 0, 0], + } + + def test_gold_biluo_U(en_vocab): words = ["I", "flew", "to", "London", "."] spaces = [True, True, True, False, True] @@ -231,7 +241,7 @@ def test_ignore_misaligned(doc): deps = [t.dep_ for t in doc] heads = [t.head.i for t in doc] - use_new_align = spacy.gold.USE_NEW_ALIGN + saved_use_new_align = spacy.gold.USE_NEW_ALIGN spacy.gold.USE_NEW_ALIGN = False with make_tempdir() as tmpdir: @@ -270,7 +280,25 @@ def test_ignore_misaligned(doc): ignore_misaligned=True)) assert len(train_reloaded_example) == 0 - spacy.gold.USE_NEW_ALIGN = use_new_align + spacy.gold.USE_NEW_ALIGN = saved_use_new_align + + +def test_make_orth_variants(doc): + nlp = English() + text = doc.text + deps = [t.dep_ for t in doc] + heads = [t.head.i for t in doc] + + with make_tempdir() as tmpdir: + jsonl_file = tmpdir / "test.jsonl" + # write to JSONL train dicts + srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) + goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + + # due to randomness, test only that this runs with no errors for now + train_reloaded_example = next(goldcorpus.train_dataset(nlp, + orth_variant_level=0.2)) + train_goldparse = train_reloaded_example.gold # xfail while we have backwards-compatible alignment @@ -386,71 +414,38 @@ def _train(train_data): nlp.update(batch, sgd=optimizer, losses=losses) -tokens_1 = { - "ids": [1, 2, 3], - "words": ["Hi", "there", "everyone"], - "tags": ["INTJ", "ADV", "PRON"], -} - -tokens_2 = { - "ids": [1, 2, 3, 4], - "words": ["It", "is", "just", "me"], - "tags": ["PRON", "AUX", "ADV", "PRON"], -} - -text0 = "Hi there everyone It is just me" - - -def test_merge_sents(): +def test_split_sents(merged_dict): nlp = English() example = Example() - example.add_token_annotation(**tokens_1) - example.add_token_annotation(**tokens_2) + example.set_token_annotation(**merged_dict) assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2 - assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1 # this shouldn't change the original object + assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1 - merged_example = example.merge_sents() + split_examples = example.split_sents() + assert len(split_examples) == 2 - token_annotation_1 = example.token_annotations[0] + token_annotation_1 = split_examples[0].token_annotation assert token_annotation_1.ids == [1, 2, 3] assert token_annotation_1.words == ["Hi", "there", "everyone"] assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"] + assert token_annotation_1.sent_starts == [1, 0, 0] - token_annotation_m = merged_example.token_annotations[0] - assert token_annotation_m.ids == [1, 2, 3, 4, 5, 6, 7] - assert token_annotation_m.words == ["Hi", "there", "everyone", "It", "is", "just", "me"] - assert token_annotation_m.tags == ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"] + token_annotation_2 = split_examples[1].token_annotation + assert token_annotation_2.ids == [4, 5, 6, 7] + assert token_annotation_2.words == ["It", "is", "just", "me"] + assert token_annotation_2.tags == ["PRON", "AUX", "ADV", "PRON"] + assert token_annotation_2.sent_starts == [1, 0, 0, 0] -def test_tuples_to_example(): +def test_tuples_to_example(merged_dict): ex = Example() - ex.add_token_annotation(**tokens_1) - ex.add_token_annotation(**tokens_2) - ex.add_doc_annotation(cats={"TRAVEL": 1.0, "BAKING": 0.0}) + ex.set_token_annotation(**merged_dict) + cats = {"TRAVEL": 1.0, "BAKING": 0.0} + ex.set_doc_annotation(cats=cats) ex_dict = ex.to_dict() - token_dicts = [ - { - "ids": [1, 2, 3], - "words": ["Hi", "there", "everyone"], - "tags": ["INTJ", "ADV", "PRON"], - "heads": [], - "deps": [], - "entities": [], - "morphology": [], - "brackets": [], - }, - { - "ids": [1, 2, 3, 4], - "words": ["It", "is", "just", "me"], - "tags": ["PRON", "AUX", "ADV", "PRON"], - "heads": [], - "deps": [], - "entities": [], - "morphology": [], - "brackets": [], - }, - ] - doc_dict = {"cats": {"TRAVEL": 1.0, "BAKING": 0.0}, "links": {}} - - assert ex_dict == {"token_annotations": token_dicts, "doc_annotation": doc_dict} + assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"] + assert ex_dict["token_annotation"]["words"] == merged_dict["words"] + assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"] + assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"] + assert ex_dict["doc_annotation"]["cats"] == cats diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index e8d74c405..92a607e5b 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -86,7 +86,7 @@ def test_ner_per_type(en_vocab): ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]], ) ex = Example(doc=doc) - ex.add_token_annotation(entities=annot["entities"]) + ex.set_token_annotation(entities=annot["entities"]) scorer.score(ex) results = scorer.scores @@ -107,7 +107,7 @@ def test_ner_per_type(en_vocab): ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]], ) ex = Example(doc=doc) - ex.add_token_annotation(entities=annot["entities"]) + ex.set_token_annotation(entities=annot["entities"]) scorer.score(ex) results = scorer.scores