various fixes in scripts - needs to be further tested

2025-10-18 17:54:17 +03:00 · 2020-06-17 12:05:58 +02:00 · 2020-06-17 12:05:58 +02:00 · f7ad8e8c83
commit f7ad8e8c83
parent 3c4f9e4cc4
9 changed files with 63 additions and 126 deletions
--- a/bin/ud/ud_train.py
+++ b/bin/ud/ud_train.py
@ -14,7 +14,7 @@ import spacy
 import spacy.util
 from bin.ud import conll17_ud_eval
 from spacy.tokens import Token, Doc
-from spacy.gold import GoldParse, Example
+from spacy.gold import Example
 from spacy.util import compounding, minibatch, minibatch_by_words
 from spacy.syntax.nonproj import projectivize
 from spacy.matcher import Matcher
@ -83,11 +83,11 @@ def read_data(
                sent["heads"].append(head)
                sent["deps"].append("ROOT" if dep == "root" else dep)
                sent["spaces"].append(space_after == "_")
-            sent["entities"] = ["-"] * len(sent["words"])
+            sent["entities"] = ["-"] * len(sent["words"])    # TODO: doc-level format
            sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
            if oracle_segments:
                docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
-                golds.append(GoldParse(docs[-1], **sent))
+                golds.append(sent)
                assert golds[-1].morphology is not None
            sent_annots.append(sent)
@ -151,28 +151,27 @@ def read_conllu(file_):
 def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
    # Flatten the conll annotations, and adjust the head indices
-    flat = defaultdict(list)
+    gold = defaultdict(list)
    sent_starts = []
    for sent in sent_annots:
-        flat["heads"].extend(len(flat["words"])+head for head in sent["heads"])
+        gold["heads"].extend(len(gold["words"])+head for head in sent["heads"])
        for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]:
-            flat[field].extend(sent[field])
+            gold[field].extend(sent[field])
        sent_starts.append(True)
        sent_starts.extend([False] * (len(sent["words"]) - 1))
    # Construct text if necessary
-    assert len(flat["words"]) == len(flat["spaces"])
+    assert len(gold["words"]) == len(gold["spaces"])
    if text is None:
        text = "".join(
-            word + " " * space for word, space in zip(flat["words"], flat["spaces"])
+            word + " " * space for word, space in zip(gold["words"], gold["spaces"])
        )
    doc = nlp.make_doc(text)
-    flat.pop("spaces")
+    gold.pop("spaces")
-    gold = GoldParse(doc, **flat)
+    gold["sent_starts"] = sent_starts
    gold.sent_starts = sent_starts
    for i in range(len(gold.heads)):
        if random.random() < drop_deps:
-            gold.heads[i] = None
+            gold["heads"][i] = None
-            gold.labels[i] = None
+            gold["labels"][i] = None
    return doc, gold
@ -183,15 +182,10 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
 def golds_to_gold_data(docs, golds):
-    """Get out the training data format used by begin_training, given the
+    """Get out the training data format used by begin_training"""
    GoldParse objects."""
    data = []
    for doc, gold in zip(docs, golds):
-        example = Example(doc=doc)
+        example = Example.from_dict(doc, gold)
        example.add_doc_annotation(cats=gold.cats)
        token_annotation_dict = gold.orig.to_dict()
        example.add_token_annotation(**token_annotation_dict)
        example.goldparse = gold
        data.append(example)
    return data
@ -359,8 +353,8 @@ def initialize_pipeline(nlp, examples, config, device):
        nlp.parser.add_multitask_objective("tag")
    if config.multitask_sent:
        nlp.parser.add_multitask_objective("sent_start")
-    for ex in examples:
+    for eg in examples:
-        gold = ex.gold
+        gold = eg.gold
        for tag in gold.tags:
            if tag is not None:
                nlp.tagger.add_label(tag)
@ -541,7 +535,7 @@ def main(
        else:
            batches = minibatch(examples, size=batch_sizes)
        losses = {}
-        n_train_words = sum(len(ex.doc) for ex in examples)
+        n_train_words = sum(len(eg.doc) for eg in examples)
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
            for batch in batches:
                pbar.update(sum(len(ex.doc) for ex in batch))
--- a/examples/training/conllu.py
+++ b/examples/training/conllu.py
@ -12,7 +12,7 @@ import tqdm
 import spacy
 import spacy.util
 from spacy.tokens import Token, Doc
-from spacy.gold import GoldParse, Example
+from spacy.gold import Example
 from spacy.syntax.nonproj import projectivize
 from collections import defaultdict
 from spacy.matcher import Matcher
@ -33,31 +33,6 @@ random.seed(0)
 numpy.random.seed(0)
 def minibatch_by_words(examples, size=5000):
    random.shuffle(examples)
    if isinstance(size, int):
        size_ = itertools.repeat(size)
    else:
        size_ = size
    examples = iter(examples)
    while True:
        batch_size = next(size_)
        batch = []
        while batch_size >= 0:
            try:
                example = next(examples)
            except StopIteration:
                if batch:
                    yield batch
                return
            batch_size -= len(example.doc)
            batch.append(example)
        if batch:
            yield batch
        else:
            break
 ################
 # Data reading #
 ################
@ -110,7 +85,7 @@ def read_data(
            sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
            if oracle_segments:
                docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
-                golds.append(GoldParse(docs[-1], **sent))
+                golds.append(sent)
            sent_annots.append(sent)
            if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
@ -159,20 +134,19 @@ def read_conllu(file_):
 def _make_gold(nlp, text, sent_annots):
    # Flatten the conll annotations, and adjust the head indices
-    flat = defaultdict(list)
+    gold = defaultdict(list)
    for sent in sent_annots:
-        flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
+        gold["heads"].extend(len(gold["words"]) + head for head in sent["heads"])
        for field in ["words", "tags", "deps", "entities", "spaces"]:
-            flat[field].extend(sent[field])
+            gold[field].extend(sent[field])
    # Construct text if necessary
-    assert len(flat["words"]) == len(flat["spaces"])
+    assert len(gold["words"]) == len(gold["spaces"])
    if text is None:
        text = "".join(
-            word + " " * space for word, space in zip(flat["words"], flat["spaces"])
+            word + " " * space for word, space in zip(gold["words"], gold["spaces"])
        )
    doc = nlp.make_doc(text)
-    flat.pop("spaces")
+    gold.pop("spaces")
    gold = GoldParse(doc, **flat)
    return doc, gold
@ -182,15 +156,10 @@ def _make_gold(nlp, text, sent_annots):
 def golds_to_gold_data(docs, golds):
-    """Get out the training data format used by begin_training, given the
+    """Get out the training data format used by begin_training."""
    GoldParse objects."""
    data = []
    for doc, gold in zip(docs, golds):
-        example = Example(doc=doc)
+        example = Example.from_dict(doc, gold)
        example.add_doc_annotation(cats=gold.cats)
        token_annotation_dict = gold.orig.to_dict()
        example.add_token_annotation(**token_annotation_dict)
        example.goldparse = gold
        data.append(example)
    return data
@ -313,15 +282,15 @@ def initialize_pipeline(nlp, examples, config):
        nlp.parser.add_multitask_objective("sent_start")
    nlp.parser.moves.add_action(2, "subtok")
    nlp.add_pipe(nlp.create_pipe("tagger"))
-    for ex in examples:
+    for eg in examples:
-        for tag in ex.gold.tags:
+        for tag in eg.gold.tags:
            if tag is not None:
                nlp.tagger.add_label(tag)
    # Replace labels that didn't make the frequency cutoff
    actions = set(nlp.parser.labels)
    label_set = set([act.split("-")[1] for act in actions if "-" in act])
-    for ex in examples:
+    for eg in examples:
-        gold = ex.gold
+        gold = eg.gold
        for i, label in enumerate(gold.labels):
            if label is not None and label not in label_set:
                gold.labels[i] = label.split("||")[0]
@ -415,13 +384,12 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
    optimizer = initialize_pipeline(nlp, examples, config)
    for i in range(config.nr_epoch):
-        docs = [nlp.make_doc(example.doc.text) for example in examples]
+        batches = spacy.minibatch_by_words(examples, size=config.batch_size)
        batches = minibatch_by_words(examples, size=config.batch_size)
        losses = {}
-        n_train_words = sum(len(doc) for doc in docs)
+        n_train_words = sum(len(eg.reference.doc) for eg in examples)
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
            for batch in batches:
-                pbar.update(sum(len(ex.doc) for ex in batch))
+                pbar.update(sum(len(eg.reference.doc) for eg in batch))
                nlp.update(
                    examples=batch, sgd=optimizer, drop=config.dropout, losses=losses,
                )
--- a/examples/training/ner_multitask_objective.py
+++ b/examples/training/ner_multitask_objective.py
@ -24,8 +24,10 @@ import random
 import plac
 import spacy
 import os.path
 from spacy.gold.example import Example
 from spacy.tokens import Doc
-from spacy.gold import read_json_file, GoldParse
+from spacy.gold import read_json_file
 random.seed(0)
@ -59,17 +61,15 @@ def main(n_iter=10):
    print(nlp.pipeline)
    print("Create data", len(TRAIN_DATA))
-    optimizer = nlp.begin_training(get_examples=lambda: TRAIN_DATA)
+    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
-        for example in TRAIN_DATA:
+        for example_dict in TRAIN_DATA:
-            for token_annotation in example.token_annotations:
+            doc = Doc(nlp.vocab, words=example_dict["words"])
-                doc = Doc(nlp.vocab, words=token_annotation.words)
+            example = Example.from_dict(doc, example_dict)
                gold = GoldParse.from_annotation(doc, example.doc_annotation, token_annotation)
            nlp.update(
-                    examples=[(doc, gold)],  # 1 example
+                examples=[example],  # 1 example
                drop=0.2,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses,
@ -77,9 +77,9 @@ def main(n_iter=10):
        print(losses.get("nn_labeller", 0.0), losses["ner"])
    # test the trained model
-    for example in TRAIN_DATA:
+    for example_dict in TRAIN_DATA:
-        if example.text is not None:
+        if "text" in example_dict:
-            doc = nlp(example.text)
+            doc = nlp(example_dict["text"])
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -1,8 +1,7 @@
 import re
 from ...gold import Example
-from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets
+from ...gold import iob_to_biluo, spans_from_biluo_tags
 from ...gold import TokenAnnotation
 from ...language import Language
 from ...tokens import Doc, Token
 from .conll_ner2json import n_sents_info
@ -42,10 +41,10 @@ def conllu2json(
    )
    has_ner_tags = has_ner(input_data, MISC_NER_PATTERN)
    for i, example in enumerate(conll_data):
-        raw += example.text
+        raw += example.predicted.text
        sentences.append(
            generate_sentence(
-                example.token_annotation,
+                example,
                has_ner_tags,
                MISC_NER_PATTERN,
                ner_map=ner_map,
@ -268,36 +267,14 @@ def example_from_conllu_sentence(
        doc = merge_conllu_subtokens(lines, doc)
    # create Example from custom Doc annotation
-    ids, words, tags, heads, deps = [], [], [], [], []
+    words, spaces = [], []
    pos, lemmas, morphs, spaces = [], [], [], []
    for i, t in enumerate(doc):
        ids.append(i)
        words.append(t._.merged_orth)
        if append_morphology and t._.merged_morph:
            tags.append(t.tag_ + "__" + t._.merged_morph)
        else:
            tags.append(t.tag_)
        pos.append(t.pos_)
        morphs.append(t._.merged_morph)
        lemmas.append(t._.merged_lemma)
        heads.append(t.head.i)
        deps.append(t.dep_)
        spaces.append(t._.merged_spaceafter)
-    ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
+        if append_morphology and t._.merged_morph:
-    ents = biluo_tags_from_offsets(doc, ent_offsets)
+            t.tag_ = t.tag_ + "__" + t._.merged_morph
-    example = Example(doc=Doc(vocab, words=words, spaces=spaces))
+
-    example.token_annotation = TokenAnnotation(
+    return Example(predicted=Doc(vocab, words=words, spaces=spaces), reference=doc)
        ids=ids,
        words=words,
        tags=tags,
        pos=pos,
        morphs=morphs,
        lemmas=lemmas,
        heads=heads,
        deps=deps,
        entities=ents,
    )
    return example
 def merge_conllu_subtokens(lines, doc):
--- a/spacy/gold/gold_io.pyx
+++ b/spacy/gold/gold_io.pyx
@ -69,6 +69,7 @@ def docs_to_json(docs, id=0, ner_missing_tag="O"):
 def read_json_file(loc, docs_filter=None, limit=None):
    """Read Example dictionaries from a json file or directory."""
    loc = util.ensure_path(loc)
    if loc.is_dir():
        for filename in loc.iterdir():
@ -105,7 +106,7 @@ def json_to_annotations(doc):
            sent_start_i = len(words)
            for i, token in enumerate(sent["tokens"]):
                words.append(token["orth"])
-                spaces.append(token["space"])
+                spaces.append(token.get("space", True))
                ids.append(token.get('id', sent_start_i + i))
                tags.append(token.get('tag', "-"))
                pos.append(token.get("pos", ""))
--- a/spacy/language.py
+++ b/spacy/language.py
@ -804,7 +804,6 @@ class Language(object):
        cleanup=False,
        component_cfg=None,
        n_process=1,
        as_example=False,
    ):
        """Process texts as a stream, and yield `Doc` objects in order.
@ -837,8 +836,7 @@ class Language(object):
                batch_size=batch_size,
                disable=disable,
                n_process=n_process,
-                component_cfg=component_cfg,
+                component_cfg=component_cfg
                as_example=as_example,
            )
            for doc, context in zip(docs, contexts):
                yield (doc, context)
--- a/spacy/tests/pipeline/test_sentencizer.py
+++ b/spacy/tests/pipeline/test_sentencizer.py
@ -26,7 +26,7 @@ def test_sentencizer_pipe():
        sent_starts = [t.is_sent_start for t in doc]
        assert sent_starts == [True, False, True, False, False, False, False]
        assert len(list(doc.sents)) == 2
-    for ex in nlp.pipe(texts, as_example=True):
+    for ex in nlp.pipe(texts):
        doc = ex.doc
        assert doc.is_sentenced
        sent_starts = [t.is_sent_start for t in doc]
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -1,5 +1,4 @@
 import pytest
 from spacy.gold import Example
 from .util import get_random_doc
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -205,7 +205,7 @@ cdef class Tokenizer:
            doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
        return doc
-    def pipe(self, texts, batch_size=1000, n_threads=-1, as_example=False):
+    def pipe(self, texts, batch_size=1000, n_threads=-1):
        """Tokenize a stream of texts.
        texts: A sequence of unicode texts.