fix test checking for variants

2025-07-18 20:22:25 +03:00 · 2020-06-19 14:05:35 +02:00 · 2020-06-19 14:05:35 +02:00 · 6d5bfd6f6a
commit 6d5bfd6f6a
parent a427ca9355
9 changed files with 27 additions and 59 deletions
--- a/examples/experiments/onto-joint/defaults.cfg
+++ b/examples/experiments/onto-joint/defaults.cfg
@ -9,7 +9,6 @@ max_length = 0
 limit = 0
 # Data augmentation
 orth_variant_level = 0.0
-noise_level = 0.0
 dropout = 0.1
 # Controls early-stopping. 0 or -1 mean unlimited.
 patience = 1600
--- a/examples/experiments/onto-joint/pretrain.cfg
+++ b/examples/experiments/onto-joint/pretrain.cfg
@ -9,7 +9,6 @@ max_length = 0
 limit = 0
 # Data augmentation
 orth_variant_level = 0.0
-noise_level = 0.0
 dropout = 0.1
 # Controls early-stopping. 0 or -1 mean unlimited.
 patience = 1600
--- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
@ -6,7 +6,6 @@ init_tok2vec = null
 vectors = null
 max_epochs = 100
 orth_variant_level = 0.0
-noise_level = 0.0
 gold_preproc = true
 max_length = 0
 use_gpu = 0
--- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg
@ -6,7 +6,6 @@ init_tok2vec = null
 vectors = null
 max_epochs = 100
 orth_variant_level = 0.0
-noise_level = 0.0
 gold_preproc = true
 max_length = 0
 use_gpu = -1
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -4,7 +4,7 @@ from .download import download  # noqa: F401
 from .info import info  # noqa: F401
 from .package import package  # noqa: F401
 from .profile import profile  # noqa: F401
-from .train_from_config import train_cli  # noqa: F401
+from .train import train_cli  # noqa: F401
 from .pretrain import pretrain  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@ -371,7 +371,6 @@ def create_train_batches(nlp, corpus, cfg):
        train_examples = list(
            corpus.train_dataset(
                nlp,
-                noise_level=cfg["noise_level"], # I think this is deprecated?
                orth_variant_level=cfg["orth_variant_level"],
                gold_preproc=cfg["gold_preproc"],
                max_length=cfg["max_length"],
--- a/spacy/gold/augment.py
+++ b/spacy/gold/augment.py
@ -2,6 +2,15 @@ import random
 import itertools


+def make_orth_variants_example(nlp, example, orth_variant_level=0.0):  # TODO: naming
+    raw_text = example.text
+    orig_dict = example.to_dict()
+    variant_text, variant_token_annot = make_orth_variants(nlp, raw_text, orig_dict["token_annotation"], orth_variant_level)
+    doc = nlp.make_doc(variant_text)
+    orig_dict["token_annotation"] = variant_token_annot
+    return example.from_dict(doc, orig_dict)
+
+
 def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
    if random.random() >= orth_variant_level:
        return raw_text, orig_token_dict
@ -98,23 +107,3 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
                raw_idx += 1
        raw = variant_raw
    return raw, token_dict
-
-
-def add_noise(orig, noise_level):
-    if random.random() >= noise_level:
-        return orig
-    elif type(orig) == list:
-        corrupted = [_corrupt(word, noise_level) for word in orig]
-        corrupted = [w for w in corrupted if w]
-        return corrupted
-    else:
-        return "".join(_corrupt(c, noise_level) for c in orig)
-
-
-def _corrupt(c, noise_level):
-    if random.random() >= noise_level:
-        return c
-    elif c in [".", "'", "!", "?", ","]:
-        return "\n"
-    else:
-        return c.lower()
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@ -8,7 +8,7 @@ from ..tokens import Doc
 from .. import util
 from ..errors import Errors, AlignmentError
 from .gold_io import read_json_file, json_to_annotations
-from .augment import make_orth_variants, add_noise
+from .augment import make_orth_variants
 from .example import Example


@ -148,7 +148,6 @@ class GoldCorpus(object):
        nlp,
        gold_preproc=False,
        max_length=None,
-        noise_level=0.0,
        orth_variant_level=0.0,
        ignore_misaligned=False,
    ):
@ -160,7 +159,6 @@ class GoldCorpus(object):
            train_annotations,
            gold_preproc,
            max_length=max_length,
-            noise_level=noise_level,
            orth_variant_level=orth_variant_level,
            make_projective=True,
            ignore_misaligned=ignore_misaligned,
@ -194,33 +192,31 @@ class GoldCorpus(object):
        annotations,
        gold_preproc,
        max_length=None,
-        noise_level=0.0,
        orth_variant_level=0.0,
        make_projective=False,
        ignore_misaligned=False,
    ):
        """ Setting gold_preproc will result in creating a doc per sentence """
        for eg_dict in annotations:
+            token_annot = eg_dict.get("token_annotation", {})
            if eg_dict["text"]:
-                example = Example.from_dict(
-                    nlp.make_doc(eg_dict["text"]),
-                    eg_dict
-                )
+                doc = nlp.make_doc(eg_dict["text"])
+            elif "words" in token_annot:
+                doc = Doc(nlp.vocab, words=token_annot["words"])
            else:
-                example = Example.from_dict(
-                    Doc(nlp.vocab, words=eg_dict["words"]),
-                    eg_dict
-                )
+                raise ValueError("Expecting either 'text' or token_annotation.words annotation")
+
            if gold_preproc:
-                # TODO: Data augmentation
+                variant_text, variant_token_annot = make_orth_variants(nlp, doc.text, token_annot, orth_variant_level)
+                doc = nlp.make_doc(variant_text)
+                eg_dict["token_annotation"] = variant_token_annot
+                example = Example.from_dict(doc, eg_dict)
                examples = example.split_sents()
+
            else:
+                example = Example.from_dict(doc, eg_dict)
                examples = [example]
+
            for eg in examples:
                if (not max_length) or len(eg.predicted) < max_length:
-                    if ignore_misaligned:
-                        try:
-                            _ = eg._deprecated_get_gold()
-                        except AlignmentError:
-                            continue
                    yield eg
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -11,6 +11,7 @@ import pytest
 import srsly

 from .util import make_tempdir
+from ..gold.augment import make_orth_variants_example


@pytest.fixture
@ -387,8 +388,8 @@ def test_make_orth_variants(doc):
        goldcorpus = GoldCorpus(str(json_file), str(json_file))

        # due to randomness, test only that this runs with no errors for now
-        train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2))
-        train_goldparse = get_parses_from_example(train_reloaded_example)[0][1]
+        train_example = next(goldcorpus.train_dataset(nlp))
+        variant_example = make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)


@pytest.mark.parametrize(
@ -499,19 +500,6 @@ def test_split_sents(merged_dict):
    )
    assert example.text == "Hi there everyone It is just me"

-    assert len(get_parses_from_example(
-        example,
-        merge=False,
-        vocab=nlp.vocab,
-        make_projective=False)
-    ) == 2
-    assert len(get_parses_from_example(
-        example,
-        merge=True,
-        vocab=nlp.vocab,
-        make_projective=False
-    )) == 1
-
    split_examples = example.split_sents()
    assert len(split_examples) == 2
    assert split_examples[0].text == "Hi there everyone "