diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg index f76336d84..337fe0379 100644 --- a/examples/experiments/onto-joint/defaults.cfg +++ b/examples/experiments/onto-joint/defaults.cfg @@ -9,7 +9,6 @@ max_length = 0 limit = 0 # Data augmentation orth_variant_level = 0.0 -noise_level = 0.0 dropout = 0.1 # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg index 40885b6e8..83991f888 100644 --- a/examples/experiments/onto-joint/pretrain.cfg +++ b/examples/experiments/onto-joint/pretrain.cfg @@ -9,7 +9,6 @@ max_length = 0 limit = 0 # Data augmentation orth_variant_level = 0.0 -noise_level = 0.0 dropout = 0.1 # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index 905b5b4e0..f1b702a4e 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -6,7 +6,6 @@ init_tok2vec = null vectors = null max_epochs = 100 orth_variant_level = 0.0 -noise_level = 0.0 gold_preproc = true max_length = 0 use_gpu = 0 diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index 7383116e7..1c946ac60 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -6,7 +6,6 @@ init_tok2vec = null vectors = null max_epochs = 100 orth_variant_level = 0.0 -noise_level = 0.0 gold_preproc = true max_length = 0 use_gpu = -1 diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 2ffbe2d0c..6f09c6884 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -4,7 +4,7 @@ from .download import download # noqa: F401 from .info import info # noqa: F401 from .package import package # noqa: F401 from .profile import profile # noqa: F401 -from .train_from_config import train_cli # noqa: F401 +from .train import train_cli # noqa: F401 from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 from .evaluate import evaluate # noqa: F401 diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train.py similarity index 99% rename from spacy/cli/train_from_config.py rename to spacy/cli/train.py index 3a4d28356..fb4347158 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train.py @@ -371,7 +371,6 @@ def create_train_batches(nlp, corpus, cfg): train_examples = list( corpus.train_dataset( nlp, - noise_level=cfg["noise_level"], # I think this is deprecated? orth_variant_level=cfg["orth_variant_level"], gold_preproc=cfg["gold_preproc"], max_length=cfg["max_length"], diff --git a/spacy/gold/augment.py b/spacy/gold/augment.py index a129793c8..dda51cda6 100644 --- a/spacy/gold/augment.py +++ b/spacy/gold/augment.py @@ -2,6 +2,15 @@ import random import itertools +def make_orth_variants_example(nlp, example, orth_variant_level=0.0): # TODO: naming + raw_text = example.text + orig_dict = example.to_dict() + variant_text, variant_token_annot = make_orth_variants(nlp, raw_text, orig_dict["token_annotation"], orth_variant_level) + doc = nlp.make_doc(variant_text) + orig_dict["token_annotation"] = variant_token_annot + return example.from_dict(doc, orig_dict) + + def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0): if random.random() >= orth_variant_level: return raw_text, orig_token_dict @@ -98,23 +107,3 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0): raw_idx += 1 raw = variant_raw return raw, token_dict - - -def add_noise(orig, noise_level): - if random.random() >= noise_level: - return orig - elif type(orig) == list: - corrupted = [_corrupt(word, noise_level) for word in orig] - corrupted = [w for w in corrupted if w] - return corrupted - else: - return "".join(_corrupt(c, noise_level) for c in orig) - - -def _corrupt(c, noise_level): - if random.random() >= noise_level: - return c - elif c in [".", "'", "!", "?", ","]: - return "\n" - else: - return c.lower() diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index d55845fb8..c84f8355f 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -8,7 +8,7 @@ from ..tokens import Doc from .. import util from ..errors import Errors, AlignmentError from .gold_io import read_json_file, json_to_annotations -from .augment import make_orth_variants, add_noise +from .augment import make_orth_variants from .example import Example @@ -148,7 +148,6 @@ class GoldCorpus(object): nlp, gold_preproc=False, max_length=None, - noise_level=0.0, orth_variant_level=0.0, ignore_misaligned=False, ): @@ -160,7 +159,6 @@ class GoldCorpus(object): train_annotations, gold_preproc, max_length=max_length, - noise_level=noise_level, orth_variant_level=orth_variant_level, make_projective=True, ignore_misaligned=ignore_misaligned, @@ -194,33 +192,31 @@ class GoldCorpus(object): annotations, gold_preproc, max_length=None, - noise_level=0.0, orth_variant_level=0.0, make_projective=False, ignore_misaligned=False, ): """ Setting gold_preproc will result in creating a doc per sentence """ for eg_dict in annotations: + token_annot = eg_dict.get("token_annotation", {}) if eg_dict["text"]: - example = Example.from_dict( - nlp.make_doc(eg_dict["text"]), - eg_dict - ) + doc = nlp.make_doc(eg_dict["text"]) + elif "words" in token_annot: + doc = Doc(nlp.vocab, words=token_annot["words"]) else: - example = Example.from_dict( - Doc(nlp.vocab, words=eg_dict["words"]), - eg_dict - ) + raise ValueError("Expecting either 'text' or token_annotation.words annotation") + if gold_preproc: - # TODO: Data augmentation + variant_text, variant_token_annot = make_orth_variants(nlp, doc.text, token_annot, orth_variant_level) + doc = nlp.make_doc(variant_text) + eg_dict["token_annotation"] = variant_token_annot + example = Example.from_dict(doc, eg_dict) examples = example.split_sents() + else: + example = Example.from_dict(doc, eg_dict) examples = [example] + for eg in examples: if (not max_length) or len(eg.predicted) < max_length: - if ignore_misaligned: - try: - _ = eg._deprecated_get_gold() - except AlignmentError: - continue yield eg diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index f76b0c1e1..726492138 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -11,6 +11,7 @@ import pytest import srsly from .util import make_tempdir +from ..gold.augment import make_orth_variants_example @pytest.fixture @@ -387,8 +388,8 @@ def test_make_orth_variants(doc): goldcorpus = GoldCorpus(str(json_file), str(json_file)) # due to randomness, test only that this runs with no errors for now - train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2)) - train_goldparse = get_parses_from_example(train_reloaded_example)[0][1] + train_example = next(goldcorpus.train_dataset(nlp)) + variant_example = make_orth_variants_example(nlp, train_example, orth_variant_level=0.2) @pytest.mark.parametrize( @@ -499,19 +500,6 @@ def test_split_sents(merged_dict): ) assert example.text == "Hi there everyone It is just me" - assert len(get_parses_from_example( - example, - merge=False, - vocab=nlp.vocab, - make_projective=False) - ) == 2 - assert len(get_parses_from_example( - example, - merge=True, - vocab=nlp.vocab, - make_projective=False - )) == 1 - split_examples = example.split_sents() assert len(split_examples) == 2 assert split_examples[0].text == "Hi there everyone "