mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-05 22:20:34 +03:00
fix test checking for variants
This commit is contained in:
parent
a427ca9355
commit
6d5bfd6f6a
|
@ -9,7 +9,6 @@ max_length = 0
|
|||
limit = 0
|
||||
# Data augmentation
|
||||
orth_variant_level = 0.0
|
||||
noise_level = 0.0
|
||||
dropout = 0.1
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
|
|
|
@ -9,7 +9,6 @@ max_length = 0
|
|||
limit = 0
|
||||
# Data augmentation
|
||||
orth_variant_level = 0.0
|
||||
noise_level = 0.0
|
||||
dropout = 0.1
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
|
|
|
@ -6,7 +6,6 @@ init_tok2vec = null
|
|||
vectors = null
|
||||
max_epochs = 100
|
||||
orth_variant_level = 0.0
|
||||
noise_level = 0.0
|
||||
gold_preproc = true
|
||||
max_length = 0
|
||||
use_gpu = 0
|
||||
|
|
|
@ -6,7 +6,6 @@ init_tok2vec = null
|
|||
vectors = null
|
||||
max_epochs = 100
|
||||
orth_variant_level = 0.0
|
||||
noise_level = 0.0
|
||||
gold_preproc = true
|
||||
max_length = 0
|
||||
use_gpu = -1
|
||||
|
|
|
@ -4,7 +4,7 @@ from .download import download # noqa: F401
|
|||
from .info import info # noqa: F401
|
||||
from .package import package # noqa: F401
|
||||
from .profile import profile # noqa: F401
|
||||
from .train_from_config import train_cli # noqa: F401
|
||||
from .train import train_cli # noqa: F401
|
||||
from .pretrain import pretrain # noqa: F401
|
||||
from .debug_data import debug_data # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
|
|
|
@ -371,7 +371,6 @@ def create_train_batches(nlp, corpus, cfg):
|
|||
train_examples = list(
|
||||
corpus.train_dataset(
|
||||
nlp,
|
||||
noise_level=cfg["noise_level"], # I think this is deprecated?
|
||||
orth_variant_level=cfg["orth_variant_level"],
|
||||
gold_preproc=cfg["gold_preproc"],
|
||||
max_length=cfg["max_length"],
|
|
@ -2,6 +2,15 @@ import random
|
|||
import itertools
|
||||
|
||||
|
||||
def make_orth_variants_example(nlp, example, orth_variant_level=0.0): # TODO: naming
|
||||
raw_text = example.text
|
||||
orig_dict = example.to_dict()
|
||||
variant_text, variant_token_annot = make_orth_variants(nlp, raw_text, orig_dict["token_annotation"], orth_variant_level)
|
||||
doc = nlp.make_doc(variant_text)
|
||||
orig_dict["token_annotation"] = variant_token_annot
|
||||
return example.from_dict(doc, orig_dict)
|
||||
|
||||
|
||||
def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
|
||||
if random.random() >= orth_variant_level:
|
||||
return raw_text, orig_token_dict
|
||||
|
@ -98,23 +107,3 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
|
|||
raw_idx += 1
|
||||
raw = variant_raw
|
||||
return raw, token_dict
|
||||
|
||||
|
||||
def add_noise(orig, noise_level):
|
||||
if random.random() >= noise_level:
|
||||
return orig
|
||||
elif type(orig) == list:
|
||||
corrupted = [_corrupt(word, noise_level) for word in orig]
|
||||
corrupted = [w for w in corrupted if w]
|
||||
return corrupted
|
||||
else:
|
||||
return "".join(_corrupt(c, noise_level) for c in orig)
|
||||
|
||||
|
||||
def _corrupt(c, noise_level):
|
||||
if random.random() >= noise_level:
|
||||
return c
|
||||
elif c in [".", "'", "!", "?", ","]:
|
||||
return "\n"
|
||||
else:
|
||||
return c.lower()
|
||||
|
|
|
@ -8,7 +8,7 @@ from ..tokens import Doc
|
|||
from .. import util
|
||||
from ..errors import Errors, AlignmentError
|
||||
from .gold_io import read_json_file, json_to_annotations
|
||||
from .augment import make_orth_variants, add_noise
|
||||
from .augment import make_orth_variants
|
||||
from .example import Example
|
||||
|
||||
|
||||
|
@ -148,7 +148,6 @@ class GoldCorpus(object):
|
|||
nlp,
|
||||
gold_preproc=False,
|
||||
max_length=None,
|
||||
noise_level=0.0,
|
||||
orth_variant_level=0.0,
|
||||
ignore_misaligned=False,
|
||||
):
|
||||
|
@ -160,7 +159,6 @@ class GoldCorpus(object):
|
|||
train_annotations,
|
||||
gold_preproc,
|
||||
max_length=max_length,
|
||||
noise_level=noise_level,
|
||||
orth_variant_level=orth_variant_level,
|
||||
make_projective=True,
|
||||
ignore_misaligned=ignore_misaligned,
|
||||
|
@ -194,33 +192,31 @@ class GoldCorpus(object):
|
|||
annotations,
|
||||
gold_preproc,
|
||||
max_length=None,
|
||||
noise_level=0.0,
|
||||
orth_variant_level=0.0,
|
||||
make_projective=False,
|
||||
ignore_misaligned=False,
|
||||
):
|
||||
""" Setting gold_preproc will result in creating a doc per sentence """
|
||||
for eg_dict in annotations:
|
||||
token_annot = eg_dict.get("token_annotation", {})
|
||||
if eg_dict["text"]:
|
||||
example = Example.from_dict(
|
||||
nlp.make_doc(eg_dict["text"]),
|
||||
eg_dict
|
||||
)
|
||||
doc = nlp.make_doc(eg_dict["text"])
|
||||
elif "words" in token_annot:
|
||||
doc = Doc(nlp.vocab, words=token_annot["words"])
|
||||
else:
|
||||
example = Example.from_dict(
|
||||
Doc(nlp.vocab, words=eg_dict["words"]),
|
||||
eg_dict
|
||||
)
|
||||
raise ValueError("Expecting either 'text' or token_annotation.words annotation")
|
||||
|
||||
if gold_preproc:
|
||||
# TODO: Data augmentation
|
||||
variant_text, variant_token_annot = make_orth_variants(nlp, doc.text, token_annot, orth_variant_level)
|
||||
doc = nlp.make_doc(variant_text)
|
||||
eg_dict["token_annotation"] = variant_token_annot
|
||||
example = Example.from_dict(doc, eg_dict)
|
||||
examples = example.split_sents()
|
||||
|
||||
else:
|
||||
example = Example.from_dict(doc, eg_dict)
|
||||
examples = [example]
|
||||
|
||||
for eg in examples:
|
||||
if (not max_length) or len(eg.predicted) < max_length:
|
||||
if ignore_misaligned:
|
||||
try:
|
||||
_ = eg._deprecated_get_gold()
|
||||
except AlignmentError:
|
||||
continue
|
||||
yield eg
|
||||
|
|
|
@ -11,6 +11,7 @@ import pytest
|
|||
import srsly
|
||||
|
||||
from .util import make_tempdir
|
||||
from ..gold.augment import make_orth_variants_example
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -387,8 +388,8 @@ def test_make_orth_variants(doc):
|
|||
goldcorpus = GoldCorpus(str(json_file), str(json_file))
|
||||
|
||||
# due to randomness, test only that this runs with no errors for now
|
||||
train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2))
|
||||
train_goldparse = get_parses_from_example(train_reloaded_example)[0][1]
|
||||
train_example = next(goldcorpus.train_dataset(nlp))
|
||||
variant_example = make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -499,19 +500,6 @@ def test_split_sents(merged_dict):
|
|||
)
|
||||
assert example.text == "Hi there everyone It is just me"
|
||||
|
||||
assert len(get_parses_from_example(
|
||||
example,
|
||||
merge=False,
|
||||
vocab=nlp.vocab,
|
||||
make_projective=False)
|
||||
) == 2
|
||||
assert len(get_parses_from_example(
|
||||
example,
|
||||
merge=True,
|
||||
vocab=nlp.vocab,
|
||||
make_projective=False
|
||||
)) == 1
|
||||
|
||||
split_examples = example.split_sents()
|
||||
assert len(split_examples) == 2
|
||||
assert split_examples[0].text == "Hi there everyone "
|
||||
|
|
Loading…
Reference in New Issue
Block a user