fix augment (needs further testing)

This commit is contained in:
svlandeg 2020-06-17 10:46:29 +02:00
parent 4ed399c848
commit 3c4f9e4cc4

View File

@ -1,14 +1,14 @@
import random import random
import itertools import itertools
from .example import Example
def make_orth_variants(nlp, example, orth_variant_level=0.0): def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
if random.random() >= orth_variant_level: if random.random() >= orth_variant_level:
return example return raw_text, orig_token_dict
if not example.token_annotation: if not orig_token_dict:
return example return raw_text, orig_token_dict
raw = example.text raw = raw_text
token_dict = orig_token_dict
lower = False lower = False
if random.random() >= 0.5: if random.random() >= 0.5:
lower = True lower = True
@ -16,16 +16,10 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
raw = raw.lower() raw = raw.lower()
ndsv = nlp.Defaults.single_orth_variants ndsv = nlp.Defaults.single_orth_variants
ndpv = nlp.Defaults.paired_orth_variants ndpv = nlp.Defaults.paired_orth_variants
# modify words in paragraph_tuples words = token_dict.get("words", [])
variant_example = Example(doc=nlp.make_doc(raw)) tags = token_dict.get("tags", [])
token_annotation = example.token_annotation # keep unmodified if words or tags are not defined
words = token_annotation.words if words and tags:
tags = token_annotation.tags
if not words or not tags:
# add the unmodified annotation
token_dict = token_annotation.to_dict()
variant_example.token_annotation = TokenAnnotation(**token_dict)
else:
if lower: if lower:
words = [w.lower() for w in words] words = [w.lower() for w in words]
# single variants # single variants
@ -56,12 +50,9 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
if words[word_idx] in pair: if words[word_idx] in pair:
pair_idx = pair.index(words[word_idx]) pair_idx = pair.index(words[word_idx])
words[word_idx] = punct_choices[punct_idx][pair_idx] words[word_idx] = punct_choices[punct_idx][pair_idx]
token_dict = token_annotation.to_dict()
token_dict["words"] = words token_dict["words"] = words
token_dict["tags"] = tags token_dict["tags"] = tags
variant_example.token_annotation = TokenAnnotation(**token_dict) # modify raw
# modify raw to match variant_paragraph_tuples
if raw is not None: if raw is not None:
variants = [] variants = []
for single_variants in ndsv: for single_variants in ndsv:
@ -80,7 +71,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
while raw_idx < len(raw) and raw[raw_idx].isspace(): while raw_idx < len(raw) and raw[raw_idx].isspace():
variant_raw += raw[raw_idx] variant_raw += raw[raw_idx]
raw_idx += 1 raw_idx += 1
for word in variant_example.token_annotation.words: for word in words:
match_found = False match_found = False
# skip whitespace words # skip whitespace words
if word.isspace(): if word.isspace():
@ -100,14 +91,13 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
# something went wrong, abort # something went wrong, abort
# (add a warning message?) # (add a warning message?)
if not match_found: if not match_found:
return example return raw_text, orig_token_dict
# add following whitespace # add following whitespace
while raw_idx < len(raw) and raw[raw_idx].isspace(): while raw_idx < len(raw) and raw[raw_idx].isspace():
variant_raw += raw[raw_idx] variant_raw += raw[raw_idx]
raw_idx += 1 raw_idx += 1
variant_example.doc = variant_raw raw = variant_raw
return variant_example return raw, token_dict
return variant_example
def add_noise(orig, noise_level): def add_noise(orig, noise_level):