mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-11 17:10:36 +03:00
fix augment (needs further testing)
This commit is contained in:
parent
4ed399c848
commit
3c4f9e4cc4
|
@ -1,14 +1,14 @@
|
||||||
import random
|
import random
|
||||||
import itertools
|
import itertools
|
||||||
from .example import Example
|
|
||||||
|
|
||||||
|
|
||||||
def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
|
||||||
if random.random() >= orth_variant_level:
|
if random.random() >= orth_variant_level:
|
||||||
return example
|
return raw_text, orig_token_dict
|
||||||
if not example.token_annotation:
|
if not orig_token_dict:
|
||||||
return example
|
return raw_text, orig_token_dict
|
||||||
raw = example.text
|
raw = raw_text
|
||||||
|
token_dict = orig_token_dict
|
||||||
lower = False
|
lower = False
|
||||||
if random.random() >= 0.5:
|
if random.random() >= 0.5:
|
||||||
lower = True
|
lower = True
|
||||||
|
@ -16,16 +16,10 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
||||||
raw = raw.lower()
|
raw = raw.lower()
|
||||||
ndsv = nlp.Defaults.single_orth_variants
|
ndsv = nlp.Defaults.single_orth_variants
|
||||||
ndpv = nlp.Defaults.paired_orth_variants
|
ndpv = nlp.Defaults.paired_orth_variants
|
||||||
# modify words in paragraph_tuples
|
words = token_dict.get("words", [])
|
||||||
variant_example = Example(doc=nlp.make_doc(raw))
|
tags = token_dict.get("tags", [])
|
||||||
token_annotation = example.token_annotation
|
# keep unmodified if words or tags are not defined
|
||||||
words = token_annotation.words
|
if words and tags:
|
||||||
tags = token_annotation.tags
|
|
||||||
if not words or not tags:
|
|
||||||
# add the unmodified annotation
|
|
||||||
token_dict = token_annotation.to_dict()
|
|
||||||
variant_example.token_annotation = TokenAnnotation(**token_dict)
|
|
||||||
else:
|
|
||||||
if lower:
|
if lower:
|
||||||
words = [w.lower() for w in words]
|
words = [w.lower() for w in words]
|
||||||
# single variants
|
# single variants
|
||||||
|
@ -56,12 +50,9 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
||||||
if words[word_idx] in pair:
|
if words[word_idx] in pair:
|
||||||
pair_idx = pair.index(words[word_idx])
|
pair_idx = pair.index(words[word_idx])
|
||||||
words[word_idx] = punct_choices[punct_idx][pair_idx]
|
words[word_idx] = punct_choices[punct_idx][pair_idx]
|
||||||
|
|
||||||
token_dict = token_annotation.to_dict()
|
|
||||||
token_dict["words"] = words
|
token_dict["words"] = words
|
||||||
token_dict["tags"] = tags
|
token_dict["tags"] = tags
|
||||||
variant_example.token_annotation = TokenAnnotation(**token_dict)
|
# modify raw
|
||||||
# modify raw to match variant_paragraph_tuples
|
|
||||||
if raw is not None:
|
if raw is not None:
|
||||||
variants = []
|
variants = []
|
||||||
for single_variants in ndsv:
|
for single_variants in ndsv:
|
||||||
|
@ -80,7 +71,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
||||||
while raw_idx < len(raw) and raw[raw_idx].isspace():
|
while raw_idx < len(raw) and raw[raw_idx].isspace():
|
||||||
variant_raw += raw[raw_idx]
|
variant_raw += raw[raw_idx]
|
||||||
raw_idx += 1
|
raw_idx += 1
|
||||||
for word in variant_example.token_annotation.words:
|
for word in words:
|
||||||
match_found = False
|
match_found = False
|
||||||
# skip whitespace words
|
# skip whitespace words
|
||||||
if word.isspace():
|
if word.isspace():
|
||||||
|
@ -100,14 +91,13 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
||||||
# something went wrong, abort
|
# something went wrong, abort
|
||||||
# (add a warning message?)
|
# (add a warning message?)
|
||||||
if not match_found:
|
if not match_found:
|
||||||
return example
|
return raw_text, orig_token_dict
|
||||||
# add following whitespace
|
# add following whitespace
|
||||||
while raw_idx < len(raw) and raw[raw_idx].isspace():
|
while raw_idx < len(raw) and raw[raw_idx].isspace():
|
||||||
variant_raw += raw[raw_idx]
|
variant_raw += raw[raw_idx]
|
||||||
raw_idx += 1
|
raw_idx += 1
|
||||||
variant_example.doc = variant_raw
|
raw = variant_raw
|
||||||
return variant_example
|
return raw, token_dict
|
||||||
return variant_example
|
|
||||||
|
|
||||||
|
|
||||||
def add_noise(orig, noise_level):
|
def add_noise(orig, noise_level):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user