Fix problems with lower and whitespace in variants (#5361)

* Initialize lower flag explicitly

* Handle whitespace words from GoldParse correctly when creating raw
text with orth variants

* Return the text with original casing if anything goes wrong
This commit is contained in:
adrianeboyd 2020-04-29 13:01:25 +02:00 committed by GitHub
parent 3f43c73d37
commit 74da669326
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -331,6 +331,8 @@ class GoldCorpus(object):
def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
if random.random() >= orth_variant_level:
return raw, paragraph_tuples
raw_orig = str(raw)
lower = False
if random.random() >= 0.5:
lower = True
if raw is not None:
@ -391,8 +393,11 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
ids, words, tags, heads, labels, ner = sent_tuples
for word in words:
match_found = False
# skip whitespace words
if word.isspace():
match_found = True
# add identical word
if word not in variants and raw[raw_idx:].startswith(word):
elif word not in variants and raw[raw_idx:].startswith(word):
variant_raw += word
raw_idx += len(word)
match_found = True
@ -407,7 +412,7 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
# something went wrong, abort
# (add a warning message?)
if not match_found:
return raw, paragraph_tuples
return raw_orig, paragraph_tuples
# add following whitespace
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
variant_raw += raw[raw_idx]