Fix problems with lower and whitespace in variants

Port relevant changes from #5361:

* Initialize lower flag explicitly

* Handle whitespace words from GoldParse correctly when creating raw
text with orth variants
This commit is contained in:
Adriane Boyd 2020-06-03 14:15:58 +02:00
parent 10d938f221
commit 1d8168d1fd

View File

@ -362,6 +362,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
if not example.token_annotation: if not example.token_annotation:
return example return example
raw = example.text raw = example.text
lower = False
if random.random() >= 0.5: if random.random() >= 0.5:
lower = True lower = True
if raw is not None: if raw is not None:
@ -429,8 +430,11 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
raw_idx += 1 raw_idx += 1
for word in variant_example.token_annotation.words: for word in variant_example.token_annotation.words:
match_found = False match_found = False
# skip whitespace words
if word.isspace():
match_found = True
# add identical word # add identical word
if word not in variants and raw[raw_idx:].startswith(word): elif word not in variants and raw[raw_idx:].startswith(word):
variant_raw += word variant_raw += word
raw_idx += len(word) raw_idx += len(word)
match_found = True match_found = True
@ -445,6 +449,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
# something went wrong, abort # something went wrong, abort
# (add a warning message?) # (add a warning message?)
if not match_found: if not match_found:
print("aborting")
return example return example
# add following whitespace # add following whitespace
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):