mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Fix problems with lower and whitespace in variants
Port relevant changes from #5361: * Initialize lower flag explicitly * Handle whitespace words from GoldParse correctly when creating raw text with orth variants
This commit is contained in:
parent
10d938f221
commit
1d8168d1fd
|
@ -362,6 +362,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
|||
if not example.token_annotation:
|
||||
return example
|
||||
raw = example.text
|
||||
lower = False
|
||||
if random.random() >= 0.5:
|
||||
lower = True
|
||||
if raw is not None:
|
||||
|
@ -429,8 +430,11 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
|||
raw_idx += 1
|
||||
for word in variant_example.token_annotation.words:
|
||||
match_found = False
|
||||
# skip whitespace words
|
||||
if word.isspace():
|
||||
match_found = True
|
||||
# add identical word
|
||||
if word not in variants and raw[raw_idx:].startswith(word):
|
||||
elif word not in variants and raw[raw_idx:].startswith(word):
|
||||
variant_raw += word
|
||||
raw_idx += len(word)
|
||||
match_found = True
|
||||
|
@ -445,6 +449,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
|||
# something went wrong, abort
|
||||
# (add a warning message?)
|
||||
if not match_found:
|
||||
print("aborting")
|
||||
return example
|
||||
# add following whitespace
|
||||
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
||||
|
|
Loading…
Reference in New Issue
Block a user