mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Fix problems with lower and whitespace in variants (#5361)
* Initialize lower flag explicitly * Handle whitespace words from GoldParse correctly when creating raw text with orth variants * Return the text with original casing if anything goes wrong
This commit is contained in:
parent
3f43c73d37
commit
74da669326
|
@ -331,6 +331,8 @@ class GoldCorpus(object):
|
||||||
def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
||||||
if random.random() >= orth_variant_level:
|
if random.random() >= orth_variant_level:
|
||||||
return raw, paragraph_tuples
|
return raw, paragraph_tuples
|
||||||
|
raw_orig = str(raw)
|
||||||
|
lower = False
|
||||||
if random.random() >= 0.5:
|
if random.random() >= 0.5:
|
||||||
lower = True
|
lower = True
|
||||||
if raw is not None:
|
if raw is not None:
|
||||||
|
@ -391,8 +393,11 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
||||||
ids, words, tags, heads, labels, ner = sent_tuples
|
ids, words, tags, heads, labels, ner = sent_tuples
|
||||||
for word in words:
|
for word in words:
|
||||||
match_found = False
|
match_found = False
|
||||||
|
# skip whitespace words
|
||||||
|
if word.isspace():
|
||||||
|
match_found = True
|
||||||
# add identical word
|
# add identical word
|
||||||
if word not in variants and raw[raw_idx:].startswith(word):
|
elif word not in variants and raw[raw_idx:].startswith(word):
|
||||||
variant_raw += word
|
variant_raw += word
|
||||||
raw_idx += len(word)
|
raw_idx += len(word)
|
||||||
match_found = True
|
match_found = True
|
||||||
|
@ -407,7 +412,7 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
||||||
# something went wrong, abort
|
# something went wrong, abort
|
||||||
# (add a warning message?)
|
# (add a warning message?)
|
||||||
if not match_found:
|
if not match_found:
|
||||||
return raw, paragraph_tuples
|
return raw_orig, paragraph_tuples
|
||||||
# add following whitespace
|
# add following whitespace
|
||||||
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
||||||
variant_raw += raw[raw_idx]
|
variant_raw += raw[raw_idx]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user