From 74da669326eaa45d878d303643abe88cf4c84d60 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 29 Apr 2020 13:01:25 +0200 Subject: [PATCH] Fix problems with lower and whitespace in variants (#5361) * Initialize lower flag explicitly * Handle whitespace words from GoldParse correctly when creating raw text with orth variants * Return the text with original casing if anything goes wrong --- spacy/gold.pyx | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index e8274563f..034bba08f 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -331,6 +331,8 @@ class GoldCorpus(object): def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): if random.random() >= orth_variant_level: return raw, paragraph_tuples + raw_orig = str(raw) + lower = False if random.random() >= 0.5: lower = True if raw is not None: @@ -391,8 +393,11 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): ids, words, tags, heads, labels, ner = sent_tuples for word in words: match_found = False + # skip whitespace words + if word.isspace(): + match_found = True # add identical word - if word not in variants and raw[raw_idx:].startswith(word): + elif word not in variants and raw[raw_idx:].startswith(word): variant_raw += word raw_idx += len(word) match_found = True @@ -407,7 +412,7 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): # something went wrong, abort # (add a warning message?) if not match_found: - return raw, paragraph_tuples + return raw_orig, paragraph_tuples # add following whitespace while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): variant_raw += raw[raw_idx]