Fix orth replacement

This commit is contained in:
Matthew Honnibal 2019-09-19 00:03:24 +02:00
parent 42df49133d
commit 7d510c833e

View File

@ -274,13 +274,14 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
return raw, paragraph_tuples return raw, paragraph_tuples
if random.random() >= 0.5: if random.random() >= 0.5:
lower = True lower = True
if raw is not None:
raw = raw.lower() raw = raw.lower()
ndsv = nlp.Defaults.single_orth_variants ndsv = nlp.Defaults.single_orth_variants
ndpv = nlp.Defaults.paired_orth_variants ndpv = nlp.Defaults.paired_orth_variants
# modify words in paragraph_tuples # modify words in paragraph_tuples
variant_paragraph_tuples = [] variant_paragraph_tuples = []
for sent_tuples, brackets in paragraph_tuples: for sent_tuples, brackets in paragraph_tuples:
ids, words, tags, heads, labels, ner = sent_tuples ids, words, tags, heads, labels, ner, cats = sent_tuples
if lower: if lower:
words = [w.lower() for w in words] words = [w.lower() for w in words]
# single variants # single variants
@ -309,7 +310,7 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
pair_idx = pair.index(words[word_idx]) pair_idx = pair.index(words[word_idx])
words[word_idx] = punct_choices[punct_idx][pair_idx] words[word_idx] = punct_choices[punct_idx][pair_idx]
variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner), brackets)) variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner, cats), brackets))
# modify raw to match variant_paragraph_tuples # modify raw to match variant_paragraph_tuples
if raw is not None: if raw is not None:
variants = [] variants = []
@ -328,7 +329,7 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
variant_raw += raw[raw_idx] variant_raw += raw[raw_idx]
raw_idx += 1 raw_idx += 1
for sent_tuples, brackets in variant_paragraph_tuples: for sent_tuples, brackets in variant_paragraph_tuples:
ids, words, tags, heads, labels, ner = sent_tuples ids, words, tags, heads, labels, ner, cats = sent_tuples
for word in words: for word in words:
match_found = False match_found = False
# add identical word # add identical word