Add train_docs() option to add orth variants

Filtering by orth and tag, create variants of training docs with
alternate orth variants, e.g., unicode quotes, dashes, and ellipses.

The variants can be single tokens (dashes) or paired tokens (quotes)
with left and right versions.

Currently restricted to only add variants to training documents without
raw text provided, where only gold.words needs to be modified.
This commit is contained in:
Adriane Boyd 2019-08-28 09:14:20 +02:00
parent bae0455f91
commit aae05ff16b
2 changed files with 56 additions and 7 deletions

View File

@ -7,6 +7,7 @@ import random
import numpy import numpy
import tempfile import tempfile
import shutil import shutil
import itertools
from pathlib import Path from pathlib import Path
import srsly import srsly
@ -206,13 +207,14 @@ class GoldCorpus(object):
return n return n
def train_docs(self, nlp, gold_preproc=False, max_length=None, def train_docs(self, nlp, gold_preproc=False, max_length=None,
noise_level=0.0): noise_level=0.0, orth_variant_level=0.0):
locs = list((self.tmp_dir / 'train').iterdir()) locs = list((self.tmp_dir / 'train').iterdir())
random.shuffle(locs) random.shuffle(locs)
train_tuples = self.read_tuples(locs, limit=self.limit) train_tuples = self.read_tuples(locs, limit=self.limit)
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
max_length=max_length, max_length=max_length,
noise_level=noise_level, noise_level=noise_level,
orth_variant_level=orth_variant_level,
make_projective=True) make_projective=True)
yield from gold_docs yield from gold_docs
@ -226,27 +228,31 @@ class GoldCorpus(object):
@classmethod @classmethod
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None, def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
noise_level=0.0, make_projective=False): noise_level=0.0, orth_variant_level=0.0, make_projective=False):
for raw_text, paragraph_tuples in tuples: for raw_text, paragraph_tuples in tuples:
if gold_preproc: if gold_preproc:
raw_text = None raw_text = None
else: else:
paragraph_tuples = merge_sents(paragraph_tuples) paragraph_tuples = merge_sents(paragraph_tuples)
docs = cls._make_docs(nlp, raw_text, paragraph_tuples, gold_preproc, docs, paragraph_tuples = cls._make_docs(nlp, raw_text,
noise_level=noise_level) paragraph_tuples, gold_preproc, noise_level=noise_level,
orth_variant_level=orth_variant_level)
golds = cls._make_golds(docs, paragraph_tuples, make_projective) golds = cls._make_golds(docs, paragraph_tuples, make_projective)
for doc, gold in zip(docs, golds): for doc, gold in zip(docs, golds):
if (not max_length) or len(doc) < max_length: if (not max_length) or len(doc) < max_length:
yield doc, gold yield doc, gold
@classmethod @classmethod
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0): def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
if raw_text is not None: if raw_text is not None:
raw_text = add_noise(raw_text, noise_level) raw_text = add_noise(raw_text, noise_level)
return [nlp.make_doc(raw_text)] return [nlp.make_doc(raw_text)], paragraph_tuples
else: else:
docs = []
raw_text, paragraph_tuples = make_orth_variants(nlp, None, paragraph_tuples, orth_variant_level)
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level)) return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
for (sent_tuples, brackets) in paragraph_tuples] for (sent_tuples, brackets) in paragraph_tuples], paragraph_tuples
@classmethod @classmethod
def _make_golds(cls, docs, paragraph_tuples, make_projective): def _make_golds(cls, docs, paragraph_tuples, make_projective):
@ -263,6 +269,47 @@ class GoldCorpus(object):
in zip(docs, paragraph_tuples)] in zip(docs, paragraph_tuples)]
def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
if random.random() >= orth_variant_level:
return raw, paragraph_tuples
variant_paragraph_tuples = []
for sent_tuples, brackets in paragraph_tuples:
ids, words, tags, heads, labels, ner = sent_tuples
# single variants
ndsv = nlp.Defaults.single_orth_variants
punct_choices = [random.choice(x["variants"]) for x in ndsv]
for word_idx in range(len(words)):
for punct_idx in range(len(ndsv)):
if tags[word_idx] in ndsv[punct_idx]["tags"] \
and words[word_idx] in ndsv[punct_idx]["variants"]:
words[word_idx] = punct_choices[punct_idx]
# paired variants
ndpv = nlp.Defaults.paired_orth_variants
punct_choices = [random.choice(x["variants"]) for x in ndpv]
for word_idx in range(len(words)):
for punct_idx in range(len(ndpv)):
if tags[word_idx] in ndpv[punct_idx]["tags"] \
and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
# backup option: random left vs. right from pair
pair_idx = random.choice([0, 1])
# best option: rely on paired POS tags like `` / ''
if len(ndpv[punct_idx]["tags"]) == 2:
pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
# next best option: rely on position in variants
# (may not be unambiguous, so order of variants matters)
else:
for pair in ndpv[punct_idx]["variants"]:
if words[word_idx] in pair:
pair_idx = pair.index(words[word_idx])
words[word_idx] = punct_choices[punct_idx][pair_idx]
variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner), brackets))
if raw is not None:
# TODO: modify raw text accordingly
return raw, paragraph_tuples
return raw, variant_paragraph_tuples
def add_noise(orig, noise_level): def add_noise(orig, noise_level):
if random.random() >= noise_level: if random.random() >= noise_level:
return orig return orig

View File

@ -108,6 +108,8 @@ class BaseDefaults(object):
syntax_iterators = {} syntax_iterators = {}
resources = {} resources = {}
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True} writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
single_orth_variants = []
paired_orth_variants = []
class Language(object): class Language(object):