From aae05ff16bcf12e4f60bb5936c3bf5728a96b78c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 28 Aug 2019 09:14:20 +0200 Subject: [PATCH] Add train_docs() option to add orth variants Filtering by orth and tag, create variants of training docs with alternate orth variants, e.g., unicode quotes, dashes, and ellipses. The variants can be single tokens (dashes) or paired tokens (quotes) with left and right versions. Currently restricted to only add variants to training documents without raw text provided, where only gold.words needs to be modified. --- spacy/gold.pyx | 61 +++++++++++++++++++++++++++++++++++++++++------ spacy/language.py | 2 ++ 2 files changed, 56 insertions(+), 7 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index f6ec8d3fa..1cd49814a 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -7,6 +7,7 @@ import random import numpy import tempfile import shutil +import itertools from pathlib import Path import srsly @@ -206,13 +207,14 @@ class GoldCorpus(object): return n def train_docs(self, nlp, gold_preproc=False, max_length=None, - noise_level=0.0): + noise_level=0.0, orth_variant_level=0.0): locs = list((self.tmp_dir / 'train').iterdir()) random.shuffle(locs) train_tuples = self.read_tuples(locs, limit=self.limit) gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, max_length=max_length, noise_level=noise_level, + orth_variant_level=orth_variant_level, make_projective=True) yield from gold_docs @@ -226,27 +228,31 @@ class GoldCorpus(object): @classmethod def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None, - noise_level=0.0, make_projective=False): + noise_level=0.0, orth_variant_level=0.0, make_projective=False): for raw_text, paragraph_tuples in tuples: if gold_preproc: raw_text = None else: paragraph_tuples = merge_sents(paragraph_tuples) - docs = cls._make_docs(nlp, raw_text, paragraph_tuples, gold_preproc, - noise_level=noise_level) + docs, paragraph_tuples = cls._make_docs(nlp, raw_text, + paragraph_tuples, gold_preproc, noise_level=noise_level, + orth_variant_level=orth_variant_level) golds = cls._make_golds(docs, paragraph_tuples, make_projective) for doc, gold in zip(docs, golds): if (not max_length) or len(doc) < max_length: yield doc, gold @classmethod - def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0): + def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0): if raw_text is not None: raw_text = add_noise(raw_text, noise_level) - return [nlp.make_doc(raw_text)] + return [nlp.make_doc(raw_text)], paragraph_tuples else: + docs = [] + raw_text, paragraph_tuples = make_orth_variants(nlp, None, paragraph_tuples, orth_variant_level) return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level)) - for (sent_tuples, brackets) in paragraph_tuples] + for (sent_tuples, brackets) in paragraph_tuples], paragraph_tuples + @classmethod def _make_golds(cls, docs, paragraph_tuples, make_projective): @@ -263,6 +269,47 @@ class GoldCorpus(object): in zip(docs, paragraph_tuples)] +def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): + if random.random() >= orth_variant_level: + return raw, paragraph_tuples + variant_paragraph_tuples = [] + for sent_tuples, brackets in paragraph_tuples: + ids, words, tags, heads, labels, ner = sent_tuples + # single variants + ndsv = nlp.Defaults.single_orth_variants + punct_choices = [random.choice(x["variants"]) for x in ndsv] + for word_idx in range(len(words)): + for punct_idx in range(len(ndsv)): + if tags[word_idx] in ndsv[punct_idx]["tags"] \ + and words[word_idx] in ndsv[punct_idx]["variants"]: + words[word_idx] = punct_choices[punct_idx] + # paired variants + ndpv = nlp.Defaults.paired_orth_variants + punct_choices = [random.choice(x["variants"]) for x in ndpv] + for word_idx in range(len(words)): + for punct_idx in range(len(ndpv)): + if tags[word_idx] in ndpv[punct_idx]["tags"] \ + and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): + # backup option: random left vs. right from pair + pair_idx = random.choice([0, 1]) + # best option: rely on paired POS tags like `` / '' + if len(ndpv[punct_idx]["tags"]) == 2: + pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) + # next best option: rely on position in variants + # (may not be unambiguous, so order of variants matters) + else: + for pair in ndpv[punct_idx]["variants"]: + if words[word_idx] in pair: + pair_idx = pair.index(words[word_idx]) + words[word_idx] = punct_choices[punct_idx][pair_idx] + + variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner), brackets)) + if raw is not None: + # TODO: modify raw text accordingly + return raw, paragraph_tuples + return raw, variant_paragraph_tuples + + def add_noise(orig, noise_level): if random.random() >= noise_level: return orig diff --git a/spacy/language.py b/spacy/language.py index 86acf0257..0cf3528a2 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -108,6 +108,8 @@ class BaseDefaults(object): syntax_iterators = {} resources = {} writing_system = {"direction": "ltr", "has_case": True, "has_letters": True} + single_orth_variants = [] + paired_orth_variants = [] class Language(object):