Add train_docs() option to add orth variants

Filtering by orth and tag, create variants of training docs with alternate orth variants, e.g., unicode quotes, dashes, and ellipses. The variants can be single tokens (dashes) or paired tokens (quotes) with left and right versions. Currently restricted to only add variants to training documents without raw text provided, where only gold.words needs to be modified.
2025-02-15 02:50:33 +03:00 · 2019-08-28 09:14:20 +02:00 · 2019-08-28 09:14:20 +02:00 · aae05ff16b
commit aae05ff16b
parent bae0455f91
2 changed files with 56 additions and 7 deletions
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -7,6 +7,7 @@ import random
 import numpy
 import tempfile
 import shutil
+import itertools
 from pathlib import Path
 import srsly

@ -206,13 +207,14 @@ class GoldCorpus(object):
        return n

    def train_docs(self, nlp, gold_preproc=False, max_length=None,
-                    noise_level=0.0):
+                    noise_level=0.0, orth_variant_level=0.0):
        locs = list((self.tmp_dir / 'train').iterdir())
        random.shuffle(locs)
        train_tuples = self.read_tuples(locs, limit=self.limit)
        gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
                                        max_length=max_length,
                                        noise_level=noise_level,
+                                        orth_variant_level=orth_variant_level,
                                        make_projective=True)
        yield from gold_docs

@ -226,27 +228,31 @@ class GoldCorpus(object):

    @classmethod
    def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
-                       noise_level=0.0, make_projective=False):
+                       noise_level=0.0, orth_variant_level=0.0, make_projective=False):
        for raw_text, paragraph_tuples in tuples:
            if gold_preproc:
                raw_text = None
            else:
                paragraph_tuples = merge_sents(paragraph_tuples)
-            docs = cls._make_docs(nlp, raw_text, paragraph_tuples, gold_preproc,
-                                  noise_level=noise_level)
+            docs, paragraph_tuples = cls._make_docs(nlp, raw_text,
+                    paragraph_tuples, gold_preproc, noise_level=noise_level,
+                    orth_variant_level=orth_variant_level)
            golds = cls._make_golds(docs, paragraph_tuples, make_projective)
            for doc, gold in zip(docs, golds):
                if (not max_length) or len(doc) < max_length:
                    yield doc, gold

    @classmethod
-    def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0):
+    def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
        if raw_text is not None:
            raw_text = add_noise(raw_text, noise_level)
-            return [nlp.make_doc(raw_text)]
+            return [nlp.make_doc(raw_text)], paragraph_tuples
        else:
+            docs = []
+            raw_text, paragraph_tuples = make_orth_variants(nlp, None, paragraph_tuples, orth_variant_level)
            return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
-                    for (sent_tuples, brackets) in paragraph_tuples]
+                    for (sent_tuples, brackets) in paragraph_tuples], paragraph_tuples
+

    @classmethod
    def _make_golds(cls, docs, paragraph_tuples, make_projective):
@ -263,6 +269,47 @@ class GoldCorpus(object):
                    in zip(docs, paragraph_tuples)]


+def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
+    if random.random() >= orth_variant_level:
+        return raw, paragraph_tuples
+    variant_paragraph_tuples = []
+    for sent_tuples, brackets in paragraph_tuples:
+        ids, words, tags, heads, labels, ner = sent_tuples
+        # single variants
+        ndsv = nlp.Defaults.single_orth_variants
+        punct_choices = [random.choice(x["variants"]) for x in ndsv]
+        for word_idx in range(len(words)):
+            for punct_idx in range(len(ndsv)):
+                if tags[word_idx] in ndsv[punct_idx]["tags"] \
+                        and words[word_idx] in ndsv[punct_idx]["variants"]:
+                    words[word_idx] = punct_choices[punct_idx]
+        # paired variants
+        ndpv = nlp.Defaults.paired_orth_variants
+        punct_choices = [random.choice(x["variants"]) for x in ndpv]
+        for word_idx in range(len(words)):
+            for punct_idx in range(len(ndpv)):
+                if tags[word_idx] in ndpv[punct_idx]["tags"] \
+                        and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
+                    # backup option: random left vs. right from pair
+                    pair_idx = random.choice([0, 1])
+                    # best option: rely on paired POS tags like `` / ''
+                    if len(ndpv[punct_idx]["tags"]) == 2:
+                        pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
+                    # next best option: rely on position in variants
+                    # (may not be unambiguous, so order of variants matters)
+                    else:
+                        for pair in ndpv[punct_idx]["variants"]:
+                            if words[word_idx] in pair:
+                                pair_idx = pair.index(words[word_idx])
+                    words[word_idx] = punct_choices[punct_idx][pair_idx]
+
+        variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner), brackets))
+        if raw is not None:
+            # TODO: modify raw text accordingly
+            return raw, paragraph_tuples
+        return raw, variant_paragraph_tuples
+
+
 def add_noise(orig, noise_level):
    if random.random() >= noise_level:
        return orig
--- a/spacy/language.py
+++ b/spacy/language.py
@ -108,6 +108,8 @@ class BaseDefaults(object):
    syntax_iterators = {}
    resources = {}
    writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
+    single_orth_variants = []
+    paired_orth_variants = []


 class Language(object):