From aae05ff16bcf12e4f60bb5936c3bf5728a96b78c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 28 Aug 2019 09:14:20 +0200
Subject: [PATCH] Add train_docs() option to add orth variants

Filtering by orth and tag, create variants of training docs with
alternate orth variants, e.g., unicode quotes, dashes, and ellipses.

The variants can be single tokens (dashes) or paired tokens (quotes)
with left and right versions.

Currently restricted to only add variants to training documents without
raw text provided, where only gold.words needs to be modified.
---
 spacy/gold.pyx    | 61 +++++++++++++++++++++++++++++++++++++++++------
 spacy/language.py |  2 ++
 2 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index f6ec8d3fa..1cd49814a 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -7,6 +7,7 @@ import random
 import numpy
 import tempfile
 import shutil
+import itertools
 from pathlib import Path
 import srsly
 
@@ -206,13 +207,14 @@ class GoldCorpus(object):
         return n
 
     def train_docs(self, nlp, gold_preproc=False, max_length=None,
-                    noise_level=0.0):
+                    noise_level=0.0, orth_variant_level=0.0):
         locs = list((self.tmp_dir / 'train').iterdir())
         random.shuffle(locs)
         train_tuples = self.read_tuples(locs, limit=self.limit)
         gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
                                         max_length=max_length,
                                         noise_level=noise_level,
+                                        orth_variant_level=orth_variant_level,
                                         make_projective=True)
         yield from gold_docs
 
@@ -226,27 +228,31 @@ class GoldCorpus(object):
 
     @classmethod
     def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
-                       noise_level=0.0, make_projective=False):
+                       noise_level=0.0, orth_variant_level=0.0, make_projective=False):
         for raw_text, paragraph_tuples in tuples:
             if gold_preproc:
                 raw_text = None
             else:
                 paragraph_tuples = merge_sents(paragraph_tuples)
-            docs = cls._make_docs(nlp, raw_text, paragraph_tuples, gold_preproc,
-                                  noise_level=noise_level)
+            docs, paragraph_tuples = cls._make_docs(nlp, raw_text,
+                    paragraph_tuples, gold_preproc, noise_level=noise_level,
+                    orth_variant_level=orth_variant_level)
             golds = cls._make_golds(docs, paragraph_tuples, make_projective)
             for doc, gold in zip(docs, golds):
                 if (not max_length) or len(doc) < max_length:
                     yield doc, gold
 
     @classmethod
-    def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0):
+    def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
         if raw_text is not None:
             raw_text = add_noise(raw_text, noise_level)
-            return [nlp.make_doc(raw_text)]
+            return [nlp.make_doc(raw_text)], paragraph_tuples
         else:
+            docs = []
+            raw_text, paragraph_tuples = make_orth_variants(nlp, None, paragraph_tuples, orth_variant_level)
             return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
-                    for (sent_tuples, brackets) in paragraph_tuples]
+                    for (sent_tuples, brackets) in paragraph_tuples], paragraph_tuples
+
 
     @classmethod
     def _make_golds(cls, docs, paragraph_tuples, make_projective):
@@ -263,6 +269,47 @@ class GoldCorpus(object):
                     in zip(docs, paragraph_tuples)]
 
 
+def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
+    if random.random() >= orth_variant_level:
+        return raw, paragraph_tuples
+    variant_paragraph_tuples = []
+    for sent_tuples, brackets in paragraph_tuples:
+        ids, words, tags, heads, labels, ner = sent_tuples
+        # single variants
+        ndsv = nlp.Defaults.single_orth_variants
+        punct_choices = [random.choice(x["variants"]) for x in ndsv]
+        for word_idx in range(len(words)):
+            for punct_idx in range(len(ndsv)):
+                if tags[word_idx] in ndsv[punct_idx]["tags"] \
+                        and words[word_idx] in ndsv[punct_idx]["variants"]:
+                    words[word_idx] = punct_choices[punct_idx]
+        # paired variants
+        ndpv = nlp.Defaults.paired_orth_variants
+        punct_choices = [random.choice(x["variants"]) for x in ndpv]
+        for word_idx in range(len(words)):
+            for punct_idx in range(len(ndpv)):
+                if tags[word_idx] in ndpv[punct_idx]["tags"] \
+                        and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
+                    # backup option: random left vs. right from pair
+                    pair_idx = random.choice([0, 1])
+                    # best option: rely on paired POS tags like `` / ''
+                    if len(ndpv[punct_idx]["tags"]) == 2:
+                        pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
+                    # next best option: rely on position in variants
+                    # (may not be unambiguous, so order of variants matters)
+                    else:
+                        for pair in ndpv[punct_idx]["variants"]:
+                            if words[word_idx] in pair:
+                                pair_idx = pair.index(words[word_idx])
+                    words[word_idx] = punct_choices[punct_idx][pair_idx]
+
+        variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner), brackets))
+        if raw is not None:
+            # TODO: modify raw text accordingly
+            return raw, paragraph_tuples
+        return raw, variant_paragraph_tuples
+
+
 def add_noise(orig, noise_level):
     if random.random() >= noise_level:
         return orig
diff --git a/spacy/language.py b/spacy/language.py
index 86acf0257..0cf3528a2 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -108,6 +108,8 @@ class BaseDefaults(object):
     syntax_iterators = {}
     resources = {}
     writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
+    single_orth_variants = []
+    paired_orth_variants = []
 
 
 class Language(object):