From 3d2c308906e2bde7ca57d2e8213252530b944502 Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Tue, 14 Apr 2020 19:15:52 +0200
Subject: [PATCH] Add Doc init from list of words and text (#5251)

* Add Doc init from list of words and text

Add an option to initialize a `Doc` from a text and list of words where
the words may or may not include all whitespace tokens. If the text and
words are mismatched, raise an error.

* Fix error code

* Remove all whitespace before aligning words/text

* Move words/text init to util function

* Update error message

* Rename to get_words_and_spaces

* Fix formatting
---
 spacy/errors.py                  |  1 +
 spacy/tests/doc/test_creation.py | 39 ++++++++++++++++++++++++++++++++
 spacy/util.py                    | 30 ++++++++++++++++++++++++
 3 files changed, 70 insertions(+)

diff --git a/spacy/errors.py b/spacy/errors.py
index e0ddc86c5..ce26e63a4 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -555,6 +555,7 @@ class Errors(object):
     E193 = ("Unable to resize vectors in place if the resized vector dimension "
             "({new_dim}) is not the same as the current vector dimension "
             "({curr_dim}).")
+    E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.")
 
 
 @add_codes
diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py
index 120fb6e28..8f543e86a 100644
--- a/spacy/tests/doc/test_creation.py
+++ b/spacy/tests/doc/test_creation.py
@@ -6,6 +6,7 @@ from spacy.vocab import Vocab
 from spacy.tokens import Doc
 from spacy.lemmatizer import Lemmatizer
 from spacy.lookups import Lookups
+from spacy import util
 
 
 @pytest.fixture
@@ -38,3 +39,41 @@ def test_lookup_lemmatization(vocab):
     assert doc[0].lemma_ == "dog"
     assert doc[1].text == "dogses"
     assert doc[1].lemma_ == "dogses"
+
+
+def test_create_from_words_and_text(vocab):
+    # no whitespace in words
+    words = ["'", "dogs", "'", "run"]
+    text = "  'dogs'\n\nrun  "
+    (words, spaces) = util.get_words_and_spaces(words, text)
+    doc = Doc(vocab, words=words, spaces=spaces)
+    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
+    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
+    assert doc.text == text
+    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
+
+    # partial whitespace in words
+    words = ["  ", "'", "dogs", "'", "\n\n", "run", " "]
+    text = "  'dogs'\n\nrun  "
+    (words, spaces) = util.get_words_and_spaces(words, text)
+    doc = Doc(vocab, words=words, spaces=spaces)
+    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
+    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
+    assert doc.text == text
+    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
+
+    # non-standard whitespace tokens
+    words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
+    text = "  'dogs'\n\nrun  "
+    (words, spaces) = util.get_words_and_spaces(words, text)
+    doc = Doc(vocab, words=words, spaces=spaces)
+    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
+    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
+    assert doc.text == text
+    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
+
+    # mismatch between words and text
+    with pytest.raises(ValueError):
+        words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
+        text = "  'dogs'\n\nrun  "
+        (words, spaces) = util.get_words_and_spaces(words + ["away"], text)
diff --git a/spacy/util.py b/spacy/util.py
index 9b96b2f5e..706fe303d 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -755,6 +755,36 @@ def get_serialization_exclude(serializers, exclude, kwargs):
     return exclude
 
 
+def get_words_and_spaces(words, text):
+    if "".join("".join(words).split())!= "".join(text.split()):
+        raise ValueError(Errors.E194.format(text=text, words=words))
+    text_words = []
+    text_spaces = []
+    text_pos = 0
+    # normalize words to remove all whitespace tokens
+    norm_words = [word for word in words if not word.isspace()]
+    # align words with text
+    for word in norm_words:
+        try:
+            word_start = text[text_pos:].index(word)
+        except ValueError:
+            raise ValueError(Errors.E194.format(text=text, words=words))
+        if word_start > 0:
+            text_words.append(text[text_pos:text_pos+word_start])
+            text_spaces.append(False)
+            text_pos += word_start
+        text_words.append(word)
+        text_spaces.append(False)
+        text_pos += len(word)
+        if text_pos < len(text) and text[text_pos] == " ":
+            text_spaces[-1] = True
+            text_pos += 1
+    if text_pos < len(text):
+        text_words.append(text[text_pos:])
+        text_spaces.append(False)
+    return (text_words, text_spaces)
+
+
 class SimpleFrozenDict(dict):
     """Simplified implementation of a frozen dict, mainly used as default
     function or method argument (for arguments that should default to empty