From 3d2c308906e2bde7ca57d2e8213252530b944502 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 14 Apr 2020 19:15:52 +0200 Subject: [PATCH] Add Doc init from list of words and text (#5251) * Add Doc init from list of words and text Add an option to initialize a `Doc` from a text and list of words where the words may or may not include all whitespace tokens. If the text and words are mismatched, raise an error. * Fix error code * Remove all whitespace before aligning words/text * Move words/text init to util function * Update error message * Rename to get_words_and_spaces * Fix formatting --- spacy/errors.py | 1 + spacy/tests/doc/test_creation.py | 39 ++++++++++++++++++++++++++++++++ spacy/util.py | 30 ++++++++++++++++++++++++ 3 files changed, 70 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index e0ddc86c5..ce26e63a4 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -555,6 +555,7 @@ class Errors(object): E193 = ("Unable to resize vectors in place if the resized vector dimension " "({new_dim}) is not the same as the current vector dimension " "({curr_dim}).") + E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.") @add_codes diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index 120fb6e28..8f543e86a 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -6,6 +6,7 @@ from spacy.vocab import Vocab from spacy.tokens import Doc from spacy.lemmatizer import Lemmatizer from spacy.lookups import Lookups +from spacy import util @pytest.fixture @@ -38,3 +39,41 @@ def test_lookup_lemmatization(vocab): assert doc[0].lemma_ == "dog" assert doc[1].text == "dogses" assert doc[1].lemma_ == "dogses" + + +def test_create_from_words_and_text(vocab): + # no whitespace in words + words = ["'", "dogs", "'", "run"] + text = " 'dogs'\n\nrun " + (words, spaces) = util.get_words_and_spaces(words, text) + doc = Doc(vocab, words=words, spaces=spaces) + assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "] + assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""] + assert doc.text == text + assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()] + + # partial whitespace in words + words = [" ", "'", "dogs", "'", "\n\n", "run", " "] + text = " 'dogs'\n\nrun " + (words, spaces) = util.get_words_and_spaces(words, text) + doc = Doc(vocab, words=words, spaces=spaces) + assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "] + assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""] + assert doc.text == text + assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()] + + # non-standard whitespace tokens + words = [" ", " ", "'", "dogs", "'", "\n\n", "run"] + text = " 'dogs'\n\nrun " + (words, spaces) = util.get_words_and_spaces(words, text) + doc = Doc(vocab, words=words, spaces=spaces) + assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "] + assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""] + assert doc.text == text + assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()] + + # mismatch between words and text + with pytest.raises(ValueError): + words = [" ", " ", "'", "dogs", "'", "\n\n", "run"] + text = " 'dogs'\n\nrun " + (words, spaces) = util.get_words_and_spaces(words + ["away"], text) diff --git a/spacy/util.py b/spacy/util.py index 9b96b2f5e..706fe303d 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -755,6 +755,36 @@ def get_serialization_exclude(serializers, exclude, kwargs): return exclude +def get_words_and_spaces(words, text): + if "".join("".join(words).split())!= "".join(text.split()): + raise ValueError(Errors.E194.format(text=text, words=words)) + text_words = [] + text_spaces = [] + text_pos = 0 + # normalize words to remove all whitespace tokens + norm_words = [word for word in words if not word.isspace()] + # align words with text + for word in norm_words: + try: + word_start = text[text_pos:].index(word) + except ValueError: + raise ValueError(Errors.E194.format(text=text, words=words)) + if word_start > 0: + text_words.append(text[text_pos:text_pos+word_start]) + text_spaces.append(False) + text_pos += word_start + text_words.append(word) + text_spaces.append(False) + text_pos += len(word) + if text_pos < len(text) and text[text_pos] == " ": + text_spaces[-1] = True + text_pos += 1 + if text_pos < len(text): + text_words.append(text[text_pos:]) + text_spaces.append(False) + return (text_words, text_spaces) + + class SimpleFrozenDict(dict): """Simplified implementation of a frozen dict, mainly used as default function or method argument (for arguments that should default to empty