Add Doc init from list of words and text (#5251)

* Add Doc init from list of words and text Add an option to initialize a `Doc` from a text and list of words where the words may or may not include all whitespace tokens. If the text and words are mismatched, raise an error. * Fix error code * Remove all whitespace before aligning words/text * Move words/text init to util function * Update error message * Rename to get_words_and_spaces * Fix formatting
2025-07-15 02:32:37 +03:00 · 2020-04-14 19:15:52 +02:00 · 2020-04-14 19:15:52 +02:00 · 3d2c308906
commit 3d2c308906
parent 8ce408d2e1
3 changed files with 70 additions and 0 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -555,6 +555,7 @@ class Errors(object):
    E193 = ("Unable to resize vectors in place if the resized vector dimension "
            "({new_dim}) is not the same as the current vector dimension "
            "({curr_dim}).")
    E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.")
@add_codes
--- a/spacy/tests/doc/test_creation.py
+++ b/spacy/tests/doc/test_creation.py
@ -6,6 +6,7 @@ from spacy.vocab import Vocab
 from spacy.tokens import Doc
 from spacy.lemmatizer import Lemmatizer
 from spacy.lookups import Lookups
 from spacy import util
@pytest.fixture
@ -38,3 +39,41 @@ def test_lookup_lemmatization(vocab):
    assert doc[0].lemma_ == "dog"
    assert doc[1].text == "dogses"
    assert doc[1].lemma_ == "dogses"
 def test_create_from_words_and_text(vocab):
    # no whitespace in words
    words = ["'", "dogs", "'", "run"]
    text = "  'dogs'\n\nrun  "
    (words, spaces) = util.get_words_and_spaces(words, text)
    doc = Doc(vocab, words=words, spaces=spaces)
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
    assert doc.text == text
    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
    # partial whitespace in words
    words = ["  ", "'", "dogs", "'", "\n\n", "run", " "]
    text = "  'dogs'\n\nrun  "
    (words, spaces) = util.get_words_and_spaces(words, text)
    doc = Doc(vocab, words=words, spaces=spaces)
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
    assert doc.text == text
    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
    # non-standard whitespace tokens
    words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
    text = "  'dogs'\n\nrun  "
    (words, spaces) = util.get_words_and_spaces(words, text)
    doc = Doc(vocab, words=words, spaces=spaces)
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
    assert doc.text == text
    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
    # mismatch between words and text
    with pytest.raises(ValueError):
        words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
        text = "  'dogs'\n\nrun  "
        (words, spaces) = util.get_words_and_spaces(words + ["away"], text)
--- a/spacy/util.py
+++ b/spacy/util.py
@ -755,6 +755,36 @@ def get_serialization_exclude(serializers, exclude, kwargs):
    return exclude
 def get_words_and_spaces(words, text):
    if "".join("".join(words).split())!= "".join(text.split()):
        raise ValueError(Errors.E194.format(text=text, words=words))
    text_words = []
    text_spaces = []
    text_pos = 0
    # normalize words to remove all whitespace tokens
    norm_words = [word for word in words if not word.isspace()]
    # align words with text
    for word in norm_words:
        try:
            word_start = text[text_pos:].index(word)
        except ValueError:
            raise ValueError(Errors.E194.format(text=text, words=words))
        if word_start > 0:
            text_words.append(text[text_pos:text_pos+word_start])
            text_spaces.append(False)
            text_pos += word_start
        text_words.append(word)
        text_spaces.append(False)
        text_pos += len(word)
        if text_pos < len(text) and text[text_pos] == " ":
            text_spaces[-1] = True
            text_pos += 1
    if text_pos < len(text):
        text_words.append(text[text_pos:])
        text_spaces.append(False)
    return (text_words, text_spaces)
 class SimpleFrozenDict(dict):
    """Simplified implementation of a frozen dict, mainly used as default
    function or method argument (for arguments that should default to empty