Add Doc init from list of words and text (#5251)

* Add Doc init from list of words and text

Add an option to initialize a `Doc` from a text and list of words where
the words may or may not include all whitespace tokens. If the text and
words are mismatched, raise an error.

* Fix error code

* Remove all whitespace before aligning words/text

* Move words/text init to util function

* Update error message

* Rename to get_words_and_spaces

* Fix formatting
This commit is contained in:
adrianeboyd 2020-04-14 19:15:52 +02:00 committed by GitHub
parent 8ce408d2e1
commit 3d2c308906
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 70 additions and 0 deletions

View File

@ -555,6 +555,7 @@ class Errors(object):
E193 = ("Unable to resize vectors in place if the resized vector dimension " E193 = ("Unable to resize vectors in place if the resized vector dimension "
"({new_dim}) is not the same as the current vector dimension " "({new_dim}) is not the same as the current vector dimension "
"({curr_dim}).") "({curr_dim}).")
E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.")
@add_codes @add_codes

View File

@ -6,6 +6,7 @@ from spacy.vocab import Vocab
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.lemmatizer import Lemmatizer from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups from spacy.lookups import Lookups
from spacy import util
@pytest.fixture @pytest.fixture
@ -38,3 +39,41 @@ def test_lookup_lemmatization(vocab):
assert doc[0].lemma_ == "dog" assert doc[0].lemma_ == "dog"
assert doc[1].text == "dogses" assert doc[1].text == "dogses"
assert doc[1].lemma_ == "dogses" assert doc[1].lemma_ == "dogses"
def test_create_from_words_and_text(vocab):
# no whitespace in words
words = ["'", "dogs", "'", "run"]
text = " 'dogs'\n\nrun "
(words, spaces) = util.get_words_and_spaces(words, text)
doc = Doc(vocab, words=words, spaces=spaces)
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
assert doc.text == text
assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
# partial whitespace in words
words = [" ", "'", "dogs", "'", "\n\n", "run", " "]
text = " 'dogs'\n\nrun "
(words, spaces) = util.get_words_and_spaces(words, text)
doc = Doc(vocab, words=words, spaces=spaces)
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
assert doc.text == text
assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
# non-standard whitespace tokens
words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
text = " 'dogs'\n\nrun "
(words, spaces) = util.get_words_and_spaces(words, text)
doc = Doc(vocab, words=words, spaces=spaces)
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
assert doc.text == text
assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
# mismatch between words and text
with pytest.raises(ValueError):
words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
text = " 'dogs'\n\nrun "
(words, spaces) = util.get_words_and_spaces(words + ["away"], text)

View File

@ -755,6 +755,36 @@ def get_serialization_exclude(serializers, exclude, kwargs):
return exclude return exclude
def get_words_and_spaces(words, text):
if "".join("".join(words).split())!= "".join(text.split()):
raise ValueError(Errors.E194.format(text=text, words=words))
text_words = []
text_spaces = []
text_pos = 0
# normalize words to remove all whitespace tokens
norm_words = [word for word in words if not word.isspace()]
# align words with text
for word in norm_words:
try:
word_start = text[text_pos:].index(word)
except ValueError:
raise ValueError(Errors.E194.format(text=text, words=words))
if word_start > 0:
text_words.append(text[text_pos:text_pos+word_start])
text_spaces.append(False)
text_pos += word_start
text_words.append(word)
text_spaces.append(False)
text_pos += len(word)
if text_pos < len(text) and text[text_pos] == " ":
text_spaces[-1] = True
text_pos += 1
if text_pos < len(text):
text_words.append(text[text_pos:])
text_spaces.append(False)
return (text_words, text_spaces)
class SimpleFrozenDict(dict): class SimpleFrozenDict(dict):
"""Simplified implementation of a frozen dict, mainly used as default """Simplified implementation of a frozen dict, mainly used as default
function or method argument (for arguments that should default to empty function or method argument (for arguments that should default to empty