mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-04 13:40:34 +03:00
Add Doc init from list of words and text (#5251)
* Add Doc init from list of words and text Add an option to initialize a `Doc` from a text and list of words where the words may or may not include all whitespace tokens. If the text and words are mismatched, raise an error. * Fix error code * Remove all whitespace before aligning words/text * Move words/text init to util function * Update error message * Rename to get_words_and_spaces * Fix formatting
This commit is contained in:
parent
8ce408d2e1
commit
3d2c308906
|
@ -555,6 +555,7 @@ class Errors(object):
|
||||||
E193 = ("Unable to resize vectors in place if the resized vector dimension "
|
E193 = ("Unable to resize vectors in place if the resized vector dimension "
|
||||||
"({new_dim}) is not the same as the current vector dimension "
|
"({new_dim}) is not the same as the current vector dimension "
|
||||||
"({curr_dim}).")
|
"({curr_dim}).")
|
||||||
|
E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -6,6 +6,7 @@ from spacy.vocab import Vocab
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.lemmatizer import Lemmatizer
|
from spacy.lemmatizer import Lemmatizer
|
||||||
from spacy.lookups import Lookups
|
from spacy.lookups import Lookups
|
||||||
|
from spacy import util
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -38,3 +39,41 @@ def test_lookup_lemmatization(vocab):
|
||||||
assert doc[0].lemma_ == "dog"
|
assert doc[0].lemma_ == "dog"
|
||||||
assert doc[1].text == "dogses"
|
assert doc[1].text == "dogses"
|
||||||
assert doc[1].lemma_ == "dogses"
|
assert doc[1].lemma_ == "dogses"
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_from_words_and_text(vocab):
|
||||||
|
# no whitespace in words
|
||||||
|
words = ["'", "dogs", "'", "run"]
|
||||||
|
text = " 'dogs'\n\nrun "
|
||||||
|
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||||
|
doc = Doc(vocab, words=words, spaces=spaces)
|
||||||
|
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
||||||
|
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
|
||||||
|
assert doc.text == text
|
||||||
|
assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
|
||||||
|
|
||||||
|
# partial whitespace in words
|
||||||
|
words = [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
||||||
|
text = " 'dogs'\n\nrun "
|
||||||
|
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||||
|
doc = Doc(vocab, words=words, spaces=spaces)
|
||||||
|
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
||||||
|
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
|
||||||
|
assert doc.text == text
|
||||||
|
assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
|
||||||
|
|
||||||
|
# non-standard whitespace tokens
|
||||||
|
words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
|
||||||
|
text = " 'dogs'\n\nrun "
|
||||||
|
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||||
|
doc = Doc(vocab, words=words, spaces=spaces)
|
||||||
|
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
||||||
|
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
|
||||||
|
assert doc.text == text
|
||||||
|
assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
|
||||||
|
|
||||||
|
# mismatch between words and text
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
|
||||||
|
text = " 'dogs'\n\nrun "
|
||||||
|
(words, spaces) = util.get_words_and_spaces(words + ["away"], text)
|
||||||
|
|
|
@ -755,6 +755,36 @@ def get_serialization_exclude(serializers, exclude, kwargs):
|
||||||
return exclude
|
return exclude
|
||||||
|
|
||||||
|
|
||||||
|
def get_words_and_spaces(words, text):
|
||||||
|
if "".join("".join(words).split())!= "".join(text.split()):
|
||||||
|
raise ValueError(Errors.E194.format(text=text, words=words))
|
||||||
|
text_words = []
|
||||||
|
text_spaces = []
|
||||||
|
text_pos = 0
|
||||||
|
# normalize words to remove all whitespace tokens
|
||||||
|
norm_words = [word for word in words if not word.isspace()]
|
||||||
|
# align words with text
|
||||||
|
for word in norm_words:
|
||||||
|
try:
|
||||||
|
word_start = text[text_pos:].index(word)
|
||||||
|
except ValueError:
|
||||||
|
raise ValueError(Errors.E194.format(text=text, words=words))
|
||||||
|
if word_start > 0:
|
||||||
|
text_words.append(text[text_pos:text_pos+word_start])
|
||||||
|
text_spaces.append(False)
|
||||||
|
text_pos += word_start
|
||||||
|
text_words.append(word)
|
||||||
|
text_spaces.append(False)
|
||||||
|
text_pos += len(word)
|
||||||
|
if text_pos < len(text) and text[text_pos] == " ":
|
||||||
|
text_spaces[-1] = True
|
||||||
|
text_pos += 1
|
||||||
|
if text_pos < len(text):
|
||||||
|
text_words.append(text[text_pos:])
|
||||||
|
text_spaces.append(False)
|
||||||
|
return (text_words, text_spaces)
|
||||||
|
|
||||||
|
|
||||||
class SimpleFrozenDict(dict):
|
class SimpleFrozenDict(dict):
|
||||||
"""Simplified implementation of a frozen dict, mainly used as default
|
"""Simplified implementation of a frozen dict, mainly used as default
|
||||||
function or method argument (for arguments that should default to empty
|
function or method argument (for arguments that should default to empty
|
||||||
|
|
Loading…
Reference in New Issue
Block a user