Add morph to morphology in Doc.from_array (#5762)

* Add morph to morphology in Doc.from_array

Add morphological analyses to morphology table in `Doc.from_array`.

* Use separate vocab in DocBin roundtrip test
This commit is contained in:
Adriane Boyd 2020-07-14 14:07:35 +02:00 committed by GitHub
parent 872938ec76
commit a7a7e0d2a6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 26 additions and 3 deletions

View File

@ -2,7 +2,7 @@ import pytest
import numpy import numpy
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
from ..util import get_doc from ..util import get_doc
@ -290,6 +290,24 @@ def test_doc_from_array_sent_starts(en_vocab):
assert new_doc.is_parsed assert new_doc.is_parsed
def test_doc_from_array_morph(en_vocab):
words = ["I", "live", "in", "New", "York", "."]
# fmt: off
morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
# fmt: on
doc = Doc(en_vocab, words=words)
for i, morph in enumerate(morphs):
doc[i].morph_ = morph
attrs = [MORPH]
arr = doc.to_array(attrs)
new_doc = Doc(en_vocab, words=words)
new_doc.from_array(attrs, arr)
assert [t.morph_ for t in new_doc] == morphs
assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc]
def test_doc_api_from_docs(en_tokenizer, de_tokenizer): def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
en_texts = ["Merging the docs is fun.", "They don't think alike."] en_texts = ["Merging the docs is fun.", "They don't think alike."]
de_text = "Wie war die Frage?" de_text = "Wie war die Frage?"

View File

@ -515,6 +515,8 @@ def test_roundtrip_docs_to_docbin(doc):
# roundtrip to DocBin # roundtrip to DocBin
with make_tempdir() as tmpdir: with make_tempdir() as tmpdir:
# use a separate vocab to test that all labels are added
reloaded_nlp = English()
json_file = tmpdir / "roundtrip.json" json_file = tmpdir / "roundtrip.json"
srsly.write_json(json_file, [docs_to_json(doc)]) srsly.write_json(json_file, [docs_to_json(doc)])
goldcorpus = Corpus(str(json_file), str(json_file)) goldcorpus = Corpus(str(json_file), str(json_file))
@ -523,8 +525,8 @@ def test_roundtrip_docs_to_docbin(doc):
with output_file.open("wb") as file_: with output_file.open("wb") as file_:
file_.write(data) file_.write(data)
goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp)) reloaded_example = next(goldcorpus.dev_dataset(nlp=reloaded_nlp))
assert len(doc) == goldcorpus.count_train(nlp) assert len(doc) == goldcorpus.count_train(reloaded_nlp)
assert text == reloaded_example.reference.text assert text == reloaded_example.reference.text
assert idx == [t.idx for t in reloaded_example.reference] assert idx == [t.idx for t in reloaded_example.reference]
assert tags == [t.tag_ for t in reloaded_example.reference] assert tags == [t.tag_ for t in reloaded_example.reference]

View File

@ -870,6 +870,9 @@ cdef class Doc:
for j in range(n_attrs): for j in range(n_attrs):
if attr_ids[j] != TAG: if attr_ids[j] != TAG:
value = values[j * stride + i] value = values[j * stride + i]
if attr_ids[j] == MORPH:
# add morph to morphology table
self.vocab.morphology.add(self.vocab.strings[value])
Token.set_struct_attr(token, attr_ids[j], value) Token.set_struct_attr(token, attr_ids[j], value)
# Set flags # Set flags
self.is_parsed = bool(self.is_parsed or HEAD in attrs) self.is_parsed = bool(self.is_parsed or HEAD in attrs)