From a7a7e0d2a681012354904df63d212ff381a0f6c7 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 14 Jul 2020 14:07:35 +0200 Subject: [PATCH] Add morph to morphology in Doc.from_array (#5762) * Add morph to morphology in Doc.from_array Add morphological analyses to morphology table in `Doc.from_array`. * Use separate vocab in DocBin roundtrip test --- spacy/tests/doc/test_doc_api.py | 20 +++++++++++++++++++- spacy/tests/test_gold.py | 6 ++++-- spacy/tokens/doc.pyx | 3 +++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index e2b6adf43..29828e0e5 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -2,7 +2,7 @@ import pytest import numpy from spacy.tokens import Doc, Span from spacy.vocab import Vocab -from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP +from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH from ..util import get_doc @@ -290,6 +290,24 @@ def test_doc_from_array_sent_starts(en_vocab): assert new_doc.is_parsed +def test_doc_from_array_morph(en_vocab): + words = ["I", "live", "in", "New", "York", "."] + # fmt: off + morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"] + # fmt: on + doc = Doc(en_vocab, words=words) + for i, morph in enumerate(morphs): + doc[i].morph_ = morph + + attrs = [MORPH] + arr = doc.to_array(attrs) + new_doc = Doc(en_vocab, words=words) + new_doc.from_array(attrs, arr) + + assert [t.morph_ for t in new_doc] == morphs + assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc] + + def test_doc_api_from_docs(en_tokenizer, de_tokenizer): en_texts = ["Merging the docs is fun.", "They don't think alike."] de_text = "Wie war die Frage?" diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 0b0ba5cad..efad7f465 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -515,6 +515,8 @@ def test_roundtrip_docs_to_docbin(doc): # roundtrip to DocBin with make_tempdir() as tmpdir: + # use a separate vocab to test that all labels are added + reloaded_nlp = English() json_file = tmpdir / "roundtrip.json" srsly.write_json(json_file, [docs_to_json(doc)]) goldcorpus = Corpus(str(json_file), str(json_file)) @@ -523,8 +525,8 @@ def test_roundtrip_docs_to_docbin(doc): with output_file.open("wb") as file_: file_.write(data) goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) - reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp)) - assert len(doc) == goldcorpus.count_train(nlp) + reloaded_example = next(goldcorpus.dev_dataset(nlp=reloaded_nlp)) + assert len(doc) == goldcorpus.count_train(reloaded_nlp) assert text == reloaded_example.reference.text assert idx == [t.idx for t in reloaded_example.reference] assert tags == [t.tag_ for t in reloaded_example.reference] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f28bd3374..3c144d8f8 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -870,6 +870,9 @@ cdef class Doc: for j in range(n_attrs): if attr_ids[j] != TAG: value = values[j * stride + i] + if attr_ids[j] == MORPH: + # add morph to morphology table + self.vocab.morphology.add(self.vocab.strings[value]) Token.set_struct_attr(token, attr_ids[j], value) # Set flags self.is_parsed = bool(self.is_parsed or HEAD in attrs)