mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Add morph to morphology in Doc.from_array (#5762)
* Add morph to morphology in Doc.from_array Add morphological analyses to morphology table in `Doc.from_array`. * Use separate vocab in DocBin roundtrip test
This commit is contained in:
parent
872938ec76
commit
a7a7e0d2a6
|
@ -2,7 +2,7 @@ import pytest
|
|||
import numpy
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP
|
||||
from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
@ -290,6 +290,24 @@ def test_doc_from_array_sent_starts(en_vocab):
|
|||
assert new_doc.is_parsed
|
||||
|
||||
|
||||
def test_doc_from_array_morph(en_vocab):
|
||||
words = ["I", "live", "in", "New", "York", "."]
|
||||
# fmt: off
|
||||
morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
|
||||
# fmt: on
|
||||
doc = Doc(en_vocab, words=words)
|
||||
for i, morph in enumerate(morphs):
|
||||
doc[i].morph_ = morph
|
||||
|
||||
attrs = [MORPH]
|
||||
arr = doc.to_array(attrs)
|
||||
new_doc = Doc(en_vocab, words=words)
|
||||
new_doc.from_array(attrs, arr)
|
||||
|
||||
assert [t.morph_ for t in new_doc] == morphs
|
||||
assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc]
|
||||
|
||||
|
||||
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||
en_texts = ["Merging the docs is fun.", "They don't think alike."]
|
||||
de_text = "Wie war die Frage?"
|
||||
|
|
|
@ -515,6 +515,8 @@ def test_roundtrip_docs_to_docbin(doc):
|
|||
|
||||
# roundtrip to DocBin
|
||||
with make_tempdir() as tmpdir:
|
||||
# use a separate vocab to test that all labels are added
|
||||
reloaded_nlp = English()
|
||||
json_file = tmpdir / "roundtrip.json"
|
||||
srsly.write_json(json_file, [docs_to_json(doc)])
|
||||
goldcorpus = Corpus(str(json_file), str(json_file))
|
||||
|
@ -523,8 +525,8 @@ def test_roundtrip_docs_to_docbin(doc):
|
|||
with output_file.open("wb") as file_:
|
||||
file_.write(data)
|
||||
goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
|
||||
reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp))
|
||||
assert len(doc) == goldcorpus.count_train(nlp)
|
||||
reloaded_example = next(goldcorpus.dev_dataset(nlp=reloaded_nlp))
|
||||
assert len(doc) == goldcorpus.count_train(reloaded_nlp)
|
||||
assert text == reloaded_example.reference.text
|
||||
assert idx == [t.idx for t in reloaded_example.reference]
|
||||
assert tags == [t.tag_ for t in reloaded_example.reference]
|
||||
|
|
|
@ -870,6 +870,9 @@ cdef class Doc:
|
|||
for j in range(n_attrs):
|
||||
if attr_ids[j] != TAG:
|
||||
value = values[j * stride + i]
|
||||
if attr_ids[j] == MORPH:
|
||||
# add morph to morphology table
|
||||
self.vocab.morphology.add(self.vocab.strings[value])
|
||||
Token.set_struct_attr(token, attr_ids[j], value)
|
||||
# Set flags
|
||||
self.is_parsed = bool(self.is_parsed or HEAD in attrs)
|
||||
|
|
Loading…
Reference in New Issue
Block a user