mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Add morph to morphology in Doc.from_array (#5762)
* Add morph to morphology in Doc.from_array Add morphological analyses to morphology table in `Doc.from_array`. * Use separate vocab in DocBin roundtrip test
This commit is contained in:
parent
872938ec76
commit
a7a7e0d2a6
|
@ -2,7 +2,7 @@ import pytest
|
||||||
import numpy
|
import numpy
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP
|
from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
|
||||||
|
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
|
||||||
|
@ -290,6 +290,24 @@ def test_doc_from_array_sent_starts(en_vocab):
|
||||||
assert new_doc.is_parsed
|
assert new_doc.is_parsed
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_from_array_morph(en_vocab):
|
||||||
|
words = ["I", "live", "in", "New", "York", "."]
|
||||||
|
# fmt: off
|
||||||
|
morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
|
||||||
|
# fmt: on
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
|
for i, morph in enumerate(morphs):
|
||||||
|
doc[i].morph_ = morph
|
||||||
|
|
||||||
|
attrs = [MORPH]
|
||||||
|
arr = doc.to_array(attrs)
|
||||||
|
new_doc = Doc(en_vocab, words=words)
|
||||||
|
new_doc.from_array(attrs, arr)
|
||||||
|
|
||||||
|
assert [t.morph_ for t in new_doc] == morphs
|
||||||
|
assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc]
|
||||||
|
|
||||||
|
|
||||||
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
en_texts = ["Merging the docs is fun.", "They don't think alike."]
|
en_texts = ["Merging the docs is fun.", "They don't think alike."]
|
||||||
de_text = "Wie war die Frage?"
|
de_text = "Wie war die Frage?"
|
||||||
|
|
|
@ -515,6 +515,8 @@ def test_roundtrip_docs_to_docbin(doc):
|
||||||
|
|
||||||
# roundtrip to DocBin
|
# roundtrip to DocBin
|
||||||
with make_tempdir() as tmpdir:
|
with make_tempdir() as tmpdir:
|
||||||
|
# use a separate vocab to test that all labels are added
|
||||||
|
reloaded_nlp = English()
|
||||||
json_file = tmpdir / "roundtrip.json"
|
json_file = tmpdir / "roundtrip.json"
|
||||||
srsly.write_json(json_file, [docs_to_json(doc)])
|
srsly.write_json(json_file, [docs_to_json(doc)])
|
||||||
goldcorpus = Corpus(str(json_file), str(json_file))
|
goldcorpus = Corpus(str(json_file), str(json_file))
|
||||||
|
@ -523,8 +525,8 @@ def test_roundtrip_docs_to_docbin(doc):
|
||||||
with output_file.open("wb") as file_:
|
with output_file.open("wb") as file_:
|
||||||
file_.write(data)
|
file_.write(data)
|
||||||
goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
|
goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
|
||||||
reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp))
|
reloaded_example = next(goldcorpus.dev_dataset(nlp=reloaded_nlp))
|
||||||
assert len(doc) == goldcorpus.count_train(nlp)
|
assert len(doc) == goldcorpus.count_train(reloaded_nlp)
|
||||||
assert text == reloaded_example.reference.text
|
assert text == reloaded_example.reference.text
|
||||||
assert idx == [t.idx for t in reloaded_example.reference]
|
assert idx == [t.idx for t in reloaded_example.reference]
|
||||||
assert tags == [t.tag_ for t in reloaded_example.reference]
|
assert tags == [t.tag_ for t in reloaded_example.reference]
|
||||||
|
|
|
@ -870,6 +870,9 @@ cdef class Doc:
|
||||||
for j in range(n_attrs):
|
for j in range(n_attrs):
|
||||||
if attr_ids[j] != TAG:
|
if attr_ids[j] != TAG:
|
||||||
value = values[j * stride + i]
|
value = values[j * stride + i]
|
||||||
|
if attr_ids[j] == MORPH:
|
||||||
|
# add morph to morphology table
|
||||||
|
self.vocab.morphology.add(self.vocab.strings[value])
|
||||||
Token.set_struct_attr(token, attr_ids[j], value)
|
Token.set_struct_attr(token, attr_ids[j], value)
|
||||||
# Set flags
|
# Set flags
|
||||||
self.is_parsed = bool(self.is_parsed or HEAD in attrs)
|
self.is_parsed = bool(self.is_parsed or HEAD in attrs)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user