mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-09 16:10:33 +03:00
Add ent_id_ to strings serialized with Doc (#6353)
This commit is contained in:
parent
d490428089
commit
320a8b1481
|
@ -109,6 +109,7 @@ def test_doc_api_serialize(en_tokenizer, text):
|
||||||
tokens[0].lemma_ = "lemma"
|
tokens[0].lemma_ = "lemma"
|
||||||
tokens[0].norm_ = "norm"
|
tokens[0].norm_ = "norm"
|
||||||
tokens[0].ent_kb_id_ = "ent_kb_id"
|
tokens[0].ent_kb_id_ = "ent_kb_id"
|
||||||
|
tokens[0].ent_id_ = "ent_id"
|
||||||
new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
|
new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
|
||||||
assert tokens.text == new_tokens.text
|
assert tokens.text == new_tokens.text
|
||||||
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
||||||
|
@ -116,6 +117,7 @@ def test_doc_api_serialize(en_tokenizer, text):
|
||||||
assert new_tokens[0].lemma_ == "lemma"
|
assert new_tokens[0].lemma_ == "lemma"
|
||||||
assert new_tokens[0].norm_ == "norm"
|
assert new_tokens[0].norm_ == "norm"
|
||||||
assert new_tokens[0].ent_kb_id_ == "ent_kb_id"
|
assert new_tokens[0].ent_kb_id_ == "ent_kb_id"
|
||||||
|
assert new_tokens[0].ent_id_ == "ent_id"
|
||||||
|
|
||||||
new_tokens = Doc(tokens.vocab).from_bytes(
|
new_tokens = Doc(tokens.vocab).from_bytes(
|
||||||
tokens.to_bytes(exclude=["tensor"]), exclude=["tensor"]
|
tokens.to_bytes(exclude=["tensor"]), exclude=["tensor"]
|
||||||
|
|
|
@ -166,3 +166,22 @@ def test_entity_ruler_overlapping_spans(nlp):
|
||||||
doc = ruler(nlp.make_doc("foo bar baz"))
|
doc = ruler(nlp.make_doc("foo bar baz"))
|
||||||
assert len(doc.ents) == 1
|
assert len(doc.ents) == 1
|
||||||
assert doc.ents[0].label_ == "FOOBAR"
|
assert doc.ents[0].label_ == "FOOBAR"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("n_process", [1, 2])
|
||||||
|
def test_entity_ruler_multiprocessing(nlp, n_process):
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
texts = [
|
||||||
|
"I enjoy eating Pizza Hut pizza."
|
||||||
|
]
|
||||||
|
|
||||||
|
patterns = [
|
||||||
|
{"label": "FASTFOOD", "pattern": "Pizza Hut", "id": "1234"}
|
||||||
|
]
|
||||||
|
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
|
||||||
|
for doc in nlp.pipe(texts, n_process=2):
|
||||||
|
for ent in doc.ents:
|
||||||
|
assert ent.ent_id_ == "1234"
|
||||||
|
|
|
@ -933,6 +933,7 @@ cdef class Doc:
|
||||||
strings.add(token.dep_)
|
strings.add(token.dep_)
|
||||||
strings.add(token.ent_type_)
|
strings.add(token.ent_type_)
|
||||||
strings.add(token.ent_kb_id_)
|
strings.add(token.ent_kb_id_)
|
||||||
|
strings.add(token.ent_id_)
|
||||||
strings.add(token.norm_)
|
strings.add(token.norm_)
|
||||||
# Msgpack doesn't distinguish between lists and tuples, which is
|
# Msgpack doesn't distinguish between lists and tuples, which is
|
||||||
# vexing for user data. As a best guess, we *know* that within
|
# vexing for user data. As a best guess, we *know* that within
|
||||||
|
|
Loading…
Reference in New Issue
Block a user