serialize ENT_ID (#4852)

* expand serialization test for custom token attribute

* add failing test for issue 4849

* define ENT_ID as attr and use in doc serialization

* fix few typos
This commit is contained in:
Sofie Van Landeghem 2020-01-06 14:57:34 +01:00 committed by Matthew Honnibal
parent 53929138d7
commit a1b22e90cd
10 changed files with 62 additions and 8 deletions

View File

@ -91,3 +91,4 @@ cdef enum attr_id_t:
LANG
ENT_KB_ID = symbols.ENT_KB_ID
ENT_ID = symbols.ENT_ID

View File

@ -84,6 +84,7 @@ IDS = {
"DEP": DEP,
"ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE,
"ENT_ID": ENT_ID,
"ENT_KB_ID": ENT_KB_ID,
"HEAD": HEAD,
"SENT_START": SENT_START,

View File

@ -780,7 +780,7 @@ class Language(object):
pipes = (
[]
) # contains functools.partial objects so that easily create multiprocess worker.
) # contains functools.partial objects to easily create multiprocess worker.
for name, proc in self.pipeline:
if name in disable:
continue
@ -837,7 +837,7 @@ class Language(object):
texts, raw_texts = itertools.tee(texts)
# for sending texts to worker
texts_q = [mp.Queue() for _ in range(n_process)]
# for receiving byte encoded docs from worker
# for receiving byte-encoded docs from worker
bytedocs_recv_ch, bytedocs_send_ch = zip(
*[mp.Pipe(False) for _ in range(n_process)]
)
@ -847,7 +847,7 @@ class Language(object):
# This is necessary to properly handle infinite length of texts.
# (In this case, all data cannot be sent to the workers at once)
sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
# send twice so that make process busy
# send twice to make process busy
sender.send()
sender.send()
@ -859,7 +859,7 @@ class Language(object):
proc.start()
# Cycle channels not to break the order of docs.
# The received object is batch of byte encoded docs, so flatten them with chain.from_iterable.
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs)
try:

View File

@ -462,3 +462,4 @@ cdef enum symbol_t:
acl
ENT_KB_ID
ENT_ID

View File

@ -86,6 +86,7 @@ IDS = {
"DEP": DEP,
"ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE,
"ENT_ID": ENT_ID,
"ENT_KB_ID": ENT_KB_ID,
"HEAD": HEAD,
"SENT_START": SENT_START,

View File

@ -0,0 +1,36 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
def test_issue4849():
nlp = English()
ruler = EntityRuler(
nlp, patterns=[
{"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'},
{"label": "PERSON", "pattern": 'bernie sanders', "id": 'bernie-sanders'},
],
phrase_matcher_attr="LOWER"
)
nlp.add_pipe(ruler)
text = """
The left is starting to take aim at Democratic front-runner Joe Biden.
Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
"""
# USING 1 PROCESS
count_ents = 0
for doc in nlp.pipe([text], n_process=1):
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
assert(count_ents == 2)
# USING 2 PROCESSES
count_ents = 0
for doc in nlp.pipe([text], n_process=2):
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
assert (count_ents == 2)

View File

@ -2,7 +2,7 @@
from __future__ import unicode_literals
import pytest
from spacy.tokens import Doc
from spacy.tokens import Doc, Token
from spacy.vocab import Vocab
@ -15,6 +15,10 @@ def doc_w_attrs(en_tokenizer):
)
doc = en_tokenizer("This is a test.")
doc._._test_attr = "test"
Token.set_extension("_test_token", default="t0")
doc[1]._._test_token = "t1"
return doc
@ -25,3 +29,7 @@ def test_serialize_ext_attrs_from_bytes(doc_w_attrs):
assert doc._._test_attr == "test"
assert doc._._test_prop == len(doc.text)
assert doc._._test_method("test") == "{}{}".format(len(doc.text), "test")
assert doc[0]._._test_token == "t0"
assert doc[1]._._test_token == "t1"
assert doc[2]._._test_token == "t0"

View File

@ -23,7 +23,7 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
from ..attrs cimport ENT_TYPE, ENT_KB_ID, SENT_START, attr_id_t
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, attr_id_t
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..attrs import intify_attrs, IDS
@ -69,6 +69,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
return token.ent_iob
elif feat_name == ENT_TYPE:
return token.ent_type
elif feat_name == ENT_ID:
return token.ent_id
elif feat_name == ENT_KB_ID:
return token.ent_kb_id
else:
@ -868,7 +870,7 @@ cdef class Doc:
DOCS: https://spacy.io/api/doc#to_bytes
"""
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE] # TODO: ENT_KB_ID ?
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID] # TODO: ENT_KB_ID ?
if self.is_tagged:
array_head.extend([TAG, POS])
# If doc parsed add head and dep attribute

View File

@ -212,7 +212,7 @@ cdef class Span:
words = [t.text for t in self]
spaces = [bool(t.whitespace_) for t in self]
cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_KB_ID]
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, ENT_KB_ID]
if self.doc.is_tagged:
array_head.append(TAG)
# If doc parsed add head and dep attribute

View File

@ -53,6 +53,8 @@ cdef class Token:
return token.ent_iob
elif feat_name == ENT_TYPE:
return token.ent_type
elif feat_name == ENT_ID:
return token.ent_id
elif feat_name == ENT_KB_ID:
return token.ent_kb_id
elif feat_name == SENT_START:
@ -81,6 +83,8 @@ cdef class Token:
token.ent_iob = value
elif feat_name == ENT_TYPE:
token.ent_type = value
elif feat_name == ENT_ID:
token.ent_id = value
elif feat_name == ENT_KB_ID:
token.ent_kb_id = value
elif feat_name == SENT_START: