serialize ENT_ID (#4852)

* expand serialization test for custom token attribute

* add failing test for issue 4849

* define ENT_ID as attr and use in doc serialization

* fix few typos
This commit is contained in:
Sofie Van Landeghem 2020-01-06 14:57:34 +01:00 committed by Matthew Honnibal
parent 53929138d7
commit a1b22e90cd
10 changed files with 62 additions and 8 deletions

View File

@ -91,3 +91,4 @@ cdef enum attr_id_t:
LANG LANG
ENT_KB_ID = symbols.ENT_KB_ID ENT_KB_ID = symbols.ENT_KB_ID
ENT_ID = symbols.ENT_ID

View File

@ -84,6 +84,7 @@ IDS = {
"DEP": DEP, "DEP": DEP,
"ENT_IOB": ENT_IOB, "ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE, "ENT_TYPE": ENT_TYPE,
"ENT_ID": ENT_ID,
"ENT_KB_ID": ENT_KB_ID, "ENT_KB_ID": ENT_KB_ID,
"HEAD": HEAD, "HEAD": HEAD,
"SENT_START": SENT_START, "SENT_START": SENT_START,

View File

@ -780,7 +780,7 @@ class Language(object):
pipes = ( pipes = (
[] []
) # contains functools.partial objects so that easily create multiprocess worker. ) # contains functools.partial objects to easily create multiprocess worker.
for name, proc in self.pipeline: for name, proc in self.pipeline:
if name in disable: if name in disable:
continue continue
@ -837,7 +837,7 @@ class Language(object):
texts, raw_texts = itertools.tee(texts) texts, raw_texts = itertools.tee(texts)
# for sending texts to worker # for sending texts to worker
texts_q = [mp.Queue() for _ in range(n_process)] texts_q = [mp.Queue() for _ in range(n_process)]
# for receiving byte encoded docs from worker # for receiving byte-encoded docs from worker
bytedocs_recv_ch, bytedocs_send_ch = zip( bytedocs_recv_ch, bytedocs_send_ch = zip(
*[mp.Pipe(False) for _ in range(n_process)] *[mp.Pipe(False) for _ in range(n_process)]
) )
@ -847,7 +847,7 @@ class Language(object):
# This is necessary to properly handle infinite length of texts. # This is necessary to properly handle infinite length of texts.
# (In this case, all data cannot be sent to the workers at once) # (In this case, all data cannot be sent to the workers at once)
sender = _Sender(batch_texts, texts_q, chunk_size=n_process) sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
# send twice so that make process busy # send twice to make process busy
sender.send() sender.send()
sender.send() sender.send()
@ -859,7 +859,7 @@ class Language(object):
proc.start() proc.start()
# Cycle channels not to break the order of docs. # Cycle channels not to break the order of docs.
# The received object is batch of byte encoded docs, so flatten them with chain.from_iterable. # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch)) byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs) docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs)
try: try:

View File

@ -462,3 +462,4 @@ cdef enum symbol_t:
acl acl
ENT_KB_ID ENT_KB_ID
ENT_ID

View File

@ -86,6 +86,7 @@ IDS = {
"DEP": DEP, "DEP": DEP,
"ENT_IOB": ENT_IOB, "ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE, "ENT_TYPE": ENT_TYPE,
"ENT_ID": ENT_ID,
"ENT_KB_ID": ENT_KB_ID, "ENT_KB_ID": ENT_KB_ID,
"HEAD": HEAD, "HEAD": HEAD,
"SENT_START": SENT_START, "SENT_START": SENT_START,

View File

@ -0,0 +1,36 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
def test_issue4849():
nlp = English()
ruler = EntityRuler(
nlp, patterns=[
{"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'},
{"label": "PERSON", "pattern": 'bernie sanders', "id": 'bernie-sanders'},
],
phrase_matcher_attr="LOWER"
)
nlp.add_pipe(ruler)
text = """
The left is starting to take aim at Democratic front-runner Joe Biden.
Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
"""
# USING 1 PROCESS
count_ents = 0
for doc in nlp.pipe([text], n_process=1):
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
assert(count_ents == 2)
# USING 2 PROCESSES
count_ents = 0
for doc in nlp.pipe([text], n_process=2):
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
assert (count_ents == 2)

View File

@ -2,7 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
from spacy.tokens import Doc from spacy.tokens import Doc, Token
from spacy.vocab import Vocab from spacy.vocab import Vocab
@ -15,6 +15,10 @@ def doc_w_attrs(en_tokenizer):
) )
doc = en_tokenizer("This is a test.") doc = en_tokenizer("This is a test.")
doc._._test_attr = "test" doc._._test_attr = "test"
Token.set_extension("_test_token", default="t0")
doc[1]._._test_token = "t1"
return doc return doc
@ -25,3 +29,7 @@ def test_serialize_ext_attrs_from_bytes(doc_w_attrs):
assert doc._._test_attr == "test" assert doc._._test_attr == "test"
assert doc._._test_prop == len(doc.text) assert doc._._test_prop == len(doc.text)
assert doc._._test_method("test") == "{}{}".format(len(doc.text), "test") assert doc._._test_method("test") == "{}{}".format(len(doc.text), "test")
assert doc[0]._._test_token == "t0"
assert doc[1]._._test_token == "t1"
assert doc[2]._._test_token == "t0"

View File

@ -23,7 +23,7 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t from ..typedefs cimport attr_t, flags_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
from ..attrs cimport ENT_TYPE, ENT_KB_ID, SENT_START, attr_id_t from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, attr_id_t
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..attrs import intify_attrs, IDS from ..attrs import intify_attrs, IDS
@ -69,6 +69,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
return token.ent_iob return token.ent_iob
elif feat_name == ENT_TYPE: elif feat_name == ENT_TYPE:
return token.ent_type return token.ent_type
elif feat_name == ENT_ID:
return token.ent_id
elif feat_name == ENT_KB_ID: elif feat_name == ENT_KB_ID:
return token.ent_kb_id return token.ent_kb_id
else: else:
@ -868,7 +870,7 @@ cdef class Doc:
DOCS: https://spacy.io/api/doc#to_bytes DOCS: https://spacy.io/api/doc#to_bytes
""" """
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE] # TODO: ENT_KB_ID ? array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID] # TODO: ENT_KB_ID ?
if self.is_tagged: if self.is_tagged:
array_head.extend([TAG, POS]) array_head.extend([TAG, POS])
# If doc parsed add head and dep attribute # If doc parsed add head and dep attribute

View File

@ -212,7 +212,7 @@ cdef class Span:
words = [t.text for t in self] words = [t.text for t in self]
spaces = [bool(t.whitespace_) for t in self] spaces = [bool(t.whitespace_) for t in self]
cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces) cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_KB_ID] array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, ENT_KB_ID]
if self.doc.is_tagged: if self.doc.is_tagged:
array_head.append(TAG) array_head.append(TAG)
# If doc parsed add head and dep attribute # If doc parsed add head and dep attribute

View File

@ -53,6 +53,8 @@ cdef class Token:
return token.ent_iob return token.ent_iob
elif feat_name == ENT_TYPE: elif feat_name == ENT_TYPE:
return token.ent_type return token.ent_type
elif feat_name == ENT_ID:
return token.ent_id
elif feat_name == ENT_KB_ID: elif feat_name == ENT_KB_ID:
return token.ent_kb_id return token.ent_kb_id
elif feat_name == SENT_START: elif feat_name == SENT_START:
@ -81,6 +83,8 @@ cdef class Token:
token.ent_iob = value token.ent_iob = value
elif feat_name == ENT_TYPE: elif feat_name == ENT_TYPE:
token.ent_type = value token.ent_type = value
elif feat_name == ENT_ID:
token.ent_id = value
elif feat_name == ENT_KB_ID: elif feat_name == ENT_KB_ID:
token.ent_kb_id = value token.ent_kb_id = value
elif feat_name == SENT_START: elif feat_name == SENT_START: