mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
serialize ENT_ID (#4852)
* expand serialization test for custom token attribute * add failing test for issue 4849 * define ENT_ID as attr and use in doc serialization * fix few typos
This commit is contained in:
parent
53929138d7
commit
a1b22e90cd
|
@ -91,3 +91,4 @@ cdef enum attr_id_t:
|
||||||
|
|
||||||
LANG
|
LANG
|
||||||
ENT_KB_ID = symbols.ENT_KB_ID
|
ENT_KB_ID = symbols.ENT_KB_ID
|
||||||
|
ENT_ID = symbols.ENT_ID
|
||||||
|
|
|
@ -84,6 +84,7 @@ IDS = {
|
||||||
"DEP": DEP,
|
"DEP": DEP,
|
||||||
"ENT_IOB": ENT_IOB,
|
"ENT_IOB": ENT_IOB,
|
||||||
"ENT_TYPE": ENT_TYPE,
|
"ENT_TYPE": ENT_TYPE,
|
||||||
|
"ENT_ID": ENT_ID,
|
||||||
"ENT_KB_ID": ENT_KB_ID,
|
"ENT_KB_ID": ENT_KB_ID,
|
||||||
"HEAD": HEAD,
|
"HEAD": HEAD,
|
||||||
"SENT_START": SENT_START,
|
"SENT_START": SENT_START,
|
||||||
|
|
|
@ -780,7 +780,7 @@ class Language(object):
|
||||||
|
|
||||||
pipes = (
|
pipes = (
|
||||||
[]
|
[]
|
||||||
) # contains functools.partial objects so that easily create multiprocess worker.
|
) # contains functools.partial objects to easily create multiprocess worker.
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if name in disable:
|
if name in disable:
|
||||||
continue
|
continue
|
||||||
|
@ -837,7 +837,7 @@ class Language(object):
|
||||||
texts, raw_texts = itertools.tee(texts)
|
texts, raw_texts = itertools.tee(texts)
|
||||||
# for sending texts to worker
|
# for sending texts to worker
|
||||||
texts_q = [mp.Queue() for _ in range(n_process)]
|
texts_q = [mp.Queue() for _ in range(n_process)]
|
||||||
# for receiving byte encoded docs from worker
|
# for receiving byte-encoded docs from worker
|
||||||
bytedocs_recv_ch, bytedocs_send_ch = zip(
|
bytedocs_recv_ch, bytedocs_send_ch = zip(
|
||||||
*[mp.Pipe(False) for _ in range(n_process)]
|
*[mp.Pipe(False) for _ in range(n_process)]
|
||||||
)
|
)
|
||||||
|
@ -847,7 +847,7 @@ class Language(object):
|
||||||
# This is necessary to properly handle infinite length of texts.
|
# This is necessary to properly handle infinite length of texts.
|
||||||
# (In this case, all data cannot be sent to the workers at once)
|
# (In this case, all data cannot be sent to the workers at once)
|
||||||
sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
|
sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
|
||||||
# send twice so that make process busy
|
# send twice to make process busy
|
||||||
sender.send()
|
sender.send()
|
||||||
sender.send()
|
sender.send()
|
||||||
|
|
||||||
|
@ -859,7 +859,7 @@ class Language(object):
|
||||||
proc.start()
|
proc.start()
|
||||||
|
|
||||||
# Cycle channels not to break the order of docs.
|
# Cycle channels not to break the order of docs.
|
||||||
# The received object is batch of byte encoded docs, so flatten them with chain.from_iterable.
|
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
|
||||||
byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
|
byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
|
||||||
docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs)
|
docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs)
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -462,3 +462,4 @@ cdef enum symbol_t:
|
||||||
acl
|
acl
|
||||||
|
|
||||||
ENT_KB_ID
|
ENT_KB_ID
|
||||||
|
ENT_ID
|
||||||
|
|
|
@ -86,6 +86,7 @@ IDS = {
|
||||||
"DEP": DEP,
|
"DEP": DEP,
|
||||||
"ENT_IOB": ENT_IOB,
|
"ENT_IOB": ENT_IOB,
|
||||||
"ENT_TYPE": ENT_TYPE,
|
"ENT_TYPE": ENT_TYPE,
|
||||||
|
"ENT_ID": ENT_ID,
|
||||||
"ENT_KB_ID": ENT_KB_ID,
|
"ENT_KB_ID": ENT_KB_ID,
|
||||||
"HEAD": HEAD,
|
"HEAD": HEAD,
|
||||||
"SENT_START": SENT_START,
|
"SENT_START": SENT_START,
|
||||||
|
|
36
spacy/tests/regression/test_issue4849.py
Normal file
36
spacy/tests/regression/test_issue4849.py
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.pipeline import EntityRuler
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4849():
|
||||||
|
nlp = English()
|
||||||
|
|
||||||
|
ruler = EntityRuler(
|
||||||
|
nlp, patterns=[
|
||||||
|
{"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'},
|
||||||
|
{"label": "PERSON", "pattern": 'bernie sanders', "id": 'bernie-sanders'},
|
||||||
|
],
|
||||||
|
phrase_matcher_attr="LOWER"
|
||||||
|
)
|
||||||
|
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
|
||||||
|
text = """
|
||||||
|
The left is starting to take aim at Democratic front-runner Joe Biden.
|
||||||
|
Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
|
||||||
|
"""
|
||||||
|
|
||||||
|
# USING 1 PROCESS
|
||||||
|
count_ents = 0
|
||||||
|
for doc in nlp.pipe([text], n_process=1):
|
||||||
|
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||||
|
assert(count_ents == 2)
|
||||||
|
|
||||||
|
# USING 2 PROCESSES
|
||||||
|
count_ents = 0
|
||||||
|
for doc in nlp.pipe([text], n_process=2):
|
||||||
|
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||||
|
assert (count_ents == 2)
|
|
@ -2,7 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc, Token
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
|
@ -15,6 +15,10 @@ def doc_w_attrs(en_tokenizer):
|
||||||
)
|
)
|
||||||
doc = en_tokenizer("This is a test.")
|
doc = en_tokenizer("This is a test.")
|
||||||
doc._._test_attr = "test"
|
doc._._test_attr = "test"
|
||||||
|
|
||||||
|
Token.set_extension("_test_token", default="t0")
|
||||||
|
doc[1]._._test_token = "t1"
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
@ -25,3 +29,7 @@ def test_serialize_ext_attrs_from_bytes(doc_w_attrs):
|
||||||
assert doc._._test_attr == "test"
|
assert doc._._test_attr == "test"
|
||||||
assert doc._._test_prop == len(doc.text)
|
assert doc._._test_prop == len(doc.text)
|
||||||
assert doc._._test_method("test") == "{}{}".format(len(doc.text), "test")
|
assert doc._._test_method("test") == "{}{}".format(len(doc.text), "test")
|
||||||
|
|
||||||
|
assert doc[0]._._test_token == "t0"
|
||||||
|
assert doc[1]._._test_token == "t1"
|
||||||
|
assert doc[2]._._test_token == "t0"
|
||||||
|
|
|
@ -23,7 +23,7 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||||
from ..typedefs cimport attr_t, flags_t
|
from ..typedefs cimport attr_t, flags_t
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
|
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
|
||||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
|
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
|
||||||
from ..attrs cimport ENT_TYPE, ENT_KB_ID, SENT_START, attr_id_t
|
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, attr_id_t
|
||||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||||
|
|
||||||
from ..attrs import intify_attrs, IDS
|
from ..attrs import intify_attrs, IDS
|
||||||
|
@ -69,6 +69,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
||||||
return token.ent_iob
|
return token.ent_iob
|
||||||
elif feat_name == ENT_TYPE:
|
elif feat_name == ENT_TYPE:
|
||||||
return token.ent_type
|
return token.ent_type
|
||||||
|
elif feat_name == ENT_ID:
|
||||||
|
return token.ent_id
|
||||||
elif feat_name == ENT_KB_ID:
|
elif feat_name == ENT_KB_ID:
|
||||||
return token.ent_kb_id
|
return token.ent_kb_id
|
||||||
else:
|
else:
|
||||||
|
@ -868,7 +870,7 @@ cdef class Doc:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#to_bytes
|
DOCS: https://spacy.io/api/doc#to_bytes
|
||||||
"""
|
"""
|
||||||
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE] # TODO: ENT_KB_ID ?
|
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID] # TODO: ENT_KB_ID ?
|
||||||
if self.is_tagged:
|
if self.is_tagged:
|
||||||
array_head.extend([TAG, POS])
|
array_head.extend([TAG, POS])
|
||||||
# If doc parsed add head and dep attribute
|
# If doc parsed add head and dep attribute
|
||||||
|
|
|
@ -212,7 +212,7 @@ cdef class Span:
|
||||||
words = [t.text for t in self]
|
words = [t.text for t in self]
|
||||||
spaces = [bool(t.whitespace_) for t in self]
|
spaces = [bool(t.whitespace_) for t in self]
|
||||||
cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
|
cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
|
||||||
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_KB_ID]
|
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, ENT_KB_ID]
|
||||||
if self.doc.is_tagged:
|
if self.doc.is_tagged:
|
||||||
array_head.append(TAG)
|
array_head.append(TAG)
|
||||||
# If doc parsed add head and dep attribute
|
# If doc parsed add head and dep attribute
|
||||||
|
|
|
@ -53,6 +53,8 @@ cdef class Token:
|
||||||
return token.ent_iob
|
return token.ent_iob
|
||||||
elif feat_name == ENT_TYPE:
|
elif feat_name == ENT_TYPE:
|
||||||
return token.ent_type
|
return token.ent_type
|
||||||
|
elif feat_name == ENT_ID:
|
||||||
|
return token.ent_id
|
||||||
elif feat_name == ENT_KB_ID:
|
elif feat_name == ENT_KB_ID:
|
||||||
return token.ent_kb_id
|
return token.ent_kb_id
|
||||||
elif feat_name == SENT_START:
|
elif feat_name == SENT_START:
|
||||||
|
@ -81,6 +83,8 @@ cdef class Token:
|
||||||
token.ent_iob = value
|
token.ent_iob = value
|
||||||
elif feat_name == ENT_TYPE:
|
elif feat_name == ENT_TYPE:
|
||||||
token.ent_type = value
|
token.ent_type = value
|
||||||
|
elif feat_name == ENT_ID:
|
||||||
|
token.ent_id = value
|
||||||
elif feat_name == ENT_KB_ID:
|
elif feat_name == ENT_KB_ID:
|
||||||
token.ent_kb_id = value
|
token.ent_kb_id = value
|
||||||
elif feat_name == SENT_START:
|
elif feat_name == SENT_START:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user