mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-23 15:54:13 +03:00
serialize ENT_ID (#4852)
* expand serialization test for custom token attribute * add failing test for issue 4849 * define ENT_ID as attr and use in doc serialization * fix few typos
This commit is contained in:
parent
53929138d7
commit
a1b22e90cd
|
@ -91,3 +91,4 @@ cdef enum attr_id_t:
|
|||
|
||||
LANG
|
||||
ENT_KB_ID = symbols.ENT_KB_ID
|
||||
ENT_ID = symbols.ENT_ID
|
||||
|
|
|
@ -84,6 +84,7 @@ IDS = {
|
|||
"DEP": DEP,
|
||||
"ENT_IOB": ENT_IOB,
|
||||
"ENT_TYPE": ENT_TYPE,
|
||||
"ENT_ID": ENT_ID,
|
||||
"ENT_KB_ID": ENT_KB_ID,
|
||||
"HEAD": HEAD,
|
||||
"SENT_START": SENT_START,
|
||||
|
|
|
@ -780,7 +780,7 @@ class Language(object):
|
|||
|
||||
pipes = (
|
||||
[]
|
||||
) # contains functools.partial objects so that easily create multiprocess worker.
|
||||
) # contains functools.partial objects to easily create multiprocess worker.
|
||||
for name, proc in self.pipeline:
|
||||
if name in disable:
|
||||
continue
|
||||
|
@ -837,7 +837,7 @@ class Language(object):
|
|||
texts, raw_texts = itertools.tee(texts)
|
||||
# for sending texts to worker
|
||||
texts_q = [mp.Queue() for _ in range(n_process)]
|
||||
# for receiving byte encoded docs from worker
|
||||
# for receiving byte-encoded docs from worker
|
||||
bytedocs_recv_ch, bytedocs_send_ch = zip(
|
||||
*[mp.Pipe(False) for _ in range(n_process)]
|
||||
)
|
||||
|
@ -847,7 +847,7 @@ class Language(object):
|
|||
# This is necessary to properly handle infinite length of texts.
|
||||
# (In this case, all data cannot be sent to the workers at once)
|
||||
sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
|
||||
# send twice so that make process busy
|
||||
# send twice to make process busy
|
||||
sender.send()
|
||||
sender.send()
|
||||
|
||||
|
@ -859,7 +859,7 @@ class Language(object):
|
|||
proc.start()
|
||||
|
||||
# Cycle channels not to break the order of docs.
|
||||
# The received object is batch of byte encoded docs, so flatten them with chain.from_iterable.
|
||||
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
|
||||
byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
|
||||
docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs)
|
||||
try:
|
||||
|
|
|
@ -462,3 +462,4 @@ cdef enum symbol_t:
|
|||
acl
|
||||
|
||||
ENT_KB_ID
|
||||
ENT_ID
|
||||
|
|
|
@ -86,6 +86,7 @@ IDS = {
|
|||
"DEP": DEP,
|
||||
"ENT_IOB": ENT_IOB,
|
||||
"ENT_TYPE": ENT_TYPE,
|
||||
"ENT_ID": ENT_ID,
|
||||
"ENT_KB_ID": ENT_KB_ID,
|
||||
"HEAD": HEAD,
|
||||
"SENT_START": SENT_START,
|
||||
|
|
36
spacy/tests/regression/test_issue4849.py
Normal file
36
spacy/tests/regression/test_issue4849.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.pipeline import EntityRuler
|
||||
|
||||
|
||||
def test_issue4849():
|
||||
nlp = English()
|
||||
|
||||
ruler = EntityRuler(
|
||||
nlp, patterns=[
|
||||
{"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'},
|
||||
{"label": "PERSON", "pattern": 'bernie sanders', "id": 'bernie-sanders'},
|
||||
],
|
||||
phrase_matcher_attr="LOWER"
|
||||
)
|
||||
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
text = """
|
||||
The left is starting to take aim at Democratic front-runner Joe Biden.
|
||||
Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
|
||||
"""
|
||||
|
||||
# USING 1 PROCESS
|
||||
count_ents = 0
|
||||
for doc in nlp.pipe([text], n_process=1):
|
||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||
assert(count_ents == 2)
|
||||
|
||||
# USING 2 PROCESSES
|
||||
count_ents = 0
|
||||
for doc in nlp.pipe([text], n_process=2):
|
||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||
assert (count_ents == 2)
|
|
@ -2,7 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
from spacy.tokens import Doc, Token
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
|
||||
|
@ -15,6 +15,10 @@ def doc_w_attrs(en_tokenizer):
|
|||
)
|
||||
doc = en_tokenizer("This is a test.")
|
||||
doc._._test_attr = "test"
|
||||
|
||||
Token.set_extension("_test_token", default="t0")
|
||||
doc[1]._._test_token = "t1"
|
||||
|
||||
return doc
|
||||
|
||||
|
||||
|
@ -25,3 +29,7 @@ def test_serialize_ext_attrs_from_bytes(doc_w_attrs):
|
|||
assert doc._._test_attr == "test"
|
||||
assert doc._._test_prop == len(doc.text)
|
||||
assert doc._._test_method("test") == "{}{}".format(len(doc.text), "test")
|
||||
|
||||
assert doc[0]._._test_token == "t0"
|
||||
assert doc[1]._._test_token == "t1"
|
||||
assert doc[2]._._test_token == "t0"
|
||||
|
|
|
@ -23,7 +23,7 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
|||
from ..typedefs cimport attr_t, flags_t
|
||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
|
||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
|
||||
from ..attrs cimport ENT_TYPE, ENT_KB_ID, SENT_START, attr_id_t
|
||||
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, attr_id_t
|
||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||
|
||||
from ..attrs import intify_attrs, IDS
|
||||
|
@ -69,6 +69,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
|||
return token.ent_iob
|
||||
elif feat_name == ENT_TYPE:
|
||||
return token.ent_type
|
||||
elif feat_name == ENT_ID:
|
||||
return token.ent_id
|
||||
elif feat_name == ENT_KB_ID:
|
||||
return token.ent_kb_id
|
||||
else:
|
||||
|
@ -868,7 +870,7 @@ cdef class Doc:
|
|||
|
||||
DOCS: https://spacy.io/api/doc#to_bytes
|
||||
"""
|
||||
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE] # TODO: ENT_KB_ID ?
|
||||
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID] # TODO: ENT_KB_ID ?
|
||||
if self.is_tagged:
|
||||
array_head.extend([TAG, POS])
|
||||
# If doc parsed add head and dep attribute
|
||||
|
|
|
@ -212,7 +212,7 @@ cdef class Span:
|
|||
words = [t.text for t in self]
|
||||
spaces = [bool(t.whitespace_) for t in self]
|
||||
cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
|
||||
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_KB_ID]
|
||||
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, ENT_KB_ID]
|
||||
if self.doc.is_tagged:
|
||||
array_head.append(TAG)
|
||||
# If doc parsed add head and dep attribute
|
||||
|
|
|
@ -53,6 +53,8 @@ cdef class Token:
|
|||
return token.ent_iob
|
||||
elif feat_name == ENT_TYPE:
|
||||
return token.ent_type
|
||||
elif feat_name == ENT_ID:
|
||||
return token.ent_id
|
||||
elif feat_name == ENT_KB_ID:
|
||||
return token.ent_kb_id
|
||||
elif feat_name == SENT_START:
|
||||
|
@ -81,6 +83,8 @@ cdef class Token:
|
|||
token.ent_iob = value
|
||||
elif feat_name == ENT_TYPE:
|
||||
token.ent_type = value
|
||||
elif feat_name == ENT_ID:
|
||||
token.ent_id = value
|
||||
elif feat_name == ENT_KB_ID:
|
||||
token.ent_kb_id = value
|
||||
elif feat_name == SENT_START:
|
||||
|
|
Loading…
Reference in New Issue
Block a user