serialize ENT_ID (#4852)

* expand serialization test for custom token attribute * add failing test for issue 4849 * define ENT_ID as attr and use in doc serialization * fix few typos
2025-07-16 03:02:41 +03:00 · 2020-01-06 14:57:34 +01:00 · 2020-01-06 14:57:34 +01:00 · a1b22e90cd
commit a1b22e90cd
parent 53929138d7
10 changed files with 62 additions and 8 deletions
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -91,3 +91,4 @@ cdef enum attr_id_t:

    LANG
    ENT_KB_ID = symbols.ENT_KB_ID
+    ENT_ID = symbols.ENT_ID
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -84,6 +84,7 @@ IDS = {
    "DEP": DEP,
    "ENT_IOB": ENT_IOB,
    "ENT_TYPE": ENT_TYPE,
+    "ENT_ID": ENT_ID,
    "ENT_KB_ID": ENT_KB_ID,
    "HEAD": HEAD,
    "SENT_START": SENT_START,
--- a/spacy/language.py
+++ b/spacy/language.py
@ -780,7 +780,7 @@ class Language(object):

        pipes = (
            []
-        )  # contains functools.partial objects so that easily create multiprocess worker.
+        )  # contains functools.partial objects to easily create multiprocess worker.
        for name, proc in self.pipeline:
            if name in disable:
                continue
@ -837,7 +837,7 @@ class Language(object):
        texts, raw_texts = itertools.tee(texts)
        # for sending texts to worker
        texts_q = [mp.Queue() for _ in range(n_process)]
-        # for receiving byte encoded docs from worker
+        # for receiving byte-encoded docs from worker
        bytedocs_recv_ch, bytedocs_send_ch = zip(
            *[mp.Pipe(False) for _ in range(n_process)]
        )
@ -847,7 +847,7 @@ class Language(object):
        # This is necessary to properly handle infinite length of texts.
        # (In this case, all data cannot be sent to the workers at once)
        sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
-        # send twice so that make process busy
+        # send twice to make process busy
        sender.send()
        sender.send()

@ -859,7 +859,7 @@ class Language(object):
            proc.start()

        # Cycle channels not to break the order of docs.
-        # The received object is batch of byte encoded docs, so flatten them with chain.from_iterable.
+        # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
        byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
        docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs)
        try:
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -462,3 +462,4 @@ cdef enum symbol_t:
    acl

    ENT_KB_ID
+    ENT_ID
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -86,6 +86,7 @@ IDS = {
    "DEP": DEP,
    "ENT_IOB": ENT_IOB,
    "ENT_TYPE": ENT_TYPE,
+    "ENT_ID": ENT_ID,
    "ENT_KB_ID": ENT_KB_ID,
    "HEAD": HEAD,
    "SENT_START": SENT_START,
--- a/spacy/tests/regression/test_issue4849.py
+++ b/spacy/tests/regression/test_issue4849.py
@ -0,0 +1,36 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from spacy.lang.en import English
+from spacy.pipeline import EntityRuler
+
+
+def test_issue4849():
+    nlp = English()
+
+    ruler = EntityRuler(
+        nlp, patterns=[
+            {"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'},
+            {"label": "PERSON", "pattern": 'bernie sanders', "id": 'bernie-sanders'},
+        ],
+        phrase_matcher_attr="LOWER"
+    )
+
+    nlp.add_pipe(ruler)
+
+    text = """
+    The left is starting to take aim at Democratic front-runner Joe Biden.
+    Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
+    """
+
+    # USING 1 PROCESS
+    count_ents = 0
+    for doc in nlp.pipe([text], n_process=1):
+        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
+    assert(count_ents == 2)
+
+    # USING 2 PROCESSES
+    count_ents = 0
+    for doc in nlp.pipe([text], n_process=2):
+        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
+    assert (count_ents == 2)
--- a/spacy/tests/serialize/test_serialize_extension_attrs.py
+++ b/spacy/tests/serialize/test_serialize_extension_attrs.py
@ -2,7 +2,7 @@
 from __future__ import unicode_literals

 import pytest
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Token
 from spacy.vocab import Vocab


@ -15,6 +15,10 @@ def doc_w_attrs(en_tokenizer):
    )
    doc = en_tokenizer("This is a test.")
    doc._._test_attr = "test"
+
+    Token.set_extension("_test_token", default="t0")
+    doc[1]._._test_token = "t1"
+
    return doc


@ -25,3 +29,7 @@ def test_serialize_ext_attrs_from_bytes(doc_w_attrs):
    assert doc._._test_attr == "test"
    assert doc._._test_prop == len(doc.text)
    assert doc._._test_method("test") == "{}{}".format(len(doc.text), "test")
+
+    assert doc[0]._._test_token == "t0"
+    assert doc[1]._._test_token == "t1"
+    assert doc[2]._._test_token == "t0"
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -23,7 +23,7 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
 from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
-from ..attrs cimport ENT_TYPE, ENT_KB_ID, SENT_START, attr_id_t
+from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, attr_id_t
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t

 from ..attrs import intify_attrs, IDS
@ -69,6 +69,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
        return token.ent_iob
    elif feat_name == ENT_TYPE:
        return token.ent_type
+    elif feat_name == ENT_ID:
+        return token.ent_id
    elif feat_name == ENT_KB_ID:
        return token.ent_kb_id
    else:
@ -868,7 +870,7 @@ cdef class Doc:

        DOCS: https://spacy.io/api/doc#to_bytes
        """
-        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE]  # TODO: ENT_KB_ID ?
+        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID]  # TODO: ENT_KB_ID ?
        if self.is_tagged:
            array_head.extend([TAG, POS])
        # If doc parsed add head and dep attribute
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -212,7 +212,7 @@ cdef class Span:
        words = [t.text for t in self]
        spaces = [bool(t.whitespace_) for t in self]
        cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
-        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_KB_ID]
+        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, ENT_KB_ID]
        if self.doc.is_tagged:
            array_head.append(TAG)
        # If doc parsed add head and dep attribute
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@ -53,6 +53,8 @@ cdef class Token:
            return token.ent_iob
        elif feat_name == ENT_TYPE:
            return token.ent_type
+        elif feat_name == ENT_ID:
+            return token.ent_id
        elif feat_name == ENT_KB_ID:
            return token.ent_kb_id
        elif feat_name == SENT_START:
@ -81,6 +83,8 @@ cdef class Token:
            token.ent_iob = value
        elif feat_name == ENT_TYPE:
            token.ent_type = value
+        elif feat_name == ENT_ID:
+            token.ent_id = value
        elif feat_name == ENT_KB_ID:
            token.ent_kb_id = value
        elif feat_name == SENT_START: