mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	serialize ENT_ID (#4852)
* expand serialization test for custom token attribute * add failing test for issue 4849 * define ENT_ID as attr and use in doc serialization * fix few typos
This commit is contained in:
		
							parent
							
								
									53929138d7
								
							
						
					
					
						commit
						a1b22e90cd
					
				| 
						 | 
				
			
			@ -91,3 +91,4 @@ cdef enum attr_id_t:
 | 
			
		|||
 | 
			
		||||
    LANG
 | 
			
		||||
    ENT_KB_ID = symbols.ENT_KB_ID
 | 
			
		||||
    ENT_ID = symbols.ENT_ID
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -84,6 +84,7 @@ IDS = {
 | 
			
		|||
    "DEP": DEP,
 | 
			
		||||
    "ENT_IOB": ENT_IOB,
 | 
			
		||||
    "ENT_TYPE": ENT_TYPE,
 | 
			
		||||
    "ENT_ID": ENT_ID,
 | 
			
		||||
    "ENT_KB_ID": ENT_KB_ID,
 | 
			
		||||
    "HEAD": HEAD,
 | 
			
		||||
    "SENT_START": SENT_START,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -780,7 +780,7 @@ class Language(object):
 | 
			
		|||
 | 
			
		||||
        pipes = (
 | 
			
		||||
            []
 | 
			
		||||
        )  # contains functools.partial objects so that easily create multiprocess worker.
 | 
			
		||||
        )  # contains functools.partial objects to easily create multiprocess worker.
 | 
			
		||||
        for name, proc in self.pipeline:
 | 
			
		||||
            if name in disable:
 | 
			
		||||
                continue
 | 
			
		||||
| 
						 | 
				
			
			@ -837,7 +837,7 @@ class Language(object):
 | 
			
		|||
        texts, raw_texts = itertools.tee(texts)
 | 
			
		||||
        # for sending texts to worker
 | 
			
		||||
        texts_q = [mp.Queue() for _ in range(n_process)]
 | 
			
		||||
        # for receiving byte encoded docs from worker
 | 
			
		||||
        # for receiving byte-encoded docs from worker
 | 
			
		||||
        bytedocs_recv_ch, bytedocs_send_ch = zip(
 | 
			
		||||
            *[mp.Pipe(False) for _ in range(n_process)]
 | 
			
		||||
        )
 | 
			
		||||
| 
						 | 
				
			
			@ -847,7 +847,7 @@ class Language(object):
 | 
			
		|||
        # This is necessary to properly handle infinite length of texts.
 | 
			
		||||
        # (In this case, all data cannot be sent to the workers at once)
 | 
			
		||||
        sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
 | 
			
		||||
        # send twice so that make process busy
 | 
			
		||||
        # send twice to make process busy
 | 
			
		||||
        sender.send()
 | 
			
		||||
        sender.send()
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -859,7 +859,7 @@ class Language(object):
 | 
			
		|||
            proc.start()
 | 
			
		||||
 | 
			
		||||
        # Cycle channels not to break the order of docs.
 | 
			
		||||
        # The received object is batch of byte encoded docs, so flatten them with chain.from_iterable.
 | 
			
		||||
        # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
 | 
			
		||||
        byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
 | 
			
		||||
        docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs)
 | 
			
		||||
        try:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -462,3 +462,4 @@ cdef enum symbol_t:
 | 
			
		|||
    acl
 | 
			
		||||
 | 
			
		||||
    ENT_KB_ID
 | 
			
		||||
    ENT_ID
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -86,6 +86,7 @@ IDS = {
 | 
			
		|||
    "DEP": DEP,
 | 
			
		||||
    "ENT_IOB": ENT_IOB,
 | 
			
		||||
    "ENT_TYPE": ENT_TYPE,
 | 
			
		||||
    "ENT_ID": ENT_ID,
 | 
			
		||||
    "ENT_KB_ID": ENT_KB_ID,
 | 
			
		||||
    "HEAD": HEAD,
 | 
			
		||||
    "SENT_START": SENT_START,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										36
									
								
								spacy/tests/regression/test_issue4849.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								spacy/tests/regression/test_issue4849.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,36 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from spacy.lang.en import English
 | 
			
		||||
from spacy.pipeline import EntityRuler
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_issue4849():
 | 
			
		||||
    nlp = English()
 | 
			
		||||
 | 
			
		||||
    ruler = EntityRuler(
 | 
			
		||||
        nlp, patterns=[
 | 
			
		||||
            {"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'},
 | 
			
		||||
            {"label": "PERSON", "pattern": 'bernie sanders', "id": 'bernie-sanders'},
 | 
			
		||||
        ],
 | 
			
		||||
        phrase_matcher_attr="LOWER"
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    nlp.add_pipe(ruler)
 | 
			
		||||
 | 
			
		||||
    text = """
 | 
			
		||||
    The left is starting to take aim at Democratic front-runner Joe Biden.
 | 
			
		||||
    Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    # USING 1 PROCESS
 | 
			
		||||
    count_ents = 0
 | 
			
		||||
    for doc in nlp.pipe([text], n_process=1):
 | 
			
		||||
        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
 | 
			
		||||
    assert(count_ents == 2)
 | 
			
		||||
 | 
			
		||||
    # USING 2 PROCESSES
 | 
			
		||||
    count_ents = 0
 | 
			
		||||
    for doc in nlp.pipe([text], n_process=2):
 | 
			
		||||
        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
 | 
			
		||||
    assert (count_ents == 2)
 | 
			
		||||
| 
						 | 
				
			
			@ -2,7 +2,7 @@
 | 
			
		|||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
from spacy.tokens import Doc
 | 
			
		||||
from spacy.tokens import Doc, Token
 | 
			
		||||
from spacy.vocab import Vocab
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -15,6 +15,10 @@ def doc_w_attrs(en_tokenizer):
 | 
			
		|||
    )
 | 
			
		||||
    doc = en_tokenizer("This is a test.")
 | 
			
		||||
    doc._._test_attr = "test"
 | 
			
		||||
 | 
			
		||||
    Token.set_extension("_test_token", default="t0")
 | 
			
		||||
    doc[1]._._test_token = "t1"
 | 
			
		||||
 | 
			
		||||
    return doc
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -25,3 +29,7 @@ def test_serialize_ext_attrs_from_bytes(doc_w_attrs):
 | 
			
		|||
    assert doc._._test_attr == "test"
 | 
			
		||||
    assert doc._._test_prop == len(doc.text)
 | 
			
		||||
    assert doc._._test_method("test") == "{}{}".format(len(doc.text), "test")
 | 
			
		||||
 | 
			
		||||
    assert doc[0]._._test_token == "t0"
 | 
			
		||||
    assert doc[1]._._test_token == "t1"
 | 
			
		||||
    assert doc[2]._._test_token == "t0"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -23,7 +23,7 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME
 | 
			
		|||
from ..typedefs cimport attr_t, flags_t
 | 
			
		||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
 | 
			
		||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
 | 
			
		||||
from ..attrs cimport ENT_TYPE, ENT_KB_ID, SENT_START, attr_id_t
 | 
			
		||||
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, attr_id_t
 | 
			
		||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
 | 
			
		||||
 | 
			
		||||
from ..attrs import intify_attrs, IDS
 | 
			
		||||
| 
						 | 
				
			
			@ -69,6 +69,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
 | 
			
		|||
        return token.ent_iob
 | 
			
		||||
    elif feat_name == ENT_TYPE:
 | 
			
		||||
        return token.ent_type
 | 
			
		||||
    elif feat_name == ENT_ID:
 | 
			
		||||
        return token.ent_id
 | 
			
		||||
    elif feat_name == ENT_KB_ID:
 | 
			
		||||
        return token.ent_kb_id
 | 
			
		||||
    else:
 | 
			
		||||
| 
						 | 
				
			
			@ -868,7 +870,7 @@ cdef class Doc:
 | 
			
		|||
 | 
			
		||||
        DOCS: https://spacy.io/api/doc#to_bytes
 | 
			
		||||
        """
 | 
			
		||||
        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE]  # TODO: ENT_KB_ID ?
 | 
			
		||||
        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID]  # TODO: ENT_KB_ID ?
 | 
			
		||||
        if self.is_tagged:
 | 
			
		||||
            array_head.extend([TAG, POS])
 | 
			
		||||
        # If doc parsed add head and dep attribute
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -212,7 +212,7 @@ cdef class Span:
 | 
			
		|||
        words = [t.text for t in self]
 | 
			
		||||
        spaces = [bool(t.whitespace_) for t in self]
 | 
			
		||||
        cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
 | 
			
		||||
        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_KB_ID]
 | 
			
		||||
        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, ENT_KB_ID]
 | 
			
		||||
        if self.doc.is_tagged:
 | 
			
		||||
            array_head.append(TAG)
 | 
			
		||||
        # If doc parsed add head and dep attribute
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -53,6 +53,8 @@ cdef class Token:
 | 
			
		|||
            return token.ent_iob
 | 
			
		||||
        elif feat_name == ENT_TYPE:
 | 
			
		||||
            return token.ent_type
 | 
			
		||||
        elif feat_name == ENT_ID:
 | 
			
		||||
            return token.ent_id
 | 
			
		||||
        elif feat_name == ENT_KB_ID:
 | 
			
		||||
            return token.ent_kb_id
 | 
			
		||||
        elif feat_name == SENT_START:
 | 
			
		||||
| 
						 | 
				
			
			@ -81,6 +83,8 @@ cdef class Token:
 | 
			
		|||
            token.ent_iob = value
 | 
			
		||||
        elif feat_name == ENT_TYPE:
 | 
			
		||||
            token.ent_type = value
 | 
			
		||||
        elif feat_name == ENT_ID:
 | 
			
		||||
            token.ent_id = value
 | 
			
		||||
        elif feat_name == ENT_KB_ID:
 | 
			
		||||
            token.ent_kb_id = value
 | 
			
		||||
        elif feat_name == SENT_START:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user