mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Add logger warning when serializing user hooks (#6595)
Add a warning that user hooks are lost on serialization. Add a `user_hooks` exclude to skip the warning with pickle.
This commit is contained in:
		
							parent
							
								
									cabd4ae5b1
								
							
						
					
					
						commit
						5ca57d8221
					
				| 
						 | 
				
			
			@ -123,6 +123,8 @@ class Warnings:
 | 
			
		|||
            "token '{text}'. Check that your pipeline includes components that "
 | 
			
		||||
            "assign token.pos, typically 'tagger'+'attribute_ruler' or "
 | 
			
		||||
            "'morphologizer'.")
 | 
			
		||||
    W109 = ("Unable to save user hooks while serializing the doc. Re-add any "
 | 
			
		||||
            "required user hooks to the doc after processing.")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@add_codes
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,5 +1,7 @@
 | 
			
		|||
import pytest
 | 
			
		||||
import numpy
 | 
			
		||||
import logging
 | 
			
		||||
import mock
 | 
			
		||||
from spacy.tokens import Doc, Span
 | 
			
		||||
from spacy.vocab import Vocab
 | 
			
		||||
from spacy.lexeme import Lexeme
 | 
			
		||||
| 
						 | 
				
			
			@ -147,6 +149,17 @@ def test_doc_api_serialize(en_tokenizer, text):
 | 
			
		|||
    assert [t.text for t in tokens] == [t.text for t in new_tokens]
 | 
			
		||||
    assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
 | 
			
		||||
 | 
			
		||||
    def inner_func(d1, d2):
 | 
			
		||||
        return "hello!"
 | 
			
		||||
 | 
			
		||||
    logger = logging.getLogger("spacy")
 | 
			
		||||
    with mock.patch.object(logger, "warning") as mock_warning:
 | 
			
		||||
        _ = tokens.to_bytes()
 | 
			
		||||
        mock_warning.assert_not_called()
 | 
			
		||||
        tokens.user_hooks["similarity"] = inner_func
 | 
			
		||||
        _ = tokens.to_bytes()
 | 
			
		||||
        mock_warning.assert_called_once()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_doc_api_set_ents(en_tokenizer):
 | 
			
		||||
    text = "I use goggle chrone to surf the web"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1273,6 +1273,8 @@ cdef class Doc:
 | 
			
		|||
                serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys)
 | 
			
		||||
            if "user_data_values" not in exclude:
 | 
			
		||||
                serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
 | 
			
		||||
        if "user_hooks" not in exclude and any((self.user_hooks, self.user_token_hooks, self.user_span_hooks)):
 | 
			
		||||
            util.logger.warning(Warnings.W109)
 | 
			
		||||
        return util.to_dict(serializers, exclude)
 | 
			
		||||
 | 
			
		||||
    def from_dict(self, msg, *, exclude=tuple()):
 | 
			
		||||
| 
						 | 
				
			
			@ -1649,7 +1651,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def pickle_doc(doc):
 | 
			
		||||
    bytes_data = doc.to_bytes(exclude=["vocab", "user_data"])
 | 
			
		||||
    bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"])
 | 
			
		||||
    hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
 | 
			
		||||
                      doc.user_token_hooks)
 | 
			
		||||
    return (unpickle_doc, (doc.vocab, srsly.pickle_dumps(hooks_and_data), bytes_data))
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user