Fix PhraseMatcher pickling and length (resolves #3248) (#3252)

2025-07-01 02:13:07 +03:00 · 2019-02-12 18:27:54 +01:00 · 2019-02-12 18:27:54 +01:00 · b589b945db
commit b589b945db
parent 483dddc9bc
2 changed files with 42 additions and 2 deletions
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -33,6 +33,7 @@ cdef class PhraseMatcher:
    cdef attr_id_t attr
    cdef public object _callbacks
    cdef public object _patterns
    cdef public object _docs
    cdef public object _validate
    def __init__(self, Vocab vocab, max_length=0, attr='ORTH', validate=False):
@ -55,6 +56,7 @@ cdef class PhraseMatcher:
        ]
        self.matcher.add('Candidate', None, *abstract_patterns)
        self._callbacks = {}
        self._docs = {}
        self._validate = validate
    def __len__(self):
@ -64,7 +66,7 @@ cdef class PhraseMatcher:
        RETURNS (int): The number of rules.
        """
-        return len(self.phrase_ids)
+        return len(self._docs)
    def __contains__(self, key):
        """Check whether the matcher contains rules for a match ID.
@ -76,7 +78,8 @@ cdef class PhraseMatcher:
        return ent_id in self._callbacks
    def __reduce__(self):
-        return (self.__class__, (self.vocab,), None, None)
+        data = (self.vocab, self._docs, self._callbacks)
        return (unpickle_matcher, data, None, None)
    def add(self, key, on_match, *docs):
        """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
@ -89,6 +92,7 @@ cdef class PhraseMatcher:
        cdef Doc doc
        cdef hash_t ent_id = self.matcher._normalize_key(key)
        self._callbacks[ent_id] = on_match
        self._docs[ent_id] = docs
        cdef int length
        cdef int i
        cdef hash_t phrase_hash
@ -213,3 +217,11 @@ def get_bilou(length):
        return [B3_ENT, I3_ENT, L3_ENT]
    else:
        return [B4_ENT, I4_ENT] + [I4_ENT] * (length-3) + [L4_ENT]
 def unpickle_matcher(vocab, docs, callbacks):
    matcher = PhraseMatcher(vocab)
    for key, specs in docs.items():
        callback = callbacks.get(key, None)
        matcher.add(key, callback, *specs)
    return matcher
--- a/spacy/tests/regression/test_issue3248.py
+++ b/spacy/tests/regression/test_issue3248.py
@ -0,0 +1,28 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
 from spacy.matcher import PhraseMatcher
 from spacy.lang.en import English
 from spacy.compat import pickle
 def test_issue3248_1():
    """Test that the PhraseMatcher correctly reports its number of rules, not
    total number of patterns."""
    nlp = English()
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c"))
    matcher.add("TEST2", None, nlp("d"))
    assert len(matcher) == 2
 def test_issue3248_2():
    """Test that the PhraseMatcher can be pickled correctly."""
    nlp = English()
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c"))
    matcher.add("TEST2", None, nlp("d"))
    data = pickle.dumps(matcher)
    new_matcher = pickle.loads(data)
    assert len(new_matcher) == len(matcher)