💫 Update matcher engine for regex and extensions (#3173)

* Update matcher engine for regex and extensions Add support for matching over arbitrary Python predicate functions, and arbitrary Python attribute getters. This will allow matching over regex patterns, and allow supporting extension attributes. The results of the Python predicate functions are cached, so that we don't call the same predicate function twice for the same token. The extension attributes are fetched into an array for each token in the doc. This should minimise the performance impact of the new features. We still need to wire up these features to the patterns, and test it all. * Work on wiring up extra attributes in matcher * Work on tests for extra matcher attrs * Add support for extension attrs to matcher * Test extension attribute matching * Work on implementing predicate-based match patterns * Get predicates working for set membership * Add test for set membership * Make extensions+predicates work * Test matcher extensions * Cache predicate results better in Matcher * Remove print statement in matcher test * Use srsly to get key for predicates
2025-12-24 02:23:19 +03:00 · 2019-01-21 13:23:15 +01:00 · 2019-01-21 13:23:15 +01:00 · 77ddcf7381
commit 77ddcf7381
parent f407954b27
2 changed files with 388 additions and 45 deletions
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -1,6 +1,8 @@
 # cython: infer_types=True
 # cython: profile=True
 from __future__ import unicode_literals
 import re
 import srsly
 from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, uint64_t, uint16_t
 from preshed.maps cimport PreshMap
@ -11,9 +13,11 @@ from .structs cimport TokenC
 from .lexeme cimport attr_id_t
 from .vocab cimport Vocab
 from .tokens.doc cimport Doc
 from .tokens.token cimport Token
 from .tokens.doc cimport get_token_attr
 from .attrs cimport ID, attr_id_t, NULL_ATTR, ORTH
 from .errors import Errors, TempErrors, Warnings, deprecation_warning
 from .strings import get_string_id
 from .attrs import IDS
 from .attrs import FLAG61 as U_ENT
@ -56,9 +60,17 @@ cdef struct AttrValueC:
    attr_id_t attr
    attr_t value
 cdef struct IndexValueC:
    int32_t index
    attr_t value
 cdef struct TokenPatternC:
    AttrValueC* attrs
    int32_t* py_predicates
    IndexValueC* extra_attrs
    int32_t nr_attr
    int32_t nr_extra_attr
    int32_t nr_py
    quantifier_t quantifier
    hash_t key
@ -75,19 +87,46 @@ cdef struct MatchC:
    int32_t length
-cdef find_matches(TokenPatternC** patterns, int n, Doc doc):
+cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
        predicates=tuple()):
    '''Find matches in a doc, with a compiled array of patterns. Matches are
    returned as a list of (id, start, end) tuples.
    To augment the compiled patterns, we optionally also take two Python lists.
    The "predicates" list contains functions that take a Python list and return a
    boolean value. It's mostly used for regular expressions.
    The "extra_getters" list contains functions that take a Python list and return
    an attr ID. It's mostly used for extension attributes.
    '''
    cdef vector[PatternStateC] states
    cdef vector[MatchC] matches
    cdef PatternStateC state
    cdef int i, j, nr_extra_attr
    cdef Pool mem = Pool()
-    # TODO: Prefill this with the extra attribute values.
+    predicate_cache = <char*>mem.alloc(doc.length * len(predicates), sizeof(char))
-    extra_attrs = <attr_t**>mem.alloc(len(doc), sizeof(attr_t*))
+    if extensions is not None and len(extensions) >= 1:
        nr_extra_attr = max(extensions.values())
        extra_attr_values = <attr_t*>mem.alloc(doc.length * nr_extra_attr, sizeof(attr_t))
    else:
        nr_extra_attr = 0
        extra_attr_values = <attr_t*>mem.alloc(doc.length, sizeof(attr_t))
    for i, token in enumerate(doc):
        for name, index in extensions.items():
            value = token._.get(name)
            if isinstance(value, basestring):
                value = token.vocab.strings[value]
            extra_attr_values[i * nr_extra_attr + index] = value
    # Main loop
-    cdef int i, j
+    cdef int nr_predicate = len(predicates)
    for i in range(doc.length):
        for j in range(n):
            states.push_back(PatternStateC(patterns[j], i, 0))
-        transition_states(states, matches, &doc.c[i], extra_attrs[i])
+        transition_states(states, matches, predicate_cache,
            doc[i], extra_attr_values, predicates)
        predicate_cache += nr_predicate
        extra_attr_values += nr_extra_attr
    # Handle matches that end in 0-width patterns
    finish_states(matches, states)
    output = []
@ -119,11 +158,17 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
 cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
-        const TokenC* token, const attr_t* extra_attrs) except *:
+                            char* cached_py_predicates,
        Token token, const attr_t* extra_attrs, py_predicates) except *:
    cdef int q = 0
    cdef vector[PatternStateC] new_states
    cdef int nr_predicate = len(py_predicates)
    for i in range(states.size()):
-        action = get_action(states[i], token, extra_attrs)
+        if states[i].pattern.nr_py != 0:
            update_predicate_cache(cached_py_predicates,
                states[i].pattern, token, py_predicates)
        action = get_action(states[i], token.c, extra_attrs,
                            cached_py_predicates, nr_predicate)
        if action == REJECT:
            continue
        state = states[i]
@ -140,7 +185,11 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
                    PatternStateC(pattern=state.pattern+1, start=state.start,
                                  length=state.length+1))
            states[q].pattern += 1
-            action = get_action(states[q], token, extra_attrs)
+            if states[q].pattern.nr_py != 0:
                update_predicate_cache(cached_py_predicates,
                    states[q].pattern, token, py_predicates)
            action = get_action(states[q], token.c, extra_attrs,
                                cached_py_predicates, nr_predicate)
        if action == REJECT:
            pass
        elif action == ADVANCE:
@ -168,6 +217,26 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
        states.push_back(new_states[i])
 cdef void update_predicate_cache(char* cache,
        const TokenPatternC* pattern, Token token, predicates):
    # If the state references any extra predicates, check whether they match.
    # These are cached, so that we don't call these potentially expensive
    # Python functions more than we need to.
    for i in range(pattern.nr_py):
        index = pattern.py_predicates[i]
        if cache[index] == 0:
            predicate = predicates[index]
            result = predicate(token)
            if result is True:
                cache[index] = 1
            elif result is False:
                cache[index] = -1
            elif result is None:
                pass
            else:
                raise ValueError("Unexpected value: %s" % result)
 cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) except *:
    '''Handle states that end in zero-width patterns.'''
    cdef PatternStateC state
@ -184,7 +253,9 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states)
                state.pattern += 1
-cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil:
+cdef action_t get_action(PatternStateC state,
        const TokenC* token, const attr_t* extra_attrs,
        const char* predicate_matches, int nr_predicate) nogil:
    '''We need to consider:
    a) Does the token match the specification? [Yes, No]
@ -244,7 +315,7 @@ cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t*
    Problem: If a quantifier is matching, we're adding a lot of open partials
    '''
    cdef char is_match
-    is_match = get_is_match(state, token, extra_attrs)
+    is_match = get_is_match(state, token, extra_attrs, predicate_matches, nr_predicate)
    quantifier = get_quantifier(state)
    is_final = get_is_final(state)
    if quantifier == ZERO:
@ -293,13 +364,20 @@ cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t*
          return RETRY
-cdef char get_is_match(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil:
+cdef char get_is_match(PatternStateC state,
        const TokenC* token, const attr_t* extra_attrs,
        const char* predicate_matches, int nr_predicate) nogil:
    for i in range(nr_predicate):
        if predicate_matches[i] == -1:
            return 0
    spec = state.pattern
    for attr in spec.attrs[:spec.nr_attr]:
        if get_token_attr(token, attr.attr) != attr.value:
            return 0
-    else:
+    for i in range(spec.nr_extra_attr):
-        return 1
+        if spec.extra_attrs[i].value != extra_attrs[spec.extra_attrs[i].index]:
            return 0
    return True
 cdef char get_is_final(PatternStateC state) nogil:
@ -316,17 +394,25 @@ cdef char get_quantifier(PatternStateC state) nogil:
 DEF PADDING = 5
-cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
+cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) except NULL:
                                 object token_specs) except NULL:
    pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
    cdef int i
-    for i, (quantifier, spec) in enumerate(token_specs):
+    for i, (quantifier, spec, extensions, predicates) in enumerate(token_specs):
        pattern[i].quantifier = quantifier
        pattern[i].attrs = <AttrValueC*>mem.alloc(len(spec), sizeof(AttrValueC))
        pattern[i].nr_attr = len(spec)
        for j, (attr, value) in enumerate(spec):
            pattern[i].attrs[j].attr = attr
            pattern[i].attrs[j].value = value
        pattern[i].extra_attrs = <IndexValueC*>mem.alloc(len(extensions), sizeof(IndexValueC))
        for j, (index, value) in enumerate(extensions):
            pattern[i].extra_attrs[j].index = index
            pattern[i].extra_attrs[j].value = value
        pattern[i].nr_extra_attr = len(extensions)
        pattern[i].py_predicates = <int32_t*>mem.alloc(len(predicates), sizeof(int32_t))
        for j, index in enumerate(predicates):
            pattern[i].py_predicates[j] = index
        pattern[i].nr_py = len(predicates)
        pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0)
    i = len(token_specs)
    pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
@ -345,41 +431,216 @@ cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil:
            raise ValueError(Errors.E074.format(attr=ID, bad_attr=id_attr.attr))
    return id_attr.value
-def _convert_strings(token_specs, string_store):
+
-    # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
+def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predicates):
-    operators = {'*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
+    """This function interprets the pattern, converting the various bits of
-                 '?': (ZERO_ONE,), '1': (ONE,), '!': (ZERO,)}
+    syntactic sugar before we compile it into a struct with init_pattern.
    We need to split the pattern up into three parts:
    * Normal attribute/value pairs, which are stored on either the token or lexeme,
        can be handled directly.
    * Extension attributes are handled specially, as we need to prefetch the
        values from Python for the doc before we begin matching.
    * Extra predicates also call Python functions, so we have to create the
        functions and store them. So we store these specially as well.
    * Extension attributes that have extra predicates are stored within the
        extra_predicates.
    """
    tokens = []
-    op = ONE
+    seen_predicates = {}
    for spec in token_specs:
        if not spec:
            # Signifier for 'any token'
-            tokens.append((ONE, [(NULL_ATTR, 0)]))
+            tokens.append((ONE, [(NULL_ATTR, 0)], [], []))
            continue
-        token = []
+        ops = _get_operators(spec)
-        ops = (ONE,)
+        attr_values = _get_attr_values(spec, string_store)
-        for attr, value in spec.items():
+        extensions = _get_extensions(spec, string_store, extensions_table)
-            if isinstance(attr, basestring) and attr.upper() == 'OP':
+        predicates = _get_extra_predicates(spec, extra_predicates, seen_predicates)
                if value in operators:
                    ops = operators[value]
                else:
                    keys = ', '.join(operators.keys())
                    raise KeyError(Errors.E011.format(op=value, opts=keys))
            if isinstance(attr, basestring):
                if attr.upper() == 'TEXT':
                    attr = 'ORTH'
                attr = IDS.get(attr.upper())
            if isinstance(value, basestring):
                value = string_store.add(value)
            if isinstance(value, bool):
                value = int(value)
            if attr is not None:
                token.append((attr, value))
        for op in ops:
-            tokens.append((op, token))
+            tokens.append((op, list(attr_values), list(extensions), list(predicates)))
    return tokens
 def _get_attr_values(spec, string_store):
    attr_values = []
    for attr, value in spec.items():
        if isinstance(attr, basestring):
            if attr == '_':
                continue
            elif attr.upper() == 'OP':
                continue
            if attr.upper() == 'TEXT':
                attr = 'ORTH'
            attr = IDS.get(attr.upper())
        if isinstance(value, basestring):
            value = string_store.add(value)
        elif isinstance(value, bool):
            value = int(value)
        elif isinstance(value, dict):
            continue
        if attr is not None:
            attr_values.append((attr, value))
    return attr_values
 # These predicate helper classes are used to match the REGEX, IN, >= etc
 # extensions to the matcher introduced in #3173.
 class _RegexPredicate(object):
    def __init__(self, i, attr, value, predicate, is_extension=False):
        self.i = i
        self.attr = attr
        self.value = re.compile(value)
        self.predicate = predicate
        self.is_extension = is_extension
        assert self.predicate == 'REGEX'
    def __call__(self, Token token):
        if self.is_extension:
            value = token._.get(self.attr)
        else:
            value = token.vocab.strings[get_token_attr(token.c, self.attr)]
        return bool(self.value.search(value))
 class _SetMemberPredicate(object):
    def __init__(self, i, attr, value, predicate, is_extension=False):
        self.i = i
        self.attr = attr
        self.value = set(get_string_id(v) for v in value)
        self.predicate = predicate
        self.is_extension = is_extension
        assert self.predicate in ('IN', 'NOT_IN')
    def __call__(self, Token token):
        if self.is_extension:
            value = get_string_id(token._.get(self.attr))
        else:
            value = get_token_attr(token.c, self.attr)
        if self.predicate == 'IN':
            return value in self.value
        else:
            return value not in self.value
 class _ComparisonPredicate(object):
    def __init__(self, i, attr, value, predicate, is_extension=False):
        self.i = i
        self.attr = attr
        self.value = value
        self.predicate = predicate
        self.is_extension = is_extension
        assert self.predicate in ('==', '!=', '>=', '<=', '>', '<')
    def __call__(self, Token token):
        if self.is_extension:
            value = token._.get(self.attr)
        else:
            value = get_token_attr(token.c, self.attr)
        if self.predicate == '==':
            return value == self.value
        if self.predicate == '!=':
            return value != self.value
        elif self.predicate == '>=':
            return value >= self.value
        elif self.predicate == '<=':
            return value <= self.value
        elif self.predicate == '>':
            return value > self.value
        elif self.predicate == '<':
            return value < self.value
 def _get_extra_predicates(spec, extra_predicates, seen_predicates):
    predicate_types = {
        'REGEX': _RegexPredicate,
        'IN': _SetMemberPredicate,
        'NOT_IN': _SetMemberPredicate,
        '==': _ComparisonPredicate,
        '>=': _ComparisonPredicate,
        '<=': _ComparisonPredicate,
        '>': _ComparisonPredicate,
        '<': _ComparisonPredicate,
    }
    output = []
    for attr, value in spec.items():
        if isinstance(attr, basestring):
            if attr == '_':
                output.extend(
                    _get_extension_extra_predicates(
                        value, extra_predicates, predicate_types,
                        seen_predicates))
                continue
            elif attr.upper() == 'OP':
                continue
            if attr.upper() == 'TEXT':
                attr = 'ORTH'
            attr = IDS.get(attr.upper())
        if isinstance(value, dict):
            for type_, cls in predicate_types.items():
                if type_ in value:
                    key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True))
                    # Don't create a redundant predicates.
                    # This helps with efficiency, as we're caching the results.
                    if key in seen_predicates:
                        output.append(seen_predicates[key])
                    else:
                        predicate = cls(len(extra_predicates), attr, value[type_], type_)
                        extra_predicates.append(predicate)
                        output.append(predicate.i)
                        seen_predicates[key] = predicate.i
    return output
 def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
        seen_predicates):
    output = []
    for attr, value in spec.items():
        if isinstance(value, dict):
            for type_, cls in predicate_types.items():
                if type_ in value:
                    key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True))
                    if key in seen_predicates:
                        output.append(seen_predicates[key])
                    else:
                        predicate = cls(len(extra_predicates), attr, value[type_], type_,
                                        is_extension=True)
                        extra_predicates.append(predicate)
                        output.append(predicate.i)
                        seen_predicates[key] = predicate.i
    return output
 def _get_operators(spec):
    # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
    lookup = {'*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
                 '?': (ZERO_ONE,), '1': (ONE,), '!': (ZERO,)}
    # Fix casing
    spec = {key.upper(): values for key, values in spec.items()
            if isinstance(key, basestring)}
    if 'OP' not in spec:
        return (ONE,)
    elif spec['OP'] in lookup:
        return lookup[spec['OP']]
    else:
        keys = ', '.join(lookup.keys())
        raise KeyError(Errors.E011.format(op=spec['OP'], opts=keys))
 def _get_extensions(spec, string_store, name2index):
    attr_values = []
    for name, value in spec.get('_', {}).items():
        if isinstance(value, dict):
            # Handle predicates (e.g. "IN", in the extra_predicates, not here.
            continue
        if isinstance(value, basestring):
            value = string_store.add(value)
        if name not in name2index:
            name2index[name] = len(name2index)
        attr_values.append((name2index[name], value))
    return attr_values
 cdef class Matcher:
    """Match sequences of tokens, based on pattern rules."""
    cdef Pool mem
@ -388,6 +649,8 @@ cdef class Matcher:
    cdef public object _patterns
    cdef public object _entities
    cdef public object _callbacks
    cdef public object _extensions
    cdef public object _extra_predicates
    def __init__(self, vocab):
        """Create the Matcher.
@ -396,9 +659,12 @@ cdef class Matcher:
            documents the matcher will operate on.
        RETURNS (Matcher): The newly constructed object.
        """
        self._extra_predicates = []
        self._patterns = {}
        self._entities = {}
        self._callbacks = {}
        self._extensions = {}
        self._extra_predicates = []
        self.vocab = vocab
        self.mem = Pool()
@ -456,7 +722,8 @@ cdef class Matcher:
                raise ValueError(Errors.E012.format(key=key))
        key = self._normalize_key(key)
        for pattern in patterns:
-            specs = _convert_strings(pattern, self.vocab.strings)
+            specs = _preprocess_pattern(pattern, self.vocab.strings,
                self._extensions, self._extra_predicates)
            self.patterns.push_back(init_pattern(self.mem, key, specs))
        self._patterns.setdefault(key, [])
        self._callbacks[key] = on_match
@ -520,7 +787,9 @@ cdef class Matcher:
            describing the matches. A match tuple describes a span
            `doc[start:end]`. The `label_id` and `key` are both integers.
        """
-        matches = find_matches(&self.patterns[0], self.patterns.size(), doc)
+        matches = find_matches(&self.patterns[0], self.patterns.size(), doc,
                               extensions=self._extensions,
                               predicates=self._extra_predicates)
        for i, (key, start, end) in enumerate(matches):
            on_match = self._callbacks.get(key, None)
            if on_match is not None:
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -4,7 +4,7 @@ from __future__ import unicode_literals
 import pytest
 import re
 from spacy.matcher import Matcher, DependencyTreeMatcher
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Token
 from ..util import get_doc
@ -179,6 +179,80 @@ def test_matcher_any_token_operator(en_vocab):
    assert matches[2] == "test hello world"
 def test_matcher_extension_attribute(en_vocab):
    matcher = Matcher(en_vocab)
    Token.set_extension('is_fruit',
        getter=lambda token: token.text in ('apple', 'banana'), force=True)
    pattern = [{'ORTH': 'an'}, {'_': {'is_fruit': True}}]
    matcher.add('HAVING_FRUIT', None, pattern)
    doc = Doc(en_vocab, words=['an', 'apple'])
    matches = matcher(doc)
    assert len(matches) == 1
    doc = Doc(en_vocab, words=['an', 'aardvark'])
    matches = matcher(doc)
    assert len(matches) == 0
 def test_matcher_set_value(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{'ORTH': {'IN': ['an', 'a']}}]
    matcher.add('A_OR_AN', None, pattern)
    doc = Doc(en_vocab, words=['an', 'a', 'apple'])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=['aardvark'])
    matches = matcher(doc)
    assert len(matches) == 0
 def test_matcher_regex(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{'ORTH': {'REGEX': r'(?:a|an)'}}]
    matcher.add('A_OR_AN', None, pattern)
    doc = Doc(en_vocab, words=['an', 'a', 'hi'])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=['bye'])
    matches = matcher(doc)
    assert len(matches) == 0
 def test_matcher_regex_shape(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{'SHAPE': {'REGEX': r'^[^x]+$'}}]
    matcher.add('NON_ALPHA', None, pattern)
    doc = Doc(en_vocab, words=['99', 'problems', '!'])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=['bye'])
    matches = matcher(doc)
    assert len(matches) == 0
 def test_matcher_compare_length(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{'LENGTH': {'>=': 2}}]
    matcher.add('LENGTH_COMPARE', None, pattern)
    doc = Doc(en_vocab, words=['a', 'aa', 'aaa'])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=['a'])
    matches = matcher(doc)
    assert len(matches) == 0
 def test_matcher_extension_set_membership(en_vocab):
    matcher = Matcher(en_vocab)
    Token.set_extension('reversed',
        getter=lambda token: ''.join(reversed(token.text)), force=True)
    pattern = [{'_': {'reversed': {"IN": ["eyb", "ih"]}}}]
    matcher.add('REVERSED', None, pattern)
    doc = Doc(en_vocab, words=['hi', 'bye', 'hello'])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=['aardvark'])
    matches = matcher(doc)
    assert len(matches) == 0
@pytest.fixture
 def text():
    return "The quick brown fox jumped over the lazy fox"