Allow empty dictionaries to match any token in Matcher

Often patterns need to match "any token". A clean way to denote this is with the empty dict {}: this sets no constraints on the token, so should always match. The problem was that having attributes length==0 was used as an end-of-array signal, so the matcher didn't handle this case correctly. This patch compiles empty token spec dicts into a constraint NULL_ATTR==0. The NULL_ATTR attribute, 0, is always set to 0 on the lexeme -- so this always matches.
2025-12-12 04:34:31 +03:00 · 2017-10-07 03:36:15 +02:00 · 2017-10-07 03:36:15 +02:00 · 3b67eabfea
commit 3b67eabfea
parent 3468d535ad
2 changed files with 20 additions and 2 deletions
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -17,7 +17,7 @@ from libcpp.pair cimport pair
 from murmurhash.mrmr cimport hash64
 from libc.stdint cimport int32_t

-from .attrs cimport ID, ENT_TYPE
+from .attrs cimport ID, NULL_ATTR, ENT_TYPE
 from . import attrs
 from .tokens.doc cimport get_token_attr
 from .tokens.doc cimport Doc
@ -142,6 +142,10 @@ def _convert_strings(token_specs, string_store):
    tokens = []
    op = ONE
    for spec in token_specs:
+        if not spec:
+            # Signifier for 'any token'
+            tokens.append((ONE, [(NULL_ATTR, 0)]))
+            continue
        token = []
        ops = (ONE,)
        for attr, value in spec.items():
@ -295,7 +299,7 @@ cdef class Matcher:
        """Find all token sequences matching the supplied patterns on the `Doc`.

        doc (Doc): The document to match over.
-        RETURNS (list): A list of `(key, label_id, start, end)` tuples,
+        RETURNS (list): A list of `(key, start, end)` tuples,
            describing the matches. A match tuple describes a span
            `doc[start:end]`. The `label_id` and `key` are both integers.
        """
--- a/spacy/tests/test_matcher.py
+++ b/spacy/tests/test_matcher.py
@ -98,6 +98,20 @@ def test_matcher_match_multi(matcher):
                            (doc.vocab.strings['Java'], 5, 6)]


+def test_matcher_empty_dict(en_vocab):
+    '''Test matcher allows empty token specs, meaning match on any token.'''
+    matcher = Matcher(en_vocab)
+    abc = ["a", "b", "c"]
+    doc = get_doc(matcher.vocab, abc)
+    matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
+    matches = matcher(doc)
+    assert len(matches) == 1
+    assert matches[0][1:] == (0, 3)
+    matcher.add('A.', None, [{'ORTH': 'a'}, {}])
+    matches = matcher(doc)
+    assert matches[0][1:] == (0, 2)
+ 
+
 def test_matcher_phrase_matcher(en_vocab):
    words = ["Google", "Now"]
    doc = get_doc(en_vocab, words)