From 3b67eabfea28f817f646a892ed4aec4644a46aee Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 7 Oct 2017 03:36:15 +0200
Subject: [PATCH] Allow empty dictionaries to match any token in Matcher

Often patterns need to match "any token". A clean way to denote this
is with the empty dict {}: this sets no constraints on the token,
so should always match.

The problem was that having attributes length==0 was used as an
end-of-array signal, so the matcher didn't handle this case correctly.

This patch compiles empty token spec dicts into a constraint
NULL_ATTR==0. The NULL_ATTR attribute, 0, is always set to 0 on the
lexeme -- so this always matches.
---
 spacy/matcher.pyx           |  8 ++++++--
 spacy/tests/test_matcher.py | 14 ++++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx
index 3bc6f859c..8893b2fed 100644
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@@ -17,7 +17,7 @@ from libcpp.pair cimport pair
 from murmurhash.mrmr cimport hash64
 from libc.stdint cimport int32_t
 
-from .attrs cimport ID, ENT_TYPE
+from .attrs cimport ID, NULL_ATTR, ENT_TYPE
 from . import attrs
 from .tokens.doc cimport get_token_attr
 from .tokens.doc cimport Doc
@@ -142,6 +142,10 @@ def _convert_strings(token_specs, string_store):
     tokens = []
     op = ONE
     for spec in token_specs:
+        if not spec:
+            # Signifier for 'any token'
+            tokens.append((ONE, [(NULL_ATTR, 0)]))
+            continue
         token = []
         ops = (ONE,)
         for attr, value in spec.items():
@@ -295,7 +299,7 @@ cdef class Matcher:
         """Find all token sequences matching the supplied patterns on the `Doc`.
 
         doc (Doc): The document to match over.
-        RETURNS (list): A list of `(key, label_id, start, end)` tuples,
+        RETURNS (list): A list of `(key, start, end)` tuples,
             describing the matches. A match tuple describes a span
             `doc[start:end]`. The `label_id` and `key` are both integers.
         """
diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py
index 1b9f92519..b36c67d8c 100644
--- a/spacy/tests/test_matcher.py
+++ b/spacy/tests/test_matcher.py
@@ -98,6 +98,20 @@ def test_matcher_match_multi(matcher):
                             (doc.vocab.strings['Java'], 5, 6)]
 
 
+def test_matcher_empty_dict(en_vocab):
+    '''Test matcher allows empty token specs, meaning match on any token.'''
+    matcher = Matcher(en_vocab)
+    abc = ["a", "b", "c"]
+    doc = get_doc(matcher.vocab, abc)
+    matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
+    matches = matcher(doc)
+    assert len(matches) == 1
+    assert matches[0][1:] == (0, 3)
+    matcher.add('A.', None, [{'ORTH': 'a'}, {}])
+    matches = matcher(doc)
+    assert matches[0][1:] == (0, 2)
+ 
+
 def test_matcher_phrase_matcher(en_vocab):
     words = ["Google", "Now"]
     doc = get_doc(en_vocab, words)