From 3b67eabfea28f817f646a892ed4aec4644a46aee Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 7 Oct 2017 03:36:15 +0200 Subject: [PATCH] Allow empty dictionaries to match any token in Matcher Often patterns need to match "any token". A clean way to denote this is with the empty dict {}: this sets no constraints on the token, so should always match. The problem was that having attributes length==0 was used as an end-of-array signal, so the matcher didn't handle this case correctly. This patch compiles empty token spec dicts into a constraint NULL_ATTR==0. The NULL_ATTR attribute, 0, is always set to 0 on the lexeme -- so this always matches. --- spacy/matcher.pyx | 8 ++++++-- spacy/tests/test_matcher.py | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 3bc6f859c..8893b2fed 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -17,7 +17,7 @@ from libcpp.pair cimport pair from murmurhash.mrmr cimport hash64 from libc.stdint cimport int32_t -from .attrs cimport ID, ENT_TYPE +from .attrs cimport ID, NULL_ATTR, ENT_TYPE from . import attrs from .tokens.doc cimport get_token_attr from .tokens.doc cimport Doc @@ -142,6 +142,10 @@ def _convert_strings(token_specs, string_store): tokens = [] op = ONE for spec in token_specs: + if not spec: + # Signifier for 'any token' + tokens.append((ONE, [(NULL_ATTR, 0)])) + continue token = [] ops = (ONE,) for attr, value in spec.items(): @@ -295,7 +299,7 @@ cdef class Matcher: """Find all token sequences matching the supplied patterns on the `Doc`. doc (Doc): The document to match over. - RETURNS (list): A list of `(key, label_id, start, end)` tuples, + RETURNS (list): A list of `(key, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `label_id` and `key` are both integers. """ diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py index 1b9f92519..b36c67d8c 100644 --- a/spacy/tests/test_matcher.py +++ b/spacy/tests/test_matcher.py @@ -98,6 +98,20 @@ def test_matcher_match_multi(matcher): (doc.vocab.strings['Java'], 5, 6)] +def test_matcher_empty_dict(en_vocab): + '''Test matcher allows empty token specs, meaning match on any token.''' + matcher = Matcher(en_vocab) + abc = ["a", "b", "c"] + doc = get_doc(matcher.vocab, abc) + matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}]) + matches = matcher(doc) + assert len(matches) == 1 + assert matches[0][1:] == (0, 3) + matcher.add('A.', None, [{'ORTH': 'a'}, {}]) + matches = matcher(doc) + assert matches[0][1:] == (0, 2) + + def test_matcher_phrase_matcher(en_vocab): words = ["Google", "Now"] doc = get_doc(en_vocab, words)