Allow empty dictionaries to match any token in Matcher

Often patterns need to match "any token". A clean way to denote this
is with the empty dict {}: this sets no constraints on the token,
so should always match.

The problem was that having attributes length==0 was used as an
end-of-array signal, so the matcher didn't handle this case correctly.

This patch compiles empty token spec dicts into a constraint
NULL_ATTR==0. The NULL_ATTR attribute, 0, is always set to 0 on the
lexeme -- so this always matches.
This commit is contained in:
Matthew Honnibal 2017-10-07 03:36:15 +02:00
parent 3468d535ad
commit 3b67eabfea
2 changed files with 20 additions and 2 deletions

View File

@ -17,7 +17,7 @@ from libcpp.pair cimport pair
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from libc.stdint cimport int32_t from libc.stdint cimport int32_t
from .attrs cimport ID, ENT_TYPE from .attrs cimport ID, NULL_ATTR, ENT_TYPE
from . import attrs from . import attrs
from .tokens.doc cimport get_token_attr from .tokens.doc cimport get_token_attr
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
@ -142,6 +142,10 @@ def _convert_strings(token_specs, string_store):
tokens = [] tokens = []
op = ONE op = ONE
for spec in token_specs: for spec in token_specs:
if not spec:
# Signifier for 'any token'
tokens.append((ONE, [(NULL_ATTR, 0)]))
continue
token = [] token = []
ops = (ONE,) ops = (ONE,)
for attr, value in spec.items(): for attr, value in spec.items():
@ -295,7 +299,7 @@ cdef class Matcher:
"""Find all token sequences matching the supplied patterns on the `Doc`. """Find all token sequences matching the supplied patterns on the `Doc`.
doc (Doc): The document to match over. doc (Doc): The document to match over.
RETURNS (list): A list of `(key, label_id, start, end)` tuples, RETURNS (list): A list of `(key, start, end)` tuples,
describing the matches. A match tuple describes a span describing the matches. A match tuple describes a span
`doc[start:end]`. The `label_id` and `key` are both integers. `doc[start:end]`. The `label_id` and `key` are both integers.
""" """

View File

@ -98,6 +98,20 @@ def test_matcher_match_multi(matcher):
(doc.vocab.strings['Java'], 5, 6)] (doc.vocab.strings['Java'], 5, 6)]
def test_matcher_empty_dict(en_vocab):
'''Test matcher allows empty token specs, meaning match on any token.'''
matcher = Matcher(en_vocab)
abc = ["a", "b", "c"]
doc = get_doc(matcher.vocab, abc)
matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
matches = matcher(doc)
assert len(matches) == 1
assert matches[0][1:] == (0, 3)
matcher.add('A.', None, [{'ORTH': 'a'}, {}])
matches = matcher(doc)
assert matches[0][1:] == (0, 2)
def test_matcher_phrase_matcher(en_vocab): def test_matcher_phrase_matcher(en_vocab):
words = ["Google", "Now"] words = ["Google", "Now"]
doc = get_doc(en_vocab, words) doc = get_doc(en_vocab, words)