mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Allow empty dictionaries to match any token in Matcher
Often patterns need to match "any token". A clean way to denote this is with the empty dict {}: this sets no constraints on the token, so should always match. The problem was that having attributes length==0 was used as an end-of-array signal, so the matcher didn't handle this case correctly. This patch compiles empty token spec dicts into a constraint NULL_ATTR==0. The NULL_ATTR attribute, 0, is always set to 0 on the lexeme -- so this always matches.
This commit is contained in:
parent
3468d535ad
commit
3b67eabfea
|
@ -17,7 +17,7 @@ from libcpp.pair cimport pair
|
|||
from murmurhash.mrmr cimport hash64
|
||||
from libc.stdint cimport int32_t
|
||||
|
||||
from .attrs cimport ID, ENT_TYPE
|
||||
from .attrs cimport ID, NULL_ATTR, ENT_TYPE
|
||||
from . import attrs
|
||||
from .tokens.doc cimport get_token_attr
|
||||
from .tokens.doc cimport Doc
|
||||
|
@ -142,6 +142,10 @@ def _convert_strings(token_specs, string_store):
|
|||
tokens = []
|
||||
op = ONE
|
||||
for spec in token_specs:
|
||||
if not spec:
|
||||
# Signifier for 'any token'
|
||||
tokens.append((ONE, [(NULL_ATTR, 0)]))
|
||||
continue
|
||||
token = []
|
||||
ops = (ONE,)
|
||||
for attr, value in spec.items():
|
||||
|
@ -295,7 +299,7 @@ cdef class Matcher:
|
|||
"""Find all token sequences matching the supplied patterns on the `Doc`.
|
||||
|
||||
doc (Doc): The document to match over.
|
||||
RETURNS (list): A list of `(key, label_id, start, end)` tuples,
|
||||
RETURNS (list): A list of `(key, start, end)` tuples,
|
||||
describing the matches. A match tuple describes a span
|
||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
||||
"""
|
||||
|
|
|
@ -98,6 +98,20 @@ def test_matcher_match_multi(matcher):
|
|||
(doc.vocab.strings['Java'], 5, 6)]
|
||||
|
||||
|
||||
def test_matcher_empty_dict(en_vocab):
|
||||
'''Test matcher allows empty token specs, meaning match on any token.'''
|
||||
matcher = Matcher(en_vocab)
|
||||
abc = ["a", "b", "c"]
|
||||
doc = get_doc(matcher.vocab, abc)
|
||||
matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
assert matches[0][1:] == (0, 3)
|
||||
matcher.add('A.', None, [{'ORTH': 'a'}, {}])
|
||||
matches = matcher(doc)
|
||||
assert matches[0][1:] == (0, 2)
|
||||
|
||||
|
||||
def test_matcher_phrase_matcher(en_vocab):
|
||||
words = ["Google", "Now"]
|
||||
doc = get_doc(en_vocab, words)
|
||||
|
|
Loading…
Reference in New Issue
Block a user