mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Allow empty dictionaries to match any token in Matcher
Often patterns need to match "any token". A clean way to denote this is with the empty dict {}: this sets no constraints on the token, so should always match. The problem was that having attributes length==0 was used as an end-of-array signal, so the matcher didn't handle this case correctly. This patch compiles empty token spec dicts into a constraint NULL_ATTR==0. The NULL_ATTR attribute, 0, is always set to 0 on the lexeme -- so this always matches.
This commit is contained in:
parent
3468d535ad
commit
3b67eabfea
|
@ -17,7 +17,7 @@ from libcpp.pair cimport pair
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
|
|
||||||
from .attrs cimport ID, ENT_TYPE
|
from .attrs cimport ID, NULL_ATTR, ENT_TYPE
|
||||||
from . import attrs
|
from . import attrs
|
||||||
from .tokens.doc cimport get_token_attr
|
from .tokens.doc cimport get_token_attr
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
|
@ -142,6 +142,10 @@ def _convert_strings(token_specs, string_store):
|
||||||
tokens = []
|
tokens = []
|
||||||
op = ONE
|
op = ONE
|
||||||
for spec in token_specs:
|
for spec in token_specs:
|
||||||
|
if not spec:
|
||||||
|
# Signifier for 'any token'
|
||||||
|
tokens.append((ONE, [(NULL_ATTR, 0)]))
|
||||||
|
continue
|
||||||
token = []
|
token = []
|
||||||
ops = (ONE,)
|
ops = (ONE,)
|
||||||
for attr, value in spec.items():
|
for attr, value in spec.items():
|
||||||
|
@ -295,7 +299,7 @@ cdef class Matcher:
|
||||||
"""Find all token sequences matching the supplied patterns on the `Doc`.
|
"""Find all token sequences matching the supplied patterns on the `Doc`.
|
||||||
|
|
||||||
doc (Doc): The document to match over.
|
doc (Doc): The document to match over.
|
||||||
RETURNS (list): A list of `(key, label_id, start, end)` tuples,
|
RETURNS (list): A list of `(key, start, end)` tuples,
|
||||||
describing the matches. A match tuple describes a span
|
describing the matches. A match tuple describes a span
|
||||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
`doc[start:end]`. The `label_id` and `key` are both integers.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -98,6 +98,20 @@ def test_matcher_match_multi(matcher):
|
||||||
(doc.vocab.strings['Java'], 5, 6)]
|
(doc.vocab.strings['Java'], 5, 6)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_empty_dict(en_vocab):
|
||||||
|
'''Test matcher allows empty token specs, meaning match on any token.'''
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
abc = ["a", "b", "c"]
|
||||||
|
doc = get_doc(matcher.vocab, abc)
|
||||||
|
matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 1
|
||||||
|
assert matches[0][1:] == (0, 3)
|
||||||
|
matcher.add('A.', None, [{'ORTH': 'a'}, {}])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert matches[0][1:] == (0, 2)
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_phrase_matcher(en_vocab):
|
def test_matcher_phrase_matcher(en_vocab):
|
||||||
words = ["Google", "Now"]
|
words = ["Google", "Now"]
|
||||||
doc = get_doc(en_vocab, words)
|
doc = get_doc(en_vocab, words)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user