mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-06 13:20:20 +03:00
add FUZZY predicate
This commit is contained in:
parent
9600fe1d99
commit
3dc5b9c7be
|
@ -131,7 +131,7 @@ cdef class Matcher:
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
try:
|
try:
|
||||||
specs = _preprocess_pattern(pattern, self.vocab,
|
specs = _preprocess_pattern(pattern, self.vocab,
|
||||||
self._extensions, self._extra_predicates)
|
self._extensions, self._extra_predicates, self.fuzzy)
|
||||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||||
for spec in specs:
|
for spec in specs:
|
||||||
for attr, _ in spec[1]:
|
for attr, _ in spec[1]:
|
||||||
|
@ -766,7 +766,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
||||||
return id_attr.value
|
return id_attr.value
|
||||||
|
|
||||||
|
|
||||||
def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
|
def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy):
|
||||||
"""This function interprets the pattern, converting the various bits of
|
"""This function interprets the pattern, converting the various bits of
|
||||||
syntactic sugar before we compile it into a struct with init_pattern.
|
syntactic sugar before we compile it into a struct with init_pattern.
|
||||||
|
|
||||||
|
@ -793,7 +793,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
|
||||||
ops = _get_operators(spec)
|
ops = _get_operators(spec)
|
||||||
attr_values = _get_attr_values(spec, string_store)
|
attr_values = _get_attr_values(spec, string_store)
|
||||||
extensions = _get_extensions(spec, string_store, extensions_table)
|
extensions = _get_extensions(spec, string_store, extensions_table)
|
||||||
predicates = _get_extra_predicates(spec, extra_predicates, vocab)
|
predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy)
|
||||||
for op in ops:
|
for op in ops:
|
||||||
tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
|
tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
|
||||||
return tokens
|
return tokens
|
||||||
|
@ -838,10 +838,32 @@ def _get_attr_values(spec, string_store):
|
||||||
# These predicate helper classes are used to match the REGEX, IN, >= etc
|
# These predicate helper classes are used to match the REGEX, IN, >= etc
|
||||||
# extensions to the matcher introduced in #3173.
|
# extensions to the matcher introduced in #3173.
|
||||||
|
|
||||||
|
class _FuzzyPredicate:
|
||||||
|
operators = ("FUZZY",)
|
||||||
|
|
||||||
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None):
|
||||||
|
self.i = i
|
||||||
|
self.attr = attr
|
||||||
|
self.value = value
|
||||||
|
self.predicate = predicate
|
||||||
|
self.is_extension = is_extension
|
||||||
|
self.fuzzy = fuzzy
|
||||||
|
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||||
|
if self.predicate not in self.operators:
|
||||||
|
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||||
|
|
||||||
|
def __call__(self, Token token):
|
||||||
|
if self.is_extension:
|
||||||
|
value = token._.get(self.attr)
|
||||||
|
else:
|
||||||
|
value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
|
||||||
|
return bool(fuzz_cpp.ratio(self.value, value) >= self.fuzzy)
|
||||||
|
|
||||||
|
|
||||||
class _RegexPredicate:
|
class _RegexPredicate:
|
||||||
operators = ("REGEX",)
|
operators = ("REGEX",)
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.value = re.compile(value)
|
self.value = re.compile(value)
|
||||||
|
@ -862,7 +884,7 @@ class _RegexPredicate:
|
||||||
class _SetPredicate:
|
class _SetPredicate:
|
||||||
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
|
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
@ -894,9 +916,9 @@ class _SetPredicate:
|
||||||
else:
|
else:
|
||||||
value = set(get_string_id(v) for v in value)
|
value = set(get_string_id(v) for v in value)
|
||||||
if self.predicate == "IN":
|
if self.predicate == "IN":
|
||||||
return value in self.value
|
return value in self.value # handle fuzzy
|
||||||
elif self.predicate == "NOT_IN":
|
elif self.predicate == "NOT_IN":
|
||||||
return value not in self.value
|
return value not in self.value # handle fuzzy
|
||||||
elif self.predicate == "IS_SUBSET":
|
elif self.predicate == "IS_SUBSET":
|
||||||
return value <= self.value
|
return value <= self.value
|
||||||
elif self.predicate == "IS_SUPERSET":
|
elif self.predicate == "IS_SUPERSET":
|
||||||
|
@ -940,8 +962,9 @@ class _ComparisonPredicate:
|
||||||
return value < self.value
|
return value < self.value
|
||||||
|
|
||||||
|
|
||||||
def _get_extra_predicates(spec, extra_predicates, vocab):
|
def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy):
|
||||||
predicate_types = {
|
predicate_types = {
|
||||||
|
"FUZZY": _FuzzyPredicate,
|
||||||
"REGEX": _RegexPredicate,
|
"REGEX": _RegexPredicate,
|
||||||
"IN": _SetPredicate,
|
"IN": _SetPredicate,
|
||||||
"NOT_IN": _SetPredicate,
|
"NOT_IN": _SetPredicate,
|
||||||
|
@ -975,7 +998,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
|
||||||
value_with_upper_keys = {k.upper(): v for k, v in value.items()}
|
value_with_upper_keys = {k.upper(): v for k, v in value.items()}
|
||||||
for type_, cls in predicate_types.items():
|
for type_, cls in predicate_types.items():
|
||||||
if type_ in value_with_upper_keys:
|
if type_ in value_with_upper_keys:
|
||||||
predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab)
|
predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab, fuzzy=fuzzy)
|
||||||
# Don't create a redundant predicates.
|
# Don't create a redundant predicates.
|
||||||
# This helps with efficiency, as we're caching the results.
|
# This helps with efficiency, as we're caching the results.
|
||||||
if predicate.key in seen_predicates:
|
if predicate.key in seen_predicates:
|
||||||
|
|
|
@ -157,6 +157,7 @@ def validate_token_pattern(obj: list) -> List[str]:
|
||||||
|
|
||||||
class TokenPatternString(BaseModel):
|
class TokenPatternString(BaseModel):
|
||||||
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
||||||
|
FUZZY: Optional[StrictStr] = Field(None, alias="fuzzy")
|
||||||
IN: Optional[List[StrictStr]] = Field(None, alias="in")
|
IN: Optional[List[StrictStr]] = Field(None, alias="in")
|
||||||
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
|
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
|
||||||
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
|
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
|
||||||
|
@ -176,6 +177,7 @@ class TokenPatternString(BaseModel):
|
||||||
|
|
||||||
class TokenPatternNumber(BaseModel):
|
class TokenPatternNumber(BaseModel):
|
||||||
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
||||||
|
FUZZY: Optional[StrictStr] = Field(None, alias="fuzzy")
|
||||||
IN: Optional[List[StrictInt]] = Field(None, alias="in")
|
IN: Optional[List[StrictInt]] = Field(None, alias="in")
|
||||||
NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
|
NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
|
||||||
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
|
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
|
||||||
|
|
|
@ -166,6 +166,22 @@ def test_matcher_match_fuzz_none(en_vocab):
|
||||||
assert matcher(doc) == []
|
assert matcher(doc) == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_match_fuzz_pred(en_vocab):
|
||||||
|
rules = {
|
||||||
|
"JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]],
|
||||||
|
"GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
|
||||||
|
"Java": [[{"LOWER": "java"}]],
|
||||||
|
}
|
||||||
|
matcher = Matcher(en_vocab, fuzzy=80)
|
||||||
|
for key, patterns in rules.items():
|
||||||
|
matcher.add(key, patterns)
|
||||||
|
|
||||||
|
words = ["I", "like", "Goggle", "Now", "and", "JavaScrpt", "best"]
|
||||||
|
doc = Doc(matcher.vocab, words=words)
|
||||||
|
assert matcher(doc) == []
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_empty_dict(en_vocab):
|
def test_matcher_empty_dict(en_vocab):
|
||||||
"""Test matcher allows empty token specs, meaning match on any token."""
|
"""Test matcher allows empty token specs, meaning match on any token."""
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user