mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-06 13:20:20 +03:00
handle fuzzy sets
This commit is contained in:
parent
0859e391c6
commit
9c0f9368a9
|
@ -816,7 +816,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates,
|
||||||
ops = _get_operators(spec)
|
ops = _get_operators(spec)
|
||||||
attr_values = _get_attr_values(spec, string_store)
|
attr_values = _get_attr_values(spec, string_store)
|
||||||
extensions = _get_extensions(spec, string_store, extensions_table)
|
extensions = _get_extensions(spec, string_store, extensions_table)
|
||||||
predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy)
|
predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs)
|
||||||
for op in ops:
|
for op in ops:
|
||||||
tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
|
tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
|
||||||
return tokens
|
return tokens
|
||||||
|
@ -915,9 +915,14 @@ class _SetPredicate:
|
||||||
# normalize morph strings
|
# normalize morph strings
|
||||||
self.value = set(self.vocab.morphology.add(v) for v in value)
|
self.value = set(self.vocab.morphology.add(v) for v in value)
|
||||||
else:
|
else:
|
||||||
self.value = set(get_string_id(v) for v in value)
|
if fuzzy:
|
||||||
|
# add to string store
|
||||||
|
self.value = set(self.vocab.strings.add(v) for v in value)
|
||||||
|
else:
|
||||||
|
self.value = set(get_string_id(v) for v in value)
|
||||||
self.predicate = predicate
|
self.predicate = predicate
|
||||||
self.is_extension = is_extension
|
self.is_extension = is_extension
|
||||||
|
self.fuzzy = fuzzy
|
||||||
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||||
if self.predicate not in self.operators:
|
if self.predicate not in self.operators:
|
||||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||||
|
@ -939,9 +944,23 @@ class _SetPredicate:
|
||||||
else:
|
else:
|
||||||
value = set(get_string_id(v) for v in value)
|
value = set(get_string_id(v) for v in value)
|
||||||
if self.predicate == "IN":
|
if self.predicate == "IN":
|
||||||
return value in self.value # TODO: handle fuzzy
|
if value in self.value:
|
||||||
|
return True
|
||||||
|
elif self.fuzzy:
|
||||||
|
for v in self.value:
|
||||||
|
if fuzz_cpp.ratio(self.vocab.strings[value],
|
||||||
|
self.vocab.strings[v]) >= self.fuzzy:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
elif self.predicate == "NOT_IN":
|
elif self.predicate == "NOT_IN":
|
||||||
return value not in self.value # TODO: handle fuzzy
|
if value in self.value:
|
||||||
|
return False
|
||||||
|
elif self.fuzzy:
|
||||||
|
for v in self.value:
|
||||||
|
if fuzz_cpp.ratio(self.vocab.strings[value],
|
||||||
|
self.vocab.strings[v]) >= self.fuzzy:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
elif self.predicate == "IS_SUBSET":
|
elif self.predicate == "IS_SUBSET":
|
||||||
return value <= self.value
|
return value <= self.value
|
||||||
elif self.predicate == "IS_SUPERSET":
|
elif self.predicate == "IS_SUPERSET":
|
||||||
|
@ -985,7 +1004,7 @@ class _ComparisonPredicate:
|
||||||
return value < self.value
|
return value < self.value
|
||||||
|
|
||||||
|
|
||||||
def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy):
|
def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs):
|
||||||
predicate_types = {
|
predicate_types = {
|
||||||
"FUZZY": _FuzzyPredicate,
|
"FUZZY": _FuzzyPredicate,
|
||||||
"REGEX": _RegexPredicate,
|
"REGEX": _RegexPredicate,
|
||||||
|
@ -1016,23 +1035,41 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy):
|
||||||
if attr.upper() == "TEXT":
|
if attr.upper() == "TEXT":
|
||||||
attr = "ORTH"
|
attr = "ORTH"
|
||||||
attr = IDS.get(attr.upper())
|
attr = IDS.get(attr.upper())
|
||||||
|
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
processed = False
|
output.extend(_get_extra_predicates_helper(attr, value, vocab, fuzzy, fuzzy_attrs,
|
||||||
value_with_upper_keys = {k.upper(): v for k, v in value.items()}
|
predicate_types,
|
||||||
for type_, cls in predicate_types.items():
|
extra_predicates, seen_predicates))
|
||||||
if type_ in value_with_upper_keys:
|
return output
|
||||||
predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab, fuzzy=fuzzy)
|
|
||||||
# Don't create a redundant predicates.
|
|
||||||
# This helps with efficiency, as we're caching the results.
|
def _get_extra_predicates_helper(attr, value, vocab, fuzzy, fuzzy_attrs,
|
||||||
if predicate.key in seen_predicates:
|
predicate_types, extra_predicates, seen_predicates):
|
||||||
output.append(seen_predicates[predicate.key])
|
output = []
|
||||||
else:
|
processed = False #TODO: not working as intended
|
||||||
extra_predicates.append(predicate)
|
value_with_upper_keys = {k.upper(): v for k, v in value.items()}
|
||||||
output.append(predicate.i)
|
for type_, cls in predicate_types.items(): #TODO: switch this loop
|
||||||
seen_predicates[predicate.key] = predicate.i
|
if type_ in value_with_upper_keys:
|
||||||
processed = True
|
if type_ == 'FUZZY' and isinstance(value_with_upper_keys[type_], dict):
|
||||||
if not processed:
|
# add predicates inside fuzzy operator
|
||||||
warnings.warn(Warnings.W035.format(pattern=value))
|
output.extend(_get_extra_predicates_helper(attr, value_with_upper_keys[type_],
|
||||||
|
vocab, fuzzy, fuzzy_attrs,
|
||||||
|
predicate_types,
|
||||||
|
extra_predicates, seen_predicates))
|
||||||
|
else:
|
||||||
|
predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_,
|
||||||
|
vocab=vocab, fuzzy=fuzzy)###??? if attr in fuzzy_attrs else 0)
|
||||||
|
# Don't create a redundant predicates.
|
||||||
|
# This helps with efficiency, as we're caching the results.
|
||||||
|
if predicate.key in seen_predicates:
|
||||||
|
output.append(seen_predicates[predicate.key])
|
||||||
|
else:
|
||||||
|
extra_predicates.append(predicate)
|
||||||
|
output.append(predicate.i)
|
||||||
|
seen_predicates[predicate.key] = predicate.i
|
||||||
|
processed = True
|
||||||
|
if not processed:
|
||||||
|
warnings.warn(Warnings.W035.format(pattern=value))
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -157,7 +157,7 @@ def validate_token_pattern(obj: list) -> List[str]:
|
||||||
|
|
||||||
class TokenPatternString(BaseModel):
|
class TokenPatternString(BaseModel):
|
||||||
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
||||||
FUZZY: Optional[StrictStr] = Field(None, alias="fuzzy")
|
FUZZY: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy")
|
||||||
IN: Optional[List[StrictStr]] = Field(None, alias="in")
|
IN: Optional[List[StrictStr]] = Field(None, alias="in")
|
||||||
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
|
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
|
||||||
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
|
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
|
||||||
|
|
|
@ -218,6 +218,50 @@ def test_matcher_match_fuzz_preds(en_vocab):
|
||||||
(doc.vocab.strings["JS"], 8, 9),
|
(doc.vocab.strings["JS"], 8, 9),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def test_matcher_match_fuzz_pred_in_set(en_vocab):
|
||||||
|
rules = {
|
||||||
|
"GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "No"]}}, "OP": "+"}]]
|
||||||
|
}
|
||||||
|
matcher = Matcher(en_vocab, fuzzy=80)
|
||||||
|
for key, patterns in rules.items():
|
||||||
|
matcher.add(key, patterns, greedy="LONGEST")
|
||||||
|
|
||||||
|
words = ["I", "like", "Goggle", "Now"]
|
||||||
|
doc = Doc(matcher.vocab, words=words)
|
||||||
|
assert matcher(doc) == [
|
||||||
|
(doc.vocab.strings["GoogleNow"], 2, 4),
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_matcher_match_fuzz_pred_not_in_set(en_vocab):
|
||||||
|
rules = {
|
||||||
|
"GoogleNow": [[{"ORTH": {"FUZZY": {"NOT_IN": ["Google", "No"]}}, "OP": "+"}]],
|
||||||
|
}
|
||||||
|
matcher = Matcher(en_vocab, fuzzy=80)
|
||||||
|
for key, patterns in rules.items():
|
||||||
|
matcher.add(key, patterns, greedy="LONGEST")
|
||||||
|
|
||||||
|
words = ["I", "like", "Goggle", "Now"]
|
||||||
|
doc = Doc(matcher.vocab, words=words)
|
||||||
|
assert matcher(doc) == [
|
||||||
|
(doc.vocab.strings["GoogleNow"], 0, 2),
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_matcher_match_fuzz_pred_in_set_with_exclude(en_vocab):
|
||||||
|
rules = {
|
||||||
|
"GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "No"]},
|
||||||
|
"NOT_IN": ["Goggle"]},
|
||||||
|
"OP": "+"}]]
|
||||||
|
}
|
||||||
|
matcher = Matcher(en_vocab, fuzzy=80)
|
||||||
|
for key, patterns in rules.items():
|
||||||
|
matcher.add(key, patterns, greedy="LONGEST")
|
||||||
|
|
||||||
|
words = ["I", "like", "Goggle", "Now"]
|
||||||
|
doc = Doc(matcher.vocab, words=words)
|
||||||
|
assert matcher(doc) == [
|
||||||
|
(doc.vocab.strings["GoogleNow"], 3, 4),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_empty_dict(en_vocab):
|
def test_matcher_empty_dict(en_vocab):
|
||||||
"""Test matcher allows empty token specs, meaning match on any token."""
|
"""Test matcher allows empty token specs, meaning match on any token."""
|
||||||
|
|
Loading…
Reference in New Issue
Block a user