fuzzy match only on oov tokens

This commit is contained in:
Kevin Humphreys 2022-09-14 15:54:05 -07:00
parent a6d26a0195
commit b7599dfb2f

View File

@ -846,7 +846,11 @@ class _FuzzyPredicate:
value = token._.get(self.attr) value = token._.get(self.attr)
else: else:
value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)] value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
if self.value == value:
return True
elif self.distance and token.is_oov:
return bool(levenshtein(self.value, value) <= self.distance) return bool(levenshtein(self.value, value) <= self.distance)
return False
class _RegexPredicate: class _RegexPredicate:
@ -912,7 +916,7 @@ class _SetPredicate:
if self.predicate == "IN": if self.predicate == "IN":
if value in self.value: if value in self.value:
return True return True
elif self.distance: elif self.distance and token.is_oov:
for v in self.value: for v in self.value:
if levenshtein(self.vocab.strings[value], if levenshtein(self.vocab.strings[value],
self.vocab.strings[v]) <= self.distance: self.vocab.strings[v]) <= self.distance:
@ -921,7 +925,7 @@ class _SetPredicate:
elif self.predicate == "NOT_IN": elif self.predicate == "NOT_IN":
if value in self.value: if value in self.value:
return False return False
elif self.distance: elif self.distance and token.is_oov:
for v in self.value: for v in self.value:
if levenshtein(self.vocab.strings[value], if levenshtein(self.vocab.strings[value],
self.vocab.strings[v]) <= self.distance: self.vocab.strings[v]) <= self.distance: