remove is_oov check

This commit is contained in:
Kevin Humphreys 2022-10-14 15:51:20 -07:00
parent bf4b353ce5
commit 9c83b804f1

View File

@ -206,15 +206,14 @@ cdef class Matcher:
yield doc yield doc
@staticmethod @staticmethod
def fuzzy_match(s1: str, s2: str, distance: int, token: Token) -> bool: def fuzzy_match(s1: str, s2: str, distance: int) -> bool:
if token.is_oov: # (TODO: param?) min_length = min(len(s1), len(s2))
threshold = min(len(s1), len(s2)) - 1 # max edit distance if distance: # FUZZYn operators with explicit distance
if distance: # FUZZYn operators threshold = min(distance, min_length - 1)
threshold = min(distance, threshold) else: # FUZZY operator with default distance
else: # FUZZY operator threshold = min(5, min_length - 2)
threshold = min(5, threshold - 1) # default fuzziness (TODO: param?) if threshold > 0:
if threshold > 0: return levenshtein(s1, s2) <= threshold
return levenshtein(s1, s2) <= threshold
return False return False
def __call__(self, object doclike, *, as_spans=False, allow_missing=False, with_alignments=False): def __call__(self, object doclike, *, as_spans=False, allow_missing=False, with_alignments=False):
@ -863,7 +862,7 @@ class _FuzzyPredicate:
value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)] value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
if self.value == value: if self.value == value:
return True return True
return Matcher.fuzzy_match(value, self.value, self.fuzzy, token) return Matcher.fuzzy_match(value, self.value, self.fuzzy)
class _RegexPredicate: class _RegexPredicate:
@ -946,7 +945,7 @@ class _SetPredicate:
return True return True
elif self.fuzzy is not None: elif self.fuzzy is not None:
value = self.vocab.strings[value] value = self.vocab.strings[value]
return any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy, token) return any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
for v in self.value) for v in self.value)
else: else:
return False return False
@ -958,7 +957,7 @@ class _SetPredicate:
return False return False
elif self.fuzzy is not None: elif self.fuzzy is not None:
value = self.vocab.strings[value] value = self.vocab.strings[value]
return not any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy, token) return not any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
for v in self.value) for v in self.value)
else: else:
return True return True