From eab96f7c0353b49ac471f0a143d2a28d31465dd5 Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Thu, 22 Sep 2022 15:37:19 -0700 Subject: [PATCH] fix min distance --- spacy/matcher/matcher.pyx | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 04ecfa546..8cc022d7e 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -849,8 +849,8 @@ class _FuzzyPredicate: value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)] if self.value == value: return True - elif self.distance and token.is_oov and not token.is_space: - return bool(levenshtein(self.value, value) <= min(self.distance, len(token.text)-1)) + elif self.distance and token.is_oov: + return bool(levenshtein(value, self.value) <= min(self.distance, min(len(value), len(self.value))-1)) return False @@ -924,19 +924,21 @@ class _SetPredicate: if self.predicate == "IN": if value in self.value: return True - elif self.distance and token.is_oov and not token.is_space: + elif self.distance and token.is_oov: + s1 = self.vocab.strings[value] for v in self.value: - if levenshtein(self.vocab.strings[value], - self.vocab.strings[v]) <= min(self.distance, len(token.text)-1): + s2 = self.vocab.strings[v] + if levenshtein(s1, s2) <= min(self.distance, min(len(s1), len(s2))-1): return True return False elif self.predicate == "NOT_IN": if value in self.value: return False - elif self.distance and token.is_oov and not token.is_space: + elif self.distance and token.is_oov: + s1 = self.vocab.strings[value] for v in self.value: - if levenshtein(self.vocab.strings[value], - self.vocab.strings[v]) <= min(self.distance, len(token.text)-1): + s2 = self.vocab.strings[v] + if levenshtein(s1, s2) <= min(self.distance, min(len(s1), len(s2))-1): return False return True elif self.predicate == "IS_SUBSET":