mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 04:40:20 +03:00
Increase default fuzzy to rounded 30% of pattern length
This commit is contained in:
parent
b690c9120c
commit
e88c724c99
|
@ -38,7 +38,7 @@ cpdef bint fuzzy_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
|
|||
else:
|
||||
# allow at least two edits (to allow at least one transposition) and up
|
||||
# to 20% of the pattern string length
|
||||
max_edits = max(2, int(0.2 * len(pattern_text)))
|
||||
max_edits = max(2, round(0.3 * len(pattern_text)))
|
||||
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
||||
|
||||
|
||||
|
|
|
@ -145,8 +145,8 @@ def test_matcher_match_multi(matcher):
|
|||
# caching don't collide)
|
||||
(
|
||||
{
|
||||
"A": [[{"ORTH": {"FUZZY": "Javascript"}}]],
|
||||
"B": [[{"ORTH": {"FUZZY5": "Javascript"}}]],
|
||||
"A": [[{"ORTH": {"FUZZY": "Javascripts"}}]],
|
||||
"B": [[{"ORTH": {"FUZZY5": "Javascripts"}}]],
|
||||
},
|
||||
[(8, 9)],
|
||||
),
|
||||
|
|
|
@ -89,7 +89,7 @@ it compares to another value.
|
|||
| Attribute | Description |
|
||||
| -------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `REGEX` | Attribute value matches the regular expression at any position in the string. ~~Any~~ |
|
||||
| `FUZZY` | Attribute value matches if the `fuzzy_compare` method matches for `(value, pattern, -1)`. The default method allows a Levenshtein edit distance of at least 2 and up to 20% of the pattern string length. ~~Any~~ |
|
||||
| `FUZZY` | Attribute value matches if the `fuzzy_compare` method matches for `(value, pattern, -1)`. The default method allows a Levenshtein edit distance of at least 2 and up to 30% of the pattern string length. ~~Any~~ |
|
||||
| `FUZZY1`, `FUZZY2`, ... `FUZZY9` | Attribute value matches if the `fuzzy_compare` method matches for `(value, pattern, N)`. The default method allows a Levenshtein edit distance of at most N (1-9). ~~Any~~ |
|
||||
| `IN` | Attribute value is member of a list. ~~Any~~ |
|
||||
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
|
||||
|
|
|
@ -378,7 +378,7 @@ pattern = [{"TEXT": {"FUZZY": "favorite"}},
|
|||
The `FUZZY` attribute allows fuzzy matches for any attribute string value,
|
||||
including custom attributes. Just like `REGEX`, it always needs to be applied to
|
||||
an attribute like `TEXT` or `LOWER`. By default `FUZZY` allows a Levenshtein
|
||||
edit distance of at least 2 and up to 20% of the pattern string length. Using
|
||||
edit distance of at least 2 and up to 30% of the pattern string length. Using
|
||||
the more specific attributes `FUZZY1`..`FUZZY9` you can specify the maximum
|
||||
allowed edit distance directly.
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user