mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 21:00:19 +03:00
switch to FUZZYn predicates
use Levenshtein distance. remove fuzzy param. remove rapidfuzz_capi.
This commit is contained in:
parent
974e5f9902
commit
3591a69d35
|
@ -8,6 +8,5 @@ requires = [
|
||||||
"thinc>=8.1.0,<8.2.0",
|
"thinc>=8.1.0,<8.2.0",
|
||||||
"pathy",
|
"pathy",
|
||||||
"numpy>=1.15.0",
|
"numpy>=1.15.0",
|
||||||
"rapidfuzz_capi>=1.0.5,<2.0.0",
|
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
|
@ -19,7 +19,6 @@ pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
|
||||||
jinja2
|
jinja2
|
||||||
langcodes>=3.2.0,<4.0.0
|
langcodes>=3.2.0,<4.0.0
|
||||||
rapidfuzz>=2.4.0,<3.0.0
|
rapidfuzz>=2.4.0,<3.0.0
|
||||||
rapidfuzz_capi>=1.0.5,<2.0.0
|
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
|
|
|
@ -34,7 +34,6 @@ python_requires = >=3.6
|
||||||
setup_requires =
|
setup_requires =
|
||||||
cython>=0.25,<3.0
|
cython>=0.25,<3.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
rapidfuzz_capi>=1.0.5,<2.0.0
|
|
||||||
# We also need our Cython packages here to compile against
|
# We also need our Cython packages here to compile against
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
|
@ -60,7 +59,6 @@ install_requires =
|
||||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
|
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
|
||||||
jinja2
|
jinja2
|
||||||
rapidfuzz>=2.4.0,<3.0.0
|
rapidfuzz>=2.4.0,<3.0.0
|
||||||
rapidfuzz_capi>=1.0.5,<2.0.0
|
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
|
|
|
@ -71,8 +71,6 @@ cdef class Matcher:
|
||||||
cdef vector[TokenPatternC*] patterns
|
cdef vector[TokenPatternC*] patterns
|
||||||
cdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
cdef public object validate
|
cdef public object validate
|
||||||
cdef public object fuzzy
|
|
||||||
cdef public object fuzzy_attrs
|
|
||||||
cdef public object _patterns
|
cdef public object _patterns
|
||||||
cdef public object _callbacks
|
cdef public object _callbacks
|
||||||
cdef public object _filter
|
cdef public object _filter
|
||||||
|
|
|
@ -5,8 +5,7 @@ from ..vocab import Vocab
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
|
|
||||||
class Matcher:
|
class Matcher:
|
||||||
def __init__(self, vocab: Vocab, validate: bool = ...,
|
def __init__(self, vocab: Vocab, validate: bool = ...) -> None: ...
|
||||||
fuzzy: float = ..., fuzzy_attrs: list = ...) -> None: ...
|
|
||||||
def __reduce__(self) -> Any: ...
|
def __reduce__(self) -> Any: ...
|
||||||
def __len__(self) -> int: ...
|
def __len__(self) -> int: ...
|
||||||
def __contains__(self, key: str) -> bool: ...
|
def __contains__(self, key: str) -> bool: ...
|
||||||
|
|
|
@ -10,7 +10,7 @@ from murmurhash.mrmr cimport hash64
|
||||||
import re
|
import re
|
||||||
import srsly
|
import srsly
|
||||||
import warnings
|
import warnings
|
||||||
from rapidfuzz import fuzz_cpp
|
from rapidfuzz.distance import Levenshtein
|
||||||
|
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
|
@ -37,7 +37,7 @@ cdef class Matcher:
|
||||||
USAGE: https://spacy.io/usage/rule-based-matching
|
USAGE: https://spacy.io/usage/rule-based-matching
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab, validate=True, fuzzy=None, fuzzy_attrs=None):
|
def __init__(self, vocab, validate=True):
|
||||||
"""Create the Matcher.
|
"""Create the Matcher.
|
||||||
|
|
||||||
vocab (Vocab): The vocabulary object, which must be shared with the
|
vocab (Vocab): The vocabulary object, which must be shared with the
|
||||||
|
@ -52,8 +52,6 @@ cdef class Matcher:
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.validate = validate
|
self.validate = validate
|
||||||
self.fuzzy = fuzzy if fuzzy is not None else 0
|
|
||||||
self.fuzzy_attrs = [IDS.get(attr) for attr in fuzzy_attrs] if fuzzy_attrs else []
|
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
data = (self.vocab, self._patterns, self._callbacks)
|
data = (self.vocab, self._patterns, self._callbacks)
|
||||||
|
@ -131,8 +129,7 @@ cdef class Matcher:
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
try:
|
try:
|
||||||
specs = _preprocess_pattern(pattern, self.vocab,
|
specs = _preprocess_pattern(pattern, self.vocab,
|
||||||
self._extensions, self._extra_predicates,
|
self._extensions, self._extra_predicates)
|
||||||
self.fuzzy, self.fuzzy_attrs)
|
|
||||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||||
for spec in specs:
|
for spec in specs:
|
||||||
for attr, _ in spec[1]:
|
for attr, _ in spec[1]:
|
||||||
|
@ -257,8 +254,7 @@ cdef class Matcher:
|
||||||
matches = []
|
matches = []
|
||||||
else:
|
else:
|
||||||
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
||||||
extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments,
|
extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
|
||||||
fuzzy=self.fuzzy, fuzzy_attrs=self.fuzzy_attrs)
|
|
||||||
final_matches = []
|
final_matches = []
|
||||||
pairs_by_id = {}
|
pairs_by_id = {}
|
||||||
# For each key, either add all matches, or only the filtered,
|
# For each key, either add all matches, or only the filtered,
|
||||||
|
@ -339,8 +335,7 @@ def unpickle_matcher(vocab, patterns, callbacks):
|
||||||
return matcher
|
return matcher
|
||||||
|
|
||||||
|
|
||||||
cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0,
|
cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0):
|
||||||
float fuzzy=0, list fuzzy_attrs=[]):
|
|
||||||
"""Find matches in a doc, with a compiled array of patterns. Matches are
|
"""Find matches in a doc, with a compiled array of patterns. Matches are
|
||||||
returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0)
|
returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0)
|
||||||
|
|
||||||
|
@ -359,8 +354,6 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
cdef PatternStateC state
|
cdef PatternStateC state
|
||||||
cdef int i, j, nr_extra_attr
|
cdef int i, j, nr_extra_attr
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
cdef int8_t* fuzzy_attrs_array
|
|
||||||
cdef int n_fuzzy_attrs = len(fuzzy_attrs)
|
|
||||||
|
|
||||||
output = []
|
output = []
|
||||||
if length == 0:
|
if length == 0:
|
||||||
|
@ -380,10 +373,6 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
if isinstance(value, str):
|
if isinstance(value, str):
|
||||||
value = token.vocab.strings[value]
|
value = token.vocab.strings[value]
|
||||||
extra_attr_values[i * nr_extra_attr + index] = value
|
extra_attr_values[i * nr_extra_attr + index] = value
|
||||||
if n_fuzzy_attrs > 0:
|
|
||||||
fuzzy_attrs_array = <int8_t*>mem.alloc(n_fuzzy_attrs, sizeof(int8_t))
|
|
||||||
for i in range(n_fuzzy_attrs):
|
|
||||||
fuzzy_attrs_array[i] = fuzzy_attrs[i]
|
|
||||||
# Main loop
|
# Main loop
|
||||||
cdef int nr_predicate = len(predicates)
|
cdef int nr_predicate = len(predicates)
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
|
@ -392,8 +381,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_states.resize(states.size())
|
align_states.resize(states.size())
|
||||||
transition_states(states, matches, align_states, align_matches, predicate_cache,
|
transition_states(states, matches, align_states, align_matches, predicate_cache,
|
||||||
doclike[i], extra_attr_values, predicates, with_alignments,
|
doclike[i], extra_attr_values, predicates, with_alignments)
|
||||||
fuzzy, fuzzy_attrs_array, n_fuzzy_attrs)
|
|
||||||
extra_attr_values += nr_extra_attr
|
extra_attr_values += nr_extra_attr
|
||||||
predicate_cache += len(predicates)
|
predicate_cache += len(predicates)
|
||||||
# Handle matches that end in 0-width patterns
|
# Handle matches that end in 0-width patterns
|
||||||
|
@ -422,8 +410,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
|
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
|
||||||
vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches,
|
vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches,
|
||||||
int8_t* cached_py_predicates,
|
int8_t* cached_py_predicates,
|
||||||
Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments,
|
Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *:
|
||||||
float fuzzy, int8_t* fuzzy_attrs, int n_fuzzy_attrs) except *:
|
|
||||||
cdef int q = 0
|
cdef int q = 0
|
||||||
cdef vector[PatternStateC] new_states
|
cdef vector[PatternStateC] new_states
|
||||||
cdef vector[vector[MatchAlignmentC]] align_new_states
|
cdef vector[vector[MatchAlignmentC]] align_new_states
|
||||||
|
@ -433,8 +420,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
||||||
update_predicate_cache(cached_py_predicates,
|
update_predicate_cache(cached_py_predicates,
|
||||||
states[i].pattern, token, py_predicates)
|
states[i].pattern, token, py_predicates)
|
||||||
action = get_action(states[i], token, extra_attrs,
|
action = get_action(states[i], token, extra_attrs,
|
||||||
cached_py_predicates,
|
cached_py_predicates)
|
||||||
fuzzy, fuzzy_attrs, n_fuzzy_attrs)
|
|
||||||
if action == REJECT:
|
if action == REJECT:
|
||||||
continue
|
continue
|
||||||
# Keep only a subset of states (the active ones). Index q is the
|
# Keep only a subset of states (the active ones). Index q is the
|
||||||
|
@ -471,8 +457,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
||||||
update_predicate_cache(cached_py_predicates,
|
update_predicate_cache(cached_py_predicates,
|
||||||
states[q].pattern, token, py_predicates)
|
states[q].pattern, token, py_predicates)
|
||||||
action = get_action(states[q], token, extra_attrs,
|
action = get_action(states[q], token, extra_attrs,
|
||||||
cached_py_predicates,
|
cached_py_predicates)
|
||||||
fuzzy, fuzzy_attrs, n_fuzzy_attrs)
|
|
||||||
# Update alignment before the transition of current state
|
# Update alignment before the transition of current state
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length))
|
align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length))
|
||||||
|
@ -584,8 +569,7 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
|
||||||
|
|
||||||
cdef action_t get_action(PatternStateC state,
|
cdef action_t get_action(PatternStateC state,
|
||||||
Token token, const attr_t* extra_attrs,
|
Token token, const attr_t* extra_attrs,
|
||||||
const int8_t* predicate_matches,
|
const int8_t* predicate_matches) nogil:
|
||||||
float fuzzy, int8_t* fuzzy_attrs, int n_fuzzy_attrs) nogil:
|
|
||||||
"""We need to consider:
|
"""We need to consider:
|
||||||
a) Does the token match the specification? [Yes, No]
|
a) Does the token match the specification? [Yes, No]
|
||||||
b) What's the quantifier? [1, 0+, ?]
|
b) What's the quantifier? [1, 0+, ?]
|
||||||
|
@ -644,8 +628,7 @@ cdef action_t get_action(PatternStateC state,
|
||||||
Problem: If a quantifier is matching, we're adding a lot of open partials
|
Problem: If a quantifier is matching, we're adding a lot of open partials
|
||||||
"""
|
"""
|
||||||
cdef int8_t is_match
|
cdef int8_t is_match
|
||||||
is_match = get_is_match(state, token, extra_attrs, predicate_matches,
|
is_match = get_is_match(state, token, extra_attrs, predicate_matches)
|
||||||
fuzzy, fuzzy_attrs, n_fuzzy_attrs)
|
|
||||||
quantifier = get_quantifier(state)
|
quantifier = get_quantifier(state)
|
||||||
is_final = get_is_final(state)
|
is_final = get_is_final(state)
|
||||||
if quantifier == ZERO:
|
if quantifier == ZERO:
|
||||||
|
@ -698,8 +681,7 @@ cdef action_t get_action(PatternStateC state,
|
||||||
|
|
||||||
cdef int8_t get_is_match(PatternStateC state,
|
cdef int8_t get_is_match(PatternStateC state,
|
||||||
Token token, const attr_t* extra_attrs,
|
Token token, const attr_t* extra_attrs,
|
||||||
const int8_t* predicate_matches,
|
const int8_t* predicate_matches) nogil:
|
||||||
float fuzzy, int8_t* fuzzy_attrs, int n_fuzzy_attrs) nogil:
|
|
||||||
for i in range(state.pattern.nr_py):
|
for i in range(state.pattern.nr_py):
|
||||||
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
||||||
return 0
|
return 0
|
||||||
|
@ -708,22 +690,9 @@ cdef int8_t get_is_match(PatternStateC state,
|
||||||
for attr in spec.attrs[:spec.nr_attr]:
|
for attr in spec.attrs[:spec.nr_attr]:
|
||||||
token_attr_value = get_token_attr_for_matcher(token.c, attr.attr)
|
token_attr_value = get_token_attr_for_matcher(token.c, attr.attr)
|
||||||
if token_attr_value != attr.value:
|
if token_attr_value != attr.value:
|
||||||
if fuzzy:
|
return 0
|
||||||
fuzzy_match = False
|
|
||||||
for i in range(n_fuzzy_attrs):
|
|
||||||
if attr.attr == fuzzy_attrs[i]:
|
|
||||||
with gil:
|
|
||||||
if fuzz_cpp.ratio(token.vocab.strings[token_attr_value],
|
|
||||||
token.vocab.strings[attr.value]) >= fuzzy:
|
|
||||||
fuzzy_match = True
|
|
||||||
break
|
|
||||||
if not fuzzy_match:
|
|
||||||
return 0
|
|
||||||
else:
|
|
||||||
return 0
|
|
||||||
for i in range(spec.nr_extra_attr):
|
for i in range(spec.nr_extra_attr):
|
||||||
if spec.extra_attrs[i].value != extra_attrs[spec.extra_attrs[i].index]:
|
if spec.extra_attrs[i].value != extra_attrs[spec.extra_attrs[i].index]:
|
||||||
# TODO: fuzzy match
|
|
||||||
return 0
|
return 0
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -788,8 +757,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
||||||
return id_attr.value
|
return id_attr.value
|
||||||
|
|
||||||
|
|
||||||
def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates,
|
def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
|
||||||
fuzzy, fuzzy_attrs):
|
|
||||||
"""This function interprets the pattern, converting the various bits of
|
"""This function interprets the pattern, converting the various bits of
|
||||||
syntactic sugar before we compile it into a struct with init_pattern.
|
syntactic sugar before we compile it into a struct with init_pattern.
|
||||||
|
|
||||||
|
@ -816,7 +784,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates,
|
||||||
ops = _get_operators(spec)
|
ops = _get_operators(spec)
|
||||||
attr_values = _get_attr_values(spec, string_store)
|
attr_values = _get_attr_values(spec, string_store)
|
||||||
extensions = _get_extensions(spec, string_store, extensions_table)
|
extensions = _get_extensions(spec, string_store, extensions_table)
|
||||||
predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs)
|
predicates = _get_extra_predicates(spec, extra_predicates, vocab)
|
||||||
for op in ops:
|
for op in ops:
|
||||||
tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
|
tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
|
||||||
return tokens
|
return tokens
|
||||||
|
@ -862,31 +830,31 @@ def _get_attr_values(spec, string_store):
|
||||||
# extensions to the matcher introduced in #3173.
|
# extensions to the matcher introduced in #3173.
|
||||||
|
|
||||||
class _FuzzyPredicate:
|
class _FuzzyPredicate:
|
||||||
operators = ("FUZZY",)
|
operators = ("FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5")
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None):
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, distance=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.value = value
|
self.value = value
|
||||||
self.predicate = predicate
|
self.predicate = predicate
|
||||||
self.is_extension = is_extension
|
self.is_extension = is_extension
|
||||||
self.fuzzy = fuzzy
|
|
||||||
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||||
if self.predicate not in self.operators:
|
if self.predicate not in self.operators:
|
||||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||||
|
self.distance = int(self.predicate[len('FUZZY'):]) # number after prefix
|
||||||
|
|
||||||
def __call__(self, Token token):
|
def __call__(self, Token token):
|
||||||
if self.is_extension:
|
if self.is_extension:
|
||||||
value = token._.get(self.attr)
|
value = token._.get(self.attr)
|
||||||
else:
|
else:
|
||||||
value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
|
value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
|
||||||
return bool(self.fuzzy and fuzz_cpp.ratio(self.value, value) >= self.fuzzy)
|
return bool(Levenshtein.distance(self.value, value) <= self.distance)
|
||||||
|
|
||||||
|
|
||||||
class _RegexPredicate:
|
class _RegexPredicate:
|
||||||
operators = ("REGEX",)
|
operators = ("REGEX",)
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None):
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, distance=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.value = re.compile(value)
|
self.value = re.compile(value)
|
||||||
|
@ -907,22 +875,22 @@ class _RegexPredicate:
|
||||||
class _SetPredicate:
|
class _SetPredicate:
|
||||||
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
|
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None):
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, distance=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
self.distance = distance
|
||||||
if self.attr == MORPH:
|
if self.attr == MORPH:
|
||||||
# normalize morph strings
|
# normalize morph strings
|
||||||
self.value = set(self.vocab.morphology.add(v) for v in value)
|
self.value = set(self.vocab.morphology.add(v) for v in value)
|
||||||
else:
|
else:
|
||||||
if fuzzy:
|
if self.distance:
|
||||||
# add to string store
|
# add to string store
|
||||||
self.value = set(self.vocab.strings.add(v) for v in value)
|
self.value = set(self.vocab.strings.add(v) for v in value)
|
||||||
else:
|
else:
|
||||||
self.value = set(get_string_id(v) for v in value)
|
self.value = set(get_string_id(v) for v in value)
|
||||||
self.predicate = predicate
|
self.predicate = predicate
|
||||||
self.is_extension = is_extension
|
self.is_extension = is_extension
|
||||||
self.fuzzy = fuzzy
|
|
||||||
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||||
if self.predicate not in self.operators:
|
if self.predicate not in self.operators:
|
||||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||||
|
@ -946,19 +914,19 @@ class _SetPredicate:
|
||||||
if self.predicate == "IN":
|
if self.predicate == "IN":
|
||||||
if value in self.value:
|
if value in self.value:
|
||||||
return True
|
return True
|
||||||
elif self.fuzzy:
|
elif self.distance:
|
||||||
for v in self.value:
|
for v in self.value:
|
||||||
if fuzz_cpp.ratio(self.vocab.strings[value],
|
if Levenshtein.distance(self.vocab.strings[value],
|
||||||
self.vocab.strings[v]) >= self.fuzzy:
|
self.vocab.strings[v]) <= self.distance:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
elif self.predicate == "NOT_IN":
|
elif self.predicate == "NOT_IN":
|
||||||
if value in self.value:
|
if value in self.value:
|
||||||
return False
|
return False
|
||||||
elif self.fuzzy:
|
elif self.distance:
|
||||||
for v in self.value:
|
for v in self.value:
|
||||||
if fuzz_cpp.ratio(self.vocab.strings[value],
|
if Levenshtein.distance(self.vocab.strings[value],
|
||||||
self.vocab.strings[v]) >= self.fuzzy:
|
self.vocab.strings[v]) <= self.distance:
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
elif self.predicate == "IS_SUBSET":
|
elif self.predicate == "IS_SUBSET":
|
||||||
|
@ -975,7 +943,7 @@ class _SetPredicate:
|
||||||
class _ComparisonPredicate:
|
class _ComparisonPredicate:
|
||||||
operators = ("==", "!=", ">=", "<=", ">", "<")
|
operators = ("==", "!=", ">=", "<=", ">", "<")
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None):
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, distance=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.value = value
|
self.value = value
|
||||||
|
@ -1004,7 +972,7 @@ class _ComparisonPredicate:
|
||||||
return value < self.value
|
return value < self.value
|
||||||
|
|
||||||
|
|
||||||
def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs):
|
def _get_extra_predicates(spec, extra_predicates, vocab):
|
||||||
predicate_types = {
|
predicate_types = {
|
||||||
"REGEX": _RegexPredicate,
|
"REGEX": _RegexPredicate,
|
||||||
"IN": _SetPredicate,
|
"IN": _SetPredicate,
|
||||||
|
@ -1018,7 +986,11 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs):
|
||||||
"<=": _ComparisonPredicate,
|
"<=": _ComparisonPredicate,
|
||||||
">": _ComparisonPredicate,
|
">": _ComparisonPredicate,
|
||||||
"<": _ComparisonPredicate,
|
"<": _ComparisonPredicate,
|
||||||
"FUZZY": _FuzzyPredicate,
|
"FUZZY1": _FuzzyPredicate,
|
||||||
|
"FUZZY2": _FuzzyPredicate,
|
||||||
|
"FUZZY3": _FuzzyPredicate,
|
||||||
|
"FUZZY4": _FuzzyPredicate,
|
||||||
|
"FUZZY5": _FuzzyPredicate,
|
||||||
}
|
}
|
||||||
seen_predicates = {pred.key: pred.i for pred in extra_predicates}
|
seen_predicates = {pred.key: pred.i for pred in extra_predicates}
|
||||||
output = []
|
output = []
|
||||||
|
@ -1037,33 +1009,30 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs):
|
||||||
attr = IDS.get(attr.upper())
|
attr = IDS.get(attr.upper())
|
||||||
|
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
fuzzy_match = attr in fuzzy_attrs # fuzzy match enabled for this attr
|
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
||||||
output.extend(_get_extra_predicates_dict(attr, value, vocab, fuzzy, fuzzy_match,
|
|
||||||
predicate_types,
|
|
||||||
extra_predicates, seen_predicates))
|
extra_predicates, seen_predicates))
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
def _get_extra_predicates_dict(attr, value_dict, vocab, fuzzy, fuzzy_match,
|
def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
|
||||||
predicate_types, extra_predicates, seen_predicates):
|
extra_predicates, seen_predicates, distance=None):
|
||||||
output = []
|
output = []
|
||||||
for type_, value in value_dict.items():
|
for type_, value in value_dict.items():
|
||||||
type_ = type_.upper()
|
type_ = type_.upper()
|
||||||
if type_ == 'FUZZY':
|
|
||||||
fuzzy_match = True # explicit fuzzy match
|
|
||||||
if isinstance(value, dict):
|
|
||||||
# add predicates inside fuzzy operator
|
|
||||||
output.extend(_get_extra_predicates_dict(attr, value, vocab, fuzzy, fuzzy_match,
|
|
||||||
predicate_types,
|
|
||||||
extra_predicates, seen_predicates))
|
|
||||||
continue
|
|
||||||
cls = predicate_types.get(type_)
|
cls = predicate_types.get(type_)
|
||||||
if cls is None:
|
if cls is None:
|
||||||
warnings.warn(Warnings.W035.format(pattern=value_dict))
|
warnings.warn(Warnings.W035.format(pattern=value_dict))
|
||||||
# ignore unrecognized predicate type
|
# ignore unrecognized predicate type
|
||||||
continue
|
continue
|
||||||
predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab,
|
elif cls == _FuzzyPredicate:
|
||||||
fuzzy=fuzzy if fuzzy_match else 0)
|
distance = int(type_[len("FUZZY"):]) # number after prefix
|
||||||
|
if isinstance(value, dict):
|
||||||
|
# add predicates inside fuzzy operator
|
||||||
|
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
||||||
|
extra_predicates, seen_predicates,
|
||||||
|
distance=distance))
|
||||||
|
continue
|
||||||
|
predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab, distance=distance)
|
||||||
# Don't create a redundant predicates.
|
# Don't create a redundant predicates.
|
||||||
# This helps with efficiency, as we're caching the results.
|
# This helps with efficiency, as we're caching the results.
|
||||||
if predicate.key in seen_predicates:
|
if predicate.key in seen_predicates:
|
||||||
|
|
|
@ -26,7 +26,6 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
||||||
"phrase_matcher_attr": None,
|
"phrase_matcher_attr": None,
|
||||||
"validate": False,
|
"validate": False,
|
||||||
"overwrite_ents": False,
|
"overwrite_ents": False,
|
||||||
"fuzzy": 0.0,
|
|
||||||
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
||||||
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
||||||
},
|
},
|
||||||
|
@ -43,7 +42,6 @@ def make_entity_ruler(
|
||||||
phrase_matcher_attr: Optional[Union[int, str]],
|
phrase_matcher_attr: Optional[Union[int, str]],
|
||||||
validate: bool,
|
validate: bool,
|
||||||
overwrite_ents: bool,
|
overwrite_ents: bool,
|
||||||
fuzzy: float,
|
|
||||||
ent_id_sep: str,
|
ent_id_sep: str,
|
||||||
scorer: Optional[Callable],
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
|
@ -53,7 +51,6 @@ def make_entity_ruler(
|
||||||
phrase_matcher_attr=phrase_matcher_attr,
|
phrase_matcher_attr=phrase_matcher_attr,
|
||||||
validate=validate,
|
validate=validate,
|
||||||
overwrite_ents=overwrite_ents,
|
overwrite_ents=overwrite_ents,
|
||||||
fuzzy=fuzzy,
|
|
||||||
ent_id_sep=ent_id_sep,
|
ent_id_sep=ent_id_sep,
|
||||||
scorer=scorer,
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
@ -87,7 +84,6 @@ class EntityRuler(Pipe):
|
||||||
phrase_matcher_attr: Optional[Union[int, str]] = None,
|
phrase_matcher_attr: Optional[Union[int, str]] = None,
|
||||||
validate: bool = False,
|
validate: bool = False,
|
||||||
overwrite_ents: bool = False,
|
overwrite_ents: bool = False,
|
||||||
fuzzy: float = 0,
|
|
||||||
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
|
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
|
||||||
patterns: Optional[List[PatternType]] = None,
|
patterns: Optional[List[PatternType]] = None,
|
||||||
scorer: Optional[Callable] = entity_ruler_score,
|
scorer: Optional[Callable] = entity_ruler_score,
|
||||||
|
@ -122,8 +118,7 @@ class EntityRuler(Pipe):
|
||||||
self.token_patterns = defaultdict(list) # type: ignore
|
self.token_patterns = defaultdict(list) # type: ignore
|
||||||
self.phrase_patterns = defaultdict(list) # type: ignore
|
self.phrase_patterns = defaultdict(list) # type: ignore
|
||||||
self._validate = validate
|
self._validate = validate
|
||||||
self.fuzzy = fuzzy
|
self.matcher = Matcher(nlp.vocab, validate=validate)
|
||||||
self.matcher = Matcher(nlp.vocab, validate=validate, fuzzy=self.fuzzy)
|
|
||||||
self.phrase_matcher_attr = phrase_matcher_attr
|
self.phrase_matcher_attr = phrase_matcher_attr
|
||||||
self.phrase_matcher = PhraseMatcher(
|
self.phrase_matcher = PhraseMatcher(
|
||||||
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
|
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
|
||||||
|
@ -343,7 +338,7 @@ class EntityRuler(Pipe):
|
||||||
self.token_patterns = defaultdict(list)
|
self.token_patterns = defaultdict(list)
|
||||||
self.phrase_patterns = defaultdict(list)
|
self.phrase_patterns = defaultdict(list)
|
||||||
self._ent_ids = defaultdict(tuple)
|
self._ent_ids = defaultdict(tuple)
|
||||||
self.matcher = Matcher(self.nlp.vocab, validate=self._validate, fuzzy=self.fuzzy)
|
self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
|
||||||
self.phrase_matcher = PhraseMatcher(
|
self.phrase_matcher = PhraseMatcher(
|
||||||
self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
|
self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
|
||||||
)
|
)
|
||||||
|
|
|
@ -157,12 +157,16 @@ def validate_token_pattern(obj: list) -> List[str]:
|
||||||
|
|
||||||
class TokenPatternString(BaseModel):
|
class TokenPatternString(BaseModel):
|
||||||
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
||||||
FUZZY: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy")
|
|
||||||
IN: Optional[List[StrictStr]] = Field(None, alias="in")
|
IN: Optional[List[StrictStr]] = Field(None, alias="in")
|
||||||
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
|
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
|
||||||
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
|
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
|
||||||
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
|
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
|
||||||
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
|
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
|
||||||
|
FUZZY1: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy1")
|
||||||
|
FUZZY2: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy2")
|
||||||
|
FUZZY3: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy3")
|
||||||
|
FUZZY4: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy4")
|
||||||
|
FUZZY5: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy5")
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
extra = "forbid"
|
extra = "forbid"
|
||||||
|
@ -177,7 +181,6 @@ class TokenPatternString(BaseModel):
|
||||||
|
|
||||||
class TokenPatternNumber(BaseModel):
|
class TokenPatternNumber(BaseModel):
|
||||||
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
||||||
FUZZY: Optional[StrictStr] = Field(None, alias="fuzzy")
|
|
||||||
IN: Optional[List[StrictInt]] = Field(None, alias="in")
|
IN: Optional[List[StrictInt]] = Field(None, alias="in")
|
||||||
NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
|
NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
|
||||||
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
|
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
|
||||||
|
@ -189,6 +192,11 @@ class TokenPatternNumber(BaseModel):
|
||||||
LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=")
|
LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=")
|
||||||
GT: Union[StrictInt, StrictFloat] = Field(None, alias=">")
|
GT: Union[StrictInt, StrictFloat] = Field(None, alias=">")
|
||||||
LT: Union[StrictInt, StrictFloat] = Field(None, alias="<")
|
LT: Union[StrictInt, StrictFloat] = Field(None, alias="<")
|
||||||
|
FUZZY1: Optional[StrictStr] = Field(None, alias="fuzzy1")
|
||||||
|
FUZZY2: Optional[StrictStr] = Field(None, alias="fuzzy2")
|
||||||
|
FUZZY3: Optional[StrictStr] = Field(None, alias="fuzzy3")
|
||||||
|
FUZZY4: Optional[StrictStr] = Field(None, alias="fuzzy4")
|
||||||
|
FUZZY5: Optional[StrictStr] = Field(None, alias="fuzzy5")
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
extra = "forbid"
|
extra = "forbid"
|
||||||
|
|
|
@ -6,16 +6,15 @@ from spacy.tokens import Doc, Token, Span
|
||||||
from ..doc.test_underscore import clean_underscore # noqa: F401
|
from ..doc.test_underscore import clean_underscore # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
matcher_rules = {
|
|
||||||
"JS": [[{"ORTH": "JavaScript"}]],
|
|
||||||
"GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
|
|
||||||
"Java": [[{"LOWER": "java"}]],
|
|
||||||
}
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def matcher(en_vocab):
|
def matcher(en_vocab):
|
||||||
|
rules = {
|
||||||
|
"JS": [[{"ORTH": "JavaScript"}]],
|
||||||
|
"GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
|
||||||
|
"Java": [[{"LOWER": "java"}]],
|
||||||
|
}
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
for key, patterns in matcher_rules.items():
|
for key, patterns in rules.items():
|
||||||
matcher.add(key, patterns)
|
matcher.add(key, patterns)
|
||||||
return matcher
|
return matcher
|
||||||
|
|
||||||
|
@ -119,98 +118,51 @@ def test_matcher_match_multi(matcher):
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
# fuzzy matches on specific attributes
|
|
||||||
|
|
||||||
def test_matcher_match_fuzz_all(en_vocab):
|
|
||||||
matcher = Matcher(en_vocab, fuzzy=80, fuzzy_attrs=["ORTH", "LOWER"])
|
|
||||||
for key, patterns in matcher_rules.items():
|
|
||||||
matcher.add(key, patterns)
|
|
||||||
|
|
||||||
words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
|
|
||||||
doc = Doc(matcher.vocab, words=words)
|
|
||||||
assert matcher(doc) == [
|
|
||||||
(doc.vocab.strings["GoogleNow"], 2, 4),
|
|
||||||
(doc.vocab.strings["Java"], 5, 6),
|
|
||||||
(doc.vocab.strings["JS"], 8, 9),
|
|
||||||
]
|
|
||||||
|
|
||||||
def test_matcher_match_fuzz_all_lower(en_vocab):
|
|
||||||
matcher = Matcher(en_vocab, fuzzy=80, fuzzy_attrs=["LOWER"])
|
|
||||||
for key, patterns in matcher_rules.items():
|
|
||||||
matcher.add(key, patterns)
|
|
||||||
|
|
||||||
words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
|
|
||||||
doc = Doc(matcher.vocab, words=words)
|
|
||||||
assert matcher(doc) == [
|
|
||||||
(doc.vocab.strings["Java"], 5, 6),
|
|
||||||
]
|
|
||||||
|
|
||||||
def test_matcher_match_fuzz_some(en_vocab):
|
|
||||||
matcher = Matcher(en_vocab, fuzzy=85, fuzzy_attrs=["ORTH", "LOWER"])
|
|
||||||
for key, patterns in matcher_rules.items():
|
|
||||||
matcher.add(key, patterns)
|
|
||||||
|
|
||||||
words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
|
|
||||||
doc = Doc(matcher.vocab, words=words)
|
|
||||||
assert matcher(doc) == [
|
|
||||||
(doc.vocab.strings["Java"], 5, 6),
|
|
||||||
]
|
|
||||||
|
|
||||||
def test_matcher_match_fuzz_none(en_vocab):
|
|
||||||
matcher = Matcher(en_vocab, fuzzy=90, fuzzy_attrs=["ORTH", "LOWER"])
|
|
||||||
for key, patterns in matcher_rules.items():
|
|
||||||
matcher.add(key, patterns)
|
|
||||||
|
|
||||||
words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
|
|
||||||
doc = Doc(matcher.vocab, words=words)
|
|
||||||
assert matcher(doc) == []
|
|
||||||
|
|
||||||
|
|
||||||
# fuzzy matches on specific tokens
|
# fuzzy matches on specific tokens
|
||||||
|
|
||||||
def test_matcher_match_fuzz_pred1(en_vocab):
|
def test_matcher_match_fuzzy1(en_vocab):
|
||||||
rules = {
|
rules = {
|
||||||
"JS": [[{"ORTH": "JavaScript"}]],
|
"JS": [[{"ORTH": "JavaScript"}]],
|
||||||
"GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
|
"GoogleNow": [[{"ORTH": {"FUZZY1": "Google"}}, {"ORTH": "Now"}]],
|
||||||
"Java": [[{"LOWER": "java"}]],
|
"Java": [[{"LOWER": "java"}]],
|
||||||
}
|
}
|
||||||
matcher = Matcher(en_vocab, fuzzy=80)
|
matcher = Matcher(en_vocab)
|
||||||
for key, patterns in rules.items():
|
for key, patterns in rules.items():
|
||||||
matcher.add(key, patterns)
|
matcher.add(key, patterns)
|
||||||
|
|
||||||
words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
|
words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
|
||||||
doc = Doc(matcher.vocab, words=words)
|
doc = Doc(matcher.vocab, words=words)
|
||||||
assert matcher(doc) == [
|
assert matcher(doc) == [
|
||||||
(doc.vocab.strings["GoogleNow"], 2, 4),
|
(doc.vocab.strings["GoogleNow"], 2, 4),
|
||||||
]
|
]
|
||||||
|
|
||||||
def test_matcher_match_fuzz_pred2(en_vocab):
|
def test_matcher_match_fuzzy2(en_vocab):
|
||||||
rules = {
|
rules = {
|
||||||
"JS": [[{"ORTH": "JavaScript"}]],
|
"JS": [[{"ORTH": "JavaScript"}]],
|
||||||
"GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
|
"GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
|
||||||
"Java": [[{"LOWER": {"FUZZY": "java"}}]],
|
"Java": [[{"LOWER": {"FUZZY1": "java"}}]],
|
||||||
}
|
}
|
||||||
matcher = Matcher(en_vocab, fuzzy=80)
|
matcher = Matcher(en_vocab)
|
||||||
for key, patterns in rules.items():
|
for key, patterns in rules.items():
|
||||||
matcher.add(key, patterns)
|
matcher.add(key, patterns)
|
||||||
|
|
||||||
words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
|
words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
|
||||||
doc = Doc(matcher.vocab, words=words)
|
doc = Doc(matcher.vocab, words=words)
|
||||||
assert matcher(doc) == [
|
assert matcher(doc) == [
|
||||||
(doc.vocab.strings["Java"], 5, 6),
|
(doc.vocab.strings["Java"], 5, 6),
|
||||||
]
|
]
|
||||||
|
|
||||||
def test_matcher_match_fuzz_preds(en_vocab):
|
def test_matcher_match_fuzzy3(en_vocab):
|
||||||
rules = {
|
rules = {
|
||||||
"JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]],
|
"JS": [[{"ORTH": {"FUZZY2": "JavaScript"}}]],
|
||||||
"GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
|
"GoogleNow": [[{"ORTH": {"FUZZY1": "Google"}}, {"ORTH": "Now"}]],
|
||||||
"Java": [[{"LOWER": {"FUZZY": "java"}}]],
|
"Java": [[{"LOWER": {"FUZZY1": "java"}}]],
|
||||||
}
|
}
|
||||||
matcher = Matcher(en_vocab, fuzzy=80)
|
matcher = Matcher(en_vocab)
|
||||||
for key, patterns in rules.items():
|
for key, patterns in rules.items():
|
||||||
matcher.add(key, patterns)
|
matcher.add(key, patterns)
|
||||||
|
|
||||||
words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
|
words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
|
||||||
doc = Doc(matcher.vocab, words=words)
|
doc = Doc(matcher.vocab, words=words)
|
||||||
assert matcher(doc) == [
|
assert matcher(doc) == [
|
||||||
(doc.vocab.strings["GoogleNow"], 2, 4),
|
(doc.vocab.strings["GoogleNow"], 2, 4),
|
||||||
|
@ -218,45 +170,45 @@ def test_matcher_match_fuzz_preds(en_vocab):
|
||||||
(doc.vocab.strings["JS"], 8, 9),
|
(doc.vocab.strings["JS"], 8, 9),
|
||||||
]
|
]
|
||||||
|
|
||||||
def test_matcher_match_fuzz_pred_in_set(en_vocab):
|
def test_matcher_match_fuzzy_set1(en_vocab):
|
||||||
rules = {
|
rules = {
|
||||||
"GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "No"]}}, "OP": "+"}]]
|
"GoogleNow": [[{"ORTH": {"FUZZY2": {"IN": ["Google", "No"]}}, "OP": "+"}]]
|
||||||
}
|
}
|
||||||
matcher = Matcher(en_vocab, fuzzy=80)
|
matcher = Matcher(en_vocab)
|
||||||
for key, patterns in rules.items():
|
for key, patterns in rules.items():
|
||||||
matcher.add(key, patterns, greedy="LONGEST")
|
matcher.add(key, patterns, greedy="LONGEST")
|
||||||
|
|
||||||
words = ["I", "like", "Goggle", "Now"]
|
words = ["They", "like", "Goggle", "Now"]
|
||||||
doc = Doc(matcher.vocab, words=words)
|
doc = Doc(matcher.vocab, words=words)
|
||||||
assert matcher(doc) == [
|
assert matcher(doc) == [
|
||||||
(doc.vocab.strings["GoogleNow"], 2, 4),
|
(doc.vocab.strings["GoogleNow"], 2, 4),
|
||||||
]
|
]
|
||||||
|
|
||||||
def test_matcher_match_fuzz_pred_not_in_set(en_vocab):
|
def test_matcher_match_fuzzy_set2(en_vocab):
|
||||||
rules = {
|
rules = {
|
||||||
"GoogleNow": [[{"ORTH": {"FUZZY": {"NOT_IN": ["Google", "No"]}}, "OP": "+"}]],
|
"GoogleNow": [[{"ORTH": {"FUZZY2": {"NOT_IN": ["Google", "No"]}}, "OP": "+"}]],
|
||||||
}
|
}
|
||||||
matcher = Matcher(en_vocab, fuzzy=80)
|
matcher = Matcher(en_vocab)
|
||||||
for key, patterns in rules.items():
|
for key, patterns in rules.items():
|
||||||
matcher.add(key, patterns, greedy="LONGEST")
|
matcher.add(key, patterns, greedy="LONGEST")
|
||||||
|
|
||||||
words = ["I", "like", "Goggle", "Now"]
|
words = ["They", "like", "Goggle", "Now"]
|
||||||
doc = Doc(matcher.vocab, words=words)
|
doc = Doc(matcher.vocab, words=words)
|
||||||
assert matcher(doc) == [
|
assert matcher(doc) == [
|
||||||
(doc.vocab.strings["GoogleNow"], 0, 2),
|
(doc.vocab.strings["GoogleNow"], 0, 2),
|
||||||
]
|
]
|
||||||
|
|
||||||
def test_matcher_match_fuzz_pred_in_set_with_exclude(en_vocab):
|
def test_matcher_match_fuzzy_set3(en_vocab):
|
||||||
rules = {
|
rules = {
|
||||||
"GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "No"]},
|
"GoogleNow": [[{"ORTH": {"FUZZY1": {"IN": ["Google", "No"]},
|
||||||
"NOT_IN": ["Goggle"]},
|
"NOT_IN": ["Goggle"]},
|
||||||
"OP": "+"}]]
|
"OP": "+"}]]
|
||||||
}
|
}
|
||||||
matcher = Matcher(en_vocab, fuzzy=80)
|
matcher = Matcher(en_vocab)
|
||||||
for key, patterns in rules.items():
|
for key, patterns in rules.items():
|
||||||
matcher.add(key, patterns, greedy="LONGEST")
|
matcher.add(key, patterns, greedy="LONGEST")
|
||||||
|
|
||||||
words = ["I", "like", "Goggle", "Now"]
|
words = ["They", "like", "Goggle", "Now"]
|
||||||
doc = Doc(matcher.vocab, words=words)
|
doc = Doc(matcher.vocab, words=words)
|
||||||
assert matcher(doc) == [
|
assert matcher(doc) == [
|
||||||
(doc.vocab.strings["GoogleNow"], 3, 4),
|
(doc.vocab.strings["GoogleNow"], 3, 4),
|
||||||
|
|
Loading…
Reference in New Issue
Block a user