Merge branch 'master' into website/migration

This commit is contained in:
svlandeg 2023-01-11 14:13:42 +01:00
commit ddc9f5612c
31 changed files with 831 additions and 129 deletions

View File

@ -74,7 +74,7 @@ console_scripts =
lookups = lookups =
spacy_lookups_data>=1.0.3,<1.1.0 spacy_lookups_data>=1.0.3,<1.1.0
transformers = transformers =
spacy_transformers>=1.1.2,<1.2.0 spacy_transformers>=1.1.2,<1.3.0
ray = ray =
spacy_ray>=0.1.0,<1.0.0 spacy_ray>=0.1.0,<1.0.0
cuda = cuda =

View File

@ -583,6 +583,10 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]: def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
"""Given a directory and a suffix, recursively find all files matching the suffix.
Directories or files with names beginning with a . are ignored, but hidden flags on
filesystems are not checked.
When provided with a suffix `None`, there is no suffix-based filtering."""
if not path.is_dir(): if not path.is_dir():
return [path] return [path]
paths = [path] paths = [path]

View File

@ -28,6 +28,8 @@ CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
"json": json_to_docs, "json": json_to_docs,
} }
AUTO = "auto"
# File types that can be written to stdout # File types that can be written to stdout
FILE_TYPES_STDOUT = ("json",) FILE_TYPES_STDOUT = ("json",)
@ -49,7 +51,7 @@ def convert_cli(
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"), model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"), morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"), merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"), converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True), ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"), lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"), concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
@ -70,8 +72,8 @@ def convert_cli(
output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
silent = output_dir == "-" silent = output_dir == "-"
msg = Printer(no_print=silent) msg = Printer(no_print=silent)
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
converter = _get_converter(msg, converter, input_path) converter = _get_converter(msg, converter, input_path)
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
convert( convert(
input_path, input_path,
output_dir, output_dir,
@ -100,7 +102,7 @@ def convert(
model: Optional[str] = None, model: Optional[str] = None,
morphology: bool = False, morphology: bool = False,
merge_subtokens: bool = False, merge_subtokens: bool = False,
converter: str = "auto", converter: str,
ner_map: Optional[Path] = None, ner_map: Optional[Path] = None,
lang: Optional[str] = None, lang: Optional[str] = None,
concatenate: bool = False, concatenate: bool = False,
@ -212,18 +214,22 @@ def verify_cli_args(
input_locs = walk_directory(input_path, converter) input_locs = walk_directory(input_path, converter)
if len(input_locs) == 0: if len(input_locs) == 0:
msg.fail("No input files in directory", input_path, exits=1) msg.fail("No input files in directory", input_path, exits=1)
file_types = list(set([loc.suffix[1:] for loc in input_locs])) if converter not in CONVERTERS:
if converter == "auto" and len(file_types) >= 2:
file_types_str = ",".join(file_types)
msg.fail("All input files must be same type", file_types_str, exits=1)
if converter != "auto" and converter not in CONVERTERS:
msg.fail(f"Can't find converter for {converter}", exits=1) msg.fail(f"Can't find converter for {converter}", exits=1)
def _get_converter(msg, converter, input_path: Path): def _get_converter(msg, converter, input_path: Path):
if input_path.is_dir(): if input_path.is_dir():
input_path = walk_directory(input_path, converter)[0] if converter == AUTO:
if converter == "auto": input_locs = walk_directory(input_path, suffix=None)
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
if len(file_types) >= 2:
file_types_str = ",".join(file_types)
msg.fail("All input files must be same type", file_types_str, exits=1)
input_path = input_locs[0]
else:
input_path = walk_directory(input_path, suffix=converter)[0]
if converter == AUTO:
converter = input_path.suffix[1:] converter = input_path.suffix[1:]
if converter == "ner" or converter == "iob": if converter == "ner" or converter == "iob":
with input_path.open(encoding="utf8") as file_: with input_path.open(encoding="utf8") as file_:

View File

@ -11,6 +11,7 @@ from .render import DependencyRenderer, EntityRenderer, SpanRenderer
from ..tokens import Doc, Span from ..tokens import Doc, Span
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..util import is_in_jupyter from ..util import is_in_jupyter
from ..util import find_available_port
_html = {} _html = {}
@ -82,6 +83,7 @@ def serve(
manual: bool = False, manual: bool = False,
port: int = 5000, port: int = 5000,
host: str = "0.0.0.0", host: str = "0.0.0.0",
auto_select_port: bool = False,
) -> None: ) -> None:
"""Serve displaCy visualisation. """Serve displaCy visualisation.
@ -93,15 +95,20 @@ def serve(
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
port (int): Port to serve visualisation. port (int): Port to serve visualisation.
host (str): Host to serve visualisation. host (str): Host to serve visualisation.
auto_select_port (bool): Automatically select a port if the specified port is in use.
DOCS: https://spacy.io/api/top-level#displacy.serve DOCS: https://spacy.io/api/top-level#displacy.serve
USAGE: https://spacy.io/usage/visualizers USAGE: https://spacy.io/usage/visualizers
""" """
from wsgiref import simple_server from wsgiref import simple_server
port = find_available_port(port, host, auto_select_port)
if is_in_jupyter(): if is_in_jupyter():
warnings.warn(Warnings.W011) warnings.warn(Warnings.W011)
render(docs, style=style, page=page, minify=minify, options=options, manual=manual) render(
docs, style=style, page=page, minify=minify, options=options, manual=manual
)
httpd = simple_server.make_server(host, port, app) httpd = simple_server.make_server(host, port, app)
print(f"\nUsing the '{style}' visualizer") print(f"\nUsing the '{style}' visualizer")
print(f"Serving on http://{host}:{port} ...\n") print(f"Serving on http://{host}:{port} ...\n")

View File

@ -214,6 +214,7 @@ class Warnings(metaclass=ErrorsWithCodes):
"is a Cython extension type.") "is a Cython extension type.")
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option " W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
class Errors(metaclass=ErrorsWithCodes): class Errors(metaclass=ErrorsWithCodes):
@ -963,6 +964,10 @@ class Errors(metaclass=ErrorsWithCodes):
"knowledge base, use `InMemoryLookupKB`.") "knowledge base, use `InMemoryLookupKB`.")
E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.") E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}") E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}")
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
"with `displacy.serve(doc, port)`")
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port)` "
"or use `auto_switch_port=True` to pick an available port automatically.")
# Deprecated model shortcuts, only used in errors and warnings # Deprecated model shortcuts, only used in errors and warnings

View File

@ -4,6 +4,8 @@ from libc.stdint cimport int64_t
from typing import Optional from typing import Optional
from ..util import registry
cdef extern from "polyleven.c": cdef extern from "polyleven.c":
int64_t polyleven(PyObject *o1, PyObject *o2, int64_t k) int64_t polyleven(PyObject *o1, PyObject *o2, int64_t k)
@ -13,3 +15,18 @@ cpdef int64_t levenshtein(a: str, b: str, k: Optional[int] = None):
if k is None: if k is None:
k = -1 k = -1
return polyleven(<PyObject*>a, <PyObject*>b, k) return polyleven(<PyObject*>a, <PyObject*>b, k)
cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
if fuzzy >= 0:
max_edits = fuzzy
else:
# allow at least two edits (to allow at least one transposition) and up
# to 20% of the pattern string length
max_edits = max(2, round(0.3 * len(pattern_text)))
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
@registry.misc("spacy.levenshtein_compare.v1")
def make_levenshtein_compare():
return levenshtein_compare

View File

@ -77,3 +77,4 @@ cdef class Matcher:
cdef public object _extensions cdef public object _extensions
cdef public object _extra_predicates cdef public object _extra_predicates
cdef public object _seen_attrs cdef public object _seen_attrs
cdef public object _fuzzy_compare

View File

@ -5,7 +5,8 @@ from ..vocab import Vocab
from ..tokens import Doc, Span from ..tokens import Doc, Span
class Matcher: class Matcher:
def __init__(self, vocab: Vocab, validate: bool = ...) -> None: ... def __init__(self, vocab: Vocab, validate: bool = ...,
fuzzy_compare: Callable[[str, str, int], bool] = ...) -> None: ...
def __reduce__(self) -> Any: ... def __reduce__(self) -> Any: ...
def __len__(self) -> int: ... def __len__(self) -> int: ...
def __contains__(self, key: str) -> bool: ... def __contains__(self, key: str) -> bool: ...

View File

@ -1,4 +1,4 @@
# cython: infer_types=True, profile=True # cython: binding=True, infer_types=True, profile=True
from typing import List, Iterable from typing import List, Iterable
from libcpp.vector cimport vector from libcpp.vector cimport vector
@ -20,10 +20,12 @@ from ..tokens.token cimport Token
from ..tokens.morphanalysis cimport MorphAnalysis from ..tokens.morphanalysis cimport MorphAnalysis
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
from .levenshtein import levenshtein_compare
from ..schemas import validate_token_pattern from ..schemas import validate_token_pattern
from ..errors import Errors, MatchPatternError, Warnings from ..errors import Errors, MatchPatternError, Warnings
from ..strings import get_string_id from ..strings import get_string_id
from ..attrs import IDS from ..attrs import IDS
from ..util import registry
DEF PADDING = 5 DEF PADDING = 5
@ -36,11 +38,13 @@ cdef class Matcher:
USAGE: https://spacy.io/usage/rule-based-matching USAGE: https://spacy.io/usage/rule-based-matching
""" """
def __init__(self, vocab, validate=True): def __init__(self, vocab, validate=True, *, fuzzy_compare=levenshtein_compare):
"""Create the Matcher. """Create the Matcher.
vocab (Vocab): The vocabulary object, which must be shared with the vocab (Vocab): The vocabulary object, which must be shared with the
documents the matcher will operate on. validate (bool): Validate all patterns added to this matcher.
fuzzy_compare (Callable[[str, str, int], bool]): The comparison method
for the FUZZY operators.
""" """
self._extra_predicates = [] self._extra_predicates = []
self._patterns = {} self._patterns = {}
@ -51,9 +55,10 @@ cdef class Matcher:
self.vocab = vocab self.vocab = vocab
self.mem = Pool() self.mem = Pool()
self.validate = validate self.validate = validate
self._fuzzy_compare = fuzzy_compare
def __reduce__(self): def __reduce__(self):
data = (self.vocab, self._patterns, self._callbacks) data = (self.vocab, self._patterns, self._callbacks, self.validate, self._fuzzy_compare)
return (unpickle_matcher, data, None, None) return (unpickle_matcher, data, None, None)
def __len__(self): def __len__(self):
@ -128,7 +133,7 @@ cdef class Matcher:
for pattern in patterns: for pattern in patterns:
try: try:
specs = _preprocess_pattern(pattern, self.vocab, specs = _preprocess_pattern(pattern, self.vocab,
self._extensions, self._extra_predicates) self._extensions, self._extra_predicates, self._fuzzy_compare)
self.patterns.push_back(init_pattern(self.mem, key, specs)) self.patterns.push_back(init_pattern(self.mem, key, specs))
for spec in specs: for spec in specs:
for attr, _ in spec[1]: for attr, _ in spec[1]:
@ -326,8 +331,8 @@ cdef class Matcher:
return key return key
def unpickle_matcher(vocab, patterns, callbacks): def unpickle_matcher(vocab, patterns, callbacks, validate, fuzzy_compare):
matcher = Matcher(vocab) matcher = Matcher(vocab, validate=validate, fuzzy_compare=fuzzy_compare)
for key, pattern in patterns.items(): for key, pattern in patterns.items():
callback = callbacks.get(key, None) callback = callbacks.get(key, None)
matcher.add(key, pattern, on_match=callback) matcher.add(key, pattern, on_match=callback)
@ -754,7 +759,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
return id_attr.value return id_attr.value
def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates): def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy_compare):
"""This function interprets the pattern, converting the various bits of """This function interprets the pattern, converting the various bits of
syntactic sugar before we compile it into a struct with init_pattern. syntactic sugar before we compile it into a struct with init_pattern.
@ -781,7 +786,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
ops = _get_operators(spec) ops = _get_operators(spec)
attr_values = _get_attr_values(spec, string_store) attr_values = _get_attr_values(spec, string_store)
extensions = _get_extensions(spec, string_store, extensions_table) extensions = _get_extensions(spec, string_store, extensions_table)
predicates = _get_extra_predicates(spec, extra_predicates, vocab) predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare)
for op in ops: for op in ops:
tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx)) tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
return tokens return tokens
@ -826,16 +831,45 @@ def _get_attr_values(spec, string_store):
# These predicate helper classes are used to match the REGEX, IN, >= etc # These predicate helper classes are used to match the REGEX, IN, >= etc
# extensions to the matcher introduced in #3173. # extensions to the matcher introduced in #3173.
class _FuzzyPredicate:
operators = ("FUZZY", "FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5",
"FUZZY6", "FUZZY7", "FUZZY8", "FUZZY9")
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
regex=False, fuzzy=None, fuzzy_compare=None):
self.i = i
self.attr = attr
self.value = value
self.predicate = predicate
self.is_extension = is_extension
if self.predicate not in self.operators:
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
fuzz = self.predicate[len("FUZZY"):] # number after prefix
self.fuzzy = int(fuzz) if fuzz else -1
self.fuzzy_compare = fuzzy_compare
self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
def __call__(self, Token token):
if self.is_extension:
value = token._.get(self.attr)
else:
value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
if self.value == value:
return True
return self.fuzzy_compare(value, self.value, self.fuzzy)
class _RegexPredicate: class _RegexPredicate:
operators = ("REGEX",) operators = ("REGEX",)
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
regex=False, fuzzy=None, fuzzy_compare=None):
self.i = i self.i = i
self.attr = attr self.attr = attr
self.value = re.compile(value) self.value = re.compile(value)
self.predicate = predicate self.predicate = predicate
self.is_extension = is_extension self.is_extension = is_extension
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
if self.predicate not in self.operators: if self.predicate not in self.operators:
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
@ -850,18 +884,28 @@ class _RegexPredicate:
class _SetPredicate: class _SetPredicate:
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS") operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
regex=False, fuzzy=None, fuzzy_compare=None):
self.i = i self.i = i
self.attr = attr self.attr = attr
self.vocab = vocab self.vocab = vocab
self.regex = regex
self.fuzzy = fuzzy
self.fuzzy_compare = fuzzy_compare
if self.attr == MORPH: if self.attr == MORPH:
# normalize morph strings # normalize morph strings
self.value = set(self.vocab.morphology.add(v) for v in value) self.value = set(self.vocab.morphology.add(v) for v in value)
else:
if self.regex:
self.value = set(re.compile(v) for v in value)
elif self.fuzzy is not None:
# add to string store
self.value = set(self.vocab.strings.add(v) for v in value)
else: else:
self.value = set(get_string_id(v) for v in value) self.value = set(get_string_id(v) for v in value)
self.predicate = predicate self.predicate = predicate
self.is_extension = is_extension self.is_extension = is_extension
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
if self.predicate not in self.operators: if self.predicate not in self.operators:
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
@ -889,9 +933,29 @@ class _SetPredicate:
return False return False
if self.predicate == "IN": if self.predicate == "IN":
return value in self.value if self.regex:
value = self.vocab.strings[value]
return any(bool(v.search(value)) for v in self.value)
elif self.fuzzy is not None:
value = self.vocab.strings[value]
return any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy)
for v in self.value)
elif value in self.value:
return True
else:
return False
elif self.predicate == "NOT_IN": elif self.predicate == "NOT_IN":
return value not in self.value if self.regex:
value = self.vocab.strings[value]
return not any(bool(v.search(value)) for v in self.value)
elif self.fuzzy is not None:
value = self.vocab.strings[value]
return not any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy)
for v in self.value)
elif value in self.value:
return False
else:
return True
elif self.predicate == "IS_SUBSET": elif self.predicate == "IS_SUBSET":
return value <= self.value return value <= self.value
elif self.predicate == "IS_SUPERSET": elif self.predicate == "IS_SUPERSET":
@ -906,13 +970,14 @@ class _SetPredicate:
class _ComparisonPredicate: class _ComparisonPredicate:
operators = ("==", "!=", ">=", "<=", ">", "<") operators = ("==", "!=", ">=", "<=", ">", "<")
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
regex=False, fuzzy=None, fuzzy_compare=None):
self.i = i self.i = i
self.attr = attr self.attr = attr
self.value = value self.value = value
self.predicate = predicate self.predicate = predicate
self.is_extension = is_extension self.is_extension = is_extension
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
if self.predicate not in self.operators: if self.predicate not in self.operators:
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
@ -935,7 +1000,7 @@ class _ComparisonPredicate:
return value < self.value return value < self.value
def _get_extra_predicates(spec, extra_predicates, vocab): def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare):
predicate_types = { predicate_types = {
"REGEX": _RegexPredicate, "REGEX": _RegexPredicate,
"IN": _SetPredicate, "IN": _SetPredicate,
@ -949,6 +1014,16 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
"<=": _ComparisonPredicate, "<=": _ComparisonPredicate,
">": _ComparisonPredicate, ">": _ComparisonPredicate,
"<": _ComparisonPredicate, "<": _ComparisonPredicate,
"FUZZY": _FuzzyPredicate,
"FUZZY1": _FuzzyPredicate,
"FUZZY2": _FuzzyPredicate,
"FUZZY3": _FuzzyPredicate,
"FUZZY4": _FuzzyPredicate,
"FUZZY5": _FuzzyPredicate,
"FUZZY6": _FuzzyPredicate,
"FUZZY7": _FuzzyPredicate,
"FUZZY8": _FuzzyPredicate,
"FUZZY9": _FuzzyPredicate,
} }
seen_predicates = {pred.key: pred.i for pred in extra_predicates} seen_predicates = {pred.key: pred.i for pred in extra_predicates}
output = [] output = []
@ -966,12 +1041,40 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
attr = "ORTH" attr = "ORTH"
attr = IDS.get(attr.upper()) attr = IDS.get(attr.upper())
if isinstance(value, dict): if isinstance(value, dict):
processed = False output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
value_with_upper_keys = {k.upper(): v for k, v in value.items()} extra_predicates, seen_predicates, fuzzy_compare=fuzzy_compare))
for type_, cls in predicate_types.items(): return output
if type_ in value_with_upper_keys:
predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab)
# Don't create a redundant predicates. def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
extra_predicates, seen_predicates, regex=False, fuzzy=None, fuzzy_compare=None):
output = []
for type_, value in value_dict.items():
type_ = type_.upper()
cls = predicate_types.get(type_)
if cls is None:
warnings.warn(Warnings.W035.format(pattern=value_dict))
# ignore unrecognized predicate type
continue
elif cls == _RegexPredicate:
if isinstance(value, dict):
# add predicates inside regex operator
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
extra_predicates, seen_predicates,
regex=True))
continue
elif cls == _FuzzyPredicate:
if isinstance(value, dict):
# add predicates inside fuzzy operator
fuzz = type_[len("FUZZY"):] # number after prefix
fuzzy_val = int(fuzz) if fuzz else -1
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
extra_predicates, seen_predicates,
fuzzy=fuzzy_val, fuzzy_compare=fuzzy_compare))
continue
predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab,
regex=regex, fuzzy=fuzzy, fuzzy_compare=fuzzy_compare)
# Don't create redundant predicates.
# This helps with efficiency, as we're caching the results. # This helps with efficiency, as we're caching the results.
if predicate.key in seen_predicates: if predicate.key in seen_predicates:
output.append(seen_predicates[predicate.key]) output.append(seen_predicates[predicate.key])
@ -979,9 +1082,6 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
extra_predicates.append(predicate) extra_predicates.append(predicate)
output.append(predicate.i) output.append(predicate.i)
seen_predicates[predicate.key] = predicate.i seen_predicates[predicate.key] = predicate.i
processed = True
if not processed:
warnings.warn(Warnings.W035.format(pattern=value))
return output return output

View File

@ -11,6 +11,7 @@ from ..errors import Errors, Warnings
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
from ..tokens import Doc, Span from ..tokens import Doc, Span
from ..matcher import Matcher, PhraseMatcher from ..matcher import Matcher, PhraseMatcher
from ..matcher.levenshtein import levenshtein_compare
from ..scorer import get_ner_prf from ..scorer import get_ner_prf
@ -23,6 +24,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
assigns=["doc.ents", "token.ent_type", "token.ent_iob"], assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
default_config={ default_config={
"phrase_matcher_attr": None, "phrase_matcher_attr": None,
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
"validate": False, "validate": False,
"overwrite_ents": False, "overwrite_ents": False,
"ent_id_sep": DEFAULT_ENT_ID_SEP, "ent_id_sep": DEFAULT_ENT_ID_SEP,
@ -39,6 +41,7 @@ def make_entity_ruler(
nlp: Language, nlp: Language,
name: str, name: str,
phrase_matcher_attr: Optional[Union[int, str]], phrase_matcher_attr: Optional[Union[int, str]],
matcher_fuzzy_compare: Callable,
validate: bool, validate: bool,
overwrite_ents: bool, overwrite_ents: bool,
ent_id_sep: str, ent_id_sep: str,
@ -48,6 +51,7 @@ def make_entity_ruler(
nlp, nlp,
name, name,
phrase_matcher_attr=phrase_matcher_attr, phrase_matcher_attr=phrase_matcher_attr,
matcher_fuzzy_compare=matcher_fuzzy_compare,
validate=validate, validate=validate,
overwrite_ents=overwrite_ents, overwrite_ents=overwrite_ents,
ent_id_sep=ent_id_sep, ent_id_sep=ent_id_sep,
@ -81,6 +85,7 @@ class EntityRuler(Pipe):
name: str = "entity_ruler", name: str = "entity_ruler",
*, *,
phrase_matcher_attr: Optional[Union[int, str]] = None, phrase_matcher_attr: Optional[Union[int, str]] = None,
matcher_fuzzy_compare: Callable = levenshtein_compare,
validate: bool = False, validate: bool = False,
overwrite_ents: bool = False, overwrite_ents: bool = False,
ent_id_sep: str = DEFAULT_ENT_ID_SEP, ent_id_sep: str = DEFAULT_ENT_ID_SEP,
@ -99,7 +104,10 @@ class EntityRuler(Pipe):
added. Used to disable the current entity ruler while creating added. Used to disable the current entity ruler while creating
phrase patterns with the nlp object. phrase patterns with the nlp object.
phrase_matcher_attr (int / str): Token attribute to match on, passed phrase_matcher_attr (int / str): Token attribute to match on, passed
to the internal PhraseMatcher as `attr` to the internal PhraseMatcher as `attr`.
matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
internal Matcher. Defaults to
spacy.matcher.levenshtein.levenshtein_compare.
validate (bool): Whether patterns should be validated, passed to validate (bool): Whether patterns should be validated, passed to
Matcher and PhraseMatcher as `validate` Matcher and PhraseMatcher as `validate`
patterns (iterable): Optional patterns to load in. patterns (iterable): Optional patterns to load in.
@ -117,7 +125,10 @@ class EntityRuler(Pipe):
self.token_patterns = defaultdict(list) # type: ignore self.token_patterns = defaultdict(list) # type: ignore
self.phrase_patterns = defaultdict(list) # type: ignore self.phrase_patterns = defaultdict(list) # type: ignore
self._validate = validate self._validate = validate
self.matcher = Matcher(nlp.vocab, validate=validate) self.matcher_fuzzy_compare = matcher_fuzzy_compare
self.matcher = Matcher(
nlp.vocab, validate=validate, fuzzy_compare=self.matcher_fuzzy_compare
)
self.phrase_matcher_attr = phrase_matcher_attr self.phrase_matcher_attr = phrase_matcher_attr
self.phrase_matcher = PhraseMatcher( self.phrase_matcher = PhraseMatcher(
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
@ -337,7 +348,11 @@ class EntityRuler(Pipe):
self.token_patterns = defaultdict(list) self.token_patterns = defaultdict(list)
self.phrase_patterns = defaultdict(list) self.phrase_patterns = defaultdict(list)
self._ent_ids = defaultdict(tuple) self._ent_ids = defaultdict(tuple)
self.matcher = Matcher(self.nlp.vocab, validate=self._validate) self.matcher = Matcher(
self.nlp.vocab,
validate=self._validate,
fuzzy_compare=self.matcher_fuzzy_compare,
)
self.phrase_matcher = PhraseMatcher( self.phrase_matcher = PhraseMatcher(
self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
) )
@ -431,7 +446,8 @@ class EntityRuler(Pipe):
self.overwrite = cfg.get("overwrite", False) self.overwrite = cfg.get("overwrite", False)
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None) self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
self.phrase_matcher = PhraseMatcher( self.phrase_matcher = PhraseMatcher(
self.nlp.vocab, attr=self.phrase_matcher_attr self.nlp.vocab,
attr=self.phrase_matcher_attr,
) )
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
else: else:

View File

@ -13,6 +13,7 @@ from ..util import ensure_path, SimpleFrozenList, registry
from ..tokens import Doc, Span from ..tokens import Doc, Span
from ..scorer import Scorer from ..scorer import Scorer
from ..matcher import Matcher, PhraseMatcher from ..matcher import Matcher, PhraseMatcher
from ..matcher.levenshtein import levenshtein_compare
from .. import util from .. import util
PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
@ -28,6 +29,7 @@ DEFAULT_SPANS_KEY = "ruler"
"overwrite_ents": False, "overwrite_ents": False,
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
"ent_id_sep": "__unused__", "ent_id_sep": "__unused__",
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
}, },
default_score_weights={ default_score_weights={
"ents_f": 1.0, "ents_f": 1.0,
@ -40,6 +42,7 @@ def make_entity_ruler(
nlp: Language, nlp: Language,
name: str, name: str,
phrase_matcher_attr: Optional[Union[int, str]], phrase_matcher_attr: Optional[Union[int, str]],
matcher_fuzzy_compare: Callable,
validate: bool, validate: bool,
overwrite_ents: bool, overwrite_ents: bool,
scorer: Optional[Callable], scorer: Optional[Callable],
@ -57,6 +60,7 @@ def make_entity_ruler(
annotate_ents=True, annotate_ents=True,
ents_filter=ents_filter, ents_filter=ents_filter,
phrase_matcher_attr=phrase_matcher_attr, phrase_matcher_attr=phrase_matcher_attr,
matcher_fuzzy_compare=matcher_fuzzy_compare,
validate=validate, validate=validate,
overwrite=False, overwrite=False,
scorer=scorer, scorer=scorer,
@ -72,6 +76,7 @@ def make_entity_ruler(
"annotate_ents": False, "annotate_ents": False,
"ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}, "ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
"phrase_matcher_attr": None, "phrase_matcher_attr": None,
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
"validate": False, "validate": False,
"overwrite": True, "overwrite": True,
"scorer": { "scorer": {
@ -94,6 +99,7 @@ def make_span_ruler(
annotate_ents: bool, annotate_ents: bool,
ents_filter: Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]], ents_filter: Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]],
phrase_matcher_attr: Optional[Union[int, str]], phrase_matcher_attr: Optional[Union[int, str]],
matcher_fuzzy_compare: Callable,
validate: bool, validate: bool,
overwrite: bool, overwrite: bool,
scorer: Optional[Callable], scorer: Optional[Callable],
@ -106,6 +112,7 @@ def make_span_ruler(
annotate_ents=annotate_ents, annotate_ents=annotate_ents,
ents_filter=ents_filter, ents_filter=ents_filter,
phrase_matcher_attr=phrase_matcher_attr, phrase_matcher_attr=phrase_matcher_attr,
matcher_fuzzy_compare=matcher_fuzzy_compare,
validate=validate, validate=validate,
overwrite=overwrite, overwrite=overwrite,
scorer=scorer, scorer=scorer,
@ -216,6 +223,7 @@ class SpanRuler(Pipe):
[Iterable[Span], Iterable[Span]], Iterable[Span] [Iterable[Span], Iterable[Span]], Iterable[Span]
] = util.filter_chain_spans, ] = util.filter_chain_spans,
phrase_matcher_attr: Optional[Union[int, str]] = None, phrase_matcher_attr: Optional[Union[int, str]] = None,
matcher_fuzzy_compare: Callable = levenshtein_compare,
validate: bool = False, validate: bool = False,
overwrite: bool = False, overwrite: bool = False,
scorer: Optional[Callable] = partial( scorer: Optional[Callable] = partial(
@ -246,6 +254,9 @@ class SpanRuler(Pipe):
phrase_matcher_attr (Optional[Union[int, str]]): Token attribute to phrase_matcher_attr (Optional[Union[int, str]]): Token attribute to
match on, passed to the internal PhraseMatcher as `attr`. Defaults match on, passed to the internal PhraseMatcher as `attr`. Defaults
to `None`. to `None`.
matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
internal Matcher. Defaults to
spacy.matcher.levenshtein.levenshtein_compare.
validate (bool): Whether patterns should be validated, passed to validate (bool): Whether patterns should be validated, passed to
Matcher and PhraseMatcher as `validate`. Matcher and PhraseMatcher as `validate`.
overwrite (bool): Whether to remove any existing spans under this spans overwrite (bool): Whether to remove any existing spans under this spans
@ -266,6 +277,7 @@ class SpanRuler(Pipe):
self.spans_filter = spans_filter self.spans_filter = spans_filter
self.ents_filter = ents_filter self.ents_filter = ents_filter
self.scorer = scorer self.scorer = scorer
self.matcher_fuzzy_compare = matcher_fuzzy_compare
self._match_label_id_map: Dict[int, Dict[str, str]] = {} self._match_label_id_map: Dict[int, Dict[str, str]] = {}
self.clear() self.clear()
@ -451,7 +463,11 @@ class SpanRuler(Pipe):
DOCS: https://spacy.io/api/spanruler#clear DOCS: https://spacy.io/api/spanruler#clear
""" """
self._patterns: List[PatternType] = [] self._patterns: List[PatternType] = []
self.matcher: Matcher = Matcher(self.nlp.vocab, validate=self.validate) self.matcher: Matcher = Matcher(
self.nlp.vocab,
validate=self.validate,
fuzzy_compare=self.matcher_fuzzy_compare,
)
self.phrase_matcher: PhraseMatcher = PhraseMatcher( self.phrase_matcher: PhraseMatcher = PhraseMatcher(
self.nlp.vocab, self.nlp.vocab,
attr=self.phrase_matcher_attr, attr=self.phrase_matcher_attr,

View File

@ -74,7 +74,7 @@ subword_features = true
default_config={ default_config={
"threshold": 0.5, "threshold": 0.5,
"model": DEFAULT_MULTI_TEXTCAT_MODEL, "model": DEFAULT_MULTI_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"}, "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
}, },
default_score_weights={ default_score_weights={
"cats_score": 1.0, "cats_score": 1.0,
@ -120,7 +120,7 @@ def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str,
) )
@registry.scorers("spacy.textcat_multilabel_scorer.v1") @registry.scorers("spacy.textcat_multilabel_scorer.v2")
def make_textcat_multilabel_scorer(): def make_textcat_multilabel_scorer():
return textcat_multilabel_score return textcat_multilabel_score

View File

@ -156,12 +156,22 @@ def validate_token_pattern(obj: list) -> List[str]:
class TokenPatternString(BaseModel): class TokenPatternString(BaseModel):
REGEX: Optional[StrictStr] = Field(None, alias="regex") REGEX: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="regex")
IN: Optional[List[StrictStr]] = Field(None, alias="in") IN: Optional[List[StrictStr]] = Field(None, alias="in")
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in") NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset") IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset") IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects") INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
FUZZY: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy")
FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy1")
FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy2")
FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy3")
FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy4")
FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy5")
FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy6")
FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy7")
FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy8")
FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy9")
class Config: class Config:
extra = "forbid" extra = "forbid"

View File

@ -174,7 +174,7 @@ class Scorer:
prf_score.score_set(pred_spans, gold_spans) prf_score.score_set(pred_spans, gold_spans)
if len(acc_score) > 0: if len(acc_score) > 0:
return { return {
"token_acc": acc_score.fscore, "token_acc": acc_score.precision,
"token_p": prf_score.precision, "token_p": prf_score.precision,
"token_r": prf_score.recall, "token_r": prf_score.recall,
"token_f": prf_score.fscore, "token_f": prf_score.fscore,
@ -476,14 +476,12 @@ class Scorer:
f_per_type = {label: PRFScore() for label in labels} f_per_type = {label: PRFScore() for label in labels}
auc_per_type = {label: ROCAUCScore() for label in labels} auc_per_type = {label: ROCAUCScore() for label in labels}
labels = set(labels) labels = set(labels)
if labels:
for eg in examples:
labels.update(eg.predicted.cats.keys())
labels.update(eg.reference.cats.keys())
for example in examples: for example in examples:
# Through this loop, None in the gold_cats indicates missing label. # Through this loop, None in the gold_cats indicates missing label.
pred_cats = getter(example.predicted, attr) pred_cats = getter(example.predicted, attr)
pred_cats = {k: v for k, v in pred_cats.items() if k in labels}
gold_cats = getter(example.reference, attr) gold_cats = getter(example.reference, attr)
gold_cats = {k: v for k, v in gold_cats.items() if k in labels}
for label in labels: for label in labels:
pred_score = pred_cats.get(label, 0.0) pred_score = pred_cats.get(label, 0.0)

View File

@ -1,5 +1,6 @@
import pytest import pytest
from spacy.matcher import levenshtein from spacy.matcher import levenshtein
from spacy.matcher.levenshtein import levenshtein_compare
# empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests # empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
@ -42,3 +43,31 @@ from spacy.matcher import levenshtein
) )
def test_levenshtein(dist, a, b): def test_levenshtein(dist, a, b):
assert levenshtein(a, b) == dist assert levenshtein(a, b) == dist
@pytest.mark.parametrize(
"a,b,fuzzy,expected",
[
("a", "a", 1, True),
("a", "a", 0, True),
("a", "a", -1, True),
("a", "ab", 1, True),
("a", "ab", 0, False),
("a", "ab", -1, True),
("ab", "ac", 1, True),
("ab", "ac", -1, True),
("abc", "cde", 4, True),
("abc", "cde", -1, False),
("abcdef", "cdefgh", 4, True),
("abcdef", "cdefgh", 3, False),
("abcdef", "cdefgh", -1, False), # default (2 for length 6)
("abcdefgh", "cdefghijk", 5, True),
("abcdefgh", "cdefghijk", 4, False),
("abcdefgh", "cdefghijk", -1, False), # default (2)
("abcdefgh", "cdefghijkl", 6, True),
("abcdefgh", "cdefghijkl", 5, False),
("abcdefgh", "cdefghijkl", -1, False), # default (2)
],
)
def test_levenshtein_compare(a, b, fuzzy, expected):
assert levenshtein_compare(a, b, fuzzy) == expected

View File

@ -118,6 +118,155 @@ def test_matcher_match_multi(matcher):
] ]
@pytest.mark.parametrize(
"rules,match_locs",
[
(
{
"GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
},
[(2, 4)],
),
(
{
"Java": [[{"LOWER": {"FUZZY": "java"}}]],
},
[(5, 6)],
),
(
{
"JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]],
"GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
"Java": [[{"LOWER": {"FUZZY": "java"}}]],
},
[(2, 4), (5, 6), (8, 9)],
),
# only the second pattern matches (check that predicate keys used for
# caching don't collide)
(
{
"A": [[{"ORTH": {"FUZZY": "Javascripts"}}]],
"B": [[{"ORTH": {"FUZZY5": "Javascripts"}}]],
},
[(8, 9)],
),
],
)
def test_matcher_match_fuzzy(en_vocab, rules, match_locs):
words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
doc = Doc(en_vocab, words=words)
matcher = Matcher(en_vocab)
for key, patterns in rules.items():
matcher.add(key, patterns)
assert match_locs == [(start, end) for m_id, start, end in matcher(doc)]
@pytest.mark.parametrize("set_op", ["IN", "NOT_IN"])
def test_matcher_match_fuzzy_set_op_longest(en_vocab, set_op):
rules = {
"GoogleNow": [[{"ORTH": {"FUZZY": {set_op: ["Google", "Now"]}}, "OP": "+"}]]
}
matcher = Matcher(en_vocab)
for key, patterns in rules.items():
matcher.add(key, patterns, greedy="LONGEST")
words = ["They", "like", "Goggle", "Noo"]
doc = Doc(en_vocab, words=words)
assert len(matcher(doc)) == 1
def test_matcher_match_fuzzy_set_multiple(en_vocab):
rules = {
"GoogleNow": [
[
{
"ORTH": {"FUZZY": {"IN": ["Google", "Now"]}, "NOT_IN": ["Goggle"]},
"OP": "+",
}
]
]
}
matcher = Matcher(en_vocab)
for key, patterns in rules.items():
matcher.add(key, patterns, greedy="LONGEST")
words = ["They", "like", "Goggle", "Noo"]
doc = Doc(matcher.vocab, words=words)
assert matcher(doc) == [
(doc.vocab.strings["GoogleNow"], 3, 4),
]
@pytest.mark.parametrize("fuzzyn", range(1, 10))
def test_matcher_match_fuzzyn_all_insertions(en_vocab, fuzzyn):
matcher = Matcher(en_vocab)
matcher.add("GoogleNow", [[{"ORTH": {f"FUZZY{fuzzyn}": "GoogleNow"}}]])
# words with increasing edit distance
words = ["GoogleNow" + "a" * i for i in range(0, 10)]
doc = Doc(en_vocab, words)
assert len(matcher(doc)) == fuzzyn + 1
@pytest.mark.parametrize("fuzzyn", range(1, 6))
def test_matcher_match_fuzzyn_various_edits(en_vocab, fuzzyn):
matcher = Matcher(en_vocab)
matcher.add("GoogleNow", [[{"ORTH": {f"FUZZY{fuzzyn}": "GoogleNow"}}]])
# words with increasing edit distance of different edit types
words = [
"GoogleNow",
"GoogleNuw",
"GoogleNuew",
"GoogleNoweee",
"GiggleNuw3",
"gouggle5New",
]
doc = Doc(en_vocab, words)
assert len(matcher(doc)) == fuzzyn + 1
@pytest.mark.parametrize("greedy", ["FIRST", "LONGEST"])
@pytest.mark.parametrize("set_op", ["IN", "NOT_IN"])
def test_matcher_match_fuzzyn_set_op_longest(en_vocab, greedy, set_op):
rules = {
"GoogleNow": [[{"ORTH": {"FUZZY2": {set_op: ["Google", "Now"]}}, "OP": "+"}]]
}
matcher = Matcher(en_vocab)
for key, patterns in rules.items():
matcher.add(key, patterns, greedy=greedy)
words = ["They", "like", "Goggle", "Noo"]
doc = Doc(matcher.vocab, words=words)
spans = matcher(doc, as_spans=True)
assert len(spans) == 1
if set_op == "IN":
assert spans[0].text == "Goggle Noo"
else:
assert spans[0].text == "They like"
def test_matcher_match_fuzzyn_set_multiple(en_vocab):
rules = {
"GoogleNow": [
[
{
"ORTH": {"FUZZY1": {"IN": ["Google", "Now"]}, "NOT_IN": ["Goggle"]},
"OP": "+",
}
]
]
}
matcher = Matcher(en_vocab)
for key, patterns in rules.items():
matcher.add(key, patterns, greedy="LONGEST")
words = ["They", "like", "Goggle", "Noo"]
doc = Doc(matcher.vocab, words=words)
assert matcher(doc) == [
(doc.vocab.strings["GoogleNow"], 3, 4),
]
def test_matcher_empty_dict(en_vocab): def test_matcher_empty_dict(en_vocab):
"""Test matcher allows empty token specs, meaning match on any token.""" """Test matcher allows empty token specs, meaning match on any token."""
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
@ -437,6 +586,30 @@ def test_matcher_regex(en_vocab):
assert len(matches) == 0 assert len(matches) == 0
def test_matcher_regex_set_in(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"ORTH": {"REGEX": {"IN": [r"(?:a)", r"(?:an)"]}}}]
matcher.add("A_OR_AN", [pattern])
doc = Doc(en_vocab, words=["an", "a", "hi"])
matches = matcher(doc)
assert len(matches) == 2
doc = Doc(en_vocab, words=["bye"])
matches = matcher(doc)
assert len(matches) == 0
def test_matcher_regex_set_not_in(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"ORTH": {"REGEX": {"NOT_IN": [r"(?:a)", r"(?:an)"]}}}]
matcher.add("A_OR_AN", [pattern])
doc = Doc(en_vocab, words=["an", "a", "hi"])
matches = matcher(doc)
assert len(matches) == 1
doc = Doc(en_vocab, words=["bye"])
matches = matcher(doc)
assert len(matches) == 1
def test_matcher_regex_shape(en_vocab): def test_matcher_regex_shape(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}] pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}]

View File

@ -382,6 +382,43 @@ def test_entity_ruler_overlapping_spans(nlp, entity_ruler_factory):
assert doc.ents[0].label_ == "FOOBAR" assert doc.ents[0].label_ == "FOOBAR"
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
def test_entity_ruler_fuzzy_pipe(nlp, entity_ruler_factory):
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
ruler.add_patterns(patterns)
doc = nlp("helloo")
assert len(doc.ents) == 1
assert doc.ents[0].label_ == "HELLO"
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
def test_entity_ruler_fuzzy(nlp, entity_ruler_factory):
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
ruler.add_patterns(patterns)
doc = nlp("helloo")
assert len(doc.ents) == 1
assert doc.ents[0].label_ == "HELLO"
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
def test_entity_ruler_fuzzy_disabled(nlp, entity_ruler_factory):
@registry.misc("test_fuzzy_compare_disabled")
def make_test_fuzzy_compare_disabled():
return lambda x, y, z: False
ruler = nlp.add_pipe(
entity_ruler_factory,
name="entity_ruler",
config={"matcher_fuzzy_compare": {"@misc": "test_fuzzy_compare_disabled"}},
)
patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
ruler.add_patterns(patterns)
doc = nlp("helloo")
assert len(doc.ents) == 0
@pytest.mark.parametrize("n_process", [1, 2]) @pytest.mark.parametrize("n_process", [1, 2])
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) @pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory): def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory):

View File

@ -898,7 +898,11 @@ def test_textcat_multi_threshold():
@pytest.mark.parametrize( @pytest.mark.parametrize(
"component_name,scorer", [("textcat", "spacy.textcat_scorer.v1")] "component_name,scorer",
[
("textcat", "spacy.textcat_scorer.v1"),
("textcat_multilabel", "spacy.textcat_multilabel_scorer.v1"),
],
) )
def test_textcat_legacy_scorers(component_name, scorer): def test_textcat_legacy_scorers(component_name, scorer):
"""Check that legacy scorers are registered and produce the expected score """Check that legacy scorers are registered and produce the expected score

View File

@ -4,6 +4,7 @@ from collections import Counter
from typing import Tuple, List, Dict, Any from typing import Tuple, List, Dict, Any
import pkg_resources import pkg_resources
import time import time
from pathlib import Path
import spacy import spacy
import numpy import numpy
@ -15,7 +16,7 @@ from thinc.api import Config, ConfigValidationError
from spacy import about from spacy import about
from spacy.cli import info from spacy.cli import info
from spacy.cli._util import is_subpath_of, load_project_config from spacy.cli._util import is_subpath_of, load_project_config, walk_directory
from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import parse_config_overrides, string_to_list
from spacy.cli._util import substitute_project_variables from spacy.cli._util import substitute_project_variables
from spacy.cli._util import validate_project_commands from spacy.cli._util import validate_project_commands
@ -1185,3 +1186,26 @@ def test_upload_download_local_file():
download_file(remote_file, local_file) download_file(remote_file, local_file)
with local_file.open(mode="r") as file_: with local_file.open(mode="r") as file_:
assert file_.read() == content assert file_.read() == content
def test_walk_directory():
with make_tempdir() as d:
files = [
"data1.iob",
"data2.iob",
"data3.json",
"data4.conll",
"data5.conll",
"data6.conll",
"data7.txt",
]
for f in files:
Path(d / f).touch()
assert (len(walk_directory(d))) == 7
assert (len(walk_directory(d, suffix=None))) == 7
assert (len(walk_directory(d, suffix="json"))) == 1
assert (len(walk_directory(d, suffix="iob"))) == 2
assert (len(walk_directory(d, suffix="conll"))) == 3
assert (len(walk_directory(d, suffix="pdf"))) == 0

View File

@ -0,0 +1,33 @@
import os
from pathlib import Path
from typer.testing import CliRunner
from spacy.cli._util import app
from .util import make_tempdir
def test_convert_auto():
with make_tempdir() as d_in, make_tempdir() as d_out:
for f in ["data1.iob", "data2.iob", "data3.iob"]:
Path(d_in / f).touch()
# ensure that "automatic" suffix detection works
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
assert "Generated output file" in result.stdout
out_files = os.listdir(d_out)
assert len(out_files) == 3
assert "data1.spacy" in out_files
assert "data2.spacy" in out_files
assert "data3.spacy" in out_files
def test_convert_auto_conflict():
with make_tempdir() as d_in, make_tempdir() as d_out:
for f in ["data1.iob", "data2.iob", "data3.json"]:
Path(d_in / f).touch()
# ensure that "automatic" suffix detection warns when there are different file types
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
assert "All input files must be same type" in result.stdout
out_files = os.listdir(d_out)
assert len(out_files) == 0

View File

@ -3,6 +3,7 @@ import logging
from unittest import mock from unittest import mock
import pytest import pytest
from spacy.language import Language from spacy.language import Language
from spacy.scorer import Scorer
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.training import Example from spacy.training import Example
@ -126,6 +127,112 @@ def test_evaluate_no_pipe(nlp):
nlp.evaluate([Example.from_dict(doc, annots)]) nlp.evaluate([Example.from_dict(doc, annots)])
def test_evaluate_textcat_multilabel(en_vocab):
"""Test that evaluate works with a multilabel textcat pipe."""
nlp = Language(en_vocab)
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
textcat_multilabel.add_label(label)
nlp.initialize()
annots = {"cats": {"FEATURE": 1.0, "QUESTION": 1.0}}
doc = nlp.make_doc("hello world")
example = Example.from_dict(doc, annots)
scores = nlp.evaluate([example])
labels = nlp.get_pipe("textcat_multilabel").labels
for label in labels:
assert scores["cats_f_per_type"].get(label) is not None
for key in example.reference.cats.keys():
if key not in labels:
assert scores["cats_f_per_type"].get(key) is None
def test_evaluate_multiple_textcat_final(en_vocab):
"""Test that evaluate evaluates the final textcat component in a pipeline
with more than one textcat or textcat_multilabel."""
nlp = Language(en_vocab)
textcat = nlp.add_pipe("textcat")
for label in ("POSITIVE", "NEGATIVE"):
textcat.add_label(label)
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
textcat_multilabel.add_label(label)
nlp.initialize()
annots = {
"cats": {
"POSITIVE": 1.0,
"NEGATIVE": 0.0,
"FEATURE": 1.0,
"QUESTION": 1.0,
"POSITIVE": 1.0,
"NEGATIVE": 0.0,
}
}
doc = nlp.make_doc("hello world")
example = Example.from_dict(doc, annots)
scores = nlp.evaluate([example])
# get the labels from the final pipe
labels = nlp.get_pipe(nlp.pipe_names[-1]).labels
for label in labels:
assert scores["cats_f_per_type"].get(label) is not None
for key in example.reference.cats.keys():
if key not in labels:
assert scores["cats_f_per_type"].get(key) is None
def test_evaluate_multiple_textcat_separate(en_vocab):
"""Test that evaluate can evaluate multiple textcat components separately
with custom scorers."""
def custom_textcat_score(examples, **kwargs):
scores = Scorer.score_cats(
examples,
"cats",
multi_label=False,
**kwargs,
)
return {f"custom_{k}": v for k, v in scores.items()}
@spacy.registry.scorers("test_custom_textcat_scorer")
def make_custom_textcat_scorer():
return custom_textcat_score
nlp = Language(en_vocab)
textcat = nlp.add_pipe(
"textcat",
config={"scorer": {"@scorers": "test_custom_textcat_scorer"}},
)
for label in ("POSITIVE", "NEGATIVE"):
textcat.add_label(label)
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
textcat_multilabel.add_label(label)
nlp.initialize()
annots = {
"cats": {
"POSITIVE": 1.0,
"NEGATIVE": 0.0,
"FEATURE": 1.0,
"QUESTION": 1.0,
"POSITIVE": 1.0,
"NEGATIVE": 0.0,
}
}
doc = nlp.make_doc("hello world")
example = Example.from_dict(doc, annots)
scores = nlp.evaluate([example])
# check custom scores for the textcat pipe
assert "custom_cats_f_per_type" in scores
labels = nlp.get_pipe("textcat").labels
assert set(scores["custom_cats_f_per_type"].keys()) == set(labels)
# check default scores for the textcat_multilabel pipe
assert "cats_f_per_type" in scores
labels = nlp.get_pipe("textcat_multilabel").labels
assert set(scores["cats_f_per_type"].keys()) == set(labels)
def vector_modification_pipe(doc): def vector_modification_pipe(doc):
doc.vector += 1 doc.vector += 1
return doc return doc

View File

@ -8,7 +8,7 @@ from spacy import prefer_gpu, require_gpu, require_cpu
from spacy.ml._precomputable_affine import PrecomputableAffine from spacy.ml._precomputable_affine import PrecomputableAffine
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
from spacy.util import dot_to_object, SimpleFrozenList, import_file from spacy.util import dot_to_object, SimpleFrozenList, import_file
from spacy.util import to_ternary_int from spacy.util import to_ternary_int, find_available_port
from thinc.api import Config, Optimizer, ConfigValidationError from thinc.api import Config, Optimizer, ConfigValidationError
from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
from thinc.compat import has_cupy_gpu, has_torch_mps_gpu from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
@ -434,3 +434,16 @@ def test_to_ternary_int():
assert to_ternary_int(-10) == -1 assert to_ternary_int(-10) == -1
assert to_ternary_int("string") == -1 assert to_ternary_int("string") == -1
assert to_ternary_int([0, "string"]) == -1 assert to_ternary_int([0, "string"]) == -1
def test_find_available_port():
host = "0.0.0.0"
port = 5000
assert find_available_port(port, host) == port, "Port 5000 isn't free"
from wsgiref.simple_server import make_server, demo_app
with make_server(host, port, demo_app) as httpd:
with pytest.warns(UserWarning, match="already in use"):
found_port = find_available_port(port, host, auto_select=True)
assert found_port == port + 1, "Didn't find next port"

View File

@ -110,7 +110,7 @@ def test_tokenization(sented_doc):
) )
example.predicted[1].is_sent_start = False example.predicted[1].is_sent_start = False
scores = scorer.score([example]) scores = scorer.score([example])
assert scores["token_acc"] == approx(0.66666666) assert scores["token_acc"] == 0.5
assert scores["token_p"] == 0.5 assert scores["token_p"] == 0.5
assert scores["token_r"] == approx(0.33333333) assert scores["token_r"] == approx(0.33333333)
assert scores["token_f"] == 0.4 assert scores["token_f"] == 0.4

View File

@ -31,6 +31,7 @@ import shlex
import inspect import inspect
import pkgutil import pkgutil
import logging import logging
import socket
try: try:
import cupy.random import cupy.random
@ -1736,3 +1737,50 @@ def all_equal(iterable):
(or if the input is an empty sequence), False otherwise.""" (or if the input is an empty sequence), False otherwise."""
g = itertools.groupby(iterable) g = itertools.groupby(iterable)
return next(g, True) and not next(g, False) return next(g, True) and not next(g, False)
def _is_port_in_use(port: int, host: str = "localhost") -> bool:
"""Check if 'host:port' is in use. Return True if it is, False otherwise.
port (int): the port to check
host (str): the host to check (default "localhost")
RETURNS (bool): Whether 'host:port' is in use.
"""
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
s.bind((host, port))
return False
except socket.error:
return True
finally:
s.close()
def find_available_port(start: int, host: str, auto_select: bool = False) -> int:
"""Given a starting port and a host, handle finding a port.
If `auto_select` is False, a busy port will raise an error.
If `auto_select` is True, the next free higher port will be used.
start (int): the port to start looking from
host (str): the host to find a port on
auto_select (bool): whether to automatically select a new port if the given port is busy (default False)
RETURNS (int): The port to use.
"""
if not _is_port_in_use(start, host):
return start
port = start
if not auto_select:
raise ValueError(Errors.E1050.format(port=port))
while _is_port_in_use(port, host) and port < 65535:
port += 1
if port == 65535 and _is_port_in_use(port, host):
raise ValueError(Errors.E1049.format(host=host))
# if we get here, the port changed
warnings.warn(Warnings.W124.format(host=host, port=start, serve_port=port))
return port

View File

@ -186,7 +186,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | | `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | | `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | | `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
| `before_update` | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `step`, `epoch`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~ | | `before_update` <Tag variant="new">3.5</Tag> | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `step`, `epoch`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~ |
| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | | `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | | `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | | `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |

View File

@ -56,8 +56,9 @@ how the component should be configured. You can override its settings via the
> ``` > ```
| Setting | Description | | Setting | Description |
| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | | `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ |
| `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ | | `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ |
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ |
@ -86,15 +87,17 @@ be a token pattern (list) or a phrase pattern (string). For example:
> ``` > ```
| Name | Description | | Name | Description |
| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ | | `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ |
| `name` <Tag variant="new">3</Tag> | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ | | `name` <Tag variant="new">3</Tag> | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | | `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ |
| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ | | `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ |
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ |
| `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ | | `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ |
| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ |
## EntityRuler.initialize {id="initialize",tag="method",version="3"} ## EntityRuler.initialize {id="initialize",tag="method",version="3"}

View File

@ -87,7 +87,10 @@ it compares to another value.
> ``` > ```
| Attribute | Description | | Attribute | Description |
| -------------------------- | -------------------------------------------------------------------------------------------------------- | | -------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `REGEX` | Attribute value matches the regular expression at any position in the string. ~~Any~~ |
| `FUZZY` | Attribute value matches if the `fuzzy_compare` method matches for `(value, pattern, -1)`. The default method allows a Levenshtein edit distance of at least 2 and up to 30% of the pattern string length. ~~Any~~ |
| `FUZZY1`, `FUZZY2`, ... `FUZZY9` | Attribute value matches if the `fuzzy_compare` method matches for `(value, pattern, N)`. The default method allows a Levenshtein edit distance of at most N (1-9). ~~Any~~ |
| `IN` | Attribute value is member of a list. ~~Any~~ | | `IN` | Attribute value is member of a list. ~~Any~~ |
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ | | `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
| `IS_SUBSET` | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~ | | `IS_SUBSET` | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~ |
@ -95,6 +98,9 @@ it compares to another value.
| `INTERSECTS` | Attribute value (for `MORPH` or custom list attribute) has a non-empty intersection with a list. ~~Any~~ | | `INTERSECTS` | Attribute value (for `MORPH` or custom list attribute) has a non-empty intersection with a list. ~~Any~~ |
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
As of spaCy v3.5, `REGEX` and `FUZZY` can be used in combination with `IN` and
`NOT_IN`.
## Matcher.\_\_init\_\_ {id="init",tag="method"} ## Matcher.\_\_init\_\_ {id="init",tag="method"}
Create the rule-based `Matcher`. If `validate=True` is set, all patterns added Create the rule-based `Matcher`. If `validate=True` is set, all patterns added
@ -110,9 +116,10 @@ string where an integer is expected) or unexpected property names.
> ``` > ```
| Name | Description | | Name | Description |
| ---------- | ----------------------------------------------------------------------------------------------------- | | --------------- | ----------------------------------------------------------------------------------------------------- |
| `vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. ~~Vocab~~ | | `vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. ~~Vocab~~ |
| `validate` | Validate all patterns added to this matcher. ~~bool~~ | | `validate` | Validate all patterns added to this matcher. ~~bool~~ |
| `fuzzy_compare` | The comparison method used for the `FUZZY` operators. ~~Callable[[str, str, int], bool]~~ |
## Matcher.\_\_call\_\_ {id="call",tag="method"} ## Matcher.\_\_call\_\_ {id="call",tag="method"}

View File

@ -76,7 +76,7 @@ core pipeline components, the individual score names start with the `Token` or
Scores the tokenization: Scores the tokenization:
- `token_acc`: number of correct tokens / number of gold tokens - `token_acc`: number of correct tokens / number of predicted tokens
- `token_p`, `token_r`, `token_f`: precision, recall and F-score for token - `token_p`, `token_r`, `token_f`: precision, recall and F-score for token
character spans character spans

View File

@ -47,13 +47,14 @@ how the component should be configured. You can override its settings via the
> ``` > ```
| Setting | Description | | Setting | Description |
| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `spans_key` | The spans key to save the spans under. If `None`, no spans are saved. Defaults to `"ruler"`. ~~Optional[str]~~ | | `spans_key` | The spans key to save the spans under. If `None`, no spans are saved. Defaults to `"ruler"`. ~~Optional[str]~~ |
| `spans_filter` | The optional method to filter spans before they are assigned to doc.spans. Defaults to `None`. ~~Optional[Callable[[Iterable[Span], Iterable[Span]], List[Span]]]~~ | | `spans_filter` | The optional method to filter spans before they are assigned to doc.spans. Defaults to `None`. ~~Optional[Callable[[Iterable[Span], Iterable[Span]], List[Span]]]~~ |
| `annotate_ents` | Whether to save spans to doc.ents. Defaults to `False`. ~~bool~~ | | `annotate_ents` | Whether to save spans to doc.ents. Defaults to `False`. ~~bool~~ |
| `ents_filter` | The method to filter spans before they are assigned to doc.ents. Defaults to `util.filter_chain_spans`. ~~Callable[[Iterable[Span], Iterable[Span]], List[Span]]~~ | | `ents_filter` | The method to filter spans before they are assigned to doc.ents. Defaults to `util.filter_chain_spans`. ~~Callable[[Iterable[Span], Iterable[Span]], List[Span]]~~ |
| `phrase_matcher_attr` | Token attribute to match on, passed to the internal PhraseMatcher as `attr`. Defaults to `None`. ~~Optional[Union[int, str]]~~ | | `phrase_matcher_attr` | Token attribute to match on, passed to the internal `PhraseMatcher` as `attr`. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ | | `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ |
| `validate` | Whether patterns should be validated, passed to `Matcher` and `PhraseMatcher` as `validate`. Defaults to `False`. ~~bool~~ |
| `overwrite` | Whether to remove any existing spans under `Doc.spans[spans key]` if `spans_key` is set, or to remove any ents under `Doc.ents` if `annotate_ents` is set. Defaults to `True`. ~~bool~~ | | `overwrite` | Whether to remove any existing spans under `Doc.spans[spans key]` if `spans_key` is set, or to remove any ents under `Doc.ents` if `annotate_ents` is set. Defaults to `True`. ~~bool~~ |
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | | `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
@ -80,7 +81,7 @@ token pattern (list) or a phrase pattern (string). For example:
> ``` > ```
| Name | Description | | Name | Description |
| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ | | `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ |
| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current span ruler while creating phrase patterns with the nlp object. ~~str~~ | | `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current span ruler while creating phrase patterns with the nlp object. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
@ -89,6 +90,7 @@ token pattern (list) or a phrase pattern (string). For example:
| `annotate_ents` | Whether to save spans to doc.ents. Defaults to `False`. ~~bool~~ | | `annotate_ents` | Whether to save spans to doc.ents. Defaults to `False`. ~~bool~~ |
| `ents_filter` | The method to filter spans before they are assigned to doc.ents. Defaults to `util.filter_chain_spans`. ~~Callable[[Iterable[Span], Iterable[Span]], List[Span]]~~ | | `ents_filter` | The method to filter spans before they are assigned to doc.ents. Defaults to `util.filter_chain_spans`. ~~Callable[[Iterable[Span], Iterable[Span]], List[Span]]~~ |
| `phrase_matcher_attr` | Token attribute to match on, passed to the internal PhraseMatcher as `attr`. Defaults to `None`. ~~Optional[Union[int, str]]~~ | | `phrase_matcher_attr` | Token attribute to match on, passed to the internal PhraseMatcher as `attr`. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ |
| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ | | `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ |
| `overwrite` | Whether to remove any existing spans under `Doc.spans[spans key]` if `spans_key` is set, or to remove any ents under `Doc.ents` if `annotate_ents` is set. Defaults to `True`. ~~bool~~ | | `overwrite` | Whether to remove any existing spans under `Doc.spans[spans key]` if `spans_key` is set, or to remove any ents under `Doc.ents` if `annotate_ents` is set. Defaults to `True`. ~~bool~~ |
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | | `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |

View File

@ -237,7 +237,7 @@ browser. Will run a simple web server.
> ``` > ```
| Name | Description | | Name | Description |
| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ | | `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ | | `style` | Visualization style, `"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ | | `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
@ -246,6 +246,7 @@ browser. Will run a simple web server.
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ | | `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
| `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ | | `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ |
| `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ | | `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ |
| `auto_select_port` | If `True`, automatically switch to a different port if the specified port is already in use. Defaults to `False`. ~~bool~~ |
### displacy.render {id="displacy.render",tag="method",version="2"} ### displacy.render {id="displacy.render",tag="method",version="2"}

View File

@ -365,6 +365,46 @@ else:
</Accordion> </Accordion>
#### Fuzzy matching {#fuzzy new="3.5"}
Fuzzy matching allows you to match tokens with alternate spellings, typos, etc.
without specifying every possible variant.
```python
# Matches "favourite", "favorites", "gavorite", "theatre", "theatr", ...
pattern = [{"TEXT": {"FUZZY": "favorite"}},
{"TEXT": {"FUZZY": "theater"}}]
```
The `FUZZY` attribute allows fuzzy matches for any attribute string value,
including custom attributes. Just like `REGEX`, it always needs to be applied to
an attribute like `TEXT` or `LOWER`. By default `FUZZY` allows a Levenshtein
edit distance of at least 2 and up to 30% of the pattern string length. Using
the more specific attributes `FUZZY1`..`FUZZY9` you can specify the maximum
allowed edit distance directly.
```python
# Match lowercase with fuzzy matching (allows 2 edits)
pattern = [{"LOWER": {"FUZZY": "definitely"}}]
# Match custom attribute values with fuzzy matching (allows 2 edits)
pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
# Match with exact Levenshtein edit distance limits (allows 3 edits)
pattern = [{"_": {"country": {"FUZZY3": "Kyrgyzstan"}}}]
```
#### Regex and fuzzy matching with lists {#regex-fuzzy-lists new="3.5"}
Starting in spaCy v3.5, both `REGEX` and `FUZZY` can be combined with the
attributes `IN` and `NOT_IN`:
```python
pattern = [{"TEXT": {"FUZZY": {"IN": ["awesome", "cool", "wonderful"]}}}]
pattern = [{"TEXT": {"REGEX": {"NOT_IN": ["^awe(some)?$", "^wonder(ful)?"]}}}]
```
--- ---
#### Operators and quantifiers {id="quantifiers"} #### Operators and quantifiers {id="quantifiers"}