diff --git a/requirements.txt b/requirements.txt index 0440835f2..5bc1c8684 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # Our libraries -spacy-legacy>=3.0.10,<3.1.0 +spacy-legacy>=3.0.11,<3.1.0 spacy-loggers>=1.0.0,<2.0.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 diff --git a/setup.cfg b/setup.cfg index cf6e6f84b..cee8c0c33 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,6 +22,7 @@ classifiers = Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 Topic :: Scientific/Engineering project_urls = Release notes = https://github.com/explosion/spaCy/releases @@ -41,7 +42,7 @@ setup_requires = thinc>=8.1.0,<8.2.0 install_requires = # Our libraries - spacy-legacy>=3.0.10,<3.1.0 + spacy-legacy>=3.0.11,<3.1.0 spacy-loggers>=1.0.0,<2.0.0 murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index c46abffe5..0f4e9f599 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -583,6 +583,10 @@ def setup_gpu(use_gpu: int, silent=None) -> None: def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]: + """Given a directory and a suffix, recursively find all files matching the suffix. + Directories or files with names beginning with a . are ignored, but hidden flags on + filesystems are not checked. + When provided with a suffix `None`, there is no suffix-based filtering.""" if not path.is_dir(): return [path] paths = [path] diff --git a/spacy/cli/apply.py b/spacy/cli/apply.py index 9d170bc95..f0df4e757 100644 --- a/spacy/cli/apply.py +++ b/spacy/cli/apply.py @@ -53,9 +53,7 @@ def _stream_jsonl(path: Path, field: str) -> Iterable[str]: """ for entry in srsly.read_jsonl(path): if field not in entry: - msg.fail( - f"{path} does not contain the required '{field}' field.", exits=1 - ) + msg.fail(f"{path} does not contain the required '{field}' field.", exits=1) else: yield entry[field] @@ -118,8 +116,10 @@ def apply( paths = walk_directory(data_path) if len(paths) == 0: docbin.to_disk(output_file) - msg.warn("Did not find data to process," - f" {data_path} seems to be an empty directory.") + msg.warn( + "Did not find data to process," + f" {data_path} seems to be an empty directory." + ) return nlp = load_model(model) msg.good(f"Loaded model {model}") diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 7f365ae2c..68d454b3e 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -28,6 +28,8 @@ CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = { "json": json_to_docs, } +AUTO = "auto" + # File types that can be written to stdout FILE_TYPES_STDOUT = ("json",) @@ -49,7 +51,7 @@ def convert_cli( model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"), morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"), merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"), - converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"), + converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"), ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True), lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"), concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"), @@ -70,8 +72,8 @@ def convert_cli( output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir silent = output_dir == "-" msg = Printer(no_print=silent) - verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map) converter = _get_converter(msg, converter, input_path) + verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map) convert( input_path, output_dir, @@ -100,7 +102,7 @@ def convert( model: Optional[str] = None, morphology: bool = False, merge_subtokens: bool = False, - converter: str = "auto", + converter: str, ner_map: Optional[Path] = None, lang: Optional[str] = None, concatenate: bool = False, @@ -212,18 +214,22 @@ def verify_cli_args( input_locs = walk_directory(input_path, converter) if len(input_locs) == 0: msg.fail("No input files in directory", input_path, exits=1) - file_types = list(set([loc.suffix[1:] for loc in input_locs])) - if converter == "auto" and len(file_types) >= 2: - file_types_str = ",".join(file_types) - msg.fail("All input files must be same type", file_types_str, exits=1) - if converter != "auto" and converter not in CONVERTERS: + if converter not in CONVERTERS: msg.fail(f"Can't find converter for {converter}", exits=1) def _get_converter(msg, converter, input_path: Path): if input_path.is_dir(): - input_path = walk_directory(input_path, converter)[0] - if converter == "auto": + if converter == AUTO: + input_locs = walk_directory(input_path, suffix=None) + file_types = list(set([loc.suffix[1:] for loc in input_locs])) + if len(file_types) >= 2: + file_types_str = ",".join(file_types) + msg.fail("All input files must be same type", file_types_str, exits=1) + input_path = input_locs[0] + else: + input_path = walk_directory(input_path, suffix=converter)[0] + if converter == AUTO: converter = input_path.suffix[1:] if converter == "ner" or converter == "iob": with input_path.open(encoding="utf8") as file_: diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index bc32001d7..a3cfd96dd 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -11,6 +11,7 @@ from .render import DependencyRenderer, EntityRenderer, SpanRenderer from ..tokens import Doc, Span from ..errors import Errors, Warnings from ..util import is_in_jupyter +from ..util import find_available_port _html = {} @@ -36,7 +37,7 @@ def render( jupyter (bool): Override Jupyter auto-detection. options (dict): Visualiser-specific options, e.g. colors. manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. - RETURNS (str): Rendered HTML markup. + RETURNS (str): Rendered SVG or HTML markup. DOCS: https://spacy.io/api/top-level#displacy.render USAGE: https://spacy.io/usage/visualizers @@ -82,6 +83,7 @@ def serve( manual: bool = False, port: int = 5000, host: str = "0.0.0.0", + auto_select_port: bool = False, ) -> None: """Serve displaCy visualisation. @@ -93,15 +95,20 @@ def serve( manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. port (int): Port to serve visualisation. host (str): Host to serve visualisation. + auto_select_port (bool): Automatically select a port if the specified port is in use. DOCS: https://spacy.io/api/top-level#displacy.serve USAGE: https://spacy.io/usage/visualizers """ from wsgiref import simple_server + port = find_available_port(port, host, auto_select_port) + if is_in_jupyter(): warnings.warn(Warnings.W011) - render(docs, style=style, page=page, minify=minify, options=options, manual=manual) + render( + docs, style=style, page=page, minify=minify, options=options, manual=manual + ) httpd = simple_server.make_server(host, port, app) print(f"\nUsing the '{style}' visualizer") print(f"Serving on http://{host}:{port} ...\n") diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 50dc3466c..f74222dc2 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -94,7 +94,7 @@ class SpanRenderer: parsed (list): Dependency parses to render. page (bool): Render parses wrapped as full HTML page. minify (bool): Minify HTML markup. - RETURNS (str): Rendered HTML markup. + RETURNS (str): Rendered SVG or HTML markup. """ rendered = [] for i, p in enumerate(parsed): @@ -510,7 +510,7 @@ class EntityRenderer: parsed (list): Dependency parses to render. page (bool): Render parses wrapped as full HTML page. minify (bool): Minify HTML markup. - RETURNS (str): Rendered HTML markup. + RETURNS (str): Rendered SVG or HTML markup. """ rendered = [] for i, p in enumerate(parsed): diff --git a/spacy/errors.py b/spacy/errors.py index b5fe0cbf6..e76d6a653 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -214,7 +214,8 @@ class Warnings(metaclass=ErrorsWithCodes): "is a Cython extension type.") W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option " "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") - W124 = ("Using the features PREFIX and/or SUFFIX in a RichFeatureExtractor configuration may lead to the same " + W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.") + W125 = ("Using the features PREFIX and/or SUFFIX in a RichFeatureExtractor configuration may lead to the same " "information being fed forward twice if prefixes and suffixes of corresponding lengths are specified.") @@ -964,10 +965,14 @@ class Errors(metaclass=ErrorsWithCodes): E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default " "knowledge base, use `InMemoryLookupKB`.") E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.") - E1048 = ("Invalid rich group config '{label}'.") - E1049 = ("Length > 63 in rich group config '{label}'.") - E1050 = ("Rich group config {label} specifies lengths that are not in ascending order.") - E1051 = ("Error splitting UTF-8 byte string into separate characters.") + E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}") + E1049 = ("No available port found for displaCy on host {host}. Please specify an available port " + "with `displacy.serve(doc, port)`") + E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port)` " + "or use `auto_switch_port=True` to pick an available port automatically.") + E1051 = ("Invalid rich group config '{label}'.") + E1052 = ("Length > 63 in rich group config '{label}'.") + E1053 = ("Rich group config {label} specifies lengths that are not in ascending order.") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/matcher/levenshtein.pyx b/spacy/matcher/levenshtein.pyx index 8463d913d..0e8cd26da 100644 --- a/spacy/matcher/levenshtein.pyx +++ b/spacy/matcher/levenshtein.pyx @@ -4,6 +4,8 @@ from libc.stdint cimport int64_t from typing import Optional +from ..util import registry + cdef extern from "polyleven.c": int64_t polyleven(PyObject *o1, PyObject *o2, int64_t k) @@ -13,3 +15,18 @@ cpdef int64_t levenshtein(a: str, b: str, k: Optional[int] = None): if k is None: k = -1 return polyleven(a, b, k) + + +cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int = -1): + if fuzzy >= 0: + max_edits = fuzzy + else: + # allow at least two edits (to allow at least one transposition) and up + # to 20% of the pattern string length + max_edits = max(2, round(0.3 * len(pattern_text))) + return levenshtein(input_text, pattern_text, max_edits) <= max_edits + + +@registry.misc("spacy.levenshtein_compare.v1") +def make_levenshtein_compare(): + return levenshtein_compare diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index 455f978cc..51854d562 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -77,3 +77,4 @@ cdef class Matcher: cdef public object _extensions cdef public object _extra_predicates cdef public object _seen_attrs + cdef public object _fuzzy_compare diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi index 390629ff8..77ea7b7a6 100644 --- a/spacy/matcher/matcher.pyi +++ b/spacy/matcher/matcher.pyi @@ -5,7 +5,8 @@ from ..vocab import Vocab from ..tokens import Doc, Span class Matcher: - def __init__(self, vocab: Vocab, validate: bool = ...) -> None: ... + def __init__(self, vocab: Vocab, validate: bool = ..., + fuzzy_compare: Callable[[str, str, int], bool] = ...) -> None: ... def __reduce__(self) -> Any: ... def __len__(self) -> int: ... def __contains__(self, key: str) -> bool: ... diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index c4a057ca0..ea1b4b66b 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -1,4 +1,4 @@ -# cython: infer_types=True, profile=True +# cython: binding=True, infer_types=True, profile=True from typing import List, Iterable from libcpp.vector cimport vector @@ -20,10 +20,12 @@ from ..tokens.token cimport Token from ..tokens.morphanalysis cimport MorphAnalysis from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB +from .levenshtein import levenshtein_compare from ..schemas import validate_token_pattern from ..errors import Errors, MatchPatternError, Warnings from ..strings import get_string_id from ..attrs import IDS +from ..util import registry DEF PADDING = 5 @@ -36,11 +38,13 @@ cdef class Matcher: USAGE: https://spacy.io/usage/rule-based-matching """ - def __init__(self, vocab, validate=True): + def __init__(self, vocab, validate=True, *, fuzzy_compare=levenshtein_compare): """Create the Matcher. vocab (Vocab): The vocabulary object, which must be shared with the - documents the matcher will operate on. + validate (bool): Validate all patterns added to this matcher. + fuzzy_compare (Callable[[str, str, int], bool]): The comparison method + for the FUZZY operators. """ self._extra_predicates = [] self._patterns = {} @@ -51,9 +55,10 @@ cdef class Matcher: self.vocab = vocab self.mem = Pool() self.validate = validate + self._fuzzy_compare = fuzzy_compare def __reduce__(self): - data = (self.vocab, self._patterns, self._callbacks) + data = (self.vocab, self._patterns, self._callbacks, self.validate, self._fuzzy_compare) return (unpickle_matcher, data, None, None) def __len__(self): @@ -128,7 +133,7 @@ cdef class Matcher: for pattern in patterns: try: specs = _preprocess_pattern(pattern, self.vocab, - self._extensions, self._extra_predicates) + self._extensions, self._extra_predicates, self._fuzzy_compare) self.patterns.push_back(init_pattern(self.mem, key, specs)) for spec in specs: for attr, _ in spec[1]: @@ -326,8 +331,8 @@ cdef class Matcher: return key -def unpickle_matcher(vocab, patterns, callbacks): - matcher = Matcher(vocab) +def unpickle_matcher(vocab, patterns, callbacks, validate, fuzzy_compare): + matcher = Matcher(vocab, validate=validate, fuzzy_compare=fuzzy_compare) for key, pattern in patterns.items(): callback = callbacks.get(key, None) matcher.add(key, pattern, on_match=callback) @@ -754,7 +759,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil: return id_attr.value -def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates): +def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy_compare): """This function interprets the pattern, converting the various bits of syntactic sugar before we compile it into a struct with init_pattern. @@ -781,7 +786,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates): ops = _get_operators(spec) attr_values = _get_attr_values(spec, string_store) extensions = _get_extensions(spec, string_store, extensions_table) - predicates = _get_extra_predicates(spec, extra_predicates, vocab) + predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare) for op in ops: tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx)) return tokens @@ -826,16 +831,45 @@ def _get_attr_values(spec, string_store): # These predicate helper classes are used to match the REGEX, IN, >= etc # extensions to the matcher introduced in #3173. +class _FuzzyPredicate: + operators = ("FUZZY", "FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5", + "FUZZY6", "FUZZY7", "FUZZY8", "FUZZY9") + + def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, + regex=False, fuzzy=None, fuzzy_compare=None): + self.i = i + self.attr = attr + self.value = value + self.predicate = predicate + self.is_extension = is_extension + if self.predicate not in self.operators: + raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) + fuzz = self.predicate[len("FUZZY"):] # number after prefix + self.fuzzy = int(fuzz) if fuzz else -1 + self.fuzzy_compare = fuzzy_compare + self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True)) + + def __call__(self, Token token): + if self.is_extension: + value = token._.get(self.attr) + else: + value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)] + if self.value == value: + return True + return self.fuzzy_compare(value, self.value, self.fuzzy) + + class _RegexPredicate: operators = ("REGEX",) - def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): + def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, + regex=False, fuzzy=None, fuzzy_compare=None): self.i = i self.attr = attr self.value = re.compile(value) self.predicate = predicate self.is_extension = is_extension - self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -850,18 +884,28 @@ class _RegexPredicate: class _SetPredicate: operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS") - def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): + def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, + regex=False, fuzzy=None, fuzzy_compare=None): self.i = i self.attr = attr self.vocab = vocab + self.regex = regex + self.fuzzy = fuzzy + self.fuzzy_compare = fuzzy_compare if self.attr == MORPH: # normalize morph strings self.value = set(self.vocab.morphology.add(v) for v in value) else: - self.value = set(get_string_id(v) for v in value) + if self.regex: + self.value = set(re.compile(v) for v in value) + elif self.fuzzy is not None: + # add to string store + self.value = set(self.vocab.strings.add(v) for v in value) + else: + self.value = set(get_string_id(v) for v in value) self.predicate = predicate self.is_extension = is_extension - self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True)) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -889,9 +933,29 @@ class _SetPredicate: return False if self.predicate == "IN": - return value in self.value + if self.regex: + value = self.vocab.strings[value] + return any(bool(v.search(value)) for v in self.value) + elif self.fuzzy is not None: + value = self.vocab.strings[value] + return any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy) + for v in self.value) + elif value in self.value: + return True + else: + return False elif self.predicate == "NOT_IN": - return value not in self.value + if self.regex: + value = self.vocab.strings[value] + return not any(bool(v.search(value)) for v in self.value) + elif self.fuzzy is not None: + value = self.vocab.strings[value] + return not any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy) + for v in self.value) + elif value in self.value: + return False + else: + return True elif self.predicate == "IS_SUBSET": return value <= self.value elif self.predicate == "IS_SUPERSET": @@ -906,13 +970,14 @@ class _SetPredicate: class _ComparisonPredicate: operators = ("==", "!=", ">=", "<=", ">", "<") - def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): + def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, + regex=False, fuzzy=None, fuzzy_compare=None): self.i = i self.attr = attr self.value = value self.predicate = predicate self.is_extension = is_extension - self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -935,7 +1000,7 @@ class _ComparisonPredicate: return value < self.value -def _get_extra_predicates(spec, extra_predicates, vocab): +def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare): predicate_types = { "REGEX": _RegexPredicate, "IN": _SetPredicate, @@ -949,6 +1014,16 @@ def _get_extra_predicates(spec, extra_predicates, vocab): "<=": _ComparisonPredicate, ">": _ComparisonPredicate, "<": _ComparisonPredicate, + "FUZZY": _FuzzyPredicate, + "FUZZY1": _FuzzyPredicate, + "FUZZY2": _FuzzyPredicate, + "FUZZY3": _FuzzyPredicate, + "FUZZY4": _FuzzyPredicate, + "FUZZY5": _FuzzyPredicate, + "FUZZY6": _FuzzyPredicate, + "FUZZY7": _FuzzyPredicate, + "FUZZY8": _FuzzyPredicate, + "FUZZY9": _FuzzyPredicate, } seen_predicates = {pred.key: pred.i for pred in extra_predicates} output = [] @@ -966,22 +1041,47 @@ def _get_extra_predicates(spec, extra_predicates, vocab): attr = "ORTH" attr = IDS.get(attr.upper()) if isinstance(value, dict): - processed = False - value_with_upper_keys = {k.upper(): v for k, v in value.items()} - for type_, cls in predicate_types.items(): - if type_ in value_with_upper_keys: - predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab) - # Don't create a redundant predicates. - # This helps with efficiency, as we're caching the results. - if predicate.key in seen_predicates: - output.append(seen_predicates[predicate.key]) - else: - extra_predicates.append(predicate) - output.append(predicate.i) - seen_predicates[predicate.key] = predicate.i - processed = True - if not processed: - warnings.warn(Warnings.W035.format(pattern=value)) + output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types, + extra_predicates, seen_predicates, fuzzy_compare=fuzzy_compare)) + return output + + +def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types, + extra_predicates, seen_predicates, regex=False, fuzzy=None, fuzzy_compare=None): + output = [] + for type_, value in value_dict.items(): + type_ = type_.upper() + cls = predicate_types.get(type_) + if cls is None: + warnings.warn(Warnings.W035.format(pattern=value_dict)) + # ignore unrecognized predicate type + continue + elif cls == _RegexPredicate: + if isinstance(value, dict): + # add predicates inside regex operator + output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types, + extra_predicates, seen_predicates, + regex=True)) + continue + elif cls == _FuzzyPredicate: + if isinstance(value, dict): + # add predicates inside fuzzy operator + fuzz = type_[len("FUZZY"):] # number after prefix + fuzzy_val = int(fuzz) if fuzz else -1 + output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types, + extra_predicates, seen_predicates, + fuzzy=fuzzy_val, fuzzy_compare=fuzzy_compare)) + continue + predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab, + regex=regex, fuzzy=fuzzy, fuzzy_compare=fuzzy_compare) + # Don't create redundant predicates. + # This helps with efficiency, as we're caching the results. + if predicate.key in seen_predicates: + output.append(seen_predicates[predicate.key]) + else: + extra_predicates.append(predicate) + output.append(predicate.i) + seen_predicates[predicate.key] = predicate.i return output diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index b8ff9ce87..f5cb72ee4 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -196,15 +196,15 @@ def _verify_rich_config_group( ) -> None: if lengths is not None or rows is not None: if lengths is None or rows is None: - raise ValueError(Errors.E1048.format(label=label)) + raise ValueError(Errors.E1051.format(label=label)) if len(lengths) != len(rows): - raise ValueError(Errors.E1048.format(label=label)) + raise ValueError(Errors.E1051.format(label=label)) if any([length < 1 for length in lengths]): - raise ValueError(Errors.E1048.format(label=label)) + raise ValueError(Errors.E1051.format(label=label)) if lengths[-1] > 63: - raise ValueError(Errors.E1049.format(label=label)) + raise ValueError(Errors.E1052.format(label=label)) if len(lengths) != len(set(lengths)) or lengths != sorted(lengths): - raise ValueError(Errors.E1050.format(label=label)) + raise ValueError(Errors.E1053.format(label=label)) @registry.architectures("spacy.RichMultiHashEmbed.v1") @@ -259,7 +259,7 @@ def RichMultiHashEmbed( _verify_rich_config_group("suffix", suff_lengths, suff_rows) if "PREFIX" in attrs or "SUFFIX" in attrs: - warnings.warn(Warnings.W124) + warnings.warn(Warnings.W125) if pref_rows is not None: rows.extend(pref_rows) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 8154a077d..6a3755533 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -11,6 +11,7 @@ from ..errors import Errors, Warnings from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry from ..tokens import Doc, Span from ..matcher import Matcher, PhraseMatcher +from ..matcher.levenshtein import levenshtein_compare from ..scorer import get_ner_prf @@ -23,6 +24,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] assigns=["doc.ents", "token.ent_type", "token.ent_iob"], default_config={ "phrase_matcher_attr": None, + "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, "validate": False, "overwrite_ents": False, "ent_id_sep": DEFAULT_ENT_ID_SEP, @@ -39,6 +41,7 @@ def make_entity_ruler( nlp: Language, name: str, phrase_matcher_attr: Optional[Union[int, str]], + matcher_fuzzy_compare: Callable, validate: bool, overwrite_ents: bool, ent_id_sep: str, @@ -48,6 +51,7 @@ def make_entity_ruler( nlp, name, phrase_matcher_attr=phrase_matcher_attr, + matcher_fuzzy_compare=matcher_fuzzy_compare, validate=validate, overwrite_ents=overwrite_ents, ent_id_sep=ent_id_sep, @@ -81,6 +85,7 @@ class EntityRuler(Pipe): name: str = "entity_ruler", *, phrase_matcher_attr: Optional[Union[int, str]] = None, + matcher_fuzzy_compare: Callable = levenshtein_compare, validate: bool = False, overwrite_ents: bool = False, ent_id_sep: str = DEFAULT_ENT_ID_SEP, @@ -99,7 +104,10 @@ class EntityRuler(Pipe): added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. phrase_matcher_attr (int / str): Token attribute to match on, passed - to the internal PhraseMatcher as `attr` + to the internal PhraseMatcher as `attr`. + matcher_fuzzy_compare (Callable): The fuzzy comparison method for the + internal Matcher. Defaults to + spacy.matcher.levenshtein.levenshtein_compare. validate (bool): Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate` patterns (iterable): Optional patterns to load in. @@ -117,7 +125,10 @@ class EntityRuler(Pipe): self.token_patterns = defaultdict(list) # type: ignore self.phrase_patterns = defaultdict(list) # type: ignore self._validate = validate - self.matcher = Matcher(nlp.vocab, validate=validate) + self.matcher_fuzzy_compare = matcher_fuzzy_compare + self.matcher = Matcher( + nlp.vocab, validate=validate, fuzzy_compare=self.matcher_fuzzy_compare + ) self.phrase_matcher_attr = phrase_matcher_attr self.phrase_matcher = PhraseMatcher( nlp.vocab, attr=self.phrase_matcher_attr, validate=validate @@ -337,7 +348,11 @@ class EntityRuler(Pipe): self.token_patterns = defaultdict(list) self.phrase_patterns = defaultdict(list) self._ent_ids = defaultdict(tuple) - self.matcher = Matcher(self.nlp.vocab, validate=self._validate) + self.matcher = Matcher( + self.nlp.vocab, + validate=self._validate, + fuzzy_compare=self.matcher_fuzzy_compare, + ) self.phrase_matcher = PhraseMatcher( self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate ) @@ -431,7 +446,8 @@ class EntityRuler(Pipe): self.overwrite = cfg.get("overwrite", False) self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None) self.phrase_matcher = PhraseMatcher( - self.nlp.vocab, attr=self.phrase_matcher_attr + self.nlp.vocab, + attr=self.phrase_matcher_attr, ) self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) else: diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py index 807a4ffe5..b0669c0ef 100644 --- a/spacy/pipeline/span_ruler.py +++ b/spacy/pipeline/span_ruler.py @@ -13,6 +13,7 @@ from ..util import ensure_path, SimpleFrozenList, registry from ..tokens import Doc, Span from ..scorer import Scorer from ..matcher import Matcher, PhraseMatcher +from ..matcher.levenshtein import levenshtein_compare from .. import util PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] @@ -28,6 +29,7 @@ DEFAULT_SPANS_KEY = "ruler" "overwrite_ents": False, "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, "ent_id_sep": "__unused__", + "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, }, default_score_weights={ "ents_f": 1.0, @@ -40,6 +42,7 @@ def make_entity_ruler( nlp: Language, name: str, phrase_matcher_attr: Optional[Union[int, str]], + matcher_fuzzy_compare: Callable, validate: bool, overwrite_ents: bool, scorer: Optional[Callable], @@ -57,6 +60,7 @@ def make_entity_ruler( annotate_ents=True, ents_filter=ents_filter, phrase_matcher_attr=phrase_matcher_attr, + matcher_fuzzy_compare=matcher_fuzzy_compare, validate=validate, overwrite=False, scorer=scorer, @@ -72,6 +76,7 @@ def make_entity_ruler( "annotate_ents": False, "ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}, "phrase_matcher_attr": None, + "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, "validate": False, "overwrite": True, "scorer": { @@ -94,6 +99,7 @@ def make_span_ruler( annotate_ents: bool, ents_filter: Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]], phrase_matcher_attr: Optional[Union[int, str]], + matcher_fuzzy_compare: Callable, validate: bool, overwrite: bool, scorer: Optional[Callable], @@ -106,6 +112,7 @@ def make_span_ruler( annotate_ents=annotate_ents, ents_filter=ents_filter, phrase_matcher_attr=phrase_matcher_attr, + matcher_fuzzy_compare=matcher_fuzzy_compare, validate=validate, overwrite=overwrite, scorer=scorer, @@ -170,7 +177,7 @@ def prioritize_existing_ents_filter( @registry.misc("spacy.prioritize_existing_ents_filter.v1") -def make_preverse_existing_ents_filter(): +def make_preserve_existing_ents_filter(): return prioritize_existing_ents_filter @@ -216,6 +223,7 @@ class SpanRuler(Pipe): [Iterable[Span], Iterable[Span]], Iterable[Span] ] = util.filter_chain_spans, phrase_matcher_attr: Optional[Union[int, str]] = None, + matcher_fuzzy_compare: Callable = levenshtein_compare, validate: bool = False, overwrite: bool = False, scorer: Optional[Callable] = partial( @@ -246,6 +254,9 @@ class SpanRuler(Pipe): phrase_matcher_attr (Optional[Union[int, str]]): Token attribute to match on, passed to the internal PhraseMatcher as `attr`. Defaults to `None`. + matcher_fuzzy_compare (Callable): The fuzzy comparison method for the + internal Matcher. Defaults to + spacy.matcher.levenshtein.levenshtein_compare. validate (bool): Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. overwrite (bool): Whether to remove any existing spans under this spans @@ -266,6 +277,7 @@ class SpanRuler(Pipe): self.spans_filter = spans_filter self.ents_filter = ents_filter self.scorer = scorer + self.matcher_fuzzy_compare = matcher_fuzzy_compare self._match_label_id_map: Dict[int, Dict[str, str]] = {} self.clear() @@ -451,7 +463,11 @@ class SpanRuler(Pipe): DOCS: https://spacy.io/api/spanruler#clear """ self._patterns: List[PatternType] = [] - self.matcher: Matcher = Matcher(self.nlp.vocab, validate=self.validate) + self.matcher: Matcher = Matcher( + self.nlp.vocab, + validate=self.validate, + fuzzy_compare=self.matcher_fuzzy_compare, + ) self.phrase_matcher: PhraseMatcher = PhraseMatcher( self.nlp.vocab, attr=self.phrase_matcher_attr, diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 65121114d..650a01949 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -74,7 +74,7 @@ subword_features = true default_config={ "threshold": 0.0, "model": DEFAULT_SINGLE_TEXTCAT_MODEL, - "scorer": {"@scorers": "spacy.textcat_scorer.v1"}, + "scorer": {"@scorers": "spacy.textcat_scorer.v2"}, }, default_score_weights={ "cats_score": 1.0, @@ -117,7 +117,7 @@ def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: ) -@registry.scorers("spacy.textcat_scorer.v1") +@registry.scorers("spacy.textcat_scorer.v2") def make_textcat_scorer(): return textcat_score diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index 328cee723..41c0e2f63 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -74,7 +74,7 @@ subword_features = true default_config={ "threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL, - "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"}, + "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"}, }, default_score_weights={ "cats_score": 1.0, @@ -120,7 +120,7 @@ def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, ) -@registry.scorers("spacy.textcat_multilabel_scorer.v1") +@registry.scorers("spacy.textcat_multilabel_scorer.v2") def make_textcat_multilabel_scorer(): return textcat_multilabel_score diff --git a/spacy/schemas.py b/spacy/schemas.py index e48fe1702..3675c12dd 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -156,12 +156,22 @@ def validate_token_pattern(obj: list) -> List[str]: class TokenPatternString(BaseModel): - REGEX: Optional[StrictStr] = Field(None, alias="regex") + REGEX: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="regex") IN: Optional[List[StrictStr]] = Field(None, alias="in") NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in") IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset") IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset") INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects") + FUZZY: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy") + FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy1") + FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy2") + FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy3") + FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy4") + FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy5") + FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy6") + FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy7") + FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy8") + FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy9") class Config: extra = "forbid" diff --git a/spacy/scorer.py b/spacy/scorer.py index 16fc303a0..d8c383ab8 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -476,14 +476,12 @@ class Scorer: f_per_type = {label: PRFScore() for label in labels} auc_per_type = {label: ROCAUCScore() for label in labels} labels = set(labels) - if labels: - for eg in examples: - labels.update(eg.predicted.cats.keys()) - labels.update(eg.reference.cats.keys()) for example in examples: # Through this loop, None in the gold_cats indicates missing label. pred_cats = getter(example.predicted, attr) + pred_cats = {k: v for k, v in pred_cats.items() if k in labels} gold_cats = getter(example.reference, attr) + gold_cats = {k: v for k, v in gold_cats.items() if k in labels} for label in labels: pred_score = pred_cats.get(label, 0.0) diff --git a/spacy/tests/matcher/test_levenshtein.py b/spacy/tests/matcher/test_levenshtein.py index d30e36132..5afb7e1fc 100644 --- a/spacy/tests/matcher/test_levenshtein.py +++ b/spacy/tests/matcher/test_levenshtein.py @@ -1,5 +1,6 @@ import pytest from spacy.matcher import levenshtein +from spacy.matcher.levenshtein import levenshtein_compare # empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests @@ -42,3 +43,31 @@ from spacy.matcher import levenshtein ) def test_levenshtein(dist, a, b): assert levenshtein(a, b) == dist + + +@pytest.mark.parametrize( + "a,b,fuzzy,expected", + [ + ("a", "a", 1, True), + ("a", "a", 0, True), + ("a", "a", -1, True), + ("a", "ab", 1, True), + ("a", "ab", 0, False), + ("a", "ab", -1, True), + ("ab", "ac", 1, True), + ("ab", "ac", -1, True), + ("abc", "cde", 4, True), + ("abc", "cde", -1, False), + ("abcdef", "cdefgh", 4, True), + ("abcdef", "cdefgh", 3, False), + ("abcdef", "cdefgh", -1, False), # default (2 for length 6) + ("abcdefgh", "cdefghijk", 5, True), + ("abcdefgh", "cdefghijk", 4, False), + ("abcdefgh", "cdefghijk", -1, False), # default (2) + ("abcdefgh", "cdefghijkl", 6, True), + ("abcdefgh", "cdefghijkl", 5, False), + ("abcdefgh", "cdefghijkl", -1, False), # default (2) + ], +) +def test_levenshtein_compare(a, b, fuzzy, expected): + assert levenshtein_compare(a, b, fuzzy) == expected diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index ac905eeb4..09ab6c7dc 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -118,6 +118,155 @@ def test_matcher_match_multi(matcher): ] +@pytest.mark.parametrize( + "rules,match_locs", + [ + ( + { + "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]], + }, + [(2, 4)], + ), + ( + { + "Java": [[{"LOWER": {"FUZZY": "java"}}]], + }, + [(5, 6)], + ), + ( + { + "JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]], + "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": {"FUZZY": "java"}}]], + }, + [(2, 4), (5, 6), (8, 9)], + ), + # only the second pattern matches (check that predicate keys used for + # caching don't collide) + ( + { + "A": [[{"ORTH": {"FUZZY": "Javascripts"}}]], + "B": [[{"ORTH": {"FUZZY5": "Javascripts"}}]], + }, + [(8, 9)], + ), + ], +) +def test_matcher_match_fuzzy(en_vocab, rules, match_locs): + words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] + doc = Doc(en_vocab, words=words) + + matcher = Matcher(en_vocab) + for key, patterns in rules.items(): + matcher.add(key, patterns) + assert match_locs == [(start, end) for m_id, start, end in matcher(doc)] + + +@pytest.mark.parametrize("set_op", ["IN", "NOT_IN"]) +def test_matcher_match_fuzzy_set_op_longest(en_vocab, set_op): + rules = { + "GoogleNow": [[{"ORTH": {"FUZZY": {set_op: ["Google", "Now"]}}, "OP": "+"}]] + } + matcher = Matcher(en_vocab) + for key, patterns in rules.items(): + matcher.add(key, patterns, greedy="LONGEST") + + words = ["They", "like", "Goggle", "Noo"] + doc = Doc(en_vocab, words=words) + assert len(matcher(doc)) == 1 + + +def test_matcher_match_fuzzy_set_multiple(en_vocab): + rules = { + "GoogleNow": [ + [ + { + "ORTH": {"FUZZY": {"IN": ["Google", "Now"]}, "NOT_IN": ["Goggle"]}, + "OP": "+", + } + ] + ] + } + matcher = Matcher(en_vocab) + for key, patterns in rules.items(): + matcher.add(key, patterns, greedy="LONGEST") + + words = ["They", "like", "Goggle", "Noo"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["GoogleNow"], 3, 4), + ] + + +@pytest.mark.parametrize("fuzzyn", range(1, 10)) +def test_matcher_match_fuzzyn_all_insertions(en_vocab, fuzzyn): + matcher = Matcher(en_vocab) + matcher.add("GoogleNow", [[{"ORTH": {f"FUZZY{fuzzyn}": "GoogleNow"}}]]) + # words with increasing edit distance + words = ["GoogleNow" + "a" * i for i in range(0, 10)] + doc = Doc(en_vocab, words) + assert len(matcher(doc)) == fuzzyn + 1 + + +@pytest.mark.parametrize("fuzzyn", range(1, 6)) +def test_matcher_match_fuzzyn_various_edits(en_vocab, fuzzyn): + matcher = Matcher(en_vocab) + matcher.add("GoogleNow", [[{"ORTH": {f"FUZZY{fuzzyn}": "GoogleNow"}}]]) + # words with increasing edit distance of different edit types + words = [ + "GoogleNow", + "GoogleNuw", + "GoogleNuew", + "GoogleNoweee", + "GiggleNuw3", + "gouggle5New", + ] + doc = Doc(en_vocab, words) + assert len(matcher(doc)) == fuzzyn + 1 + + +@pytest.mark.parametrize("greedy", ["FIRST", "LONGEST"]) +@pytest.mark.parametrize("set_op", ["IN", "NOT_IN"]) +def test_matcher_match_fuzzyn_set_op_longest(en_vocab, greedy, set_op): + rules = { + "GoogleNow": [[{"ORTH": {"FUZZY2": {set_op: ["Google", "Now"]}}, "OP": "+"}]] + } + matcher = Matcher(en_vocab) + for key, patterns in rules.items(): + matcher.add(key, patterns, greedy=greedy) + + words = ["They", "like", "Goggle", "Noo"] + doc = Doc(matcher.vocab, words=words) + spans = matcher(doc, as_spans=True) + assert len(spans) == 1 + if set_op == "IN": + assert spans[0].text == "Goggle Noo" + else: + assert spans[0].text == "They like" + + +def test_matcher_match_fuzzyn_set_multiple(en_vocab): + rules = { + "GoogleNow": [ + [ + { + "ORTH": {"FUZZY1": {"IN": ["Google", "Now"]}, "NOT_IN": ["Goggle"]}, + "OP": "+", + } + ] + ] + } + matcher = Matcher(en_vocab) + for key, patterns in rules.items(): + matcher.add(key, patterns, greedy="LONGEST") + + words = ["They", "like", "Goggle", "Noo"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["GoogleNow"], 3, 4), + ] + + def test_matcher_empty_dict(en_vocab): """Test matcher allows empty token specs, meaning match on any token.""" matcher = Matcher(en_vocab) @@ -437,6 +586,30 @@ def test_matcher_regex(en_vocab): assert len(matches) == 0 +def test_matcher_regex_set_in(en_vocab): + matcher = Matcher(en_vocab) + pattern = [{"ORTH": {"REGEX": {"IN": [r"(?:a)", r"(?:an)"]}}}] + matcher.add("A_OR_AN", [pattern]) + doc = Doc(en_vocab, words=["an", "a", "hi"]) + matches = matcher(doc) + assert len(matches) == 2 + doc = Doc(en_vocab, words=["bye"]) + matches = matcher(doc) + assert len(matches) == 0 + + +def test_matcher_regex_set_not_in(en_vocab): + matcher = Matcher(en_vocab) + pattern = [{"ORTH": {"REGEX": {"NOT_IN": [r"(?:a)", r"(?:an)"]}}}] + matcher.add("A_OR_AN", [pattern]) + doc = Doc(en_vocab, words=["an", "a", "hi"]) + matches = matcher(doc) + assert len(matches) == 1 + doc = Doc(en_vocab, words=["bye"]) + matches = matcher(doc) + assert len(matches) == 1 + + def test_matcher_regex_shape(en_vocab): matcher = Matcher(en_vocab) pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}] diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 6851e2a7c..417f930cb 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -382,6 +382,43 @@ def test_entity_ruler_overlapping_spans(nlp, entity_ruler_factory): assert doc.ents[0].label_ == "FOOBAR" +@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) +def test_entity_ruler_fuzzy_pipe(nlp, entity_ruler_factory): + ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}] + ruler.add_patterns(patterns) + doc = nlp("helloo") + assert len(doc.ents) == 1 + assert doc.ents[0].label_ == "HELLO" + + +@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) +def test_entity_ruler_fuzzy(nlp, entity_ruler_factory): + ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}] + ruler.add_patterns(patterns) + doc = nlp("helloo") + assert len(doc.ents) == 1 + assert doc.ents[0].label_ == "HELLO" + + +@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) +def test_entity_ruler_fuzzy_disabled(nlp, entity_ruler_factory): + @registry.misc("test_fuzzy_compare_disabled") + def make_test_fuzzy_compare_disabled(): + return lambda x, y, z: False + + ruler = nlp.add_pipe( + entity_ruler_factory, + name="entity_ruler", + config={"matcher_fuzzy_compare": {"@misc": "test_fuzzy_compare_disabled"}}, + ) + patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}] + ruler.add_patterns(patterns) + doc = nlp("helloo") + assert len(doc.ents) == 0 + + @pytest.mark.parametrize("n_process", [1, 2]) @pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory): diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 155ce99a2..d042f3445 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -895,3 +895,26 @@ def test_textcat_multi_threshold(): scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0}) assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0 + + +@pytest.mark.parametrize( + "component_name,scorer", + [ + ("textcat", "spacy.textcat_scorer.v1"), + ("textcat_multilabel", "spacy.textcat_multilabel_scorer.v1"), + ], +) +def test_textcat_legacy_scorers(component_name, scorer): + """Check that legacy scorers are registered and produce the expected score + keys.""" + nlp = English() + nlp.add_pipe(component_name, config={"scorer": {"@scorers": scorer}}) + + train_examples = [] + for text, annotations in TRAIN_DATA_SINGLE_LABEL: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + nlp.initialize(get_examples=lambda: train_examples) + + # score the model (it's not actually trained but that doesn't matter) + scores = nlp.evaluate(train_examples) + assert 0 <= scores["cats_score"] <= 1 diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index c6768a3fd..c88e20de2 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -4,6 +4,7 @@ from collections import Counter from typing import Tuple, List, Dict, Any import pkg_resources import time +from pathlib import Path import spacy import numpy @@ -15,7 +16,7 @@ from thinc.api import Config, ConfigValidationError from spacy import about from spacy.cli import info -from spacy.cli._util import is_subpath_of, load_project_config +from spacy.cli._util import is_subpath_of, load_project_config, walk_directory from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands @@ -1185,3 +1186,26 @@ def test_upload_download_local_file(): download_file(remote_file, local_file) with local_file.open(mode="r") as file_: assert file_.read() == content + + +def test_walk_directory(): + with make_tempdir() as d: + files = [ + "data1.iob", + "data2.iob", + "data3.json", + "data4.conll", + "data5.conll", + "data6.conll", + "data7.txt", + ] + + for f in files: + Path(d / f).touch() + + assert (len(walk_directory(d))) == 7 + assert (len(walk_directory(d, suffix=None))) == 7 + assert (len(walk_directory(d, suffix="json"))) == 1 + assert (len(walk_directory(d, suffix="iob"))) == 2 + assert (len(walk_directory(d, suffix="conll"))) == 3 + assert (len(walk_directory(d, suffix="pdf"))) == 0 diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py new file mode 100644 index 000000000..873a3ff66 --- /dev/null +++ b/spacy/tests/test_cli_app.py @@ -0,0 +1,33 @@ +import os +from pathlib import Path +from typer.testing import CliRunner + +from spacy.cli._util import app +from .util import make_tempdir + + +def test_convert_auto(): + with make_tempdir() as d_in, make_tempdir() as d_out: + for f in ["data1.iob", "data2.iob", "data3.iob"]: + Path(d_in / f).touch() + + # ensure that "automatic" suffix detection works + result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)]) + assert "Generated output file" in result.stdout + out_files = os.listdir(d_out) + assert len(out_files) == 3 + assert "data1.spacy" in out_files + assert "data2.spacy" in out_files + assert "data3.spacy" in out_files + + +def test_convert_auto_conflict(): + with make_tempdir() as d_in, make_tempdir() as d_out: + for f in ["data1.iob", "data2.iob", "data3.json"]: + Path(d_in / f).touch() + + # ensure that "automatic" suffix detection warns when there are different file types + result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)]) + assert "All input files must be same type" in result.stdout + out_files = os.listdir(d_out) + assert len(out_files) == 0 diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 03a98d32f..03790eb86 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -3,6 +3,7 @@ import logging from unittest import mock import pytest from spacy.language import Language +from spacy.scorer import Scorer from spacy.tokens import Doc, Span from spacy.vocab import Vocab from spacy.training import Example @@ -126,6 +127,112 @@ def test_evaluate_no_pipe(nlp): nlp.evaluate([Example.from_dict(doc, annots)]) +def test_evaluate_textcat_multilabel(en_vocab): + """Test that evaluate works with a multilabel textcat pipe.""" + nlp = Language(en_vocab) + textcat_multilabel = nlp.add_pipe("textcat_multilabel") + for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"): + textcat_multilabel.add_label(label) + nlp.initialize() + + annots = {"cats": {"FEATURE": 1.0, "QUESTION": 1.0}} + doc = nlp.make_doc("hello world") + example = Example.from_dict(doc, annots) + scores = nlp.evaluate([example]) + labels = nlp.get_pipe("textcat_multilabel").labels + for label in labels: + assert scores["cats_f_per_type"].get(label) is not None + for key in example.reference.cats.keys(): + if key not in labels: + assert scores["cats_f_per_type"].get(key) is None + + +def test_evaluate_multiple_textcat_final(en_vocab): + """Test that evaluate evaluates the final textcat component in a pipeline + with more than one textcat or textcat_multilabel.""" + nlp = Language(en_vocab) + textcat = nlp.add_pipe("textcat") + for label in ("POSITIVE", "NEGATIVE"): + textcat.add_label(label) + textcat_multilabel = nlp.add_pipe("textcat_multilabel") + for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"): + textcat_multilabel.add_label(label) + nlp.initialize() + + annots = { + "cats": { + "POSITIVE": 1.0, + "NEGATIVE": 0.0, + "FEATURE": 1.0, + "QUESTION": 1.0, + "POSITIVE": 1.0, + "NEGATIVE": 0.0, + } + } + doc = nlp.make_doc("hello world") + example = Example.from_dict(doc, annots) + scores = nlp.evaluate([example]) + # get the labels from the final pipe + labels = nlp.get_pipe(nlp.pipe_names[-1]).labels + for label in labels: + assert scores["cats_f_per_type"].get(label) is not None + for key in example.reference.cats.keys(): + if key not in labels: + assert scores["cats_f_per_type"].get(key) is None + + +def test_evaluate_multiple_textcat_separate(en_vocab): + """Test that evaluate can evaluate multiple textcat components separately + with custom scorers.""" + + def custom_textcat_score(examples, **kwargs): + scores = Scorer.score_cats( + examples, + "cats", + multi_label=False, + **kwargs, + ) + return {f"custom_{k}": v for k, v in scores.items()} + + @spacy.registry.scorers("test_custom_textcat_scorer") + def make_custom_textcat_scorer(): + return custom_textcat_score + + nlp = Language(en_vocab) + textcat = nlp.add_pipe( + "textcat", + config={"scorer": {"@scorers": "test_custom_textcat_scorer"}}, + ) + for label in ("POSITIVE", "NEGATIVE"): + textcat.add_label(label) + textcat_multilabel = nlp.add_pipe("textcat_multilabel") + for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"): + textcat_multilabel.add_label(label) + nlp.initialize() + + annots = { + "cats": { + "POSITIVE": 1.0, + "NEGATIVE": 0.0, + "FEATURE": 1.0, + "QUESTION": 1.0, + "POSITIVE": 1.0, + "NEGATIVE": 0.0, + } + } + doc = nlp.make_doc("hello world") + example = Example.from_dict(doc, annots) + scores = nlp.evaluate([example]) + # check custom scores for the textcat pipe + assert "custom_cats_f_per_type" in scores + labels = nlp.get_pipe("textcat").labels + assert set(scores["custom_cats_f_per_type"].keys()) == set(labels) + # check default scores for the textcat_multilabel pipe + assert "cats_f_per_type" in scores + labels = nlp.get_pipe("textcat_multilabel").labels + assert set(scores["cats_f_per_type"].keys()) == set(labels) + + def vector_modification_pipe(doc): doc.vector += 1 return doc diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 1c9b045ac..618f17334 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -8,7 +8,7 @@ from spacy import prefer_gpu, require_gpu, require_cpu from spacy.ml._precomputable_affine import PrecomputableAffine from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding from spacy.util import dot_to_object, SimpleFrozenList, import_file -from spacy.util import to_ternary_int +from spacy.util import to_ternary_int, find_available_port from thinc.api import Config, Optimizer, ConfigValidationError from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps from thinc.compat import has_cupy_gpu, has_torch_mps_gpu @@ -434,3 +434,16 @@ def test_to_ternary_int(): assert to_ternary_int(-10) == -1 assert to_ternary_int("string") == -1 assert to_ternary_int([0, "string"]) == -1 + + +def test_find_available_port(): + host = "0.0.0.0" + port = 5000 + assert find_available_port(port, host) == port, "Port 5000 isn't free" + + from wsgiref.simple_server import make_server, demo_app + + with make_server(host, port, demo_app) as httpd: + with pytest.warns(UserWarning, match="already in use"): + found_port = find_available_port(port, host, auto_select=True) + assert found_port == port + 1, "Didn't find next port" diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index 408ea7140..7de31822e 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -26,6 +26,8 @@ def setup_table( return final_cols, final_widths, ["r" for _ in final_widths] +# We cannot rename this method as it's directly imported +# and used by external packages such as spacy-loggers. @registry.loggers("spacy.ConsoleLogger.v2") def console_logger( progress_bar: bool = False, @@ -33,7 +35,27 @@ def console_logger( output_file: Optional[Union[str, Path]] = None, ): """The ConsoleLogger.v2 prints out training logs in the console and/or saves them to a jsonl file. - progress_bar (bool): Whether the logger should print the progress bar. + progress_bar (bool): Whether the logger should print a progress bar tracking the steps till the next evaluation pass. + console_output (bool): Whether the logger should print the logs on the console. + output_file (Optional[Union[str, Path]]): The file to save the training logs to. + """ + return console_logger_v3( + progress_bar=None if progress_bar is False else "eval", + console_output=console_output, + output_file=output_file, + ) + + +@registry.loggers("spacy.ConsoleLogger.v3") +def console_logger_v3( + progress_bar: Optional[str] = None, + console_output: bool = True, + output_file: Optional[Union[str, Path]] = None, +): + """The ConsoleLogger.v3 prints out training logs in the console and/or saves them to a jsonl file. + progress_bar (Optional[str]): Type of progress bar to show in the console. Allowed values: + train - Tracks the number of steps from the beginning of training until the full training run is complete (training.max_steps is reached). + eval - Tracks the number of steps between the previous and next evaluation (training.eval_frequency is reached). console_output (bool): Whether the logger should print the logs on the console. output_file (Optional[Union[str, Path]]): The file to save the training logs to. """ @@ -70,6 +92,7 @@ def console_logger( for name, proc in nlp.pipeline if hasattr(proc, "is_trainable") and proc.is_trainable ] + max_steps = nlp.config["training"]["max_steps"] eval_frequency = nlp.config["training"]["eval_frequency"] score_weights = nlp.config["training"]["score_weights"] score_cols = [col for col, value in score_weights.items() if value is not None] @@ -84,6 +107,13 @@ def console_logger( write(msg.row(table_header, widths=table_widths, spacing=spacing)) write(msg.row(["-" * width for width in table_widths], spacing=spacing)) progress = None + expected_progress_types = ("train", "eval") + if progress_bar is not None and progress_bar not in expected_progress_types: + raise ValueError( + Errors.E1048.format( + unexpected=progress_bar, expected=expected_progress_types + ) + ) def log_step(info: Optional[Dict[str, Any]]) -> None: nonlocal progress @@ -141,11 +171,23 @@ def console_logger( ) ) if progress_bar: + if progress_bar == "train": + total = max_steps + desc = f"Last Eval Epoch: {info['epoch']}" + initial = info["step"] + else: + total = eval_frequency + desc = f"Epoch {info['epoch']+1}" + initial = 0 # Set disable=None, so that it disables on non-TTY progress = tqdm.tqdm( - total=eval_frequency, disable=None, leave=False, file=stderr + total=total, + disable=None, + leave=False, + file=stderr, + initial=initial, ) - progress.set_description(f"Epoch {info['epoch']+1}") + progress.set_description(desc) def finalize() -> None: if output_stream: diff --git a/spacy/util.py b/spacy/util.py index 8d211a9a5..8bf8fb1b0 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -31,6 +31,7 @@ import shlex import inspect import pkgutil import logging +import socket try: import cupy.random @@ -1736,3 +1737,50 @@ def all_equal(iterable): (or if the input is an empty sequence), False otherwise.""" g = itertools.groupby(iterable) return next(g, True) and not next(g, False) + + +def _is_port_in_use(port: int, host: str = "localhost") -> bool: + """Check if 'host:port' is in use. Return True if it is, False otherwise. + + port (int): the port to check + host (str): the host to check (default "localhost") + RETURNS (bool): Whether 'host:port' is in use. + """ + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + s.bind((host, port)) + return False + except socket.error: + return True + finally: + s.close() + + +def find_available_port(start: int, host: str, auto_select: bool = False) -> int: + """Given a starting port and a host, handle finding a port. + + If `auto_select` is False, a busy port will raise an error. + + If `auto_select` is True, the next free higher port will be used. + + start (int): the port to start looking from + host (str): the host to find a port on + auto_select (bool): whether to automatically select a new port if the given port is busy (default False) + RETURNS (int): The port to use. + """ + if not _is_port_in_use(start, host): + return start + + port = start + if not auto_select: + raise ValueError(Errors.E1050.format(port=port)) + + while _is_port_in_use(port, host) and port < 65535: + port += 1 + + if port == 65535 and _is_port_in_use(port, host): + raise ValueError(Errors.E1049.format(host=host)) + + # if we get here, the port changed + warnings.warn(Warnings.W124.format(host=host, port=start, serve_port=port)) + return port diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 768844cf3..420e827a0 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -186,7 +186,7 @@ process that are used when you run [`spacy train`](/api/cli#train). | `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | | `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | | `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | -| `before_update` | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `step`, `epoch`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~ | +| `before_update` 3.5 | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `step`, `epoch`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~ | | `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | | `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | | `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index c2ba33f01..f15c648ff 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -55,13 +55,14 @@ how the component should be configured. You can override its settings via the > nlp.add_pipe("entity_ruler", config=config) > ``` -| Setting | Description | -| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | -| `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ | -| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | -| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | -| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ | +| Setting | Description | +| ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | +| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | +| `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ | +| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | +| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | +| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entityruler.py @@ -85,23 +86,25 @@ be a token pattern (list) or a phrase pattern (string). For example: > ruler = EntityRuler(nlp, overwrite_ents=True) > ``` -| Name | Description | -| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ | -| `name` 3 | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ | -| _keyword-only_ | | -| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | -| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ | -| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | -| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | -| `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ | +| Name | Description | +| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ | +| `name` 3 | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ | +| _keyword-only_ | | +| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | +| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | +| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ | +| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | +| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | +| `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ | +| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ | ## EntityRuler.initialize {#initialize tag="method" new="3"} Initialize the component with data and used before training to load in rules -from a [pattern file](/usage/rule-based-matching/#entityruler-files). This method -is typically called by [`Language.initialize`](/api/language#initialize) and -lets you customize arguments it receives via the +from a [pattern file](/usage/rule-based-matching/#entityruler-files). This +method is typically called by [`Language.initialize`](/api/language#initialize) +and lets you customize arguments it receives via the [`[initialize.components]`](/api/data-formats#config-initialize) block in the config. @@ -210,10 +213,10 @@ of dicts) or a phrase pattern (string). For more details, see the usage guide on | ---------- | ---------------------------------------------------------------- | | `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ | - ## EntityRuler.remove {#remove tag="method" new="3.2.1"} -Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if the ID does not exist. +Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if +the ID does not exist. > #### Example > @@ -224,9 +227,9 @@ Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if th > ruler.remove("apple") > ``` -| Name | Description | -| ---------- | ---------------------------------------------------------------- | -| `id` | The ID of the pattern rule. ~~str~~ | +| Name | Description | +| ---- | ----------------------------------- | +| `id` | The ID of the pattern rule. ~~str~~ | ## EntityRuler.to_disk {#to_disk tag="method"} diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index cd7bfa070..bd5f6ac24 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -86,14 +86,20 @@ it compares to another value. > ] > ``` -| Attribute | Description | -| -------------------------- | -------------------------------------------------------------------------------------------------------- | -| `IN` | Attribute value is member of a list. ~~Any~~ | -| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ | -| `IS_SUBSET` | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~ | -| `IS_SUPERSET` | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~ | -| `INTERSECTS` | Attribute value (for `MORPH` or custom list attribute) has a non-empty intersection with a list. ~~Any~~ | -| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | +| Attribute | Description | +| -------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `REGEX` | Attribute value matches the regular expression at any position in the string. ~~Any~~ | +| `FUZZY` | Attribute value matches if the `fuzzy_compare` method matches for `(value, pattern, -1)`. The default method allows a Levenshtein edit distance of at least 2 and up to 30% of the pattern string length. ~~Any~~ | +| `FUZZY1`, `FUZZY2`, ... `FUZZY9` | Attribute value matches if the `fuzzy_compare` method matches for `(value, pattern, N)`. The default method allows a Levenshtein edit distance of at most N (1-9). ~~Any~~ | +| `IN` | Attribute value is member of a list. ~~Any~~ | +| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ | +| `IS_SUBSET` | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~ | +| `IS_SUPERSET` | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~ | +| `INTERSECTS` | Attribute value (for `MORPH` or custom list attribute) has a non-empty intersection with a list. ~~Any~~ | +| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | + +As of spaCy v3.5, `REGEX` and `FUZZY` can be used in combination with `IN` and +`NOT_IN`. ## Matcher.\_\_init\_\_ {#init tag="method"} @@ -109,10 +115,11 @@ string where an integer is expected) or unexpected property names. > matcher = Matcher(nlp.vocab) > ``` -| Name | Description | -| ---------- | ----------------------------------------------------------------------------------------------------- | -| `vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. ~~Vocab~~ | -| `validate` | Validate all patterns added to this matcher. ~~bool~~ | +| Name | Description | +| --------------- | ----------------------------------------------------------------------------------------------------- | +| `vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. ~~Vocab~~ | +| `validate` | Validate all patterns added to this matcher. ~~bool~~ | +| `fuzzy_compare` | The comparison method used for the `FUZZY` operators. ~~Callable[[str, str, int], bool]~~ | ## Matcher.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/spanruler.md b/website/docs/api/spanruler.md index b573f7c58..31f04ccf9 100644 --- a/website/docs/api/spanruler.md +++ b/website/docs/api/spanruler.md @@ -46,16 +46,17 @@ how the component should be configured. You can override its settings via the > nlp.add_pipe("span_ruler", config=config) > ``` -| Setting | Description | -| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `spans_key` | The spans key to save the spans under. If `None`, no spans are saved. Defaults to `"ruler"`. ~~Optional[str]~~ | -| `spans_filter` | The optional method to filter spans before they are assigned to doc.spans. Defaults to `None`. ~~Optional[Callable[[Iterable[Span], Iterable[Span]], List[Span]]]~~ | -| `annotate_ents` | Whether to save spans to doc.ents. Defaults to `False`. ~~bool~~ | -| `ents_filter` | The method to filter spans before they are assigned to doc.ents. Defaults to `util.filter_chain_spans`. ~~Callable[[Iterable[Span], Iterable[Span]], List[Span]]~~ | -| `phrase_matcher_attr` | Token attribute to match on, passed to the internal PhraseMatcher as `attr`. Defaults to `None`. ~~Optional[Union[int, str]]~~ | -| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ | -| `overwrite` | Whether to remove any existing spans under `Doc.spans[spans key]` if `spans_key` is set, or to remove any ents under `Doc.ents` if `annotate_ents` is set. Defaults to `True`. ~~bool~~ | -| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | +| Setting | Description | +| ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `spans_key` | The spans key to save the spans under. If `None`, no spans are saved. Defaults to `"ruler"`. ~~Optional[str]~~ | +| `spans_filter` | The optional method to filter spans before they are assigned to doc.spans. Defaults to `None`. ~~Optional[Callable[[Iterable[Span], Iterable[Span]], List[Span]]]~~ | +| `annotate_ents` | Whether to save spans to doc.ents. Defaults to `False`. ~~bool~~ | +| `ents_filter` | The method to filter spans before they are assigned to doc.ents. Defaults to `util.filter_chain_spans`. ~~Callable[[Iterable[Span], Iterable[Span]], List[Span]]~~ | +| `phrase_matcher_attr` | Token attribute to match on, passed to the internal `PhraseMatcher` as `attr`. Defaults to `None`. ~~Optional[Union[int, str]]~~ | +| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | +| `validate` | Whether patterns should be validated, passed to `Matcher` and `PhraseMatcher` as `validate`. Defaults to `False`. ~~bool~~ | +| `overwrite` | Whether to remove any existing spans under `Doc.spans[spans key]` if `spans_key` is set, or to remove any ents under `Doc.ents` if `annotate_ents` is set. Defaults to `True`. ~~bool~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/span_ruler.py @@ -79,19 +80,20 @@ token pattern (list) or a phrase pattern (string). For example: > ruler = SpanRuler(nlp, overwrite=True) > ``` -| Name | Description | -| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ | -| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current span ruler while creating phrase patterns with the nlp object. ~~str~~ | -| _keyword-only_ | | -| `spans_key` | The spans key to save the spans under. If `None`, no spans are saved. Defaults to `"ruler"`. ~~Optional[str]~~ | -| `spans_filter` | The optional method to filter spans before they are assigned to doc.spans. Defaults to `None`. ~~Optional[Callable[[Iterable[Span], Iterable[Span]], List[Span]]]~~ | -| `annotate_ents` | Whether to save spans to doc.ents. Defaults to `False`. ~~bool~~ | -| `ents_filter` | The method to filter spans before they are assigned to doc.ents. Defaults to `util.filter_chain_spans`. ~~Callable[[Iterable[Span], Iterable[Span]], List[Span]]~~ | -| `phrase_matcher_attr` | Token attribute to match on, passed to the internal PhraseMatcher as `attr`. Defaults to `None`. ~~Optional[Union[int, str]]~~ | -| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ | -| `overwrite` | Whether to remove any existing spans under `Doc.spans[spans key]` if `spans_key` is set, or to remove any ents under `Doc.ents` if `annotate_ents` is set. Defaults to `True`. ~~bool~~ | -| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | +| Name | Description | +| ---------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ | +| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current span ruler while creating phrase patterns with the nlp object. ~~str~~ | +| _keyword-only_ | | +| `spans_key` | The spans key to save the spans under. If `None`, no spans are saved. Defaults to `"ruler"`. ~~Optional[str]~~ | +| `spans_filter` | The optional method to filter spans before they are assigned to doc.spans. Defaults to `None`. ~~Optional[Callable[[Iterable[Span], Iterable[Span]], List[Span]]]~~ | +| `annotate_ents` | Whether to save spans to doc.ents. Defaults to `False`. ~~bool~~ | +| `ents_filter` | The method to filter spans before they are assigned to doc.ents. Defaults to `util.filter_chain_spans`. ~~Callable[[Iterable[Span], Iterable[Span]], List[Span]]~~ | +| `phrase_matcher_attr` | Token attribute to match on, passed to the internal PhraseMatcher as `attr`. Defaults to `None`. ~~Optional[Union[int, str]]~~ | +| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | +| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ | +| `overwrite` | Whether to remove any existing spans under `Doc.spans[spans key]` if `spans_key` is set, or to remove any ents under `Doc.ents` if `annotate_ents` is set. Defaults to `True`. ~~bool~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | ## SpanRuler.initialize {#initialize tag="method"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 26a5d42f4..9d3e463d8 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -237,16 +237,17 @@ browser. Will run a simple web server. > displacy.serve([doc1, doc2], style="dep") > ``` -| Name | Description | -| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ | -| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` 3.3. Defaults to `"dep"`. ~~str~~ | -| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ | -| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ | -| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ | -| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ | -| `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ | -| `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ | +| Name | Description | +| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ | +| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` 3.3. Defaults to `"dep"`. ~~str~~ | +| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ | +| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ | +| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ | +| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ | +| `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ | +| `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ | +| `auto_select_port` | If `True`, automatically switch to a different port if the specified port is already in use. Defaults to `False`. ~~bool~~ | ### displacy.render {#displacy.render tag="method" new="2"} @@ -266,7 +267,7 @@ Render a dependency parse tree or named entity visualization. | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]~~ | | `style` | Visualization style, `"dep"`, `"ent"` or `"span"` 3.3. Defaults to `"dep"`. ~~str~~ | -| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ | +| `page` | Render markup as full HTML page. Defaults to `False`. ~~bool~~ | | `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ | | `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ | | `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ | @@ -513,7 +514,7 @@ a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of using one of the built-in loggers, you can [implement your own](/usage/training#custom-logging). -#### spacy.ConsoleLogger.v2 {#ConsoleLogger tag="registered function"} +#### spacy.ConsoleLogger.v2 {tag="registered function"} > #### Example config > @@ -564,11 +565,33 @@ start decreasing across epochs. -| Name | Description | -| ---------------- | --------------------------------------------------------------------- | -| `progress_bar` | Whether the logger should print the progress bar ~~bool~~ | -| `console_output` | Whether the logger should print the logs on the console. ~~bool~~ | -| `output_file` | The file to save the training logs to. ~~Optional[Union[str, Path]]~~ | +| Name | Description | +| ---------------- | ---------------------------------------------------------------------------------------------------------------------------- | +| `progress_bar` | Whether the logger should print a progress bar tracking the steps till the next evaluation pass (default: `False`). ~~bool~~ | +| `console_output` | Whether the logger should print the logs in the console (default: `True`). ~~bool~~ | +| `output_file` | The file to save the training logs to (default: `None`). ~~Optional[Union[str, Path]]~~ | + +#### spacy.ConsoleLogger.v3 {#ConsoleLogger tag="registered function"} + +> #### Example config +> +> ```ini +> [training.logger] +> @loggers = "spacy.ConsoleLogger.v3" +> progress_bar = "all_steps" +> console_output = true +> output_file = "training_log.jsonl" +> ``` + +Writes the results of a training step to the console in a tabular format and +optionally saves them to a `jsonl` file. + +| Name | Description | +| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `progress_bar` | Type of progress bar to show in the console: `"train"`, `"eval"` or `None`. | +| | The bar tracks the number of steps until `training.max_steps` and `training.eval_frequency` are reached respectively (default: `None`). ~~Optional[str]~~ | +| `console_output` | Whether the logger should print the logs in the console (default: `True`). ~~bool~~ | +| `output_file` | The file to save the training logs to (default: `None`). ~~Optional[Union[str, Path]]~~ | ## Readers {#readers} diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index ad8ea27f3..3e15fca36 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -364,6 +364,46 @@ else: +#### Fuzzy matching {#fuzzy new="3.5"} + +Fuzzy matching allows you to match tokens with alternate spellings, typos, etc. +without specifying every possible variant. + +```python +# Matches "favourite", "favorites", "gavorite", "theatre", "theatr", ... +pattern = [{"TEXT": {"FUZZY": "favorite"}}, + {"TEXT": {"FUZZY": "theater"}}] +``` + +The `FUZZY` attribute allows fuzzy matches for any attribute string value, +including custom attributes. Just like `REGEX`, it always needs to be applied to +an attribute like `TEXT` or `LOWER`. By default `FUZZY` allows a Levenshtein +edit distance of at least 2 and up to 30% of the pattern string length. Using +the more specific attributes `FUZZY1`..`FUZZY9` you can specify the maximum +allowed edit distance directly. + +```python +# Match lowercase with fuzzy matching (allows 2 edits) +pattern = [{"LOWER": {"FUZZY": "definitely"}}] + +# Match custom attribute values with fuzzy matching (allows 2 edits) +pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}] + +# Match with exact Levenshtein edit distance limits (allows 3 edits) +pattern = [{"_": {"country": {"FUZZY3": "Kyrgyzstan"}}}] +``` + +#### Regex and fuzzy matching with lists {#regex-fuzzy-lists new="3.5"} + +Starting in spaCy v3.5, both `REGEX` and `FUZZY` can be combined with the +attributes `IN` and `NOT_IN`: + +```python +pattern = [{"TEXT": {"FUZZY": {"IN": ["awesome", "cool", "wonderful"]}}}] + +pattern = [{"TEXT": {"REGEX": {"NOT_IN": ["^awe(some)?$", "^wonder(ful)?"]}}}] +``` + --- #### Operators and quantifiers {#quantifiers} diff --git a/website/meta/universe.json b/website/meta/universe.json index db533c3b2..99d121507 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -4062,6 +4062,33 @@ "author_links": { "github": "yasufumy" } + }, + { + "id": "spacy-pythainlp", + "title": "spaCy-PyThaiNLP", + "slogan": "PyThaiNLP for spaCy", + "description": "This package wraps the PyThaiNLP library to add support for Thai to spaCy.", + "github": "PyThaiNLP/spaCy-PyThaiNLP", + "code_example": [ + "import spacy", + "import spacy_pythainlp.core", + "", + "nlp = spacy.blank('th')", + "nlp.add_pipe('pythainlp')", + "doc = nlp('ผมเป็นคนไทย แต่มะลิอยากไปโรงเรียนส่วนผมจะไปไหน ผมอยากไปเที่ยว')", + "", + "print(list(doc.sents))", + "# output: [ผมเป็นคนไทย แต่มะลิอยากไปโรงเรียนส่วนผมจะไปไหน , ผมอยากไปเที่ยว]" + ], + "code_language": "python", + "author": "Wannaphong Phatthiyaphaibun", + "author_links": { + "twitter": "@wannaphong_p", + "github": "wannaphong", + "website": "https://iam.wannaphong.com/" + }, + "category": ["pipeline", "research"], + "tags": ["Thai"] } ],