mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-04 20:30:24 +03:00
Merge branch 'master' into feature/lowercasing
This commit is contained in:
commit
80afdd9642
2
.github/azure-steps.yml
vendored
2
.github/azure-steps.yml
vendored
|
@ -107,7 +107,7 @@ steps:
|
||||||
displayName: "Run CPU tests"
|
displayName: "Run CPU tests"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m pip install --pre thinc-apple-ops
|
python -m pip install 'spacy[apple]'
|
||||||
python -m pytest --pyargs spacy
|
python -m pytest --pyargs spacy
|
||||||
displayName: "Run CPU tests with thinc-apple-ops"
|
displayName: "Run CPU tests with thinc-apple-ops"
|
||||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
||||||
|
|
|
@ -46,6 +46,7 @@ open-source software, released under the [MIT license](https://github.com/explos
|
||||||
| 🛠 **[Changelog]** | Changes and version history. |
|
| 🛠 **[Changelog]** | Changes and version history. |
|
||||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||||
|
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
||||||
|
|
||||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||||
[new in v3.0]: https://spacy.io/usage/v3
|
[new in v3.0]: https://spacy.io/usage/v3
|
||||||
|
@ -59,6 +60,7 @@ open-source software, released under the [MIT license](https://github.com/explos
|
||||||
[changelog]: https://spacy.io/usage#changelog
|
[changelog]: https://spacy.io/usage#changelog
|
||||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||||
|
|
||||||
|
|
||||||
## 💬 Where to ask questions
|
## 💬 Where to ask questions
|
||||||
|
|
||||||
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
||||||
|
|
|
@ -5,4 +5,5 @@ numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
|
||||||
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
|
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
|
||||||
numpy==1.19.3; python_version=='3.9'
|
numpy==1.19.3; python_version=='3.9'
|
||||||
numpy==1.21.3; python_version=='3.10'
|
numpy==1.21.3; python_version=='3.10'
|
||||||
numpy; python_version>='3.11'
|
numpy==1.23.2; python_version=='3.11'
|
||||||
|
numpy; python_version>='3.12'
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.10,<3.1.0
|
spacy-legacy>=3.0.11,<3.1.0
|
||||||
spacy-loggers>=1.0.0,<2.0.0
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.1.0,<8.2.0
|
thinc>=8.1.0,<8.2.0
|
||||||
ml_datasets>=0.2.0,<0.3.0
|
ml_datasets>=0.2.0,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.9.1,<1.1.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.8.0
|
typer>=0.3.0,<0.8.0
|
||||||
|
|
|
@ -22,6 +22,7 @@ classifiers =
|
||||||
Programming Language :: Python :: 3.8
|
Programming Language :: Python :: 3.8
|
||||||
Programming Language :: Python :: 3.9
|
Programming Language :: Python :: 3.9
|
||||||
Programming Language :: Python :: 3.10
|
Programming Language :: Python :: 3.10
|
||||||
|
Programming Language :: Python :: 3.11
|
||||||
Topic :: Scientific/Engineering
|
Topic :: Scientific/Engineering
|
||||||
project_urls =
|
project_urls =
|
||||||
Release notes = https://github.com/explosion/spaCy/releases
|
Release notes = https://github.com/explosion/spaCy/releases
|
||||||
|
@ -41,13 +42,13 @@ setup_requires =
|
||||||
thinc>=8.1.0,<8.2.0
|
thinc>=8.1.0,<8.2.0
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.10,<3.1.0
|
spacy-legacy>=3.0.11,<3.1.0
|
||||||
spacy-loggers>=1.0.0,<2.0.0
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.1.0,<8.2.0
|
thinc>=8.1.0,<8.2.0
|
||||||
wasabi>=0.9.1,<1.1.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
|
|
|
@ -16,6 +16,7 @@ from .debug_config import debug_config # noqa: F401
|
||||||
from .debug_model import debug_model # noqa: F401
|
from .debug_model import debug_model # noqa: F401
|
||||||
from .debug_diff import debug_diff # noqa: F401
|
from .debug_diff import debug_diff # noqa: F401
|
||||||
from .evaluate import evaluate # noqa: F401
|
from .evaluate import evaluate # noqa: F401
|
||||||
|
from .apply import apply # noqa: F401
|
||||||
from .convert import convert # noqa: F401
|
from .convert import convert # noqa: F401
|
||||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||||
from .init_config import init_config, fill_config # noqa: F401
|
from .init_config import init_config, fill_config # noqa: F401
|
||||||
|
|
|
@ -582,6 +582,33 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
|
||||||
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
|
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
|
||||||
|
|
||||||
|
|
||||||
|
def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
|
||||||
|
"""Given a directory and a suffix, recursively find all files matching the suffix.
|
||||||
|
Directories or files with names beginning with a . are ignored, but hidden flags on
|
||||||
|
filesystems are not checked.
|
||||||
|
When provided with a suffix `None`, there is no suffix-based filtering."""
|
||||||
|
if not path.is_dir():
|
||||||
|
return [path]
|
||||||
|
paths = [path]
|
||||||
|
locs = []
|
||||||
|
seen = set()
|
||||||
|
for path in paths:
|
||||||
|
if str(path) in seen:
|
||||||
|
continue
|
||||||
|
seen.add(str(path))
|
||||||
|
if path.parts[-1].startswith("."):
|
||||||
|
continue
|
||||||
|
elif path.is_dir():
|
||||||
|
paths.extend(path.iterdir())
|
||||||
|
elif suffix is not None and not path.parts[-1].endswith(suffix):
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
locs.append(path)
|
||||||
|
# It's good to sort these, in case the ordering messes up cache.
|
||||||
|
locs.sort()
|
||||||
|
return locs
|
||||||
|
|
||||||
|
|
||||||
def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
|
def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
|
||||||
"""Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
|
"""Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
|
||||||
as happens with `round(number, ndigits)`"""
|
as happens with `round(number, ndigits)`"""
|
||||||
|
|
143
spacy/cli/apply.py
Normal file
143
spacy/cli/apply.py
Normal file
|
@ -0,0 +1,143 @@
|
||||||
|
import tqdm
|
||||||
|
import srsly
|
||||||
|
|
||||||
|
from itertools import chain
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, List, Iterable, cast, Union
|
||||||
|
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
|
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
|
||||||
|
|
||||||
|
from ..tokens import Doc, DocBin
|
||||||
|
from ..vocab import Vocab
|
||||||
|
from ..util import ensure_path, load_model
|
||||||
|
|
||||||
|
|
||||||
|
path_help = """Location of the documents to predict on.
|
||||||
|
Can be a single file in .spacy format or a .jsonl file.
|
||||||
|
Files with other extensions are treated as single plain text documents.
|
||||||
|
If a directory is provided it is traversed recursively to grab
|
||||||
|
all files to be processed.
|
||||||
|
The files can be a mixture of .spacy, .jsonl and text files.
|
||||||
|
If .jsonl is provided the specified field is going
|
||||||
|
to be grabbed ("text" by default)."""
|
||||||
|
|
||||||
|
out_help = "Path to save the resulting .spacy file"
|
||||||
|
code_help = (
|
||||||
|
"Path to Python file with additional " "code (registered functions) to be imported"
|
||||||
|
)
|
||||||
|
gold_help = "Use gold preprocessing provided in the .spacy files"
|
||||||
|
force_msg = (
|
||||||
|
"The provided output file already exists. "
|
||||||
|
"To force overwriting the output file, set the --force or -F flag."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
|
||||||
|
|
||||||
|
|
||||||
|
def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
|
||||||
|
"""
|
||||||
|
Stream Doc objects from DocBin.
|
||||||
|
"""
|
||||||
|
docbin = DocBin().from_disk(path)
|
||||||
|
for doc in docbin.get_docs(vocab):
|
||||||
|
yield doc
|
||||||
|
|
||||||
|
|
||||||
|
def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
|
||||||
|
"""
|
||||||
|
Stream "text" field from JSONL. If the field "text" is
|
||||||
|
not found it raises error.
|
||||||
|
"""
|
||||||
|
for entry in srsly.read_jsonl(path):
|
||||||
|
if field not in entry:
|
||||||
|
msg.fail(f"{path} does not contain the required '{field}' field.", exits=1)
|
||||||
|
else:
|
||||||
|
yield entry[field]
|
||||||
|
|
||||||
|
|
||||||
|
def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
|
||||||
|
"""
|
||||||
|
Yields strings from text files in paths.
|
||||||
|
"""
|
||||||
|
for path in paths:
|
||||||
|
with open(path, "r") as fin:
|
||||||
|
text = fin.read()
|
||||||
|
yield text
|
||||||
|
|
||||||
|
|
||||||
|
@app.command("apply")
|
||||||
|
def apply_cli(
|
||||||
|
# fmt: off
|
||||||
|
model: str = Arg(..., help="Model name or path"),
|
||||||
|
data_path: Path = Arg(..., help=path_help, exists=True),
|
||||||
|
output_file: Path = Arg(..., help=out_help, dir_okay=False),
|
||||||
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
|
||||||
|
text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
|
||||||
|
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
|
||||||
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
|
||||||
|
batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
|
||||||
|
n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Apply a trained pipeline to documents to get predictions.
|
||||||
|
Expects a loadable spaCy pipeline and path to the data, which
|
||||||
|
can be a directory or a file.
|
||||||
|
The data files can be provided in multiple formats:
|
||||||
|
1. .spacy files
|
||||||
|
2. .jsonl files with a specified "field" to read the text from.
|
||||||
|
3. Files with any other extension are assumed to be containing
|
||||||
|
a single document.
|
||||||
|
DOCS: https://spacy.io/api/cli#apply
|
||||||
|
"""
|
||||||
|
data_path = ensure_path(data_path)
|
||||||
|
output_file = ensure_path(output_file)
|
||||||
|
code_path = ensure_path(code_path)
|
||||||
|
if output_file.exists() and not force_overwrite:
|
||||||
|
msg.fail(force_msg, exits=1)
|
||||||
|
if not data_path.exists():
|
||||||
|
msg.fail(f"Couldn't find data path: {data_path}", exits=1)
|
||||||
|
import_code(code_path)
|
||||||
|
setup_gpu(use_gpu)
|
||||||
|
apply(data_path, output_file, model, text_key, batch_size, n_process)
|
||||||
|
|
||||||
|
|
||||||
|
def apply(
|
||||||
|
data_path: Path,
|
||||||
|
output_file: Path,
|
||||||
|
model: str,
|
||||||
|
json_field: str,
|
||||||
|
batch_size: int,
|
||||||
|
n_process: int,
|
||||||
|
):
|
||||||
|
docbin = DocBin(store_user_data=True)
|
||||||
|
paths = walk_directory(data_path)
|
||||||
|
if len(paths) == 0:
|
||||||
|
docbin.to_disk(output_file)
|
||||||
|
msg.warn(
|
||||||
|
"Did not find data to process,"
|
||||||
|
f" {data_path} seems to be an empty directory."
|
||||||
|
)
|
||||||
|
return
|
||||||
|
nlp = load_model(model)
|
||||||
|
msg.good(f"Loaded model {model}")
|
||||||
|
vocab = nlp.vocab
|
||||||
|
streams: List[DocOrStrStream] = []
|
||||||
|
text_files = []
|
||||||
|
for path in paths:
|
||||||
|
if path.suffix == ".spacy":
|
||||||
|
streams.append(_stream_docbin(path, vocab))
|
||||||
|
elif path.suffix == ".jsonl":
|
||||||
|
streams.append(_stream_jsonl(path, json_field))
|
||||||
|
else:
|
||||||
|
text_files.append(path)
|
||||||
|
if len(text_files) > 0:
|
||||||
|
streams.append(_stream_texts(text_files))
|
||||||
|
datagen = cast(DocOrStrStream, chain(*streams))
|
||||||
|
for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
|
||||||
|
docbin.add(doc)
|
||||||
|
if output_file.suffix == "":
|
||||||
|
output_file = output_file.with_suffix(".spacy")
|
||||||
|
docbin.to_disk(output_file)
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
|
from typing import Callable, Iterable, Mapping, Optional, Any, Union
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
@ -7,7 +7,7 @@ import re
|
||||||
import sys
|
import sys
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt, walk_directory
|
||||||
from ..training import docs_to_json
|
from ..training import docs_to_json
|
||||||
from ..tokens import Doc, DocBin
|
from ..tokens import Doc, DocBin
|
||||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||||
|
@ -28,6 +28,8 @@ CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
|
||||||
"json": json_to_docs,
|
"json": json_to_docs,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AUTO = "auto"
|
||||||
|
|
||||||
|
|
||||||
# File types that can be written to stdout
|
# File types that can be written to stdout
|
||||||
FILE_TYPES_STDOUT = ("json",)
|
FILE_TYPES_STDOUT = ("json",)
|
||||||
|
@ -49,7 +51,7 @@ def convert_cli(
|
||||||
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
|
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
|
||||||
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
||||||
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
||||||
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||||
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
|
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
|
||||||
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
|
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
|
||||||
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
|
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
|
||||||
|
@ -70,8 +72,8 @@ def convert_cli(
|
||||||
output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
|
output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
|
||||||
silent = output_dir == "-"
|
silent = output_dir == "-"
|
||||||
msg = Printer(no_print=silent)
|
msg = Printer(no_print=silent)
|
||||||
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
|
|
||||||
converter = _get_converter(msg, converter, input_path)
|
converter = _get_converter(msg, converter, input_path)
|
||||||
|
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
|
||||||
convert(
|
convert(
|
||||||
input_path,
|
input_path,
|
||||||
output_dir,
|
output_dir,
|
||||||
|
@ -100,7 +102,7 @@ def convert(
|
||||||
model: Optional[str] = None,
|
model: Optional[str] = None,
|
||||||
morphology: bool = False,
|
morphology: bool = False,
|
||||||
merge_subtokens: bool = False,
|
merge_subtokens: bool = False,
|
||||||
converter: str = "auto",
|
converter: str,
|
||||||
ner_map: Optional[Path] = None,
|
ner_map: Optional[Path] = None,
|
||||||
lang: Optional[str] = None,
|
lang: Optional[str] = None,
|
||||||
concatenate: bool = False,
|
concatenate: bool = False,
|
||||||
|
@ -189,33 +191,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def walk_directory(path: Path, converter: str) -> List[Path]:
|
|
||||||
if not path.is_dir():
|
|
||||||
return [path]
|
|
||||||
paths = [path]
|
|
||||||
locs = []
|
|
||||||
seen = set()
|
|
||||||
for path in paths:
|
|
||||||
if str(path) in seen:
|
|
||||||
continue
|
|
||||||
seen.add(str(path))
|
|
||||||
if path.parts[-1].startswith("."):
|
|
||||||
continue
|
|
||||||
elif path.is_dir():
|
|
||||||
paths.extend(path.iterdir())
|
|
||||||
elif converter == "json" and not path.parts[-1].endswith("json"):
|
|
||||||
continue
|
|
||||||
elif converter == "conll" and not path.parts[-1].endswith("conll"):
|
|
||||||
continue
|
|
||||||
elif converter == "iob" and not path.parts[-1].endswith("iob"):
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
locs.append(path)
|
|
||||||
# It's good to sort these, in case the ordering messes up cache.
|
|
||||||
locs.sort()
|
|
||||||
return locs
|
|
||||||
|
|
||||||
|
|
||||||
def verify_cli_args(
|
def verify_cli_args(
|
||||||
msg: Printer,
|
msg: Printer,
|
||||||
input_path: Path,
|
input_path: Path,
|
||||||
|
@ -239,18 +214,22 @@ def verify_cli_args(
|
||||||
input_locs = walk_directory(input_path, converter)
|
input_locs = walk_directory(input_path, converter)
|
||||||
if len(input_locs) == 0:
|
if len(input_locs) == 0:
|
||||||
msg.fail("No input files in directory", input_path, exits=1)
|
msg.fail("No input files in directory", input_path, exits=1)
|
||||||
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
if converter not in CONVERTERS:
|
||||||
if converter == "auto" and len(file_types) >= 2:
|
|
||||||
file_types_str = ",".join(file_types)
|
|
||||||
msg.fail("All input files must be same type", file_types_str, exits=1)
|
|
||||||
if converter != "auto" and converter not in CONVERTERS:
|
|
||||||
msg.fail(f"Can't find converter for {converter}", exits=1)
|
msg.fail(f"Can't find converter for {converter}", exits=1)
|
||||||
|
|
||||||
|
|
||||||
def _get_converter(msg, converter, input_path: Path):
|
def _get_converter(msg, converter, input_path: Path):
|
||||||
if input_path.is_dir():
|
if input_path.is_dir():
|
||||||
input_path = walk_directory(input_path, converter)[0]
|
if converter == AUTO:
|
||||||
if converter == "auto":
|
input_locs = walk_directory(input_path, suffix=None)
|
||||||
|
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
||||||
|
if len(file_types) >= 2:
|
||||||
|
file_types_str = ",".join(file_types)
|
||||||
|
msg.fail("All input files must be same type", file_types_str, exits=1)
|
||||||
|
input_path = input_locs[0]
|
||||||
|
else:
|
||||||
|
input_path = walk_directory(input_path, suffix=converter)[0]
|
||||||
|
if converter == AUTO:
|
||||||
converter = input_path.suffix[1:]
|
converter = input_path.suffix[1:]
|
||||||
if converter == "ner" or converter == "iob":
|
if converter == "ner" or converter == "iob":
|
||||||
with input_path.open(encoding="utf8") as file_:
|
with input_path.open(encoding="utf8") as file_:
|
||||||
|
|
|
@ -11,6 +11,7 @@ from .render import DependencyRenderer, EntityRenderer, SpanRenderer
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..util import is_in_jupyter
|
from ..util import is_in_jupyter
|
||||||
|
from ..util import find_available_port
|
||||||
|
|
||||||
|
|
||||||
_html = {}
|
_html = {}
|
||||||
|
@ -36,7 +37,7 @@ def render(
|
||||||
jupyter (bool): Override Jupyter auto-detection.
|
jupyter (bool): Override Jupyter auto-detection.
|
||||||
options (dict): Visualiser-specific options, e.g. colors.
|
options (dict): Visualiser-specific options, e.g. colors.
|
||||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||||
RETURNS (str): Rendered HTML markup.
|
RETURNS (str): Rendered SVG or HTML markup.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/top-level#displacy.render
|
DOCS: https://spacy.io/api/top-level#displacy.render
|
||||||
USAGE: https://spacy.io/usage/visualizers
|
USAGE: https://spacy.io/usage/visualizers
|
||||||
|
@ -82,6 +83,7 @@ def serve(
|
||||||
manual: bool = False,
|
manual: bool = False,
|
||||||
port: int = 5000,
|
port: int = 5000,
|
||||||
host: str = "0.0.0.0",
|
host: str = "0.0.0.0",
|
||||||
|
auto_select_port: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Serve displaCy visualisation.
|
"""Serve displaCy visualisation.
|
||||||
|
|
||||||
|
@ -93,15 +95,20 @@ def serve(
|
||||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||||
port (int): Port to serve visualisation.
|
port (int): Port to serve visualisation.
|
||||||
host (str): Host to serve visualisation.
|
host (str): Host to serve visualisation.
|
||||||
|
auto_select_port (bool): Automatically select a port if the specified port is in use.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/top-level#displacy.serve
|
DOCS: https://spacy.io/api/top-level#displacy.serve
|
||||||
USAGE: https://spacy.io/usage/visualizers
|
USAGE: https://spacy.io/usage/visualizers
|
||||||
"""
|
"""
|
||||||
from wsgiref import simple_server
|
from wsgiref import simple_server
|
||||||
|
|
||||||
|
port = find_available_port(port, host, auto_select_port)
|
||||||
|
|
||||||
if is_in_jupyter():
|
if is_in_jupyter():
|
||||||
warnings.warn(Warnings.W011)
|
warnings.warn(Warnings.W011)
|
||||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
render(
|
||||||
|
docs, style=style, page=page, minify=minify, options=options, manual=manual
|
||||||
|
)
|
||||||
httpd = simple_server.make_server(host, port, app)
|
httpd = simple_server.make_server(host, port, app)
|
||||||
print(f"\nUsing the '{style}' visualizer")
|
print(f"\nUsing the '{style}' visualizer")
|
||||||
print(f"Serving on http://{host}:{port} ...\n")
|
print(f"Serving on http://{host}:{port} ...\n")
|
||||||
|
|
|
@ -94,7 +94,7 @@ class SpanRenderer:
|
||||||
parsed (list): Dependency parses to render.
|
parsed (list): Dependency parses to render.
|
||||||
page (bool): Render parses wrapped as full HTML page.
|
page (bool): Render parses wrapped as full HTML page.
|
||||||
minify (bool): Minify HTML markup.
|
minify (bool): Minify HTML markup.
|
||||||
RETURNS (str): Rendered HTML markup.
|
RETURNS (str): Rendered SVG or HTML markup.
|
||||||
"""
|
"""
|
||||||
rendered = []
|
rendered = []
|
||||||
for i, p in enumerate(parsed):
|
for i, p in enumerate(parsed):
|
||||||
|
@ -510,7 +510,7 @@ class EntityRenderer:
|
||||||
parsed (list): Dependency parses to render.
|
parsed (list): Dependency parses to render.
|
||||||
page (bool): Render parses wrapped as full HTML page.
|
page (bool): Render parses wrapped as full HTML page.
|
||||||
minify (bool): Minify HTML markup.
|
minify (bool): Minify HTML markup.
|
||||||
RETURNS (str): Rendered HTML markup.
|
RETURNS (str): Rendered SVG or HTML markup.
|
||||||
"""
|
"""
|
||||||
rendered = []
|
rendered = []
|
||||||
for i, p in enumerate(parsed):
|
for i, p in enumerate(parsed):
|
||||||
|
|
|
@ -214,6 +214,7 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
"is a Cython extension type.")
|
"is a Cython extension type.")
|
||||||
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
|
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
|
||||||
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
||||||
|
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
||||||
|
|
||||||
|
|
||||||
class Errors(metaclass=ErrorsWithCodes):
|
class Errors(metaclass=ErrorsWithCodes):
|
||||||
|
@ -345,6 +346,11 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"clear the existing vectors and resize the table.")
|
"clear the existing vectors and resize the table.")
|
||||||
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
||||||
"to end with the attribute {attr}. Got: {bad_attr}.")
|
"to end with the attribute {attr}. Got: {bad_attr}.")
|
||||||
|
E079 = ("Error computing states in beam: number of predicted beams "
|
||||||
|
"({pbeams}) does not equal number of gold beams ({gbeams}).")
|
||||||
|
E080 = ("Duplicate state found in beam: {key}.")
|
||||||
|
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
|
||||||
|
"does not equal number of losses ({losses}).")
|
||||||
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
||||||
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
||||||
"match.")
|
"match.")
|
||||||
|
@ -957,6 +963,11 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
|
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
|
||||||
"knowledge base, use `InMemoryLookupKB`.")
|
"knowledge base, use `InMemoryLookupKB`.")
|
||||||
E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
|
E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
|
||||||
|
E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}")
|
||||||
|
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
|
||||||
|
"with `displacy.serve(doc, port)`")
|
||||||
|
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port)` "
|
||||||
|
"or use `auto_switch_port=True` to pick an available port automatically.")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -15,7 +15,7 @@
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna
|
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaande aangezien achter achterna
|
||||||
afgelopen aldus alhoewel anderzijds
|
afgelopen aldus alhoewel anderzijds
|
||||||
|
|
||||||
ben bij bijna bijvoorbeeld behalve beide beiden beneden bent bepaald beter betere betreffende binnen binnenin boven
|
ben bij bijna bijvoorbeeld behalve beide beiden beneden bent bepaald beter betere betreffende binnen binnenin boven
|
||||||
|
|
|
@ -4,6 +4,8 @@ from libc.stdint cimport int64_t
|
||||||
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
cdef extern from "polyleven.c":
|
cdef extern from "polyleven.c":
|
||||||
int64_t polyleven(PyObject *o1, PyObject *o2, int64_t k)
|
int64_t polyleven(PyObject *o1, PyObject *o2, int64_t k)
|
||||||
|
@ -13,3 +15,18 @@ cpdef int64_t levenshtein(a: str, b: str, k: Optional[int] = None):
|
||||||
if k is None:
|
if k is None:
|
||||||
k = -1
|
k = -1
|
||||||
return polyleven(<PyObject*>a, <PyObject*>b, k)
|
return polyleven(<PyObject*>a, <PyObject*>b, k)
|
||||||
|
|
||||||
|
|
||||||
|
cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
|
||||||
|
if fuzzy >= 0:
|
||||||
|
max_edits = fuzzy
|
||||||
|
else:
|
||||||
|
# allow at least two edits (to allow at least one transposition) and up
|
||||||
|
# to 20% of the pattern string length
|
||||||
|
max_edits = max(2, round(0.3 * len(pattern_text)))
|
||||||
|
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
||||||
|
|
||||||
|
|
||||||
|
@registry.misc("spacy.levenshtein_compare.v1")
|
||||||
|
def make_levenshtein_compare():
|
||||||
|
return levenshtein_compare
|
||||||
|
|
|
@ -77,3 +77,4 @@ cdef class Matcher:
|
||||||
cdef public object _extensions
|
cdef public object _extensions
|
||||||
cdef public object _extra_predicates
|
cdef public object _extra_predicates
|
||||||
cdef public object _seen_attrs
|
cdef public object _seen_attrs
|
||||||
|
cdef public object _fuzzy_compare
|
||||||
|
|
|
@ -5,7 +5,8 @@ from ..vocab import Vocab
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
|
|
||||||
class Matcher:
|
class Matcher:
|
||||||
def __init__(self, vocab: Vocab, validate: bool = ...) -> None: ...
|
def __init__(self, vocab: Vocab, validate: bool = ...,
|
||||||
|
fuzzy_compare: Callable[[str, str, int], bool] = ...) -> None: ...
|
||||||
def __reduce__(self) -> Any: ...
|
def __reduce__(self) -> Any: ...
|
||||||
def __len__(self) -> int: ...
|
def __len__(self) -> int: ...
|
||||||
def __contains__(self, key: str) -> bool: ...
|
def __contains__(self, key: str) -> bool: ...
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: binding=True, infer_types=True, profile=True
|
||||||
from typing import List, Iterable
|
from typing import List, Iterable
|
||||||
|
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
@ -20,10 +20,12 @@ from ..tokens.token cimport Token
|
||||||
from ..tokens.morphanalysis cimport MorphAnalysis
|
from ..tokens.morphanalysis cimport MorphAnalysis
|
||||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
|
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
|
||||||
|
|
||||||
|
from .levenshtein import levenshtein_compare
|
||||||
from ..schemas import validate_token_pattern
|
from ..schemas import validate_token_pattern
|
||||||
from ..errors import Errors, MatchPatternError, Warnings
|
from ..errors import Errors, MatchPatternError, Warnings
|
||||||
from ..strings import get_string_id
|
from ..strings import get_string_id
|
||||||
from ..attrs import IDS
|
from ..attrs import IDS
|
||||||
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
@ -36,11 +38,13 @@ cdef class Matcher:
|
||||||
USAGE: https://spacy.io/usage/rule-based-matching
|
USAGE: https://spacy.io/usage/rule-based-matching
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab, validate=True):
|
def __init__(self, vocab, validate=True, *, fuzzy_compare=levenshtein_compare):
|
||||||
"""Create the Matcher.
|
"""Create the Matcher.
|
||||||
|
|
||||||
vocab (Vocab): The vocabulary object, which must be shared with the
|
vocab (Vocab): The vocabulary object, which must be shared with the
|
||||||
documents the matcher will operate on.
|
validate (bool): Validate all patterns added to this matcher.
|
||||||
|
fuzzy_compare (Callable[[str, str, int], bool]): The comparison method
|
||||||
|
for the FUZZY operators.
|
||||||
"""
|
"""
|
||||||
self._extra_predicates = []
|
self._extra_predicates = []
|
||||||
self._patterns = {}
|
self._patterns = {}
|
||||||
|
@ -51,9 +55,10 @@ cdef class Matcher:
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.validate = validate
|
self.validate = validate
|
||||||
|
self._fuzzy_compare = fuzzy_compare
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
data = (self.vocab, self._patterns, self._callbacks)
|
data = (self.vocab, self._patterns, self._callbacks, self.validate, self._fuzzy_compare)
|
||||||
return (unpickle_matcher, data, None, None)
|
return (unpickle_matcher, data, None, None)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
|
@ -128,7 +133,7 @@ cdef class Matcher:
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
try:
|
try:
|
||||||
specs = _preprocess_pattern(pattern, self.vocab,
|
specs = _preprocess_pattern(pattern, self.vocab,
|
||||||
self._extensions, self._extra_predicates)
|
self._extensions, self._extra_predicates, self._fuzzy_compare)
|
||||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||||
for spec in specs:
|
for spec in specs:
|
||||||
for attr, _ in spec[1]:
|
for attr, _ in spec[1]:
|
||||||
|
@ -326,8 +331,8 @@ cdef class Matcher:
|
||||||
return key
|
return key
|
||||||
|
|
||||||
|
|
||||||
def unpickle_matcher(vocab, patterns, callbacks):
|
def unpickle_matcher(vocab, patterns, callbacks, validate, fuzzy_compare):
|
||||||
matcher = Matcher(vocab)
|
matcher = Matcher(vocab, validate=validate, fuzzy_compare=fuzzy_compare)
|
||||||
for key, pattern in patterns.items():
|
for key, pattern in patterns.items():
|
||||||
callback = callbacks.get(key, None)
|
callback = callbacks.get(key, None)
|
||||||
matcher.add(key, pattern, on_match=callback)
|
matcher.add(key, pattern, on_match=callback)
|
||||||
|
@ -754,7 +759,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
||||||
return id_attr.value
|
return id_attr.value
|
||||||
|
|
||||||
|
|
||||||
def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
|
def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy_compare):
|
||||||
"""This function interprets the pattern, converting the various bits of
|
"""This function interprets the pattern, converting the various bits of
|
||||||
syntactic sugar before we compile it into a struct with init_pattern.
|
syntactic sugar before we compile it into a struct with init_pattern.
|
||||||
|
|
||||||
|
@ -781,7 +786,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
|
||||||
ops = _get_operators(spec)
|
ops = _get_operators(spec)
|
||||||
attr_values = _get_attr_values(spec, string_store)
|
attr_values = _get_attr_values(spec, string_store)
|
||||||
extensions = _get_extensions(spec, string_store, extensions_table)
|
extensions = _get_extensions(spec, string_store, extensions_table)
|
||||||
predicates = _get_extra_predicates(spec, extra_predicates, vocab)
|
predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare)
|
||||||
for op in ops:
|
for op in ops:
|
||||||
tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
|
tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
|
||||||
return tokens
|
return tokens
|
||||||
|
@ -826,16 +831,45 @@ def _get_attr_values(spec, string_store):
|
||||||
# These predicate helper classes are used to match the REGEX, IN, >= etc
|
# These predicate helper classes are used to match the REGEX, IN, >= etc
|
||||||
# extensions to the matcher introduced in #3173.
|
# extensions to the matcher introduced in #3173.
|
||||||
|
|
||||||
|
class _FuzzyPredicate:
|
||||||
|
operators = ("FUZZY", "FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5",
|
||||||
|
"FUZZY6", "FUZZY7", "FUZZY8", "FUZZY9")
|
||||||
|
|
||||||
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
||||||
|
regex=False, fuzzy=None, fuzzy_compare=None):
|
||||||
|
self.i = i
|
||||||
|
self.attr = attr
|
||||||
|
self.value = value
|
||||||
|
self.predicate = predicate
|
||||||
|
self.is_extension = is_extension
|
||||||
|
if self.predicate not in self.operators:
|
||||||
|
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||||
|
fuzz = self.predicate[len("FUZZY"):] # number after prefix
|
||||||
|
self.fuzzy = int(fuzz) if fuzz else -1
|
||||||
|
self.fuzzy_compare = fuzzy_compare
|
||||||
|
self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||||
|
|
||||||
|
def __call__(self, Token token):
|
||||||
|
if self.is_extension:
|
||||||
|
value = token._.get(self.attr)
|
||||||
|
else:
|
||||||
|
value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
|
||||||
|
if self.value == value:
|
||||||
|
return True
|
||||||
|
return self.fuzzy_compare(value, self.value, self.fuzzy)
|
||||||
|
|
||||||
|
|
||||||
class _RegexPredicate:
|
class _RegexPredicate:
|
||||||
operators = ("REGEX",)
|
operators = ("REGEX",)
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
||||||
|
regex=False, fuzzy=None, fuzzy_compare=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.value = re.compile(value)
|
self.value = re.compile(value)
|
||||||
self.predicate = predicate
|
self.predicate = predicate
|
||||||
self.is_extension = is_extension
|
self.is_extension = is_extension
|
||||||
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||||
if self.predicate not in self.operators:
|
if self.predicate not in self.operators:
|
||||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||||
|
|
||||||
|
@ -850,18 +884,28 @@ class _RegexPredicate:
|
||||||
class _SetPredicate:
|
class _SetPredicate:
|
||||||
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
|
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
||||||
|
regex=False, fuzzy=None, fuzzy_compare=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
self.regex = regex
|
||||||
|
self.fuzzy = fuzzy
|
||||||
|
self.fuzzy_compare = fuzzy_compare
|
||||||
if self.attr == MORPH:
|
if self.attr == MORPH:
|
||||||
# normalize morph strings
|
# normalize morph strings
|
||||||
self.value = set(self.vocab.morphology.add(v) for v in value)
|
self.value = set(self.vocab.morphology.add(v) for v in value)
|
||||||
else:
|
else:
|
||||||
self.value = set(get_string_id(v) for v in value)
|
if self.regex:
|
||||||
|
self.value = set(re.compile(v) for v in value)
|
||||||
|
elif self.fuzzy is not None:
|
||||||
|
# add to string store
|
||||||
|
self.value = set(self.vocab.strings.add(v) for v in value)
|
||||||
|
else:
|
||||||
|
self.value = set(get_string_id(v) for v in value)
|
||||||
self.predicate = predicate
|
self.predicate = predicate
|
||||||
self.is_extension = is_extension
|
self.is_extension = is_extension
|
||||||
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||||
if self.predicate not in self.operators:
|
if self.predicate not in self.operators:
|
||||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||||
|
|
||||||
|
@ -889,9 +933,29 @@ class _SetPredicate:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if self.predicate == "IN":
|
if self.predicate == "IN":
|
||||||
return value in self.value
|
if self.regex:
|
||||||
|
value = self.vocab.strings[value]
|
||||||
|
return any(bool(v.search(value)) for v in self.value)
|
||||||
|
elif self.fuzzy is not None:
|
||||||
|
value = self.vocab.strings[value]
|
||||||
|
return any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy)
|
||||||
|
for v in self.value)
|
||||||
|
elif value in self.value:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
elif self.predicate == "NOT_IN":
|
elif self.predicate == "NOT_IN":
|
||||||
return value not in self.value
|
if self.regex:
|
||||||
|
value = self.vocab.strings[value]
|
||||||
|
return not any(bool(v.search(value)) for v in self.value)
|
||||||
|
elif self.fuzzy is not None:
|
||||||
|
value = self.vocab.strings[value]
|
||||||
|
return not any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy)
|
||||||
|
for v in self.value)
|
||||||
|
elif value in self.value:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
elif self.predicate == "IS_SUBSET":
|
elif self.predicate == "IS_SUBSET":
|
||||||
return value <= self.value
|
return value <= self.value
|
||||||
elif self.predicate == "IS_SUPERSET":
|
elif self.predicate == "IS_SUPERSET":
|
||||||
|
@ -906,13 +970,14 @@ class _SetPredicate:
|
||||||
class _ComparisonPredicate:
|
class _ComparisonPredicate:
|
||||||
operators = ("==", "!=", ">=", "<=", ">", "<")
|
operators = ("==", "!=", ">=", "<=", ">", "<")
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
||||||
|
regex=False, fuzzy=None, fuzzy_compare=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.value = value
|
self.value = value
|
||||||
self.predicate = predicate
|
self.predicate = predicate
|
||||||
self.is_extension = is_extension
|
self.is_extension = is_extension
|
||||||
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||||
if self.predicate not in self.operators:
|
if self.predicate not in self.operators:
|
||||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||||
|
|
||||||
|
@ -935,7 +1000,7 @@ class _ComparisonPredicate:
|
||||||
return value < self.value
|
return value < self.value
|
||||||
|
|
||||||
|
|
||||||
def _get_extra_predicates(spec, extra_predicates, vocab):
|
def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare):
|
||||||
predicate_types = {
|
predicate_types = {
|
||||||
"REGEX": _RegexPredicate,
|
"REGEX": _RegexPredicate,
|
||||||
"IN": _SetPredicate,
|
"IN": _SetPredicate,
|
||||||
|
@ -949,6 +1014,16 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
|
||||||
"<=": _ComparisonPredicate,
|
"<=": _ComparisonPredicate,
|
||||||
">": _ComparisonPredicate,
|
">": _ComparisonPredicate,
|
||||||
"<": _ComparisonPredicate,
|
"<": _ComparisonPredicate,
|
||||||
|
"FUZZY": _FuzzyPredicate,
|
||||||
|
"FUZZY1": _FuzzyPredicate,
|
||||||
|
"FUZZY2": _FuzzyPredicate,
|
||||||
|
"FUZZY3": _FuzzyPredicate,
|
||||||
|
"FUZZY4": _FuzzyPredicate,
|
||||||
|
"FUZZY5": _FuzzyPredicate,
|
||||||
|
"FUZZY6": _FuzzyPredicate,
|
||||||
|
"FUZZY7": _FuzzyPredicate,
|
||||||
|
"FUZZY8": _FuzzyPredicate,
|
||||||
|
"FUZZY9": _FuzzyPredicate,
|
||||||
}
|
}
|
||||||
seen_predicates = {pred.key: pred.i for pred in extra_predicates}
|
seen_predicates = {pred.key: pred.i for pred in extra_predicates}
|
||||||
output = []
|
output = []
|
||||||
|
@ -966,22 +1041,47 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
|
||||||
attr = "ORTH"
|
attr = "ORTH"
|
||||||
attr = IDS.get(attr.upper())
|
attr = IDS.get(attr.upper())
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
processed = False
|
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
||||||
value_with_upper_keys = {k.upper(): v for k, v in value.items()}
|
extra_predicates, seen_predicates, fuzzy_compare=fuzzy_compare))
|
||||||
for type_, cls in predicate_types.items():
|
return output
|
||||||
if type_ in value_with_upper_keys:
|
|
||||||
predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab)
|
|
||||||
# Don't create a redundant predicates.
|
def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
|
||||||
# This helps with efficiency, as we're caching the results.
|
extra_predicates, seen_predicates, regex=False, fuzzy=None, fuzzy_compare=None):
|
||||||
if predicate.key in seen_predicates:
|
output = []
|
||||||
output.append(seen_predicates[predicate.key])
|
for type_, value in value_dict.items():
|
||||||
else:
|
type_ = type_.upper()
|
||||||
extra_predicates.append(predicate)
|
cls = predicate_types.get(type_)
|
||||||
output.append(predicate.i)
|
if cls is None:
|
||||||
seen_predicates[predicate.key] = predicate.i
|
warnings.warn(Warnings.W035.format(pattern=value_dict))
|
||||||
processed = True
|
# ignore unrecognized predicate type
|
||||||
if not processed:
|
continue
|
||||||
warnings.warn(Warnings.W035.format(pattern=value))
|
elif cls == _RegexPredicate:
|
||||||
|
if isinstance(value, dict):
|
||||||
|
# add predicates inside regex operator
|
||||||
|
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
||||||
|
extra_predicates, seen_predicates,
|
||||||
|
regex=True))
|
||||||
|
continue
|
||||||
|
elif cls == _FuzzyPredicate:
|
||||||
|
if isinstance(value, dict):
|
||||||
|
# add predicates inside fuzzy operator
|
||||||
|
fuzz = type_[len("FUZZY"):] # number after prefix
|
||||||
|
fuzzy_val = int(fuzz) if fuzz else -1
|
||||||
|
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
||||||
|
extra_predicates, seen_predicates,
|
||||||
|
fuzzy=fuzzy_val, fuzzy_compare=fuzzy_compare))
|
||||||
|
continue
|
||||||
|
predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab,
|
||||||
|
regex=regex, fuzzy=fuzzy, fuzzy_compare=fuzzy_compare)
|
||||||
|
# Don't create redundant predicates.
|
||||||
|
# This helps with efficiency, as we're caching the results.
|
||||||
|
if predicate.key in seen_predicates:
|
||||||
|
output.append(seen_predicates[predicate.key])
|
||||||
|
else:
|
||||||
|
extra_predicates.append(predicate)
|
||||||
|
output.append(predicate.i)
|
||||||
|
seen_predicates[predicate.key] = predicate.i
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,7 @@ from ..errors import Errors, Warnings
|
||||||
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
|
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..matcher import Matcher, PhraseMatcher
|
from ..matcher import Matcher, PhraseMatcher
|
||||||
|
from ..matcher.levenshtein import levenshtein_compare
|
||||||
from ..scorer import get_ner_prf
|
from ..scorer import get_ner_prf
|
||||||
|
|
||||||
|
|
||||||
|
@ -23,6 +24,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
||||||
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
|
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
|
||||||
default_config={
|
default_config={
|
||||||
"phrase_matcher_attr": None,
|
"phrase_matcher_attr": None,
|
||||||
|
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
||||||
"validate": False,
|
"validate": False,
|
||||||
"overwrite_ents": False,
|
"overwrite_ents": False,
|
||||||
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
||||||
|
@ -39,6 +41,7 @@ def make_entity_ruler(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
phrase_matcher_attr: Optional[Union[int, str]],
|
phrase_matcher_attr: Optional[Union[int, str]],
|
||||||
|
matcher_fuzzy_compare: Callable,
|
||||||
validate: bool,
|
validate: bool,
|
||||||
overwrite_ents: bool,
|
overwrite_ents: bool,
|
||||||
ent_id_sep: str,
|
ent_id_sep: str,
|
||||||
|
@ -48,6 +51,7 @@ def make_entity_ruler(
|
||||||
nlp,
|
nlp,
|
||||||
name,
|
name,
|
||||||
phrase_matcher_attr=phrase_matcher_attr,
|
phrase_matcher_attr=phrase_matcher_attr,
|
||||||
|
matcher_fuzzy_compare=matcher_fuzzy_compare,
|
||||||
validate=validate,
|
validate=validate,
|
||||||
overwrite_ents=overwrite_ents,
|
overwrite_ents=overwrite_ents,
|
||||||
ent_id_sep=ent_id_sep,
|
ent_id_sep=ent_id_sep,
|
||||||
|
@ -81,6 +85,7 @@ class EntityRuler(Pipe):
|
||||||
name: str = "entity_ruler",
|
name: str = "entity_ruler",
|
||||||
*,
|
*,
|
||||||
phrase_matcher_attr: Optional[Union[int, str]] = None,
|
phrase_matcher_attr: Optional[Union[int, str]] = None,
|
||||||
|
matcher_fuzzy_compare: Callable = levenshtein_compare,
|
||||||
validate: bool = False,
|
validate: bool = False,
|
||||||
overwrite_ents: bool = False,
|
overwrite_ents: bool = False,
|
||||||
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
|
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
|
||||||
|
@ -99,7 +104,10 @@ class EntityRuler(Pipe):
|
||||||
added. Used to disable the current entity ruler while creating
|
added. Used to disable the current entity ruler while creating
|
||||||
phrase patterns with the nlp object.
|
phrase patterns with the nlp object.
|
||||||
phrase_matcher_attr (int / str): Token attribute to match on, passed
|
phrase_matcher_attr (int / str): Token attribute to match on, passed
|
||||||
to the internal PhraseMatcher as `attr`
|
to the internal PhraseMatcher as `attr`.
|
||||||
|
matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
|
||||||
|
internal Matcher. Defaults to
|
||||||
|
spacy.matcher.levenshtein.levenshtein_compare.
|
||||||
validate (bool): Whether patterns should be validated, passed to
|
validate (bool): Whether patterns should be validated, passed to
|
||||||
Matcher and PhraseMatcher as `validate`
|
Matcher and PhraseMatcher as `validate`
|
||||||
patterns (iterable): Optional patterns to load in.
|
patterns (iterable): Optional patterns to load in.
|
||||||
|
@ -117,7 +125,10 @@ class EntityRuler(Pipe):
|
||||||
self.token_patterns = defaultdict(list) # type: ignore
|
self.token_patterns = defaultdict(list) # type: ignore
|
||||||
self.phrase_patterns = defaultdict(list) # type: ignore
|
self.phrase_patterns = defaultdict(list) # type: ignore
|
||||||
self._validate = validate
|
self._validate = validate
|
||||||
self.matcher = Matcher(nlp.vocab, validate=validate)
|
self.matcher_fuzzy_compare = matcher_fuzzy_compare
|
||||||
|
self.matcher = Matcher(
|
||||||
|
nlp.vocab, validate=validate, fuzzy_compare=self.matcher_fuzzy_compare
|
||||||
|
)
|
||||||
self.phrase_matcher_attr = phrase_matcher_attr
|
self.phrase_matcher_attr = phrase_matcher_attr
|
||||||
self.phrase_matcher = PhraseMatcher(
|
self.phrase_matcher = PhraseMatcher(
|
||||||
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
|
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
|
||||||
|
@ -337,7 +348,11 @@ class EntityRuler(Pipe):
|
||||||
self.token_patterns = defaultdict(list)
|
self.token_patterns = defaultdict(list)
|
||||||
self.phrase_patterns = defaultdict(list)
|
self.phrase_patterns = defaultdict(list)
|
||||||
self._ent_ids = defaultdict(tuple)
|
self._ent_ids = defaultdict(tuple)
|
||||||
self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
|
self.matcher = Matcher(
|
||||||
|
self.nlp.vocab,
|
||||||
|
validate=self._validate,
|
||||||
|
fuzzy_compare=self.matcher_fuzzy_compare,
|
||||||
|
)
|
||||||
self.phrase_matcher = PhraseMatcher(
|
self.phrase_matcher = PhraseMatcher(
|
||||||
self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
|
self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
|
||||||
)
|
)
|
||||||
|
@ -431,7 +446,8 @@ class EntityRuler(Pipe):
|
||||||
self.overwrite = cfg.get("overwrite", False)
|
self.overwrite = cfg.get("overwrite", False)
|
||||||
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
|
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
|
||||||
self.phrase_matcher = PhraseMatcher(
|
self.phrase_matcher = PhraseMatcher(
|
||||||
self.nlp.vocab, attr=self.phrase_matcher_attr
|
self.nlp.vocab,
|
||||||
|
attr=self.phrase_matcher_attr,
|
||||||
)
|
)
|
||||||
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -13,6 +13,7 @@ from ..util import ensure_path, SimpleFrozenList, registry
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..matcher import Matcher, PhraseMatcher
|
from ..matcher import Matcher, PhraseMatcher
|
||||||
|
from ..matcher.levenshtein import levenshtein_compare
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
||||||
|
@ -28,6 +29,7 @@ DEFAULT_SPANS_KEY = "ruler"
|
||||||
"overwrite_ents": False,
|
"overwrite_ents": False,
|
||||||
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
||||||
"ent_id_sep": "__unused__",
|
"ent_id_sep": "__unused__",
|
||||||
|
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"ents_f": 1.0,
|
"ents_f": 1.0,
|
||||||
|
@ -40,6 +42,7 @@ def make_entity_ruler(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
phrase_matcher_attr: Optional[Union[int, str]],
|
phrase_matcher_attr: Optional[Union[int, str]],
|
||||||
|
matcher_fuzzy_compare: Callable,
|
||||||
validate: bool,
|
validate: bool,
|
||||||
overwrite_ents: bool,
|
overwrite_ents: bool,
|
||||||
scorer: Optional[Callable],
|
scorer: Optional[Callable],
|
||||||
|
@ -57,6 +60,7 @@ def make_entity_ruler(
|
||||||
annotate_ents=True,
|
annotate_ents=True,
|
||||||
ents_filter=ents_filter,
|
ents_filter=ents_filter,
|
||||||
phrase_matcher_attr=phrase_matcher_attr,
|
phrase_matcher_attr=phrase_matcher_attr,
|
||||||
|
matcher_fuzzy_compare=matcher_fuzzy_compare,
|
||||||
validate=validate,
|
validate=validate,
|
||||||
overwrite=False,
|
overwrite=False,
|
||||||
scorer=scorer,
|
scorer=scorer,
|
||||||
|
@ -72,6 +76,7 @@ def make_entity_ruler(
|
||||||
"annotate_ents": False,
|
"annotate_ents": False,
|
||||||
"ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
|
"ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
|
||||||
"phrase_matcher_attr": None,
|
"phrase_matcher_attr": None,
|
||||||
|
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
||||||
"validate": False,
|
"validate": False,
|
||||||
"overwrite": True,
|
"overwrite": True,
|
||||||
"scorer": {
|
"scorer": {
|
||||||
|
@ -94,6 +99,7 @@ def make_span_ruler(
|
||||||
annotate_ents: bool,
|
annotate_ents: bool,
|
||||||
ents_filter: Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]],
|
ents_filter: Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]],
|
||||||
phrase_matcher_attr: Optional[Union[int, str]],
|
phrase_matcher_attr: Optional[Union[int, str]],
|
||||||
|
matcher_fuzzy_compare: Callable,
|
||||||
validate: bool,
|
validate: bool,
|
||||||
overwrite: bool,
|
overwrite: bool,
|
||||||
scorer: Optional[Callable],
|
scorer: Optional[Callable],
|
||||||
|
@ -106,6 +112,7 @@ def make_span_ruler(
|
||||||
annotate_ents=annotate_ents,
|
annotate_ents=annotate_ents,
|
||||||
ents_filter=ents_filter,
|
ents_filter=ents_filter,
|
||||||
phrase_matcher_attr=phrase_matcher_attr,
|
phrase_matcher_attr=phrase_matcher_attr,
|
||||||
|
matcher_fuzzy_compare=matcher_fuzzy_compare,
|
||||||
validate=validate,
|
validate=validate,
|
||||||
overwrite=overwrite,
|
overwrite=overwrite,
|
||||||
scorer=scorer,
|
scorer=scorer,
|
||||||
|
@ -170,7 +177,7 @@ def prioritize_existing_ents_filter(
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.prioritize_existing_ents_filter.v1")
|
@registry.misc("spacy.prioritize_existing_ents_filter.v1")
|
||||||
def make_preverse_existing_ents_filter():
|
def make_preserve_existing_ents_filter():
|
||||||
return prioritize_existing_ents_filter
|
return prioritize_existing_ents_filter
|
||||||
|
|
||||||
|
|
||||||
|
@ -216,6 +223,7 @@ class SpanRuler(Pipe):
|
||||||
[Iterable[Span], Iterable[Span]], Iterable[Span]
|
[Iterable[Span], Iterable[Span]], Iterable[Span]
|
||||||
] = util.filter_chain_spans,
|
] = util.filter_chain_spans,
|
||||||
phrase_matcher_attr: Optional[Union[int, str]] = None,
|
phrase_matcher_attr: Optional[Union[int, str]] = None,
|
||||||
|
matcher_fuzzy_compare: Callable = levenshtein_compare,
|
||||||
validate: bool = False,
|
validate: bool = False,
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
scorer: Optional[Callable] = partial(
|
scorer: Optional[Callable] = partial(
|
||||||
|
@ -246,6 +254,9 @@ class SpanRuler(Pipe):
|
||||||
phrase_matcher_attr (Optional[Union[int, str]]): Token attribute to
|
phrase_matcher_attr (Optional[Union[int, str]]): Token attribute to
|
||||||
match on, passed to the internal PhraseMatcher as `attr`. Defaults
|
match on, passed to the internal PhraseMatcher as `attr`. Defaults
|
||||||
to `None`.
|
to `None`.
|
||||||
|
matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
|
||||||
|
internal Matcher. Defaults to
|
||||||
|
spacy.matcher.levenshtein.levenshtein_compare.
|
||||||
validate (bool): Whether patterns should be validated, passed to
|
validate (bool): Whether patterns should be validated, passed to
|
||||||
Matcher and PhraseMatcher as `validate`.
|
Matcher and PhraseMatcher as `validate`.
|
||||||
overwrite (bool): Whether to remove any existing spans under this spans
|
overwrite (bool): Whether to remove any existing spans under this spans
|
||||||
|
@ -266,6 +277,7 @@ class SpanRuler(Pipe):
|
||||||
self.spans_filter = spans_filter
|
self.spans_filter = spans_filter
|
||||||
self.ents_filter = ents_filter
|
self.ents_filter = ents_filter
|
||||||
self.scorer = scorer
|
self.scorer = scorer
|
||||||
|
self.matcher_fuzzy_compare = matcher_fuzzy_compare
|
||||||
self._match_label_id_map: Dict[int, Dict[str, str]] = {}
|
self._match_label_id_map: Dict[int, Dict[str, str]] = {}
|
||||||
self.clear()
|
self.clear()
|
||||||
|
|
||||||
|
@ -451,7 +463,11 @@ class SpanRuler(Pipe):
|
||||||
DOCS: https://spacy.io/api/spanruler#clear
|
DOCS: https://spacy.io/api/spanruler#clear
|
||||||
"""
|
"""
|
||||||
self._patterns: List[PatternType] = []
|
self._patterns: List[PatternType] = []
|
||||||
self.matcher: Matcher = Matcher(self.nlp.vocab, validate=self.validate)
|
self.matcher: Matcher = Matcher(
|
||||||
|
self.nlp.vocab,
|
||||||
|
validate=self.validate,
|
||||||
|
fuzzy_compare=self.matcher_fuzzy_compare,
|
||||||
|
)
|
||||||
self.phrase_matcher: PhraseMatcher = PhraseMatcher(
|
self.phrase_matcher: PhraseMatcher = PhraseMatcher(
|
||||||
self.nlp.vocab,
|
self.nlp.vocab,
|
||||||
attr=self.phrase_matcher_attr,
|
attr=self.phrase_matcher_attr,
|
||||||
|
|
|
@ -74,7 +74,7 @@ subword_features = true
|
||||||
default_config={
|
default_config={
|
||||||
"threshold": 0.0,
|
"threshold": 0.0,
|
||||||
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
||||||
"scorer": {"@scorers": "spacy.textcat_scorer.v1"},
|
"scorer": {"@scorers": "spacy.textcat_scorer.v2"},
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"cats_score": 1.0,
|
"cats_score": 1.0,
|
||||||
|
@ -117,7 +117,7 @@ def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@registry.scorers("spacy.textcat_scorer.v1")
|
@registry.scorers("spacy.textcat_scorer.v2")
|
||||||
def make_textcat_scorer():
|
def make_textcat_scorer():
|
||||||
return textcat_score
|
return textcat_score
|
||||||
|
|
||||||
|
|
|
@ -74,7 +74,7 @@ subword_features = true
|
||||||
default_config={
|
default_config={
|
||||||
"threshold": 0.5,
|
"threshold": 0.5,
|
||||||
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
|
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
|
||||||
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
|
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"cats_score": 1.0,
|
"cats_score": 1.0,
|
||||||
|
@ -120,7 +120,7 @@ def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@registry.scorers("spacy.textcat_multilabel_scorer.v1")
|
@registry.scorers("spacy.textcat_multilabel_scorer.v2")
|
||||||
def make_textcat_multilabel_scorer():
|
def make_textcat_multilabel_scorer():
|
||||||
return textcat_multilabel_score
|
return textcat_multilabel_score
|
||||||
|
|
||||||
|
|
|
@ -156,12 +156,22 @@ def validate_token_pattern(obj: list) -> List[str]:
|
||||||
|
|
||||||
|
|
||||||
class TokenPatternString(BaseModel):
|
class TokenPatternString(BaseModel):
|
||||||
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
REGEX: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="regex")
|
||||||
IN: Optional[List[StrictStr]] = Field(None, alias="in")
|
IN: Optional[List[StrictStr]] = Field(None, alias="in")
|
||||||
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
|
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
|
||||||
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
|
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
|
||||||
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
|
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
|
||||||
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
|
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
|
||||||
|
FUZZY: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy")
|
||||||
|
FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy1")
|
||||||
|
FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy2")
|
||||||
|
FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy3")
|
||||||
|
FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy4")
|
||||||
|
FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy5")
|
||||||
|
FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy6")
|
||||||
|
FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy7")
|
||||||
|
FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy8")
|
||||||
|
FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy9")
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
extra = "forbid"
|
extra = "forbid"
|
||||||
|
|
|
@ -476,14 +476,12 @@ class Scorer:
|
||||||
f_per_type = {label: PRFScore() for label in labels}
|
f_per_type = {label: PRFScore() for label in labels}
|
||||||
auc_per_type = {label: ROCAUCScore() for label in labels}
|
auc_per_type = {label: ROCAUCScore() for label in labels}
|
||||||
labels = set(labels)
|
labels = set(labels)
|
||||||
if labels:
|
|
||||||
for eg in examples:
|
|
||||||
labels.update(eg.predicted.cats.keys())
|
|
||||||
labels.update(eg.reference.cats.keys())
|
|
||||||
for example in examples:
|
for example in examples:
|
||||||
# Through this loop, None in the gold_cats indicates missing label.
|
# Through this loop, None in the gold_cats indicates missing label.
|
||||||
pred_cats = getter(example.predicted, attr)
|
pred_cats = getter(example.predicted, attr)
|
||||||
|
pred_cats = {k: v for k, v in pred_cats.items() if k in labels}
|
||||||
gold_cats = getter(example.reference, attr)
|
gold_cats = getter(example.reference, attr)
|
||||||
|
gold_cats = {k: v for k, v in gold_cats.items() if k in labels}
|
||||||
|
|
||||||
for label in labels:
|
for label in labels:
|
||||||
pred_score = pred_cats.get(label, 0.0)
|
pred_score = pred_cats.get(label, 0.0)
|
||||||
|
|
|
@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
|
||||||
|
|
||||||
# head before start
|
# head before start
|
||||||
arr = doc.to_array(["HEAD"])
|
arr = doc.to_array(["HEAD"])
|
||||||
arr[0] = -1
|
arr[0] = numpy.int32(-1).astype(numpy.uint64)
|
||||||
doc_from_array = Doc(en_vocab, words=words)
|
doc_from_array = Doc(en_vocab, words=words)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
doc_from_array.from_array(["HEAD"], arr)
|
doc_from_array.from_array(["HEAD"], arr)
|
||||||
|
|
||||||
# head after end
|
# head after end
|
||||||
arr = doc.to_array(["HEAD"])
|
arr = doc.to_array(["HEAD"])
|
||||||
arr[0] = 5
|
arr[0] = numpy.int32(5).astype(numpy.uint64)
|
||||||
doc_from_array = Doc(en_vocab, words=words)
|
doc_from_array = Doc(en_vocab, words=words)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
doc_from_array.from_array(["HEAD"], arr)
|
doc_from_array.from_array(["HEAD"], arr)
|
||||||
|
|
|
@ -1,7 +1,10 @@
|
||||||
|
from typing import List
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from random import Random
|
from random import Random
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
from spacy.tokens import Span, SpanGroup
|
from spacy.tokens import Span, SpanGroup, Doc
|
||||||
|
from spacy.util import filter_spans
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -240,3 +243,13 @@ def test_span_group_extend(doc):
|
||||||
def test_span_group_dealloc(span_group):
|
def test_span_group_dealloc(span_group):
|
||||||
with pytest.raises(AttributeError):
|
with pytest.raises(AttributeError):
|
||||||
print(span_group.doc)
|
print(span_group.doc)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(11975)
|
||||||
|
def test_span_group_typing(doc: Doc):
|
||||||
|
"""Tests whether typing of `SpanGroup` as `Iterable[Span]`-like object is accepted by mypy."""
|
||||||
|
span_group: SpanGroup = doc.spans["SPANS"]
|
||||||
|
spans: List[Span] = list(span_group)
|
||||||
|
for i, span in enumerate(span_group):
|
||||||
|
assert span == span_group[i] == spans[i]
|
||||||
|
filter_spans(span_group)
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.matcher import levenshtein
|
from spacy.matcher import levenshtein
|
||||||
|
from spacy.matcher.levenshtein import levenshtein_compare
|
||||||
|
|
||||||
|
|
||||||
# empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
|
# empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
|
||||||
|
@ -42,3 +43,31 @@ from spacy.matcher import levenshtein
|
||||||
)
|
)
|
||||||
def test_levenshtein(dist, a, b):
|
def test_levenshtein(dist, a, b):
|
||||||
assert levenshtein(a, b) == dist
|
assert levenshtein(a, b) == dist
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"a,b,fuzzy,expected",
|
||||||
|
[
|
||||||
|
("a", "a", 1, True),
|
||||||
|
("a", "a", 0, True),
|
||||||
|
("a", "a", -1, True),
|
||||||
|
("a", "ab", 1, True),
|
||||||
|
("a", "ab", 0, False),
|
||||||
|
("a", "ab", -1, True),
|
||||||
|
("ab", "ac", 1, True),
|
||||||
|
("ab", "ac", -1, True),
|
||||||
|
("abc", "cde", 4, True),
|
||||||
|
("abc", "cde", -1, False),
|
||||||
|
("abcdef", "cdefgh", 4, True),
|
||||||
|
("abcdef", "cdefgh", 3, False),
|
||||||
|
("abcdef", "cdefgh", -1, False), # default (2 for length 6)
|
||||||
|
("abcdefgh", "cdefghijk", 5, True),
|
||||||
|
("abcdefgh", "cdefghijk", 4, False),
|
||||||
|
("abcdefgh", "cdefghijk", -1, False), # default (2)
|
||||||
|
("abcdefgh", "cdefghijkl", 6, True),
|
||||||
|
("abcdefgh", "cdefghijkl", 5, False),
|
||||||
|
("abcdefgh", "cdefghijkl", -1, False), # default (2)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_levenshtein_compare(a, b, fuzzy, expected):
|
||||||
|
assert levenshtein_compare(a, b, fuzzy) == expected
|
||||||
|
|
|
@ -118,6 +118,155 @@ def test_matcher_match_multi(matcher):
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"rules,match_locs",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
{
|
||||||
|
"GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
|
||||||
|
},
|
||||||
|
[(2, 4)],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
{
|
||||||
|
"Java": [[{"LOWER": {"FUZZY": "java"}}]],
|
||||||
|
},
|
||||||
|
[(5, 6)],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
{
|
||||||
|
"JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]],
|
||||||
|
"GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
|
||||||
|
"Java": [[{"LOWER": {"FUZZY": "java"}}]],
|
||||||
|
},
|
||||||
|
[(2, 4), (5, 6), (8, 9)],
|
||||||
|
),
|
||||||
|
# only the second pattern matches (check that predicate keys used for
|
||||||
|
# caching don't collide)
|
||||||
|
(
|
||||||
|
{
|
||||||
|
"A": [[{"ORTH": {"FUZZY": "Javascripts"}}]],
|
||||||
|
"B": [[{"ORTH": {"FUZZY5": "Javascripts"}}]],
|
||||||
|
},
|
||||||
|
[(8, 9)],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_matcher_match_fuzzy(en_vocab, rules, match_locs):
|
||||||
|
words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
|
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
for key, patterns in rules.items():
|
||||||
|
matcher.add(key, patterns)
|
||||||
|
assert match_locs == [(start, end) for m_id, start, end in matcher(doc)]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("set_op", ["IN", "NOT_IN"])
|
||||||
|
def test_matcher_match_fuzzy_set_op_longest(en_vocab, set_op):
|
||||||
|
rules = {
|
||||||
|
"GoogleNow": [[{"ORTH": {"FUZZY": {set_op: ["Google", "Now"]}}, "OP": "+"}]]
|
||||||
|
}
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
for key, patterns in rules.items():
|
||||||
|
matcher.add(key, patterns, greedy="LONGEST")
|
||||||
|
|
||||||
|
words = ["They", "like", "Goggle", "Noo"]
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_match_fuzzy_set_multiple(en_vocab):
|
||||||
|
rules = {
|
||||||
|
"GoogleNow": [
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"ORTH": {"FUZZY": {"IN": ["Google", "Now"]}, "NOT_IN": ["Goggle"]},
|
||||||
|
"OP": "+",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
]
|
||||||
|
}
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
for key, patterns in rules.items():
|
||||||
|
matcher.add(key, patterns, greedy="LONGEST")
|
||||||
|
|
||||||
|
words = ["They", "like", "Goggle", "Noo"]
|
||||||
|
doc = Doc(matcher.vocab, words=words)
|
||||||
|
assert matcher(doc) == [
|
||||||
|
(doc.vocab.strings["GoogleNow"], 3, 4),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("fuzzyn", range(1, 10))
|
||||||
|
def test_matcher_match_fuzzyn_all_insertions(en_vocab, fuzzyn):
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("GoogleNow", [[{"ORTH": {f"FUZZY{fuzzyn}": "GoogleNow"}}]])
|
||||||
|
# words with increasing edit distance
|
||||||
|
words = ["GoogleNow" + "a" * i for i in range(0, 10)]
|
||||||
|
doc = Doc(en_vocab, words)
|
||||||
|
assert len(matcher(doc)) == fuzzyn + 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("fuzzyn", range(1, 6))
|
||||||
|
def test_matcher_match_fuzzyn_various_edits(en_vocab, fuzzyn):
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("GoogleNow", [[{"ORTH": {f"FUZZY{fuzzyn}": "GoogleNow"}}]])
|
||||||
|
# words with increasing edit distance of different edit types
|
||||||
|
words = [
|
||||||
|
"GoogleNow",
|
||||||
|
"GoogleNuw",
|
||||||
|
"GoogleNuew",
|
||||||
|
"GoogleNoweee",
|
||||||
|
"GiggleNuw3",
|
||||||
|
"gouggle5New",
|
||||||
|
]
|
||||||
|
doc = Doc(en_vocab, words)
|
||||||
|
assert len(matcher(doc)) == fuzzyn + 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("greedy", ["FIRST", "LONGEST"])
|
||||||
|
@pytest.mark.parametrize("set_op", ["IN", "NOT_IN"])
|
||||||
|
def test_matcher_match_fuzzyn_set_op_longest(en_vocab, greedy, set_op):
|
||||||
|
rules = {
|
||||||
|
"GoogleNow": [[{"ORTH": {"FUZZY2": {set_op: ["Google", "Now"]}}, "OP": "+"}]]
|
||||||
|
}
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
for key, patterns in rules.items():
|
||||||
|
matcher.add(key, patterns, greedy=greedy)
|
||||||
|
|
||||||
|
words = ["They", "like", "Goggle", "Noo"]
|
||||||
|
doc = Doc(matcher.vocab, words=words)
|
||||||
|
spans = matcher(doc, as_spans=True)
|
||||||
|
assert len(spans) == 1
|
||||||
|
if set_op == "IN":
|
||||||
|
assert spans[0].text == "Goggle Noo"
|
||||||
|
else:
|
||||||
|
assert spans[0].text == "They like"
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_match_fuzzyn_set_multiple(en_vocab):
|
||||||
|
rules = {
|
||||||
|
"GoogleNow": [
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"ORTH": {"FUZZY1": {"IN": ["Google", "Now"]}, "NOT_IN": ["Goggle"]},
|
||||||
|
"OP": "+",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
]
|
||||||
|
}
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
for key, patterns in rules.items():
|
||||||
|
matcher.add(key, patterns, greedy="LONGEST")
|
||||||
|
|
||||||
|
words = ["They", "like", "Goggle", "Noo"]
|
||||||
|
doc = Doc(matcher.vocab, words=words)
|
||||||
|
assert matcher(doc) == [
|
||||||
|
(doc.vocab.strings["GoogleNow"], 3, 4),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_empty_dict(en_vocab):
|
def test_matcher_empty_dict(en_vocab):
|
||||||
"""Test matcher allows empty token specs, meaning match on any token."""
|
"""Test matcher allows empty token specs, meaning match on any token."""
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
|
@ -437,6 +586,30 @@ def test_matcher_regex(en_vocab):
|
||||||
assert len(matches) == 0
|
assert len(matches) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_regex_set_in(en_vocab):
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"ORTH": {"REGEX": {"IN": [r"(?:a)", r"(?:an)"]}}}]
|
||||||
|
matcher.add("A_OR_AN", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["an", "a", "hi"])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 2
|
||||||
|
doc = Doc(en_vocab, words=["bye"])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_regex_set_not_in(en_vocab):
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"ORTH": {"REGEX": {"NOT_IN": [r"(?:a)", r"(?:an)"]}}}]
|
||||||
|
matcher.add("A_OR_AN", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["an", "a", "hi"])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 1
|
||||||
|
doc = Doc(en_vocab, words=["bye"])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_regex_shape(en_vocab):
|
def test_matcher_regex_shape(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}]
|
pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}]
|
||||||
|
|
|
@ -382,6 +382,43 @@ def test_entity_ruler_overlapping_spans(nlp, entity_ruler_factory):
|
||||||
assert doc.ents[0].label_ == "FOOBAR"
|
assert doc.ents[0].label_ == "FOOBAR"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||||
|
def test_entity_ruler_fuzzy_pipe(nlp, entity_ruler_factory):
|
||||||
|
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||||
|
patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
doc = nlp("helloo")
|
||||||
|
assert len(doc.ents) == 1
|
||||||
|
assert doc.ents[0].label_ == "HELLO"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||||
|
def test_entity_ruler_fuzzy(nlp, entity_ruler_factory):
|
||||||
|
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||||
|
patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
doc = nlp("helloo")
|
||||||
|
assert len(doc.ents) == 1
|
||||||
|
assert doc.ents[0].label_ == "HELLO"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||||
|
def test_entity_ruler_fuzzy_disabled(nlp, entity_ruler_factory):
|
||||||
|
@registry.misc("test_fuzzy_compare_disabled")
|
||||||
|
def make_test_fuzzy_compare_disabled():
|
||||||
|
return lambda x, y, z: False
|
||||||
|
|
||||||
|
ruler = nlp.add_pipe(
|
||||||
|
entity_ruler_factory,
|
||||||
|
name="entity_ruler",
|
||||||
|
config={"matcher_fuzzy_compare": {"@misc": "test_fuzzy_compare_disabled"}},
|
||||||
|
)
|
||||||
|
patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
doc = nlp("helloo")
|
||||||
|
assert len(doc.ents) == 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("n_process", [1, 2])
|
@pytest.mark.parametrize("n_process", [1, 2])
|
||||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||||
def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory):
|
def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory):
|
||||||
|
|
|
@ -895,3 +895,26 @@ def test_textcat_multi_threshold():
|
||||||
|
|
||||||
scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0})
|
scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0})
|
||||||
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
|
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"component_name,scorer",
|
||||||
|
[
|
||||||
|
("textcat", "spacy.textcat_scorer.v1"),
|
||||||
|
("textcat_multilabel", "spacy.textcat_multilabel_scorer.v1"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_textcat_legacy_scorers(component_name, scorer):
|
||||||
|
"""Check that legacy scorers are registered and produce the expected score
|
||||||
|
keys."""
|
||||||
|
nlp = English()
|
||||||
|
nlp.add_pipe(component_name, config={"scorer": {"@scorers": scorer}})
|
||||||
|
|
||||||
|
train_examples = []
|
||||||
|
for text, annotations in TRAIN_DATA_SINGLE_LABEL:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
# score the model (it's not actually trained but that doesn't matter)
|
||||||
|
scores = nlp.evaluate(train_examples)
|
||||||
|
assert 0 <= scores["cats_score"] <= 1
|
||||||
|
|
|
@ -4,7 +4,9 @@ from collections import Counter
|
||||||
from typing import Tuple, List, Dict, Any
|
from typing import Tuple, List, Dict, Any
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
import time
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import spacy
|
||||||
import numpy
|
import numpy
|
||||||
import pytest
|
import pytest
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -14,7 +16,7 @@ from thinc.api import Config, ConfigValidationError
|
||||||
|
|
||||||
from spacy import about
|
from spacy import about
|
||||||
from spacy.cli import info
|
from spacy.cli import info
|
||||||
from spacy.cli._util import is_subpath_of, load_project_config
|
from spacy.cli._util import is_subpath_of, load_project_config, walk_directory
|
||||||
from spacy.cli._util import parse_config_overrides, string_to_list
|
from spacy.cli._util import parse_config_overrides, string_to_list
|
||||||
from spacy.cli._util import substitute_project_variables
|
from spacy.cli._util import substitute_project_variables
|
||||||
from spacy.cli._util import validate_project_commands
|
from spacy.cli._util import validate_project_commands
|
||||||
|
@ -32,6 +34,7 @@ from spacy.cli.package import _is_permitted_package_name
|
||||||
from spacy.cli.project.remote_storage import RemoteStorage
|
from spacy.cli.project.remote_storage import RemoteStorage
|
||||||
from spacy.cli.project.run import _check_requirements
|
from spacy.cli.project.run import _check_requirements
|
||||||
from spacy.cli.validate import get_model_pkgs
|
from spacy.cli.validate import get_model_pkgs
|
||||||
|
from spacy.cli.apply import apply
|
||||||
from spacy.cli.find_threshold import find_threshold
|
from spacy.cli.find_threshold import find_threshold
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.nl import Dutch
|
from spacy.lang.nl import Dutch
|
||||||
|
@ -885,6 +888,82 @@ def test_span_length_freq_dist_output_must_be_correct():
|
||||||
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
|
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
|
||||||
|
|
||||||
|
|
||||||
|
def test_applycli_empty_dir():
|
||||||
|
with make_tempdir() as data_path:
|
||||||
|
output = data_path / "test.spacy"
|
||||||
|
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_applycli_docbin():
|
||||||
|
with make_tempdir() as data_path:
|
||||||
|
output = data_path / "testout.spacy"
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
doc = nlp("testing apply cli.")
|
||||||
|
# test empty DocBin case
|
||||||
|
docbin = DocBin()
|
||||||
|
docbin.to_disk(data_path / "testin.spacy")
|
||||||
|
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||||
|
docbin.add(doc)
|
||||||
|
docbin.to_disk(data_path / "testin.spacy")
|
||||||
|
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_applycli_jsonl():
|
||||||
|
with make_tempdir() as data_path:
|
||||||
|
output = data_path / "testout.spacy"
|
||||||
|
data = [{"field": "Testing apply cli.", "key": 234}]
|
||||||
|
data2 = [{"field": "234"}]
|
||||||
|
srsly.write_jsonl(data_path / "test.jsonl", data)
|
||||||
|
apply(data_path, output, "blank:en", "field", 1, 1)
|
||||||
|
srsly.write_jsonl(data_path / "test2.jsonl", data2)
|
||||||
|
apply(data_path, output, "blank:en", "field", 1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_applycli_txt():
|
||||||
|
with make_tempdir() as data_path:
|
||||||
|
output = data_path / "testout.spacy"
|
||||||
|
with open(data_path / "test.foo", "w") as ftest:
|
||||||
|
ftest.write("Testing apply cli.")
|
||||||
|
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_applycli_mixed():
|
||||||
|
with make_tempdir() as data_path:
|
||||||
|
output = data_path / "testout.spacy"
|
||||||
|
text = "Testing apply cli"
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
doc = nlp(text)
|
||||||
|
jsonl_data = [{"text": text}]
|
||||||
|
srsly.write_jsonl(data_path / "test.jsonl", jsonl_data)
|
||||||
|
docbin = DocBin()
|
||||||
|
docbin.add(doc)
|
||||||
|
docbin.to_disk(data_path / "testin.spacy")
|
||||||
|
with open(data_path / "test.txt", "w") as ftest:
|
||||||
|
ftest.write(text)
|
||||||
|
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||||
|
# Check whether it worked
|
||||||
|
result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
|
||||||
|
assert len(result) == 3
|
||||||
|
for doc in result:
|
||||||
|
assert doc.text == text
|
||||||
|
|
||||||
|
|
||||||
|
def test_applycli_user_data():
|
||||||
|
Doc.set_extension("ext", default=0)
|
||||||
|
val = ("ext", 0)
|
||||||
|
with make_tempdir() as data_path:
|
||||||
|
output = data_path / "testout.spacy"
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
doc = nlp("testing apply cli.")
|
||||||
|
doc._.ext = val
|
||||||
|
docbin = DocBin(store_user_data=True)
|
||||||
|
docbin.add(doc)
|
||||||
|
docbin.to_disk(data_path / "testin.spacy")
|
||||||
|
apply(data_path, output, "blank:en", "", 1, 1)
|
||||||
|
result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
|
||||||
|
assert result[0]._.ext == val
|
||||||
|
|
||||||
|
|
||||||
def test_local_remote_storage():
|
def test_local_remote_storage():
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
filename = "a.txt"
|
filename = "a.txt"
|
||||||
|
@ -1107,3 +1186,26 @@ def test_upload_download_local_file():
|
||||||
download_file(remote_file, local_file)
|
download_file(remote_file, local_file)
|
||||||
with local_file.open(mode="r") as file_:
|
with local_file.open(mode="r") as file_:
|
||||||
assert file_.read() == content
|
assert file_.read() == content
|
||||||
|
|
||||||
|
|
||||||
|
def test_walk_directory():
|
||||||
|
with make_tempdir() as d:
|
||||||
|
files = [
|
||||||
|
"data1.iob",
|
||||||
|
"data2.iob",
|
||||||
|
"data3.json",
|
||||||
|
"data4.conll",
|
||||||
|
"data5.conll",
|
||||||
|
"data6.conll",
|
||||||
|
"data7.txt",
|
||||||
|
]
|
||||||
|
|
||||||
|
for f in files:
|
||||||
|
Path(d / f).touch()
|
||||||
|
|
||||||
|
assert (len(walk_directory(d))) == 7
|
||||||
|
assert (len(walk_directory(d, suffix=None))) == 7
|
||||||
|
assert (len(walk_directory(d, suffix="json"))) == 1
|
||||||
|
assert (len(walk_directory(d, suffix="iob"))) == 2
|
||||||
|
assert (len(walk_directory(d, suffix="conll"))) == 3
|
||||||
|
assert (len(walk_directory(d, suffix="pdf"))) == 0
|
||||||
|
|
33
spacy/tests/test_cli_app.py
Normal file
33
spacy/tests/test_cli_app.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typer.testing import CliRunner
|
||||||
|
|
||||||
|
from spacy.cli._util import app
|
||||||
|
from .util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_auto():
|
||||||
|
with make_tempdir() as d_in, make_tempdir() as d_out:
|
||||||
|
for f in ["data1.iob", "data2.iob", "data3.iob"]:
|
||||||
|
Path(d_in / f).touch()
|
||||||
|
|
||||||
|
# ensure that "automatic" suffix detection works
|
||||||
|
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
|
||||||
|
assert "Generated output file" in result.stdout
|
||||||
|
out_files = os.listdir(d_out)
|
||||||
|
assert len(out_files) == 3
|
||||||
|
assert "data1.spacy" in out_files
|
||||||
|
assert "data2.spacy" in out_files
|
||||||
|
assert "data3.spacy" in out_files
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_auto_conflict():
|
||||||
|
with make_tempdir() as d_in, make_tempdir() as d_out:
|
||||||
|
for f in ["data1.iob", "data2.iob", "data3.json"]:
|
||||||
|
Path(d_in / f).touch()
|
||||||
|
|
||||||
|
# ensure that "automatic" suffix detection warns when there are different file types
|
||||||
|
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
|
||||||
|
assert "All input files must be same type" in result.stdout
|
||||||
|
out_files = os.listdir(d_out)
|
||||||
|
assert len(out_files) == 0
|
|
@ -3,6 +3,7 @@ import logging
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
|
from spacy.scorer import Scorer
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
|
@ -126,6 +127,112 @@ def test_evaluate_no_pipe(nlp):
|
||||||
nlp.evaluate([Example.from_dict(doc, annots)])
|
nlp.evaluate([Example.from_dict(doc, annots)])
|
||||||
|
|
||||||
|
|
||||||
|
def test_evaluate_textcat_multilabel(en_vocab):
|
||||||
|
"""Test that evaluate works with a multilabel textcat pipe."""
|
||||||
|
nlp = Language(en_vocab)
|
||||||
|
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
|
||||||
|
for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
|
||||||
|
textcat_multilabel.add_label(label)
|
||||||
|
nlp.initialize()
|
||||||
|
|
||||||
|
annots = {"cats": {"FEATURE": 1.0, "QUESTION": 1.0}}
|
||||||
|
doc = nlp.make_doc("hello world")
|
||||||
|
example = Example.from_dict(doc, annots)
|
||||||
|
scores = nlp.evaluate([example])
|
||||||
|
labels = nlp.get_pipe("textcat_multilabel").labels
|
||||||
|
for label in labels:
|
||||||
|
assert scores["cats_f_per_type"].get(label) is not None
|
||||||
|
for key in example.reference.cats.keys():
|
||||||
|
if key not in labels:
|
||||||
|
assert scores["cats_f_per_type"].get(key) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_evaluate_multiple_textcat_final(en_vocab):
|
||||||
|
"""Test that evaluate evaluates the final textcat component in a pipeline
|
||||||
|
with more than one textcat or textcat_multilabel."""
|
||||||
|
nlp = Language(en_vocab)
|
||||||
|
textcat = nlp.add_pipe("textcat")
|
||||||
|
for label in ("POSITIVE", "NEGATIVE"):
|
||||||
|
textcat.add_label(label)
|
||||||
|
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
|
||||||
|
for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
|
||||||
|
textcat_multilabel.add_label(label)
|
||||||
|
nlp.initialize()
|
||||||
|
|
||||||
|
annots = {
|
||||||
|
"cats": {
|
||||||
|
"POSITIVE": 1.0,
|
||||||
|
"NEGATIVE": 0.0,
|
||||||
|
"FEATURE": 1.0,
|
||||||
|
"QUESTION": 1.0,
|
||||||
|
"POSITIVE": 1.0,
|
||||||
|
"NEGATIVE": 0.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
doc = nlp.make_doc("hello world")
|
||||||
|
example = Example.from_dict(doc, annots)
|
||||||
|
scores = nlp.evaluate([example])
|
||||||
|
# get the labels from the final pipe
|
||||||
|
labels = nlp.get_pipe(nlp.pipe_names[-1]).labels
|
||||||
|
for label in labels:
|
||||||
|
assert scores["cats_f_per_type"].get(label) is not None
|
||||||
|
for key in example.reference.cats.keys():
|
||||||
|
if key not in labels:
|
||||||
|
assert scores["cats_f_per_type"].get(key) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_evaluate_multiple_textcat_separate(en_vocab):
|
||||||
|
"""Test that evaluate can evaluate multiple textcat components separately
|
||||||
|
with custom scorers."""
|
||||||
|
|
||||||
|
def custom_textcat_score(examples, **kwargs):
|
||||||
|
scores = Scorer.score_cats(
|
||||||
|
examples,
|
||||||
|
"cats",
|
||||||
|
multi_label=False,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
return {f"custom_{k}": v for k, v in scores.items()}
|
||||||
|
|
||||||
|
@spacy.registry.scorers("test_custom_textcat_scorer")
|
||||||
|
def make_custom_textcat_scorer():
|
||||||
|
return custom_textcat_score
|
||||||
|
|
||||||
|
nlp = Language(en_vocab)
|
||||||
|
textcat = nlp.add_pipe(
|
||||||
|
"textcat",
|
||||||
|
config={"scorer": {"@scorers": "test_custom_textcat_scorer"}},
|
||||||
|
)
|
||||||
|
for label in ("POSITIVE", "NEGATIVE"):
|
||||||
|
textcat.add_label(label)
|
||||||
|
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
|
||||||
|
for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
|
||||||
|
textcat_multilabel.add_label(label)
|
||||||
|
nlp.initialize()
|
||||||
|
|
||||||
|
annots = {
|
||||||
|
"cats": {
|
||||||
|
"POSITIVE": 1.0,
|
||||||
|
"NEGATIVE": 0.0,
|
||||||
|
"FEATURE": 1.0,
|
||||||
|
"QUESTION": 1.0,
|
||||||
|
"POSITIVE": 1.0,
|
||||||
|
"NEGATIVE": 0.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
doc = nlp.make_doc("hello world")
|
||||||
|
example = Example.from_dict(doc, annots)
|
||||||
|
scores = nlp.evaluate([example])
|
||||||
|
# check custom scores for the textcat pipe
|
||||||
|
assert "custom_cats_f_per_type" in scores
|
||||||
|
labels = nlp.get_pipe("textcat").labels
|
||||||
|
assert set(scores["custom_cats_f_per_type"].keys()) == set(labels)
|
||||||
|
# check default scores for the textcat_multilabel pipe
|
||||||
|
assert "cats_f_per_type" in scores
|
||||||
|
labels = nlp.get_pipe("textcat_multilabel").labels
|
||||||
|
assert set(scores["cats_f_per_type"].keys()) == set(labels)
|
||||||
|
|
||||||
|
|
||||||
def vector_modification_pipe(doc):
|
def vector_modification_pipe(doc):
|
||||||
doc.vector += 1
|
doc.vector += 1
|
||||||
return doc
|
return doc
|
||||||
|
|
|
@ -8,7 +8,7 @@ from spacy import prefer_gpu, require_gpu, require_cpu
|
||||||
from spacy.ml._precomputable_affine import PrecomputableAffine
|
from spacy.ml._precomputable_affine import PrecomputableAffine
|
||||||
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
||||||
from spacy.util import dot_to_object, SimpleFrozenList, import_file
|
from spacy.util import dot_to_object, SimpleFrozenList, import_file
|
||||||
from spacy.util import to_ternary_int
|
from spacy.util import to_ternary_int, find_available_port
|
||||||
from thinc.api import Config, Optimizer, ConfigValidationError
|
from thinc.api import Config, Optimizer, ConfigValidationError
|
||||||
from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
|
from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
|
||||||
from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
|
from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
|
||||||
|
@ -434,3 +434,16 @@ def test_to_ternary_int():
|
||||||
assert to_ternary_int(-10) == -1
|
assert to_ternary_int(-10) == -1
|
||||||
assert to_ternary_int("string") == -1
|
assert to_ternary_int("string") == -1
|
||||||
assert to_ternary_int([0, "string"]) == -1
|
assert to_ternary_int([0, "string"]) == -1
|
||||||
|
|
||||||
|
|
||||||
|
def test_find_available_port():
|
||||||
|
host = "0.0.0.0"
|
||||||
|
port = 5000
|
||||||
|
assert find_available_port(port, host) == port, "Port 5000 isn't free"
|
||||||
|
|
||||||
|
from wsgiref.simple_server import make_server, demo_app
|
||||||
|
|
||||||
|
with make_server(host, port, demo_app) as httpd:
|
||||||
|
with pytest.warns(UserWarning, match="already in use"):
|
||||||
|
found_port = find_available_port(port, host, auto_select=True)
|
||||||
|
assert found_port == port + 1, "Didn't find next port"
|
||||||
|
|
|
@ -359,6 +359,7 @@ cdef class Doc:
|
||||||
for annot in annotations:
|
for annot in annotations:
|
||||||
if annot:
|
if annot:
|
||||||
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
||||||
|
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
if attrs.ndim == 1:
|
if attrs.ndim == 1:
|
||||||
attrs[i] = annot[i]
|
attrs[i] = annot[i]
|
||||||
|
@ -1558,6 +1559,7 @@ cdef class Doc:
|
||||||
|
|
||||||
for j, (attr, annot) in enumerate(token_annotations.items()):
|
for j, (attr, annot) in enumerate(token_annotations.items()):
|
||||||
if attr is HEAD:
|
if attr is HEAD:
|
||||||
|
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
array[i, j] = annot[i]
|
array[i, j] = annot[i]
|
||||||
elif attr is MORPH:
|
elif attr is MORPH:
|
||||||
|
|
|
@ -95,8 +95,8 @@ class Span:
|
||||||
self,
|
self,
|
||||||
start_idx: int,
|
start_idx: int,
|
||||||
end_idx: int,
|
end_idx: int,
|
||||||
label: int = ...,
|
label: Union[int, str] = ...,
|
||||||
kb_id: int = ...,
|
kb_id: Union[int, str] = ...,
|
||||||
vector: Optional[Floats1d] = ...,
|
vector: Optional[Floats1d] = ...,
|
||||||
) -> Span: ...
|
) -> Span: ...
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -299,7 +299,7 @@ cdef class Span:
|
||||||
for ancestor in ancestors:
|
for ancestor in ancestors:
|
||||||
ancestor_i = ancestor.i - self.c.start
|
ancestor_i = ancestor.i - self.c.start
|
||||||
if ancestor_i in range(length):
|
if ancestor_i in range(length):
|
||||||
array[i, head_col] = ancestor_i - i
|
array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
|
||||||
|
|
||||||
# if there is no appropriate ancestor, define a new artificial root
|
# if there is no appropriate ancestor, define a new artificial root
|
||||||
value = array[i, head_col]
|
value = array[i, head_col]
|
||||||
|
@ -307,7 +307,7 @@ cdef class Span:
|
||||||
new_root = old_to_new_root.get(ancestor_i, None)
|
new_root = old_to_new_root.get(ancestor_i, None)
|
||||||
if new_root is not None:
|
if new_root is not None:
|
||||||
# take the same artificial root as a previous token from the same sentence
|
# take the same artificial root as a previous token from the same sentence
|
||||||
array[i, head_col] = new_root - i
|
array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
|
||||||
else:
|
else:
|
||||||
# set this token as the new artificial root
|
# set this token as the new artificial root
|
||||||
array[i, head_col] = 0
|
array[i, head_col] = 0
|
||||||
|
|
|
@ -18,6 +18,7 @@ class SpanGroup:
|
||||||
def doc(self) -> Doc: ...
|
def doc(self) -> Doc: ...
|
||||||
@property
|
@property
|
||||||
def has_overlap(self) -> bool: ...
|
def has_overlap(self) -> bool: ...
|
||||||
|
def __iter__(self): ...
|
||||||
def __len__(self) -> int: ...
|
def __len__(self) -> int: ...
|
||||||
def append(self, span: Span) -> None: ...
|
def append(self, span: Span) -> None: ...
|
||||||
def extend(self, spans: Iterable[Span]) -> None: ...
|
def extend(self, spans: Iterable[Span]) -> None: ...
|
||||||
|
|
|
@ -158,6 +158,16 @@ cdef class SpanGroup:
|
||||||
return self._concat(other)
|
return self._concat(other)
|
||||||
return NotImplemented
|
return NotImplemented
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
"""
|
||||||
|
Iterate over the spans in this SpanGroup.
|
||||||
|
YIELDS (Span): A span in this SpanGroup.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spangroup#iter
|
||||||
|
"""
|
||||||
|
for i in range(self.c.size()):
|
||||||
|
yield self[i]
|
||||||
|
|
||||||
def append(self, Span span):
|
def append(self, Span span):
|
||||||
"""Add a span to the group. The span must refer to the same Doc
|
"""Add a span to the group. The span must refer to the same Doc
|
||||||
object as the span group.
|
object as the span group.
|
||||||
|
|
|
@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
if key not in IDS:
|
if key not in IDS:
|
||||||
raise ValueError(Errors.E974.format(obj="token", key=key))
|
raise ValueError(Errors.E974.format(obj="token", key=key))
|
||||||
elif key in ["ORTH", "SPACY"]:
|
elif key in ["ORTH", "SPACY"]:
|
||||||
pass
|
continue
|
||||||
elif key == "HEAD":
|
elif key == "HEAD":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
|
row = [h-i if h is not None else 0 for i, h in enumerate(value)]
|
||||||
elif key == "DEP":
|
elif key == "DEP":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
|
row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
|
||||||
elif key == "SENT_START":
|
elif key == "SENT_START":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([to_ternary_int(v) for v in value])
|
row = [to_ternary_int(v) for v in value]
|
||||||
elif key == "MORPH":
|
elif key == "MORPH":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.morphology.add(v) for v in value])
|
row = [vocab.morphology.add(v) for v in value]
|
||||||
else:
|
else:
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
if not all(isinstance(v, str) for v in value):
|
if not all(isinstance(v, str) for v in value):
|
||||||
types = set([type(v) for v in value])
|
types = set([type(v) for v in value])
|
||||||
raise TypeError(Errors.E969.format(field=key, types=types)) from None
|
raise TypeError(Errors.E969.format(field=key, types=types)) from None
|
||||||
values.append([vocab.strings.add(v) for v in value])
|
row = [vocab.strings.add(v) for v in value]
|
||||||
array = numpy.asarray(values, dtype="uint64")
|
values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
|
||||||
|
array = numpy.array(values, dtype=numpy.uint64)
|
||||||
return attrs, array.T
|
return attrs, array.T
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -26,6 +26,8 @@ def setup_table(
|
||||||
return final_cols, final_widths, ["r" for _ in final_widths]
|
return final_cols, final_widths, ["r" for _ in final_widths]
|
||||||
|
|
||||||
|
|
||||||
|
# We cannot rename this method as it's directly imported
|
||||||
|
# and used by external packages such as spacy-loggers.
|
||||||
@registry.loggers("spacy.ConsoleLogger.v2")
|
@registry.loggers("spacy.ConsoleLogger.v2")
|
||||||
def console_logger(
|
def console_logger(
|
||||||
progress_bar: bool = False,
|
progress_bar: bool = False,
|
||||||
|
@ -33,7 +35,27 @@ def console_logger(
|
||||||
output_file: Optional[Union[str, Path]] = None,
|
output_file: Optional[Union[str, Path]] = None,
|
||||||
):
|
):
|
||||||
"""The ConsoleLogger.v2 prints out training logs in the console and/or saves them to a jsonl file.
|
"""The ConsoleLogger.v2 prints out training logs in the console and/or saves them to a jsonl file.
|
||||||
progress_bar (bool): Whether the logger should print the progress bar.
|
progress_bar (bool): Whether the logger should print a progress bar tracking the steps till the next evaluation pass.
|
||||||
|
console_output (bool): Whether the logger should print the logs on the console.
|
||||||
|
output_file (Optional[Union[str, Path]]): The file to save the training logs to.
|
||||||
|
"""
|
||||||
|
return console_logger_v3(
|
||||||
|
progress_bar=None if progress_bar is False else "eval",
|
||||||
|
console_output=console_output,
|
||||||
|
output_file=output_file,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.loggers("spacy.ConsoleLogger.v3")
|
||||||
|
def console_logger_v3(
|
||||||
|
progress_bar: Optional[str] = None,
|
||||||
|
console_output: bool = True,
|
||||||
|
output_file: Optional[Union[str, Path]] = None,
|
||||||
|
):
|
||||||
|
"""The ConsoleLogger.v3 prints out training logs in the console and/or saves them to a jsonl file.
|
||||||
|
progress_bar (Optional[str]): Type of progress bar to show in the console. Allowed values:
|
||||||
|
train - Tracks the number of steps from the beginning of training until the full training run is complete (training.max_steps is reached).
|
||||||
|
eval - Tracks the number of steps between the previous and next evaluation (training.eval_frequency is reached).
|
||||||
console_output (bool): Whether the logger should print the logs on the console.
|
console_output (bool): Whether the logger should print the logs on the console.
|
||||||
output_file (Optional[Union[str, Path]]): The file to save the training logs to.
|
output_file (Optional[Union[str, Path]]): The file to save the training logs to.
|
||||||
"""
|
"""
|
||||||
|
@ -70,6 +92,7 @@ def console_logger(
|
||||||
for name, proc in nlp.pipeline
|
for name, proc in nlp.pipeline
|
||||||
if hasattr(proc, "is_trainable") and proc.is_trainable
|
if hasattr(proc, "is_trainable") and proc.is_trainable
|
||||||
]
|
]
|
||||||
|
max_steps = nlp.config["training"]["max_steps"]
|
||||||
eval_frequency = nlp.config["training"]["eval_frequency"]
|
eval_frequency = nlp.config["training"]["eval_frequency"]
|
||||||
score_weights = nlp.config["training"]["score_weights"]
|
score_weights = nlp.config["training"]["score_weights"]
|
||||||
score_cols = [col for col, value in score_weights.items() if value is not None]
|
score_cols = [col for col, value in score_weights.items() if value is not None]
|
||||||
|
@ -84,6 +107,13 @@ def console_logger(
|
||||||
write(msg.row(table_header, widths=table_widths, spacing=spacing))
|
write(msg.row(table_header, widths=table_widths, spacing=spacing))
|
||||||
write(msg.row(["-" * width for width in table_widths], spacing=spacing))
|
write(msg.row(["-" * width for width in table_widths], spacing=spacing))
|
||||||
progress = None
|
progress = None
|
||||||
|
expected_progress_types = ("train", "eval")
|
||||||
|
if progress_bar is not None and progress_bar not in expected_progress_types:
|
||||||
|
raise ValueError(
|
||||||
|
Errors.E1048.format(
|
||||||
|
unexpected=progress_bar, expected=expected_progress_types
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
def log_step(info: Optional[Dict[str, Any]]) -> None:
|
def log_step(info: Optional[Dict[str, Any]]) -> None:
|
||||||
nonlocal progress
|
nonlocal progress
|
||||||
|
@ -141,11 +171,23 @@ def console_logger(
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if progress_bar:
|
if progress_bar:
|
||||||
|
if progress_bar == "train":
|
||||||
|
total = max_steps
|
||||||
|
desc = f"Last Eval Epoch: {info['epoch']}"
|
||||||
|
initial = info["step"]
|
||||||
|
else:
|
||||||
|
total = eval_frequency
|
||||||
|
desc = f"Epoch {info['epoch']+1}"
|
||||||
|
initial = 0
|
||||||
# Set disable=None, so that it disables on non-TTY
|
# Set disable=None, so that it disables on non-TTY
|
||||||
progress = tqdm.tqdm(
|
progress = tqdm.tqdm(
|
||||||
total=eval_frequency, disable=None, leave=False, file=stderr
|
total=total,
|
||||||
|
disable=None,
|
||||||
|
leave=False,
|
||||||
|
file=stderr,
|
||||||
|
initial=initial,
|
||||||
)
|
)
|
||||||
progress.set_description(f"Epoch {info['epoch']+1}")
|
progress.set_description(desc)
|
||||||
|
|
||||||
def finalize() -> None:
|
def finalize() -> None:
|
||||||
if output_stream:
|
if output_stream:
|
||||||
|
|
|
@ -31,6 +31,7 @@ import shlex
|
||||||
import inspect
|
import inspect
|
||||||
import pkgutil
|
import pkgutil
|
||||||
import logging
|
import logging
|
||||||
|
import socket
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import cupy.random
|
import cupy.random
|
||||||
|
@ -1736,3 +1737,50 @@ def all_equal(iterable):
|
||||||
(or if the input is an empty sequence), False otherwise."""
|
(or if the input is an empty sequence), False otherwise."""
|
||||||
g = itertools.groupby(iterable)
|
g = itertools.groupby(iterable)
|
||||||
return next(g, True) and not next(g, False)
|
return next(g, True) and not next(g, False)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_port_in_use(port: int, host: str = "localhost") -> bool:
|
||||||
|
"""Check if 'host:port' is in use. Return True if it is, False otherwise.
|
||||||
|
|
||||||
|
port (int): the port to check
|
||||||
|
host (str): the host to check (default "localhost")
|
||||||
|
RETURNS (bool): Whether 'host:port' is in use.
|
||||||
|
"""
|
||||||
|
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
try:
|
||||||
|
s.bind((host, port))
|
||||||
|
return False
|
||||||
|
except socket.error:
|
||||||
|
return True
|
||||||
|
finally:
|
||||||
|
s.close()
|
||||||
|
|
||||||
|
|
||||||
|
def find_available_port(start: int, host: str, auto_select: bool = False) -> int:
|
||||||
|
"""Given a starting port and a host, handle finding a port.
|
||||||
|
|
||||||
|
If `auto_select` is False, a busy port will raise an error.
|
||||||
|
|
||||||
|
If `auto_select` is True, the next free higher port will be used.
|
||||||
|
|
||||||
|
start (int): the port to start looking from
|
||||||
|
host (str): the host to find a port on
|
||||||
|
auto_select (bool): whether to automatically select a new port if the given port is busy (default False)
|
||||||
|
RETURNS (int): The port to use.
|
||||||
|
"""
|
||||||
|
if not _is_port_in_use(start, host):
|
||||||
|
return start
|
||||||
|
|
||||||
|
port = start
|
||||||
|
if not auto_select:
|
||||||
|
raise ValueError(Errors.E1050.format(port=port))
|
||||||
|
|
||||||
|
while _is_port_in_use(port, host) and port < 65535:
|
||||||
|
port += 1
|
||||||
|
|
||||||
|
if port == 65535 and _is_port_in_use(port, host):
|
||||||
|
raise ValueError(Errors.E1049.format(host=host))
|
||||||
|
|
||||||
|
# if we get here, the port changed
|
||||||
|
warnings.warn(Warnings.W124.format(host=host, port=start, serve_port=port))
|
||||||
|
return port
|
||||||
|
|
|
@ -12,6 +12,7 @@ menu:
|
||||||
- ['train', 'train']
|
- ['train', 'train']
|
||||||
- ['pretrain', 'pretrain']
|
- ['pretrain', 'pretrain']
|
||||||
- ['evaluate', 'evaluate']
|
- ['evaluate', 'evaluate']
|
||||||
|
- ['apply', 'apply']
|
||||||
- ['find-threshold', 'find-threshold']
|
- ['find-threshold', 'find-threshold']
|
||||||
- ['assemble', 'assemble']
|
- ['assemble', 'assemble']
|
||||||
- ['package', 'package']
|
- ['package', 'package']
|
||||||
|
@ -474,7 +475,7 @@ report span characteristics such as the average span length and the span (or
|
||||||
span boundary) distinctiveness. The distinctiveness measure shows how different
|
span boundary) distinctiveness. The distinctiveness measure shows how different
|
||||||
the tokens are with respect to the rest of the corpus using the KL-divergence of
|
the tokens are with respect to the rest of the corpus using the KL-divergence of
|
||||||
the token distributions. To learn more, you can check out Papay et al.'s work on
|
the token distributions. To learn more, you can check out Papay et al.'s work on
|
||||||
[*Dissecting Span Identification Tasks with Performance Prediction* (EMNLP 2020)](https://aclanthology.org/2020.emnlp-main.396/).
|
[_Dissecting Span Identification Tasks with Performance Prediction_ (EMNLP 2020)](https://aclanthology.org/2020.emnlp-main.396/).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
@ -1162,6 +1163,37 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-prepr
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **CREATES** | Training results and optional metrics and visualizations. |
|
| **CREATES** | Training results and optional metrics and visualizations. |
|
||||||
|
|
||||||
|
## apply {#apply new="3.5" tag="command"}
|
||||||
|
|
||||||
|
Applies a trained pipeline to data and stores the resulting annotated documents
|
||||||
|
in a `DocBin`. The input can be a single file or a directory. The recognized
|
||||||
|
input formats are:
|
||||||
|
|
||||||
|
1. `.spacy`
|
||||||
|
2. `.jsonl` containing a user specified `text_key`
|
||||||
|
3. Files with any other extension are assumed to be plain text files containing
|
||||||
|
a single document.
|
||||||
|
|
||||||
|
When a directory is provided it is traversed recursively to collect all files.
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
|
||||||
|
```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||||
|
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
|
||||||
|
| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
|
||||||
|
| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
|
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
|
||||||
|
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
|
||||||
|
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||||
|
| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
||||||
|
| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
||||||
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. |
|
||||||
|
|
||||||
## find-threshold {#find-threshold new="3.5" tag="command"}
|
## find-threshold {#find-threshold new="3.5" tag="command"}
|
||||||
|
|
||||||
Runs prediction trials for a trained model with varying tresholds to maximize
|
Runs prediction trials for a trained model with varying tresholds to maximize
|
||||||
|
@ -1187,7 +1219,6 @@ be provided.
|
||||||
> $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
|
> $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||||
|
|
|
@ -186,7 +186,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
|
||||||
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
|
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
|
||||||
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
||||||
| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||||
| `before_update` | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `step`, `epoch`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~ |
|
| `before_update` <Tag variant="new">3.5</Tag> | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `step`, `epoch`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~ |
|
||||||
| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
|
| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
|
||||||
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
||||||
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
||||||
|
|
|
@ -55,13 +55,14 @@ how the component should be configured. You can override its settings via the
|
||||||
> nlp.add_pipe("entity_ruler", config=config)
|
> nlp.add_pipe("entity_ruler", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
|
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
|
||||||
| `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ |
|
| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ |
|
||||||
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
|
| `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ |
|
||||||
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ |
|
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
|
||||||
| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ |
|
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ |
|
||||||
|
| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/entityruler.py
|
%%GITHUB_SPACY/spacy/pipeline/entityruler.py
|
||||||
|
@ -85,23 +86,25 @@ be a token pattern (list) or a phrase pattern (string). For example:
|
||||||
> ruler = EntityRuler(nlp, overwrite_ents=True)
|
> ruler = EntityRuler(nlp, overwrite_ents=True)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ |
|
| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ |
|
||||||
| `name` <Tag variant="new">3</Tag> | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
|
| `name` <Tag variant="new">3</Tag> | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
|
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
|
||||||
| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ |
|
| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ |
|
||||||
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
|
| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ |
|
||||||
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ |
|
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
|
||||||
| `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ |
|
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ |
|
||||||
|
| `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ |
|
||||||
|
| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ |
|
||||||
|
|
||||||
## EntityRuler.initialize {#initialize tag="method" new="3"}
|
## EntityRuler.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component with data and used before training to load in rules
|
Initialize the component with data and used before training to load in rules
|
||||||
from a [pattern file](/usage/rule-based-matching/#entityruler-files). This method
|
from a [pattern file](/usage/rule-based-matching/#entityruler-files). This
|
||||||
is typically called by [`Language.initialize`](/api/language#initialize) and
|
method is typically called by [`Language.initialize`](/api/language#initialize)
|
||||||
lets you customize arguments it receives via the
|
and lets you customize arguments it receives via the
|
||||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||||
config.
|
config.
|
||||||
|
|
||||||
|
@ -210,10 +213,10 @@ of dicts) or a phrase pattern (string). For more details, see the usage guide on
|
||||||
| ---------- | ---------------------------------------------------------------- |
|
| ---------- | ---------------------------------------------------------------- |
|
||||||
| `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ |
|
| `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ |
|
||||||
|
|
||||||
|
|
||||||
## EntityRuler.remove {#remove tag="method" new="3.2.1"}
|
## EntityRuler.remove {#remove tag="method" new="3.2.1"}
|
||||||
|
|
||||||
Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if the ID does not exist.
|
Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if
|
||||||
|
the ID does not exist.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -224,9 +227,9 @@ Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if th
|
||||||
> ruler.remove("apple")
|
> ruler.remove("apple")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------- | ---------------------------------------------------------------- |
|
| ---- | ----------------------------------- |
|
||||||
| `id` | The ID of the pattern rule. ~~str~~ |
|
| `id` | The ID of the pattern rule. ~~str~~ |
|
||||||
|
|
||||||
## EntityRuler.to_disk {#to_disk tag="method"}
|
## EntityRuler.to_disk {#to_disk tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -138,7 +138,7 @@ The L2 norm of the lexeme's vector representation.
|
||||||
| `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ |
|
| `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ |
|
||||||
| `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ |
|
| `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ |
|
||||||
| `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ |
|
| `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ |
|
||||||
| `suffix_` | Length-N substring from the start of the word. Defaults to `N=3`. ~~str~~ |
|
| `suffix_` | Length-N substring from the end of the word. Defaults to `N=3`. ~~str~~ |
|
||||||
| `is_alpha` | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~ |
|
| `is_alpha` | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~ |
|
||||||
| `is_ascii` | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~ |
|
| `is_ascii` | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~ |
|
||||||
| `is_digit` | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~ |
|
| `is_digit` | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~ |
|
||||||
|
|
|
@ -86,14 +86,20 @@ it compares to another value.
|
||||||
> ]
|
> ]
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Attribute | Description |
|
| Attribute | Description |
|
||||||
| -------------------------- | -------------------------------------------------------------------------------------------------------- |
|
| -------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `IN` | Attribute value is member of a list. ~~Any~~ |
|
| `REGEX` | Attribute value matches the regular expression at any position in the string. ~~Any~~ |
|
||||||
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
|
| `FUZZY` | Attribute value matches if the `fuzzy_compare` method matches for `(value, pattern, -1)`. The default method allows a Levenshtein edit distance of at least 2 and up to 30% of the pattern string length. ~~Any~~ |
|
||||||
| `IS_SUBSET` | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~ |
|
| `FUZZY1`, `FUZZY2`, ... `FUZZY9` | Attribute value matches if the `fuzzy_compare` method matches for `(value, pattern, N)`. The default method allows a Levenshtein edit distance of at most N (1-9). ~~Any~~ |
|
||||||
| `IS_SUPERSET` | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~ |
|
| `IN` | Attribute value is member of a list. ~~Any~~ |
|
||||||
| `INTERSECTS` | Attribute value (for `MORPH` or custom list attribute) has a non-empty intersection with a list. ~~Any~~ |
|
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
|
||||||
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
|
| `IS_SUBSET` | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~ |
|
||||||
|
| `IS_SUPERSET` | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~ |
|
||||||
|
| `INTERSECTS` | Attribute value (for `MORPH` or custom list attribute) has a non-empty intersection with a list. ~~Any~~ |
|
||||||
|
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
|
||||||
|
|
||||||
|
As of spaCy v3.5, `REGEX` and `FUZZY` can be used in combination with `IN` and
|
||||||
|
`NOT_IN`.
|
||||||
|
|
||||||
## Matcher.\_\_init\_\_ {#init tag="method"}
|
## Matcher.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
@ -109,10 +115,11 @@ string where an integer is expected) or unexpected property names.
|
||||||
> matcher = Matcher(nlp.vocab)
|
> matcher = Matcher(nlp.vocab)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------- | ----------------------------------------------------------------------------------------------------- |
|
| --------------- | ----------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. ~~Vocab~~ |
|
| `vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. ~~Vocab~~ |
|
||||||
| `validate` | Validate all patterns added to this matcher. ~~bool~~ |
|
| `validate` | Validate all patterns added to this matcher. ~~bool~~ |
|
||||||
|
| `fuzzy_compare` | The comparison method used for the `FUZZY` operators. ~~Callable[[str, str, int], bool]~~ |
|
||||||
|
|
||||||
## Matcher.\_\_call\_\_ {#call tag="method"}
|
## Matcher.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -202,6 +202,23 @@ already present in the current span group.
|
||||||
| `other` | The span group or spans to append. ~~Union[SpanGroup, Iterable[Span]]~~ |
|
| `other` | The span group or spans to append. ~~Union[SpanGroup, Iterable[Span]]~~ |
|
||||||
| **RETURNS** | The span group. ~~SpanGroup~~ |
|
| **RETURNS** | The span group. ~~SpanGroup~~ |
|
||||||
|
|
||||||
|
## SpanGroup.\_\_iter\_\_ {#iter tag="method" new="3.5"}
|
||||||
|
|
||||||
|
Iterate over the spans in this span group.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("Their goi ng home")
|
||||||
|
> doc.spans["errors"] = [doc[0:1], doc[1:3]]
|
||||||
|
> for error_span in doc.spans["errors"]:
|
||||||
|
> print(error_span)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------- | ----------------------------------- |
|
||||||
|
| **YIELDS** | A span in this span group. ~~Span~~ |
|
||||||
|
|
||||||
## SpanGroup.append {#append tag="method"}
|
## SpanGroup.append {#append tag="method"}
|
||||||
|
|
||||||
Add a [`Span`](/api/span) object to the group. The span must refer to the same
|
Add a [`Span`](/api/span) object to the group. The span must refer to the same
|
||||||
|
|
|
@ -46,16 +46,17 @@ how the component should be configured. You can override its settings via the
|
||||||
> nlp.add_pipe("span_ruler", config=config)
|
> nlp.add_pipe("span_ruler", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `spans_key` | The spans key to save the spans under. If `None`, no spans are saved. Defaults to `"ruler"`. ~~Optional[str]~~ |
|
| `spans_key` | The spans key to save the spans under. If `None`, no spans are saved. Defaults to `"ruler"`. ~~Optional[str]~~ |
|
||||||
| `spans_filter` | The optional method to filter spans before they are assigned to doc.spans. Defaults to `None`. ~~Optional[Callable[[Iterable[Span], Iterable[Span]], List[Span]]]~~ |
|
| `spans_filter` | The optional method to filter spans before they are assigned to doc.spans. Defaults to `None`. ~~Optional[Callable[[Iterable[Span], Iterable[Span]], List[Span]]]~~ |
|
||||||
| `annotate_ents` | Whether to save spans to doc.ents. Defaults to `False`. ~~bool~~ |
|
| `annotate_ents` | Whether to save spans to doc.ents. Defaults to `False`. ~~bool~~ |
|
||||||
| `ents_filter` | The method to filter spans before they are assigned to doc.ents. Defaults to `util.filter_chain_spans`. ~~Callable[[Iterable[Span], Iterable[Span]], List[Span]]~~ |
|
| `ents_filter` | The method to filter spans before they are assigned to doc.ents. Defaults to `util.filter_chain_spans`. ~~Callable[[Iterable[Span], Iterable[Span]], List[Span]]~~ |
|
||||||
| `phrase_matcher_attr` | Token attribute to match on, passed to the internal PhraseMatcher as `attr`. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
|
| `phrase_matcher_attr` | Token attribute to match on, passed to the internal `PhraseMatcher` as `attr`. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
|
||||||
| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ |
|
| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ |
|
||||||
| `overwrite` | Whether to remove any existing spans under `Doc.spans[spans key]` if `spans_key` is set, or to remove any ents under `Doc.ents` if `annotate_ents` is set. Defaults to `True`. ~~bool~~ |
|
| `validate` | Whether patterns should be validated, passed to `Matcher` and `PhraseMatcher` as `validate`. Defaults to `False`. ~~bool~~ |
|
||||||
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
| `overwrite` | Whether to remove any existing spans under `Doc.spans[spans key]` if `spans_key` is set, or to remove any ents under `Doc.ents` if `annotate_ents` is set. Defaults to `True`. ~~bool~~ |
|
||||||
|
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/span_ruler.py
|
%%GITHUB_SPACY/spacy/pipeline/span_ruler.py
|
||||||
|
@ -79,19 +80,20 @@ token pattern (list) or a phrase pattern (string). For example:
|
||||||
> ruler = SpanRuler(nlp, overwrite=True)
|
> ruler = SpanRuler(nlp, overwrite=True)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ |
|
| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ |
|
||||||
| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current span ruler while creating phrase patterns with the nlp object. ~~str~~ |
|
| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current span ruler while creating phrase patterns with the nlp object. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `spans_key` | The spans key to save the spans under. If `None`, no spans are saved. Defaults to `"ruler"`. ~~Optional[str]~~ |
|
| `spans_key` | The spans key to save the spans under. If `None`, no spans are saved. Defaults to `"ruler"`. ~~Optional[str]~~ |
|
||||||
| `spans_filter` | The optional method to filter spans before they are assigned to doc.spans. Defaults to `None`. ~~Optional[Callable[[Iterable[Span], Iterable[Span]], List[Span]]]~~ |
|
| `spans_filter` | The optional method to filter spans before they are assigned to doc.spans. Defaults to `None`. ~~Optional[Callable[[Iterable[Span], Iterable[Span]], List[Span]]]~~ |
|
||||||
| `annotate_ents` | Whether to save spans to doc.ents. Defaults to `False`. ~~bool~~ |
|
| `annotate_ents` | Whether to save spans to doc.ents. Defaults to `False`. ~~bool~~ |
|
||||||
| `ents_filter` | The method to filter spans before they are assigned to doc.ents. Defaults to `util.filter_chain_spans`. ~~Callable[[Iterable[Span], Iterable[Span]], List[Span]]~~ |
|
| `ents_filter` | The method to filter spans before they are assigned to doc.ents. Defaults to `util.filter_chain_spans`. ~~Callable[[Iterable[Span], Iterable[Span]], List[Span]]~~ |
|
||||||
| `phrase_matcher_attr` | Token attribute to match on, passed to the internal PhraseMatcher as `attr`. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
|
| `phrase_matcher_attr` | Token attribute to match on, passed to the internal PhraseMatcher as `attr`. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
|
||||||
| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ |
|
| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ |
|
||||||
| `overwrite` | Whether to remove any existing spans under `Doc.spans[spans key]` if `spans_key` is set, or to remove any ents under `Doc.ents` if `annotate_ents` is set. Defaults to `True`. ~~bool~~ |
|
| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ |
|
||||||
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
| `overwrite` | Whether to remove any existing spans under `Doc.spans[spans key]` if `spans_key` is set, or to remove any ents under `Doc.ents` if `annotate_ents` is set. Defaults to `True`. ~~bool~~ |
|
||||||
|
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
||||||
|
|
||||||
## SpanRuler.initialize {#initialize tag="method"}
|
## SpanRuler.initialize {#initialize tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -237,16 +237,17 @@ browser. Will run a simple web server.
|
||||||
> displacy.serve([doc1, doc2], style="dep")
|
> displacy.serve([doc1, doc2], style="dep")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
|
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
|
||||||
| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
|
| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
|
||||||
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
||||||
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
||||||
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
||||||
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
|
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
|
||||||
| `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ |
|
| `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ |
|
||||||
| `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ |
|
| `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ |
|
||||||
|
| `auto_select_port` | If `True`, automatically switch to a different port if the specified port is already in use. Defaults to `False`. ~~bool~~ |
|
||||||
|
|
||||||
### displacy.render {#displacy.render tag="method" new="2"}
|
### displacy.render {#displacy.render tag="method" new="2"}
|
||||||
|
|
||||||
|
@ -266,7 +267,7 @@ Render a dependency parse tree or named entity visualization.
|
||||||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]~~ |
|
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]~~ |
|
||||||
| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
|
| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
|
||||||
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
| `page` | Render markup as full HTML page. Defaults to `False`. ~~bool~~ |
|
||||||
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
||||||
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
||||||
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
|
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
|
||||||
|
@ -513,7 +514,7 @@ a [Weights & Biases](https://www.wandb.com/) dashboard.
|
||||||
Instead of using one of the built-in loggers, you can
|
Instead of using one of the built-in loggers, you can
|
||||||
[implement your own](/usage/training#custom-logging).
|
[implement your own](/usage/training#custom-logging).
|
||||||
|
|
||||||
#### spacy.ConsoleLogger.v2 {#ConsoleLogger tag="registered function"}
|
#### spacy.ConsoleLogger.v2 {tag="registered function"}
|
||||||
|
|
||||||
> #### Example config
|
> #### Example config
|
||||||
>
|
>
|
||||||
|
@ -564,11 +565,33 @@ start decreasing across epochs.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------- | --------------------------------------------------------------------- |
|
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `progress_bar` | Whether the logger should print the progress bar ~~bool~~ |
|
| `progress_bar` | Whether the logger should print a progress bar tracking the steps till the next evaluation pass (default: `False`). ~~bool~~ |
|
||||||
| `console_output` | Whether the logger should print the logs on the console. ~~bool~~ |
|
| `console_output` | Whether the logger should print the logs in the console (default: `True`). ~~bool~~ |
|
||||||
| `output_file` | The file to save the training logs to. ~~Optional[Union[str, Path]]~~ |
|
| `output_file` | The file to save the training logs to (default: `None`). ~~Optional[Union[str, Path]]~~ |
|
||||||
|
|
||||||
|
#### spacy.ConsoleLogger.v3 {#ConsoleLogger tag="registered function"}
|
||||||
|
|
||||||
|
> #### Example config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [training.logger]
|
||||||
|
> @loggers = "spacy.ConsoleLogger.v3"
|
||||||
|
> progress_bar = "all_steps"
|
||||||
|
> console_output = true
|
||||||
|
> output_file = "training_log.jsonl"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Writes the results of a training step to the console in a tabular format and
|
||||||
|
optionally saves them to a `jsonl` file.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `progress_bar` | Type of progress bar to show in the console: `"train"`, `"eval"` or `None`. |
|
||||||
|
| | The bar tracks the number of steps until `training.max_steps` and `training.eval_frequency` are reached respectively (default: `None`). ~~Optional[str]~~ |
|
||||||
|
| `console_output` | Whether the logger should print the logs in the console (default: `True`). ~~bool~~ |
|
||||||
|
| `output_file` | The file to save the training logs to (default: `None`). ~~Optional[Union[str, Path]]~~ |
|
||||||
|
|
||||||
## Readers {#readers}
|
## Readers {#readers}
|
||||||
|
|
||||||
|
|
|
@ -364,6 +364,46 @@ else:
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
|
#### Fuzzy matching {#fuzzy new="3.5"}
|
||||||
|
|
||||||
|
Fuzzy matching allows you to match tokens with alternate spellings, typos, etc.
|
||||||
|
without specifying every possible variant.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Matches "favourite", "favorites", "gavorite", "theatre", "theatr", ...
|
||||||
|
pattern = [{"TEXT": {"FUZZY": "favorite"}},
|
||||||
|
{"TEXT": {"FUZZY": "theater"}}]
|
||||||
|
```
|
||||||
|
|
||||||
|
The `FUZZY` attribute allows fuzzy matches for any attribute string value,
|
||||||
|
including custom attributes. Just like `REGEX`, it always needs to be applied to
|
||||||
|
an attribute like `TEXT` or `LOWER`. By default `FUZZY` allows a Levenshtein
|
||||||
|
edit distance of at least 2 and up to 30% of the pattern string length. Using
|
||||||
|
the more specific attributes `FUZZY1`..`FUZZY9` you can specify the maximum
|
||||||
|
allowed edit distance directly.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Match lowercase with fuzzy matching (allows 2 edits)
|
||||||
|
pattern = [{"LOWER": {"FUZZY": "definitely"}}]
|
||||||
|
|
||||||
|
# Match custom attribute values with fuzzy matching (allows 2 edits)
|
||||||
|
pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
|
||||||
|
|
||||||
|
# Match with exact Levenshtein edit distance limits (allows 3 edits)
|
||||||
|
pattern = [{"_": {"country": {"FUZZY3": "Kyrgyzstan"}}}]
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Regex and fuzzy matching with lists {#regex-fuzzy-lists new="3.5"}
|
||||||
|
|
||||||
|
Starting in spaCy v3.5, both `REGEX` and `FUZZY` can be combined with the
|
||||||
|
attributes `IN` and `NOT_IN`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
pattern = [{"TEXT": {"FUZZY": {"IN": ["awesome", "cool", "wonderful"]}}}]
|
||||||
|
|
||||||
|
pattern = [{"TEXT": {"REGEX": {"NOT_IN": ["^awe(some)?$", "^wonder(ful)?"]}}}]
|
||||||
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
#### Operators and quantifiers {#quantifiers}
|
#### Operators and quantifiers {#quantifiers}
|
||||||
|
|
|
@ -45,7 +45,7 @@
|
||||||
{ "text": "v2.x Documentation", "url": "https://v2.spacy.io" },
|
{ "text": "v2.x Documentation", "url": "https://v2.spacy.io" },
|
||||||
{
|
{
|
||||||
"text": "Custom Solutions",
|
"text": "Custom Solutions",
|
||||||
"url": "https://explosion.ai/spacy-tailored-pipelines"
|
"url": "https://explosion.ai/custom-solutions"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,7 +51,7 @@
|
||||||
{ "text": "Online Course", "url": "https://course.spacy.io" },
|
{ "text": "Online Course", "url": "https://course.spacy.io" },
|
||||||
{
|
{
|
||||||
"text": "Custom Solutions",
|
"text": "Custom Solutions",
|
||||||
"url": "https://explosion.ai/spacy-tailored-pipelines"
|
"url": "https://explosion.ai/custom-solutions"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
|
@ -1023,25 +1023,6 @@
|
||||||
},
|
},
|
||||||
"category": ["pipeline"]
|
"category": ["pipeline"]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "spacy-sentence-segmenter",
|
|
||||||
"title": "Sentence Segmenter",
|
|
||||||
"slogan": "Custom sentence segmentation for spaCy",
|
|
||||||
"code_example": [
|
|
||||||
"from seg.newline.segmenter import NewLineSegmenter",
|
|
||||||
"import spacy",
|
|
||||||
"",
|
|
||||||
"nlseg = NewLineSegmenter()",
|
|
||||||
"nlp = spacy.load('en')",
|
|
||||||
"nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter', before='parser')",
|
|
||||||
"doc = nlp(my_doc_text)"
|
|
||||||
],
|
|
||||||
"author": "tc64",
|
|
||||||
"author_links": {
|
|
||||||
"github": "tc64"
|
|
||||||
},
|
|
||||||
"category": ["pipeline"]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "spacy_cld",
|
"id": "spacy_cld",
|
||||||
"title": "spaCy-CLD",
|
"title": "spaCy-CLD",
|
||||||
|
@ -1468,13 +1449,26 @@
|
||||||
"image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png",
|
"image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png",
|
||||||
"code_example": [
|
"code_example": [
|
||||||
"import spacy",
|
"import spacy",
|
||||||
"import scattertext as st",
|
|
||||||
"",
|
"",
|
||||||
"nlp = spacy.load('en')",
|
"from scattertext import SampleCorpora, produce_scattertext_explorer",
|
||||||
"corpus = st.CorpusFromPandas(convention_df,",
|
"from scattertext import produce_scattertext_html",
|
||||||
" category_col='party',",
|
"from scattertext.CorpusFromPandas import CorpusFromPandas",
|
||||||
" text_col='text',",
|
"",
|
||||||
" nlp=nlp).build()"
|
"nlp = spacy.load('en_core_web_sm')",
|
||||||
|
"convention_df = SampleCorpora.ConventionData2012.get_data()",
|
||||||
|
"corpus = CorpusFromPandas(convention_df,",
|
||||||
|
" category_col='party',",
|
||||||
|
" text_col='text',",
|
||||||
|
" nlp=nlp).build()",
|
||||||
|
"",
|
||||||
|
"html = produce_scattertext_html(corpus,",
|
||||||
|
" category='democrat',",
|
||||||
|
" category_name='Democratic',",
|
||||||
|
" not_category_name='Republican',",
|
||||||
|
" minimum_term_frequency=5,",
|
||||||
|
" width_in_pixels=1000)",
|
||||||
|
"open('./simple.html', 'wb').write(html.encode('utf-8'))",
|
||||||
|
"print('Open ./simple.html in Chrome or Firefox.')"
|
||||||
],
|
],
|
||||||
"author": "Jason Kessler",
|
"author": "Jason Kessler",
|
||||||
"author_links": {
|
"author_links": {
|
||||||
|
@ -4068,6 +4062,33 @@
|
||||||
"author_links": {
|
"author_links": {
|
||||||
"github": "yasufumy"
|
"github": "yasufumy"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "spacy-pythainlp",
|
||||||
|
"title": "spaCy-PyThaiNLP",
|
||||||
|
"slogan": "PyThaiNLP for spaCy",
|
||||||
|
"description": "This package wraps the PyThaiNLP library to add support for Thai to spaCy.",
|
||||||
|
"github": "PyThaiNLP/spaCy-PyThaiNLP",
|
||||||
|
"code_example": [
|
||||||
|
"import spacy",
|
||||||
|
"import spacy_pythainlp.core",
|
||||||
|
"",
|
||||||
|
"nlp = spacy.blank('th')",
|
||||||
|
"nlp.add_pipe('pythainlp')",
|
||||||
|
"doc = nlp('ผมเป็นคนไทย แต่มะลิอยากไปโรงเรียนส่วนผมจะไปไหน ผมอยากไปเที่ยว')",
|
||||||
|
"",
|
||||||
|
"print(list(doc.sents))",
|
||||||
|
"# output: [ผมเป็นคนไทย แต่มะลิอยากไปโรงเรียนส่วนผมจะไปไหน , ผมอยากไปเที่ยว]"
|
||||||
|
],
|
||||||
|
"code_language": "python",
|
||||||
|
"author": "Wannaphong Phatthiyaphaibun",
|
||||||
|
"author_links": {
|
||||||
|
"twitter": "@wannaphong_p",
|
||||||
|
"github": "wannaphong",
|
||||||
|
"website": "https://iam.wannaphong.com/"
|
||||||
|
},
|
||||||
|
"category": ["pipeline", "research"],
|
||||||
|
"tags": ["Thai"]
|
||||||
}
|
}
|
||||||
|
|
||||||
],
|
],
|
||||||
|
|
|
@ -105,13 +105,13 @@ const Landing = ({ data }) => {
|
||||||
|
|
||||||
<LandingBannerGrid>
|
<LandingBannerGrid>
|
||||||
<LandingBanner
|
<LandingBanner
|
||||||
to="https://explosion.ai/spacy-tailored-pipelines"
|
to="https://explosion.ai/custom-solutions"
|
||||||
button="Learn more"
|
button="Learn more"
|
||||||
background="#E4F4F9"
|
background="#E4F4F9"
|
||||||
color="#1e1935"
|
color="#1e1935"
|
||||||
small
|
small
|
||||||
>
|
>
|
||||||
<Link to="https://explosion.ai/spacy-tailored-pipelines" hidden>
|
<Link to="https://explosion.ai/custom-solutions" hidden>
|
||||||
<img src={tailoredPipelinesImage} alt="spaCy Tailored Pipelines" />
|
<img src={tailoredPipelinesImage} alt="spaCy Tailored Pipelines" />
|
||||||
</Link>
|
</Link>
|
||||||
<strong>
|
<strong>
|
||||||
|
|
Loading…
Reference in New Issue
Block a user