mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-04 04:10:20 +03:00
Merge remote-tracking branch 'upstream/master' into accuracy-speed
This commit is contained in:
commit
7b092a4c5e
2
.github/azure-steps.yml
vendored
2
.github/azure-steps.yml
vendored
|
@ -107,7 +107,7 @@ steps:
|
|||
displayName: "Run CPU tests"
|
||||
|
||||
- script: |
|
||||
python -m pip install --pre thinc-apple-ops
|
||||
python -m pip install 'spacy[apple]'
|
||||
python -m pytest --pyargs spacy
|
||||
displayName: "Run CPU tests with thinc-apple-ops"
|
||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
||||
|
|
8
.github/workflows/lock.yml
vendored
8
.github/workflows/lock.yml
vendored
|
@ -15,11 +15,11 @@ jobs:
|
|||
action:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: dessant/lock-threads@v3
|
||||
- uses: dessant/lock-threads@v4
|
||||
with:
|
||||
process-only: 'issues'
|
||||
issue-inactive-days: '30'
|
||||
issue-comment: >
|
||||
This thread has been automatically locked since there
|
||||
has not been any recent activity after it was closed.
|
||||
issue-comment: >
|
||||
This thread has been automatically locked since there
|
||||
has not been any recent activity after it was closed.
|
||||
Please open a new issue for related bugs.
|
||||
|
|
|
@ -14,7 +14,7 @@ parsing, **named entity recognition**, **text classification** and more,
|
|||
multi-task learning with pretrained **transformers** like BERT, as well as a
|
||||
production-ready [**training system**](https://spacy.io/usage/training) and easy
|
||||
model packaging, deployment and workflow management. spaCy is commercial
|
||||
open-source software, released under the MIT license.
|
||||
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||
|
||||
💫 **Version 3.4 out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
@ -46,6 +46,7 @@ open-source software, released under the MIT license.
|
|||
| 🛠 **[Changelog]** | Changes and version history. |
|
||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
||||
|
||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||
[new in v3.0]: https://spacy.io/usage/v3
|
||||
|
@ -59,6 +60,7 @@ open-source software, released under the MIT license.
|
|||
[changelog]: https://spacy.io/usage#changelog
|
||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||
|
||||
|
||||
## 💬 Where to ask questions
|
||||
|
||||
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
||||
|
|
|
@ -41,7 +41,7 @@ jobs:
|
|||
matrix:
|
||||
# We're only running one platform per Python version to speed up builds
|
||||
Python36Linux:
|
||||
imageName: "ubuntu-latest"
|
||||
imageName: "ubuntu-20.04"
|
||||
python.version: "3.6"
|
||||
# Python36Windows:
|
||||
# imageName: "windows-latest"
|
||||
|
@ -50,7 +50,7 @@ jobs:
|
|||
# imageName: "macos-latest"
|
||||
# python.version: "3.6"
|
||||
# Python37Linux:
|
||||
# imageName: "ubuntu-latest"
|
||||
# imageName: "ubuntu-20.04"
|
||||
# python.version: "3.7"
|
||||
Python37Windows:
|
||||
imageName: "windows-latest"
|
||||
|
|
|
@ -5,4 +5,5 @@ numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
|
|||
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
|
||||
numpy==1.19.3; python_version=='3.9'
|
||||
numpy==1.21.3; python_version=='3.10'
|
||||
numpy; python_version>='3.11'
|
||||
numpy==1.23.2; python_version=='3.11'
|
||||
numpy; python_version>='3.12'
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
# Our libraries
|
||||
spacy-legacy>=3.0.10,<3.1.0
|
||||
spacy-legacy>=3.0.11,<3.1.0
|
||||
spacy-loggers>=1.0.0,<2.0.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
ml_datasets>=0.2.0,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.9.1,<1.1.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
typer>=0.3.0,<0.8.0
|
||||
|
|
|
@ -22,6 +22,7 @@ classifiers =
|
|||
Programming Language :: Python :: 3.8
|
||||
Programming Language :: Python :: 3.9
|
||||
Programming Language :: Python :: 3.10
|
||||
Programming Language :: Python :: 3.11
|
||||
Topic :: Scientific/Engineering
|
||||
project_urls =
|
||||
Release notes = https://github.com/explosion/spaCy/releases
|
||||
|
@ -41,13 +42,13 @@ setup_requires =
|
|||
thinc>=8.1.0,<8.2.0
|
||||
install_requires =
|
||||
# Our libraries
|
||||
spacy-legacy>=3.0.10,<3.1.0
|
||||
spacy-legacy>=3.0.11,<3.1.0
|
||||
spacy-loggers>=1.0.0,<2.0.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
wasabi>=0.9.1,<1.1.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
# Third-party dependencies
|
||||
|
|
|
@ -17,6 +17,7 @@ from .debug_config import debug_config # noqa: F401
|
|||
from .debug_model import debug_model # noqa: F401
|
||||
from .debug_diff import debug_diff # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .apply import apply # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||
from .init_config import init_config, fill_config # noqa: F401
|
||||
|
|
|
@ -161,15 +161,15 @@ def load_project_config(
|
|||
sys.exit(1)
|
||||
validate_project_version(config)
|
||||
validate_project_commands(config)
|
||||
if interpolate:
|
||||
err = f"{PROJECT_FILE} validation error"
|
||||
with show_validation_error(title=err, hint_fill=False):
|
||||
config = substitute_project_variables(config, overrides)
|
||||
# Make sure directories defined in config exist
|
||||
for subdir in config.get("directories", []):
|
||||
dir_path = path / subdir
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir(parents=True)
|
||||
if interpolate:
|
||||
err = f"{PROJECT_FILE} validation error"
|
||||
with show_validation_error(title=err, hint_fill=False):
|
||||
config = substitute_project_variables(config, overrides)
|
||||
return config
|
||||
|
||||
|
||||
|
@ -585,6 +585,33 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
|
|||
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
|
||||
|
||||
|
||||
def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
|
||||
"""Given a directory and a suffix, recursively find all files matching the suffix.
|
||||
Directories or files with names beginning with a . are ignored, but hidden flags on
|
||||
filesystems are not checked.
|
||||
When provided with a suffix `None`, there is no suffix-based filtering."""
|
||||
if not path.is_dir():
|
||||
return [path]
|
||||
paths = [path]
|
||||
locs = []
|
||||
seen = set()
|
||||
for path in paths:
|
||||
if str(path) in seen:
|
||||
continue
|
||||
seen.add(str(path))
|
||||
if path.parts[-1].startswith("."):
|
||||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
elif suffix is not None and not path.parts[-1].endswith(suffix):
|
||||
continue
|
||||
else:
|
||||
locs.append(path)
|
||||
# It's good to sort these, in case the ordering messes up cache.
|
||||
locs.sort()
|
||||
return locs
|
||||
|
||||
|
||||
def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
|
||||
"""Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
|
||||
as happens with `round(number, ndigits)`"""
|
||||
|
|
143
spacy/cli/apply.py
Normal file
143
spacy/cli/apply.py
Normal file
|
@ -0,0 +1,143 @@
|
|||
import tqdm
|
||||
import srsly
|
||||
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Iterable, cast, Union
|
||||
|
||||
from wasabi import msg
|
||||
|
||||
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
|
||||
|
||||
from ..tokens import Doc, DocBin
|
||||
from ..vocab import Vocab
|
||||
from ..util import ensure_path, load_model
|
||||
|
||||
|
||||
path_help = """Location of the documents to predict on.
|
||||
Can be a single file in .spacy format or a .jsonl file.
|
||||
Files with other extensions are treated as single plain text documents.
|
||||
If a directory is provided it is traversed recursively to grab
|
||||
all files to be processed.
|
||||
The files can be a mixture of .spacy, .jsonl and text files.
|
||||
If .jsonl is provided the specified field is going
|
||||
to be grabbed ("text" by default)."""
|
||||
|
||||
out_help = "Path to save the resulting .spacy file"
|
||||
code_help = (
|
||||
"Path to Python file with additional " "code (registered functions) to be imported"
|
||||
)
|
||||
gold_help = "Use gold preprocessing provided in the .spacy files"
|
||||
force_msg = (
|
||||
"The provided output file already exists. "
|
||||
"To force overwriting the output file, set the --force or -F flag."
|
||||
)
|
||||
|
||||
|
||||
DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
|
||||
|
||||
|
||||
def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
|
||||
"""
|
||||
Stream Doc objects from DocBin.
|
||||
"""
|
||||
docbin = DocBin().from_disk(path)
|
||||
for doc in docbin.get_docs(vocab):
|
||||
yield doc
|
||||
|
||||
|
||||
def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
|
||||
"""
|
||||
Stream "text" field from JSONL. If the field "text" is
|
||||
not found it raises error.
|
||||
"""
|
||||
for entry in srsly.read_jsonl(path):
|
||||
if field not in entry:
|
||||
msg.fail(f"{path} does not contain the required '{field}' field.", exits=1)
|
||||
else:
|
||||
yield entry[field]
|
||||
|
||||
|
||||
def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
|
||||
"""
|
||||
Yields strings from text files in paths.
|
||||
"""
|
||||
for path in paths:
|
||||
with open(path, "r") as fin:
|
||||
text = fin.read()
|
||||
yield text
|
||||
|
||||
|
||||
@app.command("apply")
|
||||
def apply_cli(
|
||||
# fmt: off
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help=path_help, exists=True),
|
||||
output_file: Path = Arg(..., help=out_help, dir_okay=False),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
|
||||
text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
|
||||
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
|
||||
batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
|
||||
n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
|
||||
):
|
||||
"""
|
||||
Apply a trained pipeline to documents to get predictions.
|
||||
Expects a loadable spaCy pipeline and path to the data, which
|
||||
can be a directory or a file.
|
||||
The data files can be provided in multiple formats:
|
||||
1. .spacy files
|
||||
2. .jsonl files with a specified "field" to read the text from.
|
||||
3. Files with any other extension are assumed to be containing
|
||||
a single document.
|
||||
DOCS: https://spacy.io/api/cli#apply
|
||||
"""
|
||||
data_path = ensure_path(data_path)
|
||||
output_file = ensure_path(output_file)
|
||||
code_path = ensure_path(code_path)
|
||||
if output_file.exists() and not force_overwrite:
|
||||
msg.fail(force_msg, exits=1)
|
||||
if not data_path.exists():
|
||||
msg.fail(f"Couldn't find data path: {data_path}", exits=1)
|
||||
import_code(code_path)
|
||||
setup_gpu(use_gpu)
|
||||
apply(data_path, output_file, model, text_key, batch_size, n_process)
|
||||
|
||||
|
||||
def apply(
|
||||
data_path: Path,
|
||||
output_file: Path,
|
||||
model: str,
|
||||
json_field: str,
|
||||
batch_size: int,
|
||||
n_process: int,
|
||||
):
|
||||
docbin = DocBin(store_user_data=True)
|
||||
paths = walk_directory(data_path)
|
||||
if len(paths) == 0:
|
||||
docbin.to_disk(output_file)
|
||||
msg.warn(
|
||||
"Did not find data to process,"
|
||||
f" {data_path} seems to be an empty directory."
|
||||
)
|
||||
return
|
||||
nlp = load_model(model)
|
||||
msg.good(f"Loaded model {model}")
|
||||
vocab = nlp.vocab
|
||||
streams: List[DocOrStrStream] = []
|
||||
text_files = []
|
||||
for path in paths:
|
||||
if path.suffix == ".spacy":
|
||||
streams.append(_stream_docbin(path, vocab))
|
||||
elif path.suffix == ".jsonl":
|
||||
streams.append(_stream_jsonl(path, json_field))
|
||||
else:
|
||||
text_files.append(path)
|
||||
if len(text_files) > 0:
|
||||
streams.append(_stream_texts(text_files))
|
||||
datagen = cast(DocOrStrStream, chain(*streams))
|
||||
for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
|
||||
docbin.add(doc)
|
||||
if output_file.suffix == "":
|
||||
output_file = output_file.with_suffix(".spacy")
|
||||
docbin.to_disk(output_file)
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
|
||||
from typing import Callable, Iterable, Mapping, Optional, Any, Union
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
|
@ -7,7 +7,7 @@ import re
|
|||
import sys
|
||||
import itertools
|
||||
|
||||
from ._util import app, Arg, Opt
|
||||
from ._util import app, Arg, Opt, walk_directory
|
||||
from ..training import docs_to_json
|
||||
from ..tokens import Doc, DocBin
|
||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||
|
@ -28,6 +28,8 @@ CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
|
|||
"json": json_to_docs,
|
||||
}
|
||||
|
||||
AUTO = "auto"
|
||||
|
||||
|
||||
# File types that can be written to stdout
|
||||
FILE_TYPES_STDOUT = ("json",)
|
||||
|
@ -49,7 +51,7 @@ def convert_cli(
|
|||
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
|
||||
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
||||
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
||||
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||
converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
|
||||
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
|
||||
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
|
||||
|
@ -70,8 +72,8 @@ def convert_cli(
|
|||
output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
|
||||
silent = output_dir == "-"
|
||||
msg = Printer(no_print=silent)
|
||||
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
|
||||
converter = _get_converter(msg, converter, input_path)
|
||||
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
|
||||
convert(
|
||||
input_path,
|
||||
output_dir,
|
||||
|
@ -100,7 +102,7 @@ def convert(
|
|||
model: Optional[str] = None,
|
||||
morphology: bool = False,
|
||||
merge_subtokens: bool = False,
|
||||
converter: str = "auto",
|
||||
converter: str,
|
||||
ner_map: Optional[Path] = None,
|
||||
lang: Optional[str] = None,
|
||||
concatenate: bool = False,
|
||||
|
@ -189,33 +191,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
|
|||
return None
|
||||
|
||||
|
||||
def walk_directory(path: Path, converter: str) -> List[Path]:
|
||||
if not path.is_dir():
|
||||
return [path]
|
||||
paths = [path]
|
||||
locs = []
|
||||
seen = set()
|
||||
for path in paths:
|
||||
if str(path) in seen:
|
||||
continue
|
||||
seen.add(str(path))
|
||||
if path.parts[-1].startswith("."):
|
||||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
elif converter == "json" and not path.parts[-1].endswith("json"):
|
||||
continue
|
||||
elif converter == "conll" and not path.parts[-1].endswith("conll"):
|
||||
continue
|
||||
elif converter == "iob" and not path.parts[-1].endswith("iob"):
|
||||
continue
|
||||
else:
|
||||
locs.append(path)
|
||||
# It's good to sort these, in case the ordering messes up cache.
|
||||
locs.sort()
|
||||
return locs
|
||||
|
||||
|
||||
def verify_cli_args(
|
||||
msg: Printer,
|
||||
input_path: Path,
|
||||
|
@ -239,18 +214,22 @@ def verify_cli_args(
|
|||
input_locs = walk_directory(input_path, converter)
|
||||
if len(input_locs) == 0:
|
||||
msg.fail("No input files in directory", input_path, exits=1)
|
||||
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
||||
if converter == "auto" and len(file_types) >= 2:
|
||||
file_types_str = ",".join(file_types)
|
||||
msg.fail("All input files must be same type", file_types_str, exits=1)
|
||||
if converter != "auto" and converter not in CONVERTERS:
|
||||
if converter not in CONVERTERS:
|
||||
msg.fail(f"Can't find converter for {converter}", exits=1)
|
||||
|
||||
|
||||
def _get_converter(msg, converter, input_path: Path):
|
||||
if input_path.is_dir():
|
||||
input_path = walk_directory(input_path, converter)[0]
|
||||
if converter == "auto":
|
||||
if converter == AUTO:
|
||||
input_locs = walk_directory(input_path, suffix=None)
|
||||
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
||||
if len(file_types) >= 2:
|
||||
file_types_str = ",".join(file_types)
|
||||
msg.fail("All input files must be same type", file_types_str, exits=1)
|
||||
input_path = input_locs[0]
|
||||
else:
|
||||
input_path = walk_directory(input_path, suffix=converter)[0]
|
||||
if converter == AUTO:
|
||||
converter = input_path.suffix[1:]
|
||||
if converter == "ner" or converter == "iob":
|
||||
with input_path.open(encoding="utf8") as file_:
|
||||
|
|
|
@ -101,8 +101,8 @@ def project_run(
|
|||
if not (project_dir / dep).exists():
|
||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
||||
err_kwargs = {"exits": 1} if not dry else {}
|
||||
msg.fail(err, err_help, **err_kwargs)
|
||||
err_exits = 1 if not dry else None
|
||||
msg.fail(err, err_help, exits=err_exits)
|
||||
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
||||
with working_dir(project_dir) as current_dir:
|
||||
msg.divider(subcommand)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{# This is a template for training configs used for the quickstart widget in
|
||||
the docs and the init config command. It encodes various best practices and
|
||||
can help generate the best possible configuration, given a user's requirements. #}
|
||||
{%- set use_transformer = hardware != "cpu" -%}
|
||||
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
|
||||
[paths]
|
||||
|
|
|
@ -36,7 +36,7 @@ def render(
|
|||
jupyter (bool): Override Jupyter auto-detection.
|
||||
options (dict): Visualiser-specific options, e.g. colors.
|
||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
RETURNS (str): Rendered SVG or HTML markup.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy.render
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
|
|
|
@ -94,7 +94,7 @@ class SpanRenderer:
|
|||
parsed (list): Dependency parses to render.
|
||||
page (bool): Render parses wrapped as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
RETURNS (str): Rendered SVG or HTML markup.
|
||||
"""
|
||||
rendered = []
|
||||
for i, p in enumerate(parsed):
|
||||
|
@ -510,7 +510,7 @@ class EntityRenderer:
|
|||
parsed (list): Dependency parses to render.
|
||||
page (bool): Render parses wrapped as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
RETURNS (str): Rendered SVG or HTML markup.
|
||||
"""
|
||||
rendered = []
|
||||
for i, p in enumerate(parsed):
|
||||
|
|
|
@ -345,6 +345,11 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"clear the existing vectors and resize the table.")
|
||||
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
||||
"to end with the attribute {attr}. Got: {bad_attr}.")
|
||||
E079 = ("Error computing states in beam: number of predicted beams "
|
||||
"({pbeams}) does not equal number of gold beams ({gbeams}).")
|
||||
E080 = ("Duplicate state found in beam: {key}.")
|
||||
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
|
||||
"does not equal number of losses ({losses}).")
|
||||
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
||||
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
||||
"match.")
|
||||
|
@ -957,6 +962,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
|
||||
"knowledge base, use `InMemoryLookupKB`.")
|
||||
E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
|
||||
E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna
|
||||
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaande aangezien achter achterna
|
||||
afgelopen aldus alhoewel anderzijds
|
||||
|
||||
ben bij bijna bijvoorbeeld behalve beide beiden beneden bent bepaald beter betere betreffende binnen binnenin boven
|
||||
|
|
|
@ -328,9 +328,9 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
|
||||
tree = dict(tree)
|
||||
if "orig" in tree:
|
||||
tree["orig"] = self.vocab.strings[tree["orig"]]
|
||||
tree["orig"] = self.vocab.strings.add(tree["orig"])
|
||||
if "orig" in tree:
|
||||
tree["subst"] = self.vocab.strings[tree["subst"]]
|
||||
tree["subst"] = self.vocab.strings.add(tree["subst"])
|
||||
|
||||
trees.append(tree)
|
||||
|
||||
|
|
|
@ -170,7 +170,7 @@ def prioritize_existing_ents_filter(
|
|||
|
||||
|
||||
@registry.misc("spacy.prioritize_existing_ents_filter.v1")
|
||||
def make_preverse_existing_ents_filter():
|
||||
def make_preserve_existing_ents_filter():
|
||||
return prioritize_existing_ents_filter
|
||||
|
||||
|
||||
|
|
|
@ -272,7 +272,10 @@ class SpanCategorizer(TrainablePipe):
|
|||
DOCS: https://spacy.io/api/spancategorizer#predict
|
||||
"""
|
||||
indices = self.suggester(docs, ops=self.model.ops)
|
||||
scores = self.model.predict((docs, indices)) # type: ignore
|
||||
if indices.lengths.sum() == 0:
|
||||
scores = self.model.ops.alloc2f(0, 0)
|
||||
else:
|
||||
scores = self.model.predict((docs, indices)) # type: ignore
|
||||
return indices, scores
|
||||
|
||||
def set_candidates(
|
||||
|
|
|
@ -74,7 +74,7 @@ subword_features = true
|
|||
default_config={
|
||||
"threshold": 0.0,
|
||||
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
||||
"scorer": {"@scorers": "spacy.textcat_scorer.v1"},
|
||||
"scorer": {"@scorers": "spacy.textcat_scorer.v2"},
|
||||
},
|
||||
default_score_weights={
|
||||
"cats_score": 1.0,
|
||||
|
@ -117,7 +117,7 @@ def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
|||
)
|
||||
|
||||
|
||||
@registry.scorers("spacy.textcat_scorer.v1")
|
||||
@registry.scorers("spacy.textcat_scorer.v2")
|
||||
def make_textcat_scorer():
|
||||
return textcat_score
|
||||
|
||||
|
|
|
@ -74,7 +74,7 @@ subword_features = true
|
|||
default_config={
|
||||
"threshold": 0.5,
|
||||
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
|
||||
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
|
||||
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
|
||||
},
|
||||
default_score_weights={
|
||||
"cats_score": 1.0,
|
||||
|
@ -120,7 +120,7 @@ def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str,
|
|||
)
|
||||
|
||||
|
||||
@registry.scorers("spacy.textcat_multilabel_scorer.v1")
|
||||
@registry.scorers("spacy.textcat_multilabel_scorer.v2")
|
||||
def make_textcat_multilabel_scorer():
|
||||
return textcat_multilabel_score
|
||||
|
||||
|
|
|
@ -476,14 +476,12 @@ class Scorer:
|
|||
f_per_type = {label: PRFScore() for label in labels}
|
||||
auc_per_type = {label: ROCAUCScore() for label in labels}
|
||||
labels = set(labels)
|
||||
if labels:
|
||||
for eg in examples:
|
||||
labels.update(eg.predicted.cats.keys())
|
||||
labels.update(eg.reference.cats.keys())
|
||||
for example in examples:
|
||||
# Through this loop, None in the gold_cats indicates missing label.
|
||||
pred_cats = getter(example.predicted, attr)
|
||||
pred_cats = {k: v for k, v in pred_cats.items() if k in labels}
|
||||
gold_cats = getter(example.reference, attr)
|
||||
gold_cats = {k: v for k, v in gold_cats.items() if k in labels}
|
||||
|
||||
for label in labels:
|
||||
pred_score = pred_cats.get(label, 0.0)
|
||||
|
|
|
@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
|
|||
|
||||
# head before start
|
||||
arr = doc.to_array(["HEAD"])
|
||||
arr[0] = -1
|
||||
arr[0] = numpy.int32(-1).astype(numpy.uint64)
|
||||
doc_from_array = Doc(en_vocab, words=words)
|
||||
with pytest.raises(ValueError):
|
||||
doc_from_array.from_array(["HEAD"], arr)
|
||||
|
||||
# head after end
|
||||
arr = doc.to_array(["HEAD"])
|
||||
arr[0] = 5
|
||||
arr[0] = numpy.int32(5).astype(numpy.uint64)
|
||||
doc_from_array = Doc(en_vocab, words=words)
|
||||
with pytest.raises(ValueError):
|
||||
doc_from_array.from_array(["HEAD"], arr)
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
from typing import List
|
||||
|
||||
import pytest
|
||||
from random import Random
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Span, SpanGroup
|
||||
from spacy.tokens import Span, SpanGroup, Doc
|
||||
from spacy.util import filter_spans
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -240,3 +243,13 @@ def test_span_group_extend(doc):
|
|||
def test_span_group_dealloc(span_group):
|
||||
with pytest.raises(AttributeError):
|
||||
print(span_group.doc)
|
||||
|
||||
|
||||
@pytest.mark.issue(11975)
|
||||
def test_span_group_typing(doc: Doc):
|
||||
"""Tests whether typing of `SpanGroup` as `Iterable[Span]`-like object is accepted by mypy."""
|
||||
span_group: SpanGroup = doc.spans["SPANS"]
|
||||
spans: List[Span] = list(span_group)
|
||||
for i, span in enumerate(span_group):
|
||||
assert span == span_group[i] == spans[i]
|
||||
filter_spans(span_group)
|
||||
|
|
|
@ -60,10 +60,45 @@ def test_initialize_from_labels():
|
|||
nlp2 = Language()
|
||||
lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
|
||||
lemmatizer2.initialize(
|
||||
get_examples=lambda: train_examples,
|
||||
# We want to check that the strings in replacement nodes are
|
||||
# added to the string store. Avoid that they get added through
|
||||
# the examples.
|
||||
get_examples=lambda: train_examples[:1],
|
||||
labels=lemmatizer.label_data,
|
||||
)
|
||||
assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
|
||||
assert lemmatizer2.label_data == {
|
||||
"trees": [
|
||||
{"orig": "S", "subst": "s"},
|
||||
{
|
||||
"prefix_len": 1,
|
||||
"suffix_len": 0,
|
||||
"prefix_tree": 0,
|
||||
"suffix_tree": 4294967295,
|
||||
},
|
||||
{"orig": "s", "subst": ""},
|
||||
{
|
||||
"prefix_len": 0,
|
||||
"suffix_len": 1,
|
||||
"prefix_tree": 4294967295,
|
||||
"suffix_tree": 2,
|
||||
},
|
||||
{
|
||||
"prefix_len": 0,
|
||||
"suffix_len": 0,
|
||||
"prefix_tree": 4294967295,
|
||||
"suffix_tree": 4294967295,
|
||||
},
|
||||
{"orig": "E", "subst": "e"},
|
||||
{
|
||||
"prefix_len": 1,
|
||||
"suffix_len": 0,
|
||||
"prefix_tree": 5,
|
||||
"suffix_tree": 4294967295,
|
||||
},
|
||||
],
|
||||
"labels": (1, 3, 4, 6),
|
||||
}
|
||||
|
||||
|
||||
def test_no_data():
|
||||
|
|
|
@ -372,24 +372,39 @@ def test_overfitting_IO_overlapping():
|
|||
|
||||
|
||||
def test_zero_suggestions():
|
||||
# Test with a suggester that returns 0 suggestions
|
||||
# Test with a suggester that can return 0 suggestions
|
||||
|
||||
@registry.misc("test_zero_suggester")
|
||||
def make_zero_suggester():
|
||||
def zero_suggester(docs, *, ops=None):
|
||||
@registry.misc("test_mixed_zero_suggester")
|
||||
def make_mixed_zero_suggester():
|
||||
def mixed_zero_suggester(docs, *, ops=None):
|
||||
if ops is None:
|
||||
ops = get_current_ops()
|
||||
return Ragged(
|
||||
ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
|
||||
)
|
||||
spans = []
|
||||
lengths = []
|
||||
for doc in docs:
|
||||
if len(doc) > 0 and len(doc) % 2 == 0:
|
||||
spans.append((0, 1))
|
||||
lengths.append(1)
|
||||
else:
|
||||
lengths.append(0)
|
||||
spans = ops.asarray2i(spans)
|
||||
lengths_array = ops.asarray1i(lengths)
|
||||
if len(spans) > 0:
|
||||
output = Ragged(ops.xp.vstack(spans), lengths_array)
|
||||
else:
|
||||
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
|
||||
return output
|
||||
|
||||
return zero_suggester
|
||||
return mixed_zero_suggester
|
||||
|
||||
fix_random_seed(0)
|
||||
nlp = English()
|
||||
spancat = nlp.add_pipe(
|
||||
"spancat",
|
||||
config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
|
||||
config={
|
||||
"suggester": {"@misc": "test_mixed_zero_suggester"},
|
||||
"spans_key": SPAN_KEY,
|
||||
},
|
||||
)
|
||||
train_examples = make_examples(nlp)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
@ -397,6 +412,16 @@ def test_zero_suggestions():
|
|||
assert set(spancat.labels) == {"LOC", "PERSON"}
|
||||
|
||||
nlp.update(train_examples, sgd=optimizer)
|
||||
# empty doc
|
||||
nlp("")
|
||||
# single doc with zero suggestions
|
||||
nlp("one")
|
||||
# single doc with one suggestion
|
||||
nlp("two two")
|
||||
# batch with mixed zero/one suggestions
|
||||
list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"]))
|
||||
# batch with no suggestions
|
||||
list(nlp.pipe(["", "one", "three three three"]))
|
||||
|
||||
|
||||
def test_set_candidates():
|
||||
|
|
|
@ -895,3 +895,26 @@ def test_textcat_multi_threshold():
|
|||
|
||||
scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0})
|
||||
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"component_name,scorer",
|
||||
[
|
||||
("textcat", "spacy.textcat_scorer.v1"),
|
||||
("textcat_multilabel", "spacy.textcat_multilabel_scorer.v1"),
|
||||
],
|
||||
)
|
||||
def test_textcat_legacy_scorers(component_name, scorer):
|
||||
"""Check that legacy scorers are registered and produce the expected score
|
||||
keys."""
|
||||
nlp = English()
|
||||
nlp.add_pipe(component_name, config={"scorer": {"@scorers": scorer}})
|
||||
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA_SINGLE_LABEL:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
# score the model (it's not actually trained but that doesn't matter)
|
||||
scores = nlp.evaluate(train_examples)
|
||||
assert 0 <= scores["cats_score"] <= 1
|
||||
|
|
|
@ -4,7 +4,9 @@ from collections import Counter
|
|||
from typing import Tuple, List, Dict, Any
|
||||
import pkg_resources
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import spacy
|
||||
import numpy
|
||||
import pytest
|
||||
import srsly
|
||||
|
@ -14,7 +16,7 @@ from thinc.api import Config, ConfigValidationError
|
|||
|
||||
from spacy import about
|
||||
from spacy.cli import info
|
||||
from spacy.cli._util import is_subpath_of, load_project_config
|
||||
from spacy.cli._util import is_subpath_of, load_project_config, walk_directory
|
||||
from spacy.cli._util import parse_config_overrides, string_to_list
|
||||
from spacy.cli._util import substitute_project_variables
|
||||
from spacy.cli._util import validate_project_commands
|
||||
|
@ -32,6 +34,7 @@ from spacy.cli.package import _is_permitted_package_name
|
|||
from spacy.cli.project.remote_storage import RemoteStorage
|
||||
from spacy.cli.project.run import _check_requirements
|
||||
from spacy.cli.validate import get_model_pkgs
|
||||
from spacy.cli.apply import apply
|
||||
from spacy.cli.find_threshold import find_threshold
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.nl import Dutch
|
||||
|
@ -123,6 +126,25 @@ def test_issue7055():
|
|||
assert "model" in filled_cfg["components"]["ner"]
|
||||
|
||||
|
||||
@pytest.mark.issue(11235)
|
||||
def test_issue11235():
|
||||
"""
|
||||
Test that the cli handles interpolation in the directory names correctly when loading project config.
|
||||
"""
|
||||
lang_var = "en"
|
||||
variables = {"lang": lang_var}
|
||||
commands = [{"name": "x", "script": ["hello ${vars.lang}"]}]
|
||||
directories = ["cfg", "${vars.lang}_model"]
|
||||
project = {"commands": commands, "vars": variables, "directories": directories}
|
||||
with make_tempdir() as d:
|
||||
srsly.write_yaml(d / "project.yml", project)
|
||||
cfg = load_project_config(d)
|
||||
# Check that the directories are interpolated and created correctly
|
||||
assert os.path.exists(d / "cfg")
|
||||
assert os.path.exists(d / f"{lang_var}_model")
|
||||
assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
|
||||
|
||||
|
||||
def test_cli_info():
|
||||
nlp = Dutch()
|
||||
nlp.add_pipe("textcat")
|
||||
|
@ -866,6 +888,82 @@ def test_span_length_freq_dist_output_must_be_correct():
|
|||
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
|
||||
|
||||
|
||||
def test_applycli_empty_dir():
|
||||
with make_tempdir() as data_path:
|
||||
output = data_path / "test.spacy"
|
||||
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||
|
||||
|
||||
def test_applycli_docbin():
|
||||
with make_tempdir() as data_path:
|
||||
output = data_path / "testout.spacy"
|
||||
nlp = spacy.blank("en")
|
||||
doc = nlp("testing apply cli.")
|
||||
# test empty DocBin case
|
||||
docbin = DocBin()
|
||||
docbin.to_disk(data_path / "testin.spacy")
|
||||
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||
docbin.add(doc)
|
||||
docbin.to_disk(data_path / "testin.spacy")
|
||||
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||
|
||||
|
||||
def test_applycli_jsonl():
|
||||
with make_tempdir() as data_path:
|
||||
output = data_path / "testout.spacy"
|
||||
data = [{"field": "Testing apply cli.", "key": 234}]
|
||||
data2 = [{"field": "234"}]
|
||||
srsly.write_jsonl(data_path / "test.jsonl", data)
|
||||
apply(data_path, output, "blank:en", "field", 1, 1)
|
||||
srsly.write_jsonl(data_path / "test2.jsonl", data2)
|
||||
apply(data_path, output, "blank:en", "field", 1, 1)
|
||||
|
||||
|
||||
def test_applycli_txt():
|
||||
with make_tempdir() as data_path:
|
||||
output = data_path / "testout.spacy"
|
||||
with open(data_path / "test.foo", "w") as ftest:
|
||||
ftest.write("Testing apply cli.")
|
||||
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||
|
||||
|
||||
def test_applycli_mixed():
|
||||
with make_tempdir() as data_path:
|
||||
output = data_path / "testout.spacy"
|
||||
text = "Testing apply cli"
|
||||
nlp = spacy.blank("en")
|
||||
doc = nlp(text)
|
||||
jsonl_data = [{"text": text}]
|
||||
srsly.write_jsonl(data_path / "test.jsonl", jsonl_data)
|
||||
docbin = DocBin()
|
||||
docbin.add(doc)
|
||||
docbin.to_disk(data_path / "testin.spacy")
|
||||
with open(data_path / "test.txt", "w") as ftest:
|
||||
ftest.write(text)
|
||||
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||
# Check whether it worked
|
||||
result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
|
||||
assert len(result) == 3
|
||||
for doc in result:
|
||||
assert doc.text == text
|
||||
|
||||
|
||||
def test_applycli_user_data():
|
||||
Doc.set_extension("ext", default=0)
|
||||
val = ("ext", 0)
|
||||
with make_tempdir() as data_path:
|
||||
output = data_path / "testout.spacy"
|
||||
nlp = spacy.blank("en")
|
||||
doc = nlp("testing apply cli.")
|
||||
doc._.ext = val
|
||||
docbin = DocBin(store_user_data=True)
|
||||
docbin.add(doc)
|
||||
docbin.to_disk(data_path / "testin.spacy")
|
||||
apply(data_path, output, "blank:en", "", 1, 1)
|
||||
result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
|
||||
assert result[0]._.ext == val
|
||||
|
||||
|
||||
def test_local_remote_storage():
|
||||
with make_tempdir() as d:
|
||||
filename = "a.txt"
|
||||
|
@ -1088,3 +1186,26 @@ def test_upload_download_local_file():
|
|||
download_file(remote_file, local_file)
|
||||
with local_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
|
||||
|
||||
def test_walk_directory():
|
||||
with make_tempdir() as d:
|
||||
files = [
|
||||
"data1.iob",
|
||||
"data2.iob",
|
||||
"data3.json",
|
||||
"data4.conll",
|
||||
"data5.conll",
|
||||
"data6.conll",
|
||||
"data7.txt",
|
||||
]
|
||||
|
||||
for f in files:
|
||||
Path(d / f).touch()
|
||||
|
||||
assert (len(walk_directory(d))) == 7
|
||||
assert (len(walk_directory(d, suffix=None))) == 7
|
||||
assert (len(walk_directory(d, suffix="json"))) == 1
|
||||
assert (len(walk_directory(d, suffix="iob"))) == 2
|
||||
assert (len(walk_directory(d, suffix="conll"))) == 3
|
||||
assert (len(walk_directory(d, suffix="pdf"))) == 0
|
||||
|
|
33
spacy/tests/test_cli_app.py
Normal file
33
spacy/tests/test_cli_app.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from spacy.cli._util import app
|
||||
from .util import make_tempdir
|
||||
|
||||
|
||||
def test_convert_auto():
|
||||
with make_tempdir() as d_in, make_tempdir() as d_out:
|
||||
for f in ["data1.iob", "data2.iob", "data3.iob"]:
|
||||
Path(d_in / f).touch()
|
||||
|
||||
# ensure that "automatic" suffix detection works
|
||||
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
|
||||
assert "Generated output file" in result.stdout
|
||||
out_files = os.listdir(d_out)
|
||||
assert len(out_files) == 3
|
||||
assert "data1.spacy" in out_files
|
||||
assert "data2.spacy" in out_files
|
||||
assert "data3.spacy" in out_files
|
||||
|
||||
|
||||
def test_convert_auto_conflict():
|
||||
with make_tempdir() as d_in, make_tempdir() as d_out:
|
||||
for f in ["data1.iob", "data2.iob", "data3.json"]:
|
||||
Path(d_in / f).touch()
|
||||
|
||||
# ensure that "automatic" suffix detection warns when there are different file types
|
||||
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
|
||||
assert "All input files must be same type" in result.stdout
|
||||
out_files = os.listdir(d_out)
|
||||
assert len(out_files) == 0
|
|
@ -3,6 +3,7 @@ import logging
|
|||
from unittest import mock
|
||||
import pytest
|
||||
from spacy.language import Language
|
||||
from spacy.scorer import Scorer
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.training import Example
|
||||
|
@ -126,6 +127,112 @@ def test_evaluate_no_pipe(nlp):
|
|||
nlp.evaluate([Example.from_dict(doc, annots)])
|
||||
|
||||
|
||||
def test_evaluate_textcat_multilabel(en_vocab):
|
||||
"""Test that evaluate works with a multilabel textcat pipe."""
|
||||
nlp = Language(en_vocab)
|
||||
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
|
||||
for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
|
||||
textcat_multilabel.add_label(label)
|
||||
nlp.initialize()
|
||||
|
||||
annots = {"cats": {"FEATURE": 1.0, "QUESTION": 1.0}}
|
||||
doc = nlp.make_doc("hello world")
|
||||
example = Example.from_dict(doc, annots)
|
||||
scores = nlp.evaluate([example])
|
||||
labels = nlp.get_pipe("textcat_multilabel").labels
|
||||
for label in labels:
|
||||
assert scores["cats_f_per_type"].get(label) is not None
|
||||
for key in example.reference.cats.keys():
|
||||
if key not in labels:
|
||||
assert scores["cats_f_per_type"].get(key) is None
|
||||
|
||||
|
||||
def test_evaluate_multiple_textcat_final(en_vocab):
|
||||
"""Test that evaluate evaluates the final textcat component in a pipeline
|
||||
with more than one textcat or textcat_multilabel."""
|
||||
nlp = Language(en_vocab)
|
||||
textcat = nlp.add_pipe("textcat")
|
||||
for label in ("POSITIVE", "NEGATIVE"):
|
||||
textcat.add_label(label)
|
||||
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
|
||||
for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
|
||||
textcat_multilabel.add_label(label)
|
||||
nlp.initialize()
|
||||
|
||||
annots = {
|
||||
"cats": {
|
||||
"POSITIVE": 1.0,
|
||||
"NEGATIVE": 0.0,
|
||||
"FEATURE": 1.0,
|
||||
"QUESTION": 1.0,
|
||||
"POSITIVE": 1.0,
|
||||
"NEGATIVE": 0.0,
|
||||
}
|
||||
}
|
||||
doc = nlp.make_doc("hello world")
|
||||
example = Example.from_dict(doc, annots)
|
||||
scores = nlp.evaluate([example])
|
||||
# get the labels from the final pipe
|
||||
labels = nlp.get_pipe(nlp.pipe_names[-1]).labels
|
||||
for label in labels:
|
||||
assert scores["cats_f_per_type"].get(label) is not None
|
||||
for key in example.reference.cats.keys():
|
||||
if key not in labels:
|
||||
assert scores["cats_f_per_type"].get(key) is None
|
||||
|
||||
|
||||
def test_evaluate_multiple_textcat_separate(en_vocab):
|
||||
"""Test that evaluate can evaluate multiple textcat components separately
|
||||
with custom scorers."""
|
||||
|
||||
def custom_textcat_score(examples, **kwargs):
|
||||
scores = Scorer.score_cats(
|
||||
examples,
|
||||
"cats",
|
||||
multi_label=False,
|
||||
**kwargs,
|
||||
)
|
||||
return {f"custom_{k}": v for k, v in scores.items()}
|
||||
|
||||
@spacy.registry.scorers("test_custom_textcat_scorer")
|
||||
def make_custom_textcat_scorer():
|
||||
return custom_textcat_score
|
||||
|
||||
nlp = Language(en_vocab)
|
||||
textcat = nlp.add_pipe(
|
||||
"textcat",
|
||||
config={"scorer": {"@scorers": "test_custom_textcat_scorer"}},
|
||||
)
|
||||
for label in ("POSITIVE", "NEGATIVE"):
|
||||
textcat.add_label(label)
|
||||
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
|
||||
for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
|
||||
textcat_multilabel.add_label(label)
|
||||
nlp.initialize()
|
||||
|
||||
annots = {
|
||||
"cats": {
|
||||
"POSITIVE": 1.0,
|
||||
"NEGATIVE": 0.0,
|
||||
"FEATURE": 1.0,
|
||||
"QUESTION": 1.0,
|
||||
"POSITIVE": 1.0,
|
||||
"NEGATIVE": 0.0,
|
||||
}
|
||||
}
|
||||
doc = nlp.make_doc("hello world")
|
||||
example = Example.from_dict(doc, annots)
|
||||
scores = nlp.evaluate([example])
|
||||
# check custom scores for the textcat pipe
|
||||
assert "custom_cats_f_per_type" in scores
|
||||
labels = nlp.get_pipe("textcat").labels
|
||||
assert set(scores["custom_cats_f_per_type"].keys()) == set(labels)
|
||||
# check default scores for the textcat_multilabel pipe
|
||||
assert "cats_f_per_type" in scores
|
||||
labels = nlp.get_pipe("textcat_multilabel").labels
|
||||
assert set(scores["cats_f_per_type"].keys()) == set(labels)
|
||||
|
||||
|
||||
def vector_modification_pipe(doc):
|
||||
doc.vector += 1
|
||||
return doc
|
||||
|
|
|
@ -359,6 +359,7 @@ cdef class Doc:
|
|||
for annot in annotations:
|
||||
if annot:
|
||||
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
||||
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
|
||||
for i in range(len(words)):
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = annot[i]
|
||||
|
@ -1558,6 +1559,7 @@ cdef class Doc:
|
|||
|
||||
for j, (attr, annot) in enumerate(token_annotations.items()):
|
||||
if attr is HEAD:
|
||||
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
|
||||
for i in range(len(words)):
|
||||
array[i, j] = annot[i]
|
||||
elif attr is MORPH:
|
||||
|
|
|
@ -95,8 +95,8 @@ class Span:
|
|||
self,
|
||||
start_idx: int,
|
||||
end_idx: int,
|
||||
label: int = ...,
|
||||
kb_id: int = ...,
|
||||
label: Union[int, str] = ...,
|
||||
kb_id: Union[int, str] = ...,
|
||||
vector: Optional[Floats1d] = ...,
|
||||
) -> Span: ...
|
||||
@property
|
||||
|
|
|
@ -299,7 +299,7 @@ cdef class Span:
|
|||
for ancestor in ancestors:
|
||||
ancestor_i = ancestor.i - self.c.start
|
||||
if ancestor_i in range(length):
|
||||
array[i, head_col] = ancestor_i - i
|
||||
array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
|
||||
|
||||
# if there is no appropriate ancestor, define a new artificial root
|
||||
value = array[i, head_col]
|
||||
|
@ -307,7 +307,7 @@ cdef class Span:
|
|||
new_root = old_to_new_root.get(ancestor_i, None)
|
||||
if new_root is not None:
|
||||
# take the same artificial root as a previous token from the same sentence
|
||||
array[i, head_col] = new_root - i
|
||||
array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
|
||||
else:
|
||||
# set this token as the new artificial root
|
||||
array[i, head_col] = 0
|
||||
|
|
|
@ -18,6 +18,7 @@ class SpanGroup:
|
|||
def doc(self) -> Doc: ...
|
||||
@property
|
||||
def has_overlap(self) -> bool: ...
|
||||
def __iter__(self): ...
|
||||
def __len__(self) -> int: ...
|
||||
def append(self, span: Span) -> None: ...
|
||||
def extend(self, spans: Iterable[Span]) -> None: ...
|
||||
|
|
|
@ -158,6 +158,16 @@ cdef class SpanGroup:
|
|||
return self._concat(other)
|
||||
return NotImplemented
|
||||
|
||||
def __iter__(self):
|
||||
"""
|
||||
Iterate over the spans in this SpanGroup.
|
||||
YIELDS (Span): A span in this SpanGroup.
|
||||
|
||||
DOCS: https://spacy.io/api/spangroup#iter
|
||||
"""
|
||||
for i in range(self.c.size()):
|
||||
yield self[i]
|
||||
|
||||
def append(self, Span span):
|
||||
"""Add a span to the group. The span must refer to the same Doc
|
||||
object as the span group.
|
||||
|
|
|
@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
|||
if key not in IDS:
|
||||
raise ValueError(Errors.E974.format(obj="token", key=key))
|
||||
elif key in ["ORTH", "SPACY"]:
|
||||
pass
|
||||
continue
|
||||
elif key == "HEAD":
|
||||
attrs.append(key)
|
||||
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
|
||||
row = [h-i if h is not None else 0 for i, h in enumerate(value)]
|
||||
elif key == "DEP":
|
||||
attrs.append(key)
|
||||
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
|
||||
row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
|
||||
elif key == "SENT_START":
|
||||
attrs.append(key)
|
||||
values.append([to_ternary_int(v) for v in value])
|
||||
row = [to_ternary_int(v) for v in value]
|
||||
elif key == "MORPH":
|
||||
attrs.append(key)
|
||||
values.append([vocab.morphology.add(v) for v in value])
|
||||
row = [vocab.morphology.add(v) for v in value]
|
||||
else:
|
||||
attrs.append(key)
|
||||
if not all(isinstance(v, str) for v in value):
|
||||
types = set([type(v) for v in value])
|
||||
raise TypeError(Errors.E969.format(field=key, types=types)) from None
|
||||
values.append([vocab.strings.add(v) for v in value])
|
||||
array = numpy.asarray(values, dtype="uint64")
|
||||
row = [vocab.strings.add(v) for v in value]
|
||||
values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
|
||||
array = numpy.array(values, dtype=numpy.uint64)
|
||||
return attrs, array.T
|
||||
|
||||
|
||||
|
|
|
@ -26,6 +26,8 @@ def setup_table(
|
|||
return final_cols, final_widths, ["r" for _ in final_widths]
|
||||
|
||||
|
||||
# We cannot rename this method as it's directly imported
|
||||
# and used by external packages such as spacy-loggers.
|
||||
@registry.loggers("spacy.ConsoleLogger.v2")
|
||||
def console_logger(
|
||||
progress_bar: bool = False,
|
||||
|
@ -33,7 +35,27 @@ def console_logger(
|
|||
output_file: Optional[Union[str, Path]] = None,
|
||||
):
|
||||
"""The ConsoleLogger.v2 prints out training logs in the console and/or saves them to a jsonl file.
|
||||
progress_bar (bool): Whether the logger should print the progress bar.
|
||||
progress_bar (bool): Whether the logger should print a progress bar tracking the steps till the next evaluation pass.
|
||||
console_output (bool): Whether the logger should print the logs on the console.
|
||||
output_file (Optional[Union[str, Path]]): The file to save the training logs to.
|
||||
"""
|
||||
return console_logger_v3(
|
||||
progress_bar=None if progress_bar is False else "eval",
|
||||
console_output=console_output,
|
||||
output_file=output_file,
|
||||
)
|
||||
|
||||
|
||||
@registry.loggers("spacy.ConsoleLogger.v3")
|
||||
def console_logger_v3(
|
||||
progress_bar: Optional[str] = None,
|
||||
console_output: bool = True,
|
||||
output_file: Optional[Union[str, Path]] = None,
|
||||
):
|
||||
"""The ConsoleLogger.v3 prints out training logs in the console and/or saves them to a jsonl file.
|
||||
progress_bar (Optional[str]): Type of progress bar to show in the console. Allowed values:
|
||||
train - Tracks the number of steps from the beginning of training until the full training run is complete (training.max_steps is reached).
|
||||
eval - Tracks the number of steps between the previous and next evaluation (training.eval_frequency is reached).
|
||||
console_output (bool): Whether the logger should print the logs on the console.
|
||||
output_file (Optional[Union[str, Path]]): The file to save the training logs to.
|
||||
"""
|
||||
|
@ -70,6 +92,7 @@ def console_logger(
|
|||
for name, proc in nlp.pipeline
|
||||
if hasattr(proc, "is_trainable") and proc.is_trainable
|
||||
]
|
||||
max_steps = nlp.config["training"]["max_steps"]
|
||||
eval_frequency = nlp.config["training"]["eval_frequency"]
|
||||
score_weights = nlp.config["training"]["score_weights"]
|
||||
score_cols = [col for col, value in score_weights.items() if value is not None]
|
||||
|
@ -84,6 +107,13 @@ def console_logger(
|
|||
write(msg.row(table_header, widths=table_widths, spacing=spacing))
|
||||
write(msg.row(["-" * width for width in table_widths], spacing=spacing))
|
||||
progress = None
|
||||
expected_progress_types = ("train", "eval")
|
||||
if progress_bar is not None and progress_bar not in expected_progress_types:
|
||||
raise ValueError(
|
||||
Errors.E1048.format(
|
||||
unexpected=progress_bar, expected=expected_progress_types
|
||||
)
|
||||
)
|
||||
|
||||
def log_step(info: Optional[Dict[str, Any]]) -> None:
|
||||
nonlocal progress
|
||||
|
@ -141,11 +171,23 @@ def console_logger(
|
|||
)
|
||||
)
|
||||
if progress_bar:
|
||||
if progress_bar == "train":
|
||||
total = max_steps
|
||||
desc = f"Last Eval Epoch: {info['epoch']}"
|
||||
initial = info["step"]
|
||||
else:
|
||||
total = eval_frequency
|
||||
desc = f"Epoch {info['epoch']+1}"
|
||||
initial = 0
|
||||
# Set disable=None, so that it disables on non-TTY
|
||||
progress = tqdm.tqdm(
|
||||
total=eval_frequency, disable=None, leave=False, file=stderr
|
||||
total=total,
|
||||
disable=None,
|
||||
leave=False,
|
||||
file=stderr,
|
||||
initial=initial,
|
||||
)
|
||||
progress.set_description(f"Epoch {info['epoch']+1}")
|
||||
progress.set_description(desc)
|
||||
|
||||
def finalize() -> None:
|
||||
if output_stream:
|
||||
|
|
|
@ -1643,7 +1643,9 @@ def _pipe(
|
|||
docs: Iterable["Doc"],
|
||||
proc: "PipeCallable",
|
||||
name: str,
|
||||
default_error_handler: Callable[[str, "PipeCallable", List["Doc"], Exception], NoReturn],
|
||||
default_error_handler: Callable[
|
||||
[str, "PipeCallable", List["Doc"], Exception], NoReturn
|
||||
],
|
||||
kwargs: Mapping[str, Any],
|
||||
) -> Iterator["Doc"]:
|
||||
if hasattr(proc, "pipe"):
|
||||
|
|
|
@ -13,6 +13,7 @@ menu:
|
|||
- ['pretrain', 'pretrain']
|
||||
- ['evaluate', 'evaluate']
|
||||
- ['benchmark', 'benchmark']
|
||||
- ['apply', 'apply']
|
||||
- ['find-threshold', 'find-threshold']
|
||||
- ['assemble', 'assemble']
|
||||
- ['package', 'package']
|
||||
|
@ -475,7 +476,7 @@ report span characteristics such as the average span length and the span (or
|
|||
span boundary) distinctiveness. The distinctiveness measure shows how different
|
||||
the tokens are with respect to the rest of the corpus using the KL-divergence of
|
||||
the token distributions. To learn more, you can check out Papay et al.'s work on
|
||||
[*Dissecting Span Identification Tasks with Performance Prediction* (EMNLP 2020)](https://aclanthology.org/2020.emnlp-main.396/).
|
||||
[_Dissecting Span Identification Tasks with Performance Prediction_ (EMNLP 2020)](https://aclanthology.org/2020.emnlp-main.396/).
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
@ -1195,6 +1196,37 @@ $ python -m spacy benchmark speed [model] [data_path] [--batch_size] [--no-shuff
|
|||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | Pipeline speed with 95% confidence interval. |
|
||||
|
||||
## apply {#apply new="3.5" tag="command"}
|
||||
|
||||
Applies a trained pipeline to data and stores the resulting annotated documents
|
||||
in a `DocBin`. The input can be a single file or a directory. The recognized
|
||||
input formats are:
|
||||
|
||||
1. `.spacy`
|
||||
2. `.jsonl` containing a user specified `text_key`
|
||||
3. Files with any other extension are assumed to be plain text files containing
|
||||
a single document.
|
||||
|
||||
When a directory is provided it is traversed recursively to collect all files.
|
||||
|
||||
```cli
|
||||
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
|
||||
| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
|
||||
| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
|
||||
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
||||
| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. |
|
||||
|
||||
## find-threshold {#find-threshold new="3.5" tag="command"}
|
||||
|
||||
Runs prediction trials for a trained model with varying tresholds to maximize
|
||||
|
@ -1220,7 +1252,6 @@ be provided.
|
|||
> $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
|
||||
> ```
|
||||
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
|
|
|
@ -186,7 +186,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
|
|||
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
|
||||
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
||||
| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||
| `before_update` | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `step`, `epoch`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~ |
|
||||
| `before_update` <Tag variant="new">3.5</Tag> | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `step`, `epoch`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~ |
|
||||
| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
|
||||
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
||||
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
||||
|
|
|
@ -138,7 +138,7 @@ The L2 norm of the lexeme's vector representation.
|
|||
| `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ |
|
||||
| `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ |
|
||||
| `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ |
|
||||
| `suffix_` | Length-N substring from the start of the word. Defaults to `N=3`. ~~str~~ |
|
||||
| `suffix_` | Length-N substring from the end of the word. Defaults to `N=3`. ~~str~~ |
|
||||
| `is_alpha` | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~ |
|
||||
| `is_ascii` | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~ |
|
||||
| `is_digit` | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~ |
|
||||
|
|
|
@ -202,6 +202,23 @@ already present in the current span group.
|
|||
| `other` | The span group or spans to append. ~~Union[SpanGroup, Iterable[Span]]~~ |
|
||||
| **RETURNS** | The span group. ~~SpanGroup~~ |
|
||||
|
||||
## SpanGroup.\_\_iter\_\_ {#iter tag="method" new="3.5"}
|
||||
|
||||
Iterate over the spans in this span group.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc = nlp("Their goi ng home")
|
||||
> doc.spans["errors"] = [doc[0:1], doc[1:3]]
|
||||
> for error_span in doc.spans["errors"]:
|
||||
> print(error_span)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------- | ----------------------------------- |
|
||||
| **YIELDS** | A span in this span group. ~~Span~~ |
|
||||
|
||||
## SpanGroup.append {#append tag="method"}
|
||||
|
||||
Add a [`Span`](/api/span) object to the group. The span must refer to the same
|
||||
|
|
|
@ -266,7 +266,7 @@ Render a dependency parse tree or named entity visualization.
|
|||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]~~ |
|
||||
| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
|
||||
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
||||
| `page` | Render markup as full HTML page. Defaults to `False`. ~~bool~~ |
|
||||
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
||||
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
||||
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
|
||||
|
@ -513,7 +513,7 @@ a [Weights & Biases](https://www.wandb.com/) dashboard.
|
|||
Instead of using one of the built-in loggers, you can
|
||||
[implement your own](/usage/training#custom-logging).
|
||||
|
||||
#### spacy.ConsoleLogger.v2 {#ConsoleLogger tag="registered function"}
|
||||
#### spacy.ConsoleLogger.v2 {tag="registered function"}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
|
@ -564,11 +564,33 @@ start decreasing across epochs.
|
|||
|
||||
</Accordion>
|
||||
|
||||
| Name | Description |
|
||||
| ---------------- | --------------------------------------------------------------------- |
|
||||
| `progress_bar` | Whether the logger should print the progress bar ~~bool~~ |
|
||||
| `console_output` | Whether the logger should print the logs on the console. ~~bool~~ |
|
||||
| `output_file` | The file to save the training logs to. ~~Optional[Union[str, Path]]~~ |
|
||||
| Name | Description |
|
||||
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `progress_bar` | Whether the logger should print a progress bar tracking the steps till the next evaluation pass (default: `False`). ~~bool~~ |
|
||||
| `console_output` | Whether the logger should print the logs in the console (default: `True`). ~~bool~~ |
|
||||
| `output_file` | The file to save the training logs to (default: `None`). ~~Optional[Union[str, Path]]~~ |
|
||||
|
||||
#### spacy.ConsoleLogger.v3 {#ConsoleLogger tag="registered function"}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [training.logger]
|
||||
> @loggers = "spacy.ConsoleLogger.v3"
|
||||
> progress_bar = "all_steps"
|
||||
> console_output = true
|
||||
> output_file = "training_log.jsonl"
|
||||
> ```
|
||||
|
||||
Writes the results of a training step to the console in a tabular format and
|
||||
optionally saves them to a `jsonl` file.
|
||||
|
||||
| Name | Description |
|
||||
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `progress_bar` | Type of progress bar to show in the console: `"train"`, `"eval"` or `None`. |
|
||||
| | The bar tracks the number of steps until `training.max_steps` and `training.eval_frequency` are reached respectively (default: `None`). ~~Optional[str]~~ |
|
||||
| `console_output` | Whether the logger should print the logs in the console (default: `True`). ~~bool~~ |
|
||||
| `output_file` | The file to save the training logs to (default: `None`). ~~Optional[Union[str, Path]]~~ |
|
||||
|
||||
## Readers {#readers}
|
||||
|
||||
|
@ -1004,6 +1026,54 @@ This method was previously available as `spacy.gold.spans_from_biluo_tags`.
|
|||
| `tags` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
|
||||
| **RETURNS** | A sequence of `Span` objects with added entity labels. ~~List[Span]~~ |
|
||||
|
||||
### training.biluo_to_iob {#biluo_to_iob tag="function"}
|
||||
|
||||
Convert a sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags to
|
||||
[IOB](/usage/linguistic-features#accessing-ner) tags. This is useful if you want
|
||||
use the BILUO tags with a model that only supports IOB tags.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.training import biluo_to_iob
|
||||
>
|
||||
> tags = ["O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||
> iob_tags = biluo_to_iob(tags)
|
||||
> assert iob_tags == ["O", "O", "B-LOC", "I-LOC", "I-LOC", "O"]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------------------- |
|
||||
| `tags` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | A list of [IOB](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
|
||||
|
||||
### training.iob_to_biluo {#iob_to_biluo tag="function"}
|
||||
|
||||
Convert a sequence of [IOB](/usage/linguistic-features#accessing-ner) tags to
|
||||
[BILUO](/usage/linguistic-features#accessing-ner) tags. This is useful if you
|
||||
want use the IOB tags with a model that only supports BILUO tags.
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning" id="iob_to_biluo">
|
||||
|
||||
This method was previously available as `spacy.gold.iob_to_biluo`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.training import iob_to_biluo
|
||||
>
|
||||
> tags = ["O", "O", "B-LOC", "I-LOC", "O"]
|
||||
> biluo_tags = iob_to_biluo(tags)
|
||||
> assert biluo_tags == ["O", "O", "B-LOC", "L-LOC", "O"]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------- |
|
||||
| `tags` | A sequence of [IOB](/usage/linguistic-features#accessing-ner) tags. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | A list of [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
|
||||
|
||||
## Utility functions {#util source="spacy/util.py"}
|
||||
|
||||
spaCy comes with a small collection of utility functions located in
|
||||
|
|
|
@ -308,14 +308,14 @@ Load state from a binary string.
|
|||
> assert type(PERSON) == int
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `strings` | A table managing the string-to-int mapping. ~~StringStore~~ |
|
||||
| `vectors` | A table associating word IDs to word vectors. ~~Vectors~~ |
|
||||
| `vectors_length` | Number of dimensions for each word vector. ~~int~~ |
|
||||
| `lookups` | The available lookup tables in this vocab. ~~Lookups~~ |
|
||||
| `writing_system` | A dict with information about the language's writing system. ~~Dict[str, Any]~~ |
|
||||
| `get_noun_chunks` <Tag variant="new">3.0</Tag> | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
|
||||
| Name | Description |
|
||||
| ---------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `strings` | A table managing the string-to-int mapping. ~~StringStore~~ |
|
||||
| `vectors` | A table associating word IDs to word vectors. ~~Vectors~~ |
|
||||
| `vectors_length` | Number of dimensions for each word vector. ~~int~~ |
|
||||
| `lookups` | The available lookup tables in this vocab. ~~Lookups~~ |
|
||||
| `writing_system` | A dict with information about the language's writing system. ~~Dict[str, Any]~~ |
|
||||
| `get_noun_chunks` <Tag variant="new">3.0</Tag> | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -66,8 +66,8 @@ The English CNN pipelines have new word vectors:
|
|||
| Package | Model Version | TAG | Parser LAS | NER F |
|
||||
| ----------------------------------------------- | ------------- | ---: | ---------: | ----: |
|
||||
| [`en_core_web_md`](/models/en#en_core_web_md) | v3.3.0 | 97.3 | 90.1 | 84.6 |
|
||||
| [`en_core_web_md`](/models/en#en_core_web_lg) | v3.4.0 | 97.2 | 90.3 | 85.5 |
|
||||
| [`en_core_web_lg`](/models/en#en_core_web_md) | v3.3.0 | 97.4 | 90.1 | 85.3 |
|
||||
| [`en_core_web_md`](/models/en#en_core_web_md) | v3.4.0 | 97.2 | 90.3 | 85.5 |
|
||||
| [`en_core_web_lg`](/models/en#en_core_web_lg) | v3.3.0 | 97.4 | 90.1 | 85.3 |
|
||||
| [`en_core_web_lg`](/models/en#en_core_web_lg) | v3.4.0 | 97.3 | 90.2 | 85.6 |
|
||||
|
||||
## Notes about upgrading from v3.3 {#upgrading}
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
{ "text": "v2.x Documentation", "url": "https://v2.spacy.io" },
|
||||
{
|
||||
"text": "Custom Solutions",
|
||||
"url": "https://explosion.ai/spacy-tailored-pipelines"
|
||||
"url": "https://explosion.ai/custom-solutions"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -51,7 +51,7 @@
|
|||
{ "text": "Online Course", "url": "https://course.spacy.io" },
|
||||
{
|
||||
"text": "Custom Solutions",
|
||||
"url": "https://explosion.ai/spacy-tailored-pipelines"
|
||||
"url": "https://explosion.ai/custom-solutions"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
|
|
@ -1023,25 +1023,6 @@
|
|||
},
|
||||
"category": ["pipeline"]
|
||||
},
|
||||
{
|
||||
"id": "spacy-sentence-segmenter",
|
||||
"title": "Sentence Segmenter",
|
||||
"slogan": "Custom sentence segmentation for spaCy",
|
||||
"code_example": [
|
||||
"from seg.newline.segmenter import NewLineSegmenter",
|
||||
"import spacy",
|
||||
"",
|
||||
"nlseg = NewLineSegmenter()",
|
||||
"nlp = spacy.load('en')",
|
||||
"nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter', before='parser')",
|
||||
"doc = nlp(my_doc_text)"
|
||||
],
|
||||
"author": "tc64",
|
||||
"author_links": {
|
||||
"github": "tc64"
|
||||
},
|
||||
"category": ["pipeline"]
|
||||
},
|
||||
{
|
||||
"id": "spacy_cld",
|
||||
"title": "spaCy-CLD",
|
||||
|
@ -1468,13 +1449,26 @@
|
|||
"image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png",
|
||||
"code_example": [
|
||||
"import spacy",
|
||||
"import scattertext as st",
|
||||
"",
|
||||
"nlp = spacy.load('en')",
|
||||
"corpus = st.CorpusFromPandas(convention_df,",
|
||||
" category_col='party',",
|
||||
" text_col='text',",
|
||||
" nlp=nlp).build()"
|
||||
"from scattertext import SampleCorpora, produce_scattertext_explorer",
|
||||
"from scattertext import produce_scattertext_html",
|
||||
"from scattertext.CorpusFromPandas import CorpusFromPandas",
|
||||
"",
|
||||
"nlp = spacy.load('en_core_web_sm')",
|
||||
"convention_df = SampleCorpora.ConventionData2012.get_data()",
|
||||
"corpus = CorpusFromPandas(convention_df,",
|
||||
" category_col='party',",
|
||||
" text_col='text',",
|
||||
" nlp=nlp).build()",
|
||||
"",
|
||||
"html = produce_scattertext_html(corpus,",
|
||||
" category='democrat',",
|
||||
" category_name='Democratic',",
|
||||
" not_category_name='Republican',",
|
||||
" minimum_term_frequency=5,",
|
||||
" width_in_pixels=1000)",
|
||||
"open('./simple.html', 'wb').write(html.encode('utf-8'))",
|
||||
"print('Open ./simple.html in Chrome or Firefox.')"
|
||||
],
|
||||
"author": "Jason Kessler",
|
||||
"author_links": {
|
||||
|
@ -4068,6 +4062,33 @@
|
|||
"author_links": {
|
||||
"github": "yasufumy"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "spacy-pythainlp",
|
||||
"title": "spaCy-PyThaiNLP",
|
||||
"slogan": "PyThaiNLP for spaCy",
|
||||
"description": "This package wraps the PyThaiNLP library to add support for Thai to spaCy.",
|
||||
"github": "PyThaiNLP/spaCy-PyThaiNLP",
|
||||
"code_example": [
|
||||
"import spacy",
|
||||
"import spacy_pythainlp.core",
|
||||
"",
|
||||
"nlp = spacy.blank('th')",
|
||||
"nlp.add_pipe('pythainlp')",
|
||||
"doc = nlp('ผมเป็นคนไทย แต่มะลิอยากไปโรงเรียนส่วนผมจะไปไหน ผมอยากไปเที่ยว')",
|
||||
"",
|
||||
"print(list(doc.sents))",
|
||||
"# output: [ผมเป็นคนไทย แต่มะลิอยากไปโรงเรียนส่วนผมจะไปไหน , ผมอยากไปเที่ยว]"
|
||||
],
|
||||
"code_language": "python",
|
||||
"author": "Wannaphong Phatthiyaphaibun",
|
||||
"author_links": {
|
||||
"twitter": "@wannaphong_p",
|
||||
"github": "wannaphong",
|
||||
"website": "https://iam.wannaphong.com/"
|
||||
},
|
||||
"category": ["pipeline", "research"],
|
||||
"tags": ["Thai"]
|
||||
}
|
||||
|
||||
],
|
||||
|
|
|
@ -105,13 +105,13 @@ const Landing = ({ data }) => {
|
|||
|
||||
<LandingBannerGrid>
|
||||
<LandingBanner
|
||||
to="https://explosion.ai/spacy-tailored-pipelines"
|
||||
to="https://explosion.ai/custom-solutions"
|
||||
button="Learn more"
|
||||
background="#E4F4F9"
|
||||
color="#1e1935"
|
||||
small
|
||||
>
|
||||
<Link to="https://explosion.ai/spacy-tailored-pipelines" hidden>
|
||||
<Link to="https://explosion.ai/custom-solutions" hidden>
|
||||
<img src={tailoredPipelinesImage} alt="spaCy Tailored Pipelines" />
|
||||
</Link>
|
||||
<strong>
|
||||
|
|
Loading…
Reference in New Issue
Block a user