mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-24 08:14:15 +03:00
Merge branch 'develop' into pr/6444
This commit is contained in:
commit
05a2812ae0
|
@ -36,3 +36,44 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
|
scikit-learn
|
||||||
|
------------
|
||||||
|
|
||||||
|
* Files: scorer.py
|
||||||
|
|
||||||
|
The following implementation of roc_auc_score() is adapted from
|
||||||
|
scikit-learn, which is distributed under the following license:
|
||||||
|
|
||||||
|
New BSD License
|
||||||
|
|
||||||
|
Copyright (c) 2007–2019 The scikit-learn developers.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
a. Redistributions of source code must retain the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer.
|
||||||
|
b. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
c. Neither the name of the Scikit-learn Developers nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written
|
||||||
|
permission.
|
||||||
|
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||||
|
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
|
||||||
|
DAMAGE.
|
||||||
|
|
|
@ -5,7 +5,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0rc0,<8.1.0",
|
"thinc>=8.0.0rc2,<8.1.0",
|
||||||
"blis>=0.4.0,<0.8.0",
|
"blis>=0.4.0,<0.8.0",
|
||||||
"pathy",
|
"pathy",
|
||||||
"numpy==1.15.0; python_version<='3.7'",
|
"numpy==1.15.0; python_version<='3.7'",
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0rc0,<8.1.0
|
thinc>=8.0.0rc2,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
ml_datasets==0.2.0a0
|
ml_datasets==0.2.0a0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
|
@ -15,6 +15,7 @@ numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
pydantic>=1.5.0,<1.7.0
|
pydantic>=1.5.0,<1.7.0
|
||||||
|
jinja2
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
|
@ -26,4 +27,3 @@ pytest>=4.6.5
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.5.0,<3.6.0
|
flake8>=3.5.0,<3.6.0
|
||||||
jinja2
|
|
||||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.0rc0,<8.1.0
|
thinc>=8.0.0rc2,<8.1.0
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0rc0,<8.1.0
|
thinc>=8.0.0rc2,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
wasabi>=0.8.0,<1.1.0
|
wasabi>=0.8.0,<1.1.0
|
||||||
srsly>=2.3.0,<3.0.0
|
srsly>=2.3.0,<3.0.0
|
||||||
|
|
|
@ -7,7 +7,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed") # noqa
|
||||||
warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # noqa
|
warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # noqa
|
||||||
|
|
||||||
# These are imported as part of the API
|
# These are imported as part of the API
|
||||||
from thinc.api import prefer_gpu, require_gpu # noqa: F401
|
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from . import pipeline # noqa: F401
|
from . import pipeline # noqa: F401
|
||||||
|
|
|
@ -272,7 +272,11 @@ def show_validation_error(
|
||||||
msg.fail(title)
|
msg.fail(title)
|
||||||
print(err.text.strip())
|
print(err.text.strip())
|
||||||
if hint_fill and "value_error.missing" in err.error_types:
|
if hint_fill and "value_error.missing" in err.error_types:
|
||||||
config_path = file_path if file_path is not None else "config.cfg"
|
config_path = (
|
||||||
|
file_path
|
||||||
|
if file_path is not None and str(file_path) != "-"
|
||||||
|
else "config.cfg"
|
||||||
|
)
|
||||||
msg.text(
|
msg.text(
|
||||||
"If your config contains missing values, you can run the 'init "
|
"If your config contains missing values, you can run the 'init "
|
||||||
"fill-config' command to fill in all the defaults, if possible:",
|
"fill-config' command to fill in all the defaults, if possible:",
|
||||||
|
|
|
@ -19,7 +19,7 @@ from .. import util
|
||||||
def debug_config_cli(
|
def debug_config_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
|
show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
|
||||||
show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
|
show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
|
||||||
|
|
|
@ -37,7 +37,7 @@ BLANK_MODEL_THRESHOLD = 2000
|
||||||
def debug_data_cli(
|
def debug_data_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
|
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
|
verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
|
||||||
|
|
|
@ -22,7 +22,7 @@ from .. import util
|
||||||
def debug_model_cli(
|
def debug_model_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||||
component: str = Arg(..., help="Name of the pipeline component of which the model should be analysed"),
|
component: str = Arg(..., help="Name of the pipeline component of which the model should be analysed"),
|
||||||
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"),
|
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"),
|
||||||
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
|
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
|
||||||
|
|
|
@ -35,7 +35,7 @@ def download_cli(
|
||||||
|
|
||||||
|
|
||||||
def download(model: str, direct: bool = False, *pip_args) -> None:
|
def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||||
if not is_package("spacy") and "--no-deps" not in pip_args:
|
if not (is_package("spacy") or is_package("spacy-nightly")) and "--no-deps" not in pip_args:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Skipping pipeline package dependencies and setting `--no-deps`. "
|
"Skipping pipeline package dependencies and setting `--no-deps`. "
|
||||||
"You don't seem to have the spaCy package itself installed "
|
"You don't seem to have the spaCy package itself installed "
|
||||||
|
|
|
@ -5,6 +5,7 @@ from wasabi import Printer, diff_strings
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
import srsly
|
import srsly
|
||||||
import re
|
import re
|
||||||
|
from jinja2 import Template
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
|
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
|
||||||
|
@ -127,10 +128,6 @@ def init_config(
|
||||||
) -> None:
|
) -> None:
|
||||||
is_stdout = str(output_file) == "-"
|
is_stdout = str(output_file) == "-"
|
||||||
msg = Printer(no_print=is_stdout)
|
msg = Printer(no_print=is_stdout)
|
||||||
try:
|
|
||||||
from jinja2 import Template
|
|
||||||
except ImportError:
|
|
||||||
msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
|
|
||||||
with TEMPLATE_PATH.open("r") as f:
|
with TEMPLATE_PATH.open("r") as f:
|
||||||
template = Template(f.read())
|
template = Template(f.read())
|
||||||
# Filter out duplicates since tok2vec and transformer are added by template
|
# Filter out duplicates since tok2vec and transformer are added by template
|
||||||
|
|
|
@ -62,7 +62,7 @@ def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
|
||||||
def init_pipeline_cli(
|
def init_pipeline_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||||
output_path: Path = Arg(..., help="Output directory for the prepared data"),
|
output_path: Path = Arg(..., help="Output directory for the prepared data"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
@ -88,7 +88,7 @@ def init_pipeline_cli(
|
||||||
def init_labels_cli(
|
def init_labels_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||||
output_path: Path = Arg(..., help="Output directory for the labels"),
|
output_path: Path = Arg(..., help="Output directory for the labels"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
|
|
@ -103,6 +103,9 @@ def package(
|
||||||
)
|
)
|
||||||
Path.mkdir(package_path, parents=True)
|
Path.mkdir(package_path, parents=True)
|
||||||
shutil.copytree(str(input_dir), str(package_path / model_name_v))
|
shutil.copytree(str(input_dir), str(package_path / model_name_v))
|
||||||
|
license_path = package_path / model_name_v / "LICENSE"
|
||||||
|
if license_path.exists():
|
||||||
|
shutil.move(str(license_path), str(main_path))
|
||||||
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
||||||
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||||
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||||
|
@ -238,7 +241,7 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
TEMPLATE_MANIFEST = """
|
TEMPLATE_MANIFEST = """
|
||||||
include meta.json
|
include meta.json
|
||||||
include config.cfg
|
include LICENSE
|
||||||
""".strip()
|
""".strip()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,7 @@ from ..util import load_config
|
||||||
def pretrain_cli(
|
def pretrain_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True),
|
||||||
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||||
|
@ -79,7 +79,7 @@ def pretrain_cli(
|
||||||
|
|
||||||
|
|
||||||
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
|
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
|
||||||
if not config_path or not config_path.exists():
|
if not config_path or (str(config_path) != "-" and not config_path.exists()):
|
||||||
msg.fail("Config file not found", config_path, exits=1)
|
msg.fail("Config file not found", config_path, exits=1)
|
||||||
if output_dir.exists() and [p for p in output_dir.iterdir()]:
|
if output_dir.exists() and [p for p in output_dir.iterdir()]:
|
||||||
if resume_path:
|
if resume_path:
|
||||||
|
|
|
@ -18,7 +18,7 @@ from .. import util
|
||||||
def train_cli(
|
def train_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
@ -41,7 +41,7 @@ def train_cli(
|
||||||
"""
|
"""
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
# Make sure all files and paths exists if they are needed
|
# Make sure all files and paths exists if they are needed
|
||||||
if not config_path or not config_path.exists():
|
if not config_path or (str(config_path) != "-" and not config_path.exists()):
|
||||||
msg.fail("Config file not found", config_path, exits=1)
|
msg.fail("Config file not found", config_path, exits=1)
|
||||||
if output_path is not None and not output_path.exists():
|
if output_path is not None and not output_path.exists():
|
||||||
output_path.mkdir(parents=True)
|
output_path.mkdir(parents=True)
|
||||||
|
|
|
@ -17,7 +17,9 @@ tolerance = 0.2
|
||||||
get_length = null
|
get_length = null
|
||||||
|
|
||||||
[pretraining.objective]
|
[pretraining.objective]
|
||||||
type = "characters"
|
@architectures = "spacy.PretrainCharacters.v1"
|
||||||
|
maxout_pieces = 3
|
||||||
|
hidden_size = 300
|
||||||
n_characters = 4
|
n_characters = 4
|
||||||
|
|
||||||
[pretraining.optimizer]
|
[pretraining.optimizer]
|
||||||
|
|
|
@ -125,8 +125,9 @@ class Warnings:
|
||||||
class Errors:
|
class Errors:
|
||||||
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
||||||
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
|
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
|
||||||
"This usually happens when spaCy calls `nlp.{method}` with custom "
|
"This usually happens when spaCy calls `nlp.{method}` with a custom "
|
||||||
"component name that's not registered on the current language class. "
|
"component name that's not registered on the current language class. "
|
||||||
|
"If you're using a Transformer, make sure to install 'spacy-transformers'. "
|
||||||
"If you're using a custom component, make sure you've added the "
|
"If you're using a custom component, make sure you've added the "
|
||||||
"decorator `@Language.component` (for function components) or "
|
"decorator `@Language.component` (for function components) or "
|
||||||
"`@Language.factory` (for class components).\n\nAvailable "
|
"`@Language.factory` (for class components).\n\nAvailable "
|
||||||
|
@ -456,6 +457,9 @@ class Errors:
|
||||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E896 = ("There was an error using the static vectors. Ensure that the vectors "
|
||||||
|
"of the vocab are properly initialized, or set 'include_static_vectors' "
|
||||||
|
"to False.")
|
||||||
E897 = ("Field '{field}' should be a dot-notation string referring to the "
|
E897 = ("Field '{field}' should be a dot-notation string referring to the "
|
||||||
"relevant section in the config, but found type {type} instead.")
|
"relevant section in the config, but found type {type} instead.")
|
||||||
E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
|
E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
|
||||||
|
@ -483,8 +487,8 @@ class Errors:
|
||||||
"has been applied.")
|
"has been applied.")
|
||||||
E905 = ("Cannot initialize StaticVectors layer: nM dimension unset. This "
|
E905 = ("Cannot initialize StaticVectors layer: nM dimension unset. This "
|
||||||
"dimension refers to the width of the vectors table.")
|
"dimension refers to the width of the vectors table.")
|
||||||
E906 = ("Unexpected `loss` value in pretraining objective: {loss_type}")
|
E906 = ("Unexpected `loss` value in pretraining objective: '{found}'. Supported values "
|
||||||
E907 = ("Unexpected `objective_type` value in pretraining objective: {objective_type}")
|
"are: {supported}")
|
||||||
E908 = ("Can't set `spaces` without `words` in `Doc.__init__`.")
|
E908 = ("Can't set `spaces` without `words` in `Doc.__init__`.")
|
||||||
E909 = ("Expected {name} in parser internals. This is likely a bug in spaCy.")
|
E909 = ("Expected {name} in parser internals. This is likely a bug in spaCy.")
|
||||||
E910 = ("Encountered NaN value when computing loss for component '{name}'.")
|
E910 = ("Encountered NaN value when computing loss for component '{name}'.")
|
||||||
|
@ -712,6 +716,10 @@ class Errors:
|
||||||
E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
|
E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
|
||||||
"token itself. To set the morph from this MorphAnalysis, set from "
|
"token itself. To set the morph from this MorphAnalysis, set from "
|
||||||
"the string value with: `token.set_morph(str(other_morph))`.")
|
"the string value with: `token.set_morph(str(other_morph))`.")
|
||||||
|
E1014 = ("Error loading DocBin data. It doesn't look like the data is in "
|
||||||
|
"DocBin (.spacy) format. If your data is in spaCy v2's JSON "
|
||||||
|
"training format, convert it using `python -m spacy convert "
|
||||||
|
"file.json .`.")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -23,10 +23,7 @@ def forward(model: Model, docs, is_train: bool):
|
||||||
keys, vals = model.ops.xp.unique(keys, return_counts=True)
|
keys, vals = model.ops.xp.unique(keys, return_counts=True)
|
||||||
batch_keys.append(keys)
|
batch_keys.append(keys)
|
||||||
batch_vals.append(vals)
|
batch_vals.append(vals)
|
||||||
# The dtype here matches what thinc is expecting -- which differs per
|
lengths = model.ops.asarray([arr.shape[0] for arr in batch_keys], dtype="int32")
|
||||||
# platform (by int definition). This should be fixed once the problem
|
|
||||||
# is fixed on Thinc's side.
|
|
||||||
lengths = model.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_)
|
|
||||||
batch_keys = model.ops.xp.concatenate(batch_keys)
|
batch_keys = model.ops.xp.concatenate(batch_keys)
|
||||||
batch_vals = model.ops.asarray(model.ops.xp.concatenate(batch_vals), dtype="f")
|
batch_vals = model.ops.asarray(model.ops.xp.concatenate(batch_vals), dtype="f")
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from .entity_linker import * # noqa
|
from .entity_linker import * # noqa
|
||||||
|
from .multi_task import * # noqa
|
||||||
from .parser import * # noqa
|
from .parser import * # noqa
|
||||||
from .tagger import * # noqa
|
from .tagger import * # noqa
|
||||||
from .textcat import * # noqa
|
from .textcat import * # noqa
|
||||||
|
|
|
@ -1,7 +1,14 @@
|
||||||
from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING
|
from typing import Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING
|
||||||
import numpy
|
|
||||||
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
||||||
from thinc.api import MultiSoftmax, list2array
|
from thinc.api import MultiSoftmax, list2array
|
||||||
|
from thinc.api import to_categorical, CosineDistance, L2Distance
|
||||||
|
|
||||||
|
from ...util import registry
|
||||||
|
from ...errors import Errors
|
||||||
|
from ...attrs import ID
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
# This lets us add type hints for mypy etc. without causing circular imports
|
# This lets us add type hints for mypy etc. without causing circular imports
|
||||||
|
@ -9,6 +16,74 @@ if TYPE_CHECKING:
|
||||||
from ...tokens import Doc # noqa: F401
|
from ...tokens import Doc # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.PretrainVectors.v1")
|
||||||
|
def create_pretrain_vectors(
|
||||||
|
maxout_pieces: int, hidden_size: int, loss: str
|
||||||
|
) -> Callable[["Vocab", Model], Model]:
|
||||||
|
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
|
||||||
|
model = build_cloze_multi_task_model(
|
||||||
|
vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
||||||
|
)
|
||||||
|
model.attrs["loss"] = create_vectors_loss()
|
||||||
|
return model
|
||||||
|
|
||||||
|
def create_vectors_loss() -> Callable:
|
||||||
|
if loss == "cosine":
|
||||||
|
distance = CosineDistance(normalize=True, ignore_zeros=True)
|
||||||
|
return partial(get_vectors_loss, distance=distance)
|
||||||
|
elif loss == "L2":
|
||||||
|
distance = L2Distance(normalize=True)
|
||||||
|
return partial(get_vectors_loss, distance=distance)
|
||||||
|
else:
|
||||||
|
raise ValueError(Errors.E906.format(found=loss, supported="'cosine', 'L2'"))
|
||||||
|
|
||||||
|
return create_vectors_objective
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.PretrainCharacters.v1")
|
||||||
|
def create_pretrain_characters(
|
||||||
|
maxout_pieces: int, hidden_size: int, n_characters: int
|
||||||
|
) -> Callable[["Vocab", Model], Model]:
|
||||||
|
def create_characters_objective(vocab: "Vocab", tok2vec: Model) -> Model:
|
||||||
|
model = build_cloze_characters_multi_task_model(
|
||||||
|
vocab,
|
||||||
|
tok2vec,
|
||||||
|
hidden_size=hidden_size,
|
||||||
|
maxout_pieces=maxout_pieces,
|
||||||
|
nr_char=n_characters,
|
||||||
|
)
|
||||||
|
model.attrs["loss"] = partial(get_characters_loss, nr_char=n_characters)
|
||||||
|
return model
|
||||||
|
|
||||||
|
return create_characters_objective
|
||||||
|
|
||||||
|
|
||||||
|
def get_vectors_loss(ops, docs, prediction, distance):
|
||||||
|
"""Compute a loss based on a distance between the documents' vectors and
|
||||||
|
the prediction.
|
||||||
|
"""
|
||||||
|
# The simplest way to implement this would be to vstack the
|
||||||
|
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||||
|
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||||
|
# and look them up all at once. This prevents data copying.
|
||||||
|
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||||
|
target = docs[0].vocab.vectors.data[ids]
|
||||||
|
d_target, loss = distance(prediction, target)
|
||||||
|
return loss, d_target
|
||||||
|
|
||||||
|
|
||||||
|
def get_characters_loss(ops, docs, prediction, nr_char):
|
||||||
|
"""Compute a loss based on a number of characters predicted from the docs."""
|
||||||
|
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
|
||||||
|
target_ids = target_ids.reshape((-1,))
|
||||||
|
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
||||||
|
target = target.reshape((-1, 256 * nr_char))
|
||||||
|
diff = prediction - target
|
||||||
|
loss = (diff ** 2).sum()
|
||||||
|
d_target = diff / float(prediction.shape[0])
|
||||||
|
return loss, d_target
|
||||||
|
|
||||||
|
|
||||||
def build_multi_task_model(
|
def build_multi_task_model(
|
||||||
tok2vec: Model,
|
tok2vec: Model,
|
||||||
maxout_pieces: int,
|
maxout_pieces: int,
|
||||||
|
@ -33,23 +108,19 @@ def build_multi_task_model(
|
||||||
|
|
||||||
|
|
||||||
def build_cloze_multi_task_model(
|
def build_cloze_multi_task_model(
|
||||||
vocab: "Vocab",
|
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
|
||||||
tok2vec: Model,
|
|
||||||
maxout_pieces: int,
|
|
||||||
hidden_size: int,
|
|
||||||
nO: Optional[int] = None,
|
|
||||||
) -> Model:
|
) -> Model:
|
||||||
# nO = vocab.vectors.data.shape[1]
|
nO = vocab.vectors.data.shape[1]
|
||||||
output_layer = chain(
|
output_layer = chain(
|
||||||
list2array(),
|
list2array(),
|
||||||
Maxout(
|
Maxout(
|
||||||
nO=nO,
|
nO=hidden_size,
|
||||||
nI=tok2vec.get_dim("nO"),
|
nI=tok2vec.get_dim("nO"),
|
||||||
nP=maxout_pieces,
|
nP=maxout_pieces,
|
||||||
normalize=True,
|
normalize=True,
|
||||||
dropout=0.0,
|
dropout=0.0,
|
||||||
),
|
),
|
||||||
Linear(nO=nO, nI=nO, init_W=zero_init),
|
Linear(nO=nO, nI=hidden_size, init_W=zero_init),
|
||||||
)
|
)
|
||||||
model = chain(tok2vec, output_layer)
|
model = chain(tok2vec, output_layer)
|
||||||
model = build_masked_language_model(vocab, model)
|
model = build_masked_language_model(vocab, model)
|
||||||
|
|
|
@ -42,9 +42,13 @@ def forward(
|
||||||
rows = model.ops.flatten(
|
rows = model.ops.flatten(
|
||||||
[doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
|
[doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
|
||||||
)
|
)
|
||||||
|
try:
|
||||||
|
vectors_data = model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True)
|
||||||
|
except ValueError:
|
||||||
|
raise RuntimeError(Errors.E896)
|
||||||
output = Ragged(
|
output = Ragged(
|
||||||
model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True),
|
vectors_data,
|
||||||
model.ops.asarray([len(doc) for doc in docs], dtype="i"),
|
model.ops.asarray([len(doc) for doc in docs], dtype="i")
|
||||||
)
|
)
|
||||||
mask = None
|
mask = None
|
||||||
if is_train:
|
if is_train:
|
||||||
|
|
|
@ -67,9 +67,6 @@ class Morphologizer(Tagger):
|
||||||
vocab: Vocab,
|
vocab: Vocab,
|
||||||
model: Model,
|
model: Model,
|
||||||
name: str = "morphologizer",
|
name: str = "morphologizer",
|
||||||
*,
|
|
||||||
labels_morph: Optional[dict] = None,
|
|
||||||
labels_pos: Optional[dict] = None,
|
|
||||||
):
|
):
|
||||||
"""Initialize a morphologizer.
|
"""Initialize a morphologizer.
|
||||||
|
|
||||||
|
@ -77,8 +74,6 @@ class Morphologizer(Tagger):
|
||||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
name (str): The component instance name, used to add entries to the
|
name (str): The component instance name, used to add entries to the
|
||||||
losses during training.
|
losses during training.
|
||||||
labels_morph (dict): Mapping of morph + POS tags to morph labels.
|
|
||||||
labels_pos (dict): Mapping of morph + POS tags to POS tags.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/morphologizer#init
|
DOCS: https://nightly.spacy.io/api/morphologizer#init
|
||||||
"""
|
"""
|
||||||
|
@ -90,7 +85,7 @@ class Morphologizer(Tagger):
|
||||||
# store mappings from morph+POS labels to token-level annotations:
|
# store mappings from morph+POS labels to token-level annotations:
|
||||||
# 1) labels_morph stores a mapping from morph+POS->morph
|
# 1) labels_morph stores a mapping from morph+POS->morph
|
||||||
# 2) labels_pos stores a mapping from morph+POS->POS
|
# 2) labels_pos stores a mapping from morph+POS->POS
|
||||||
cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}}
|
cfg = {"labels_morph": {}, "labels_pos": {}}
|
||||||
self.cfg = dict(sorted(cfg.items()))
|
self.cfg = dict(sorted(cfg.items()))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -47,7 +47,7 @@ class MultitaskObjective(Tagger):
|
||||||
side-objective.
|
side-objective.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab, model, name="nn_labeller", *, labels, target):
|
def __init__(self, vocab, model, name="nn_labeller", *, target):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
|
@ -67,7 +67,7 @@ class MultitaskObjective(Tagger):
|
||||||
self.make_label = target
|
self.make_label = target
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E016)
|
raise ValueError(Errors.E016)
|
||||||
cfg = {"labels": labels or {}, "target": target}
|
cfg = {"labels": {}, "target": target}
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -81,10 +81,13 @@ class MultitaskObjective(Tagger):
|
||||||
def set_annotations(self, docs, dep_ids):
|
def set_annotations(self, docs, dep_ids):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def initialize(self, get_examples, nlp=None):
|
def initialize(self, get_examples, nlp=None, labels=None):
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
|
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
|
||||||
raise ValueError(err)
|
raise ValueError(err)
|
||||||
|
if labels is not None:
|
||||||
|
self.labels = labels
|
||||||
|
else:
|
||||||
for example in get_examples():
|
for example in get_examples():
|
||||||
for token in example.y:
|
for token in example.y:
|
||||||
label = self.make_label(token)
|
label = self.make_label(token)
|
||||||
|
|
|
@ -61,14 +61,13 @@ class Tagger(TrainablePipe):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tagger
|
DOCS: https://nightly.spacy.io/api/tagger
|
||||||
"""
|
"""
|
||||||
def __init__(self, vocab, model, name="tagger", *, labels=None):
|
def __init__(self, vocab, model, name="tagger"):
|
||||||
"""Initialize a part-of-speech tagger.
|
"""Initialize a part-of-speech tagger.
|
||||||
|
|
||||||
vocab (Vocab): The shared vocabulary.
|
vocab (Vocab): The shared vocabulary.
|
||||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
name (str): The component instance name, used to add entries to the
|
name (str): The component instance name, used to add entries to the
|
||||||
losses during training.
|
losses during training.
|
||||||
labels (List): The set of labels. Defaults to None.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tagger#init
|
DOCS: https://nightly.spacy.io/api/tagger#init
|
||||||
"""
|
"""
|
||||||
|
@ -76,7 +75,7 @@ class Tagger(TrainablePipe):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
cfg = {"labels": labels or []}
|
cfg = {"labels": []}
|
||||||
self.cfg = dict(sorted(cfg.items()))
|
self.cfg = dict(sorted(cfg.items()))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -351,9 +351,7 @@ class ConfigSchemaPretrain(BaseModel):
|
||||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||||
component: str = Field(..., title="Component to find the layer to pretrain")
|
component: str = Field(..., title="Component to find the layer to pretrain")
|
||||||
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
|
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
|
||||||
|
objective: Callable[["Vocab", "Model"], "Model"] = Field(..., title="A function that creates the pretraining objective.")
|
||||||
# TODO: use a more detailed schema for this?
|
|
||||||
objective: Dict[str, Any] = Field(..., title="Pretraining objective")
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
|
|
|
@ -720,44 +720,10 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#############################################################################
|
|
||||||
#
|
|
||||||
# The following implementation of roc_auc_score() is adapted from
|
# The following implementation of roc_auc_score() is adapted from
|
||||||
# scikit-learn, which is distributed under the following license:
|
# scikit-learn, which is distributed under the New BSD License.
|
||||||
#
|
|
||||||
# New BSD License
|
|
||||||
#
|
|
||||||
# Copyright (c) 2007–2019 The scikit-learn developers.
|
# Copyright (c) 2007–2019 The scikit-learn developers.
|
||||||
# All rights reserved.
|
# See licenses/3rd_party_licenses.txt
|
||||||
#
|
|
||||||
#
|
|
||||||
# Redistribution and use in source and binary forms, with or without
|
|
||||||
# modification, are permitted provided that the following conditions are met:
|
|
||||||
#
|
|
||||||
# a. Redistributions of source code must retain the above copyright notice,
|
|
||||||
# this list of conditions and the following disclaimer.
|
|
||||||
# b. Redistributions in binary form must reproduce the above copyright
|
|
||||||
# notice, this list of conditions and the following disclaimer in the
|
|
||||||
# documentation and/or other materials provided with the distribution.
|
|
||||||
# c. Neither the name of the Scikit-learn Developers nor the names of
|
|
||||||
# its contributors may be used to endorse or promote products
|
|
||||||
# derived from this software without specific prior written
|
|
||||||
# permission.
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
||||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
||||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
||||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
|
|
||||||
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
||||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
||||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
||||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
||||||
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
||||||
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
|
|
||||||
# DAMAGE.
|
|
||||||
|
|
||||||
|
|
||||||
def _roc_auc_score(y_true, y_score):
|
def _roc_auc_score(y_true, y_score):
|
||||||
"""Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
|
"""Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
|
||||||
from prediction scores.
|
from prediction scores.
|
||||||
|
|
|
@ -135,7 +135,7 @@ def test_initialize_examples():
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
# Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
|
# Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly
|
||||||
fix_random_seed(0)
|
fix_random_seed(0)
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
|
nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
|
||||||
|
@ -177,11 +177,58 @@ def test_overfitting_IO():
|
||||||
|
|
||||||
# Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
|
# Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
|
||||||
texts = ["Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham."]
|
texts = ["Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham."]
|
||||||
batch_deps_1 = [doc.cats for doc in nlp.pipe(texts)]
|
batch_cats_1 = [doc.cats for doc in nlp.pipe(texts)]
|
||||||
batch_deps_2 = [doc.cats for doc in nlp.pipe(texts)]
|
batch_cats_2 = [doc.cats for doc in nlp.pipe(texts)]
|
||||||
no_batch_deps = [doc.cats for doc in [nlp(text) for text in texts]]
|
no_batch_cats = [doc.cats for doc in [nlp(text) for text in texts]]
|
||||||
assert_equal(batch_deps_1, batch_deps_2)
|
assert_equal(batch_cats_1, batch_cats_2)
|
||||||
assert_equal(batch_deps_1, no_batch_deps)
|
assert_equal(batch_cats_1, no_batch_cats)
|
||||||
|
|
||||||
|
|
||||||
|
def test_overfitting_IO_multi():
|
||||||
|
# Simple test to try and quickly overfit the multi-label textcat component - ensuring the ML models work correctly
|
||||||
|
fix_random_seed(0)
|
||||||
|
nlp = English()
|
||||||
|
# Set exclusive labels to False
|
||||||
|
config = {"model": {"linear_model": {"exclusive_classes": False}}}
|
||||||
|
textcat = nlp.add_pipe("textcat", config=config)
|
||||||
|
train_examples = []
|
||||||
|
for text, annotations in TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
assert textcat.model.get_dim("nO") == 2
|
||||||
|
|
||||||
|
for i in range(50):
|
||||||
|
losses = {}
|
||||||
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
assert losses["textcat"] < 0.01
|
||||||
|
|
||||||
|
# test the trained model
|
||||||
|
test_text = "I am happy."
|
||||||
|
doc = nlp(test_text)
|
||||||
|
cats = doc.cats
|
||||||
|
assert cats["POSITIVE"] > 0.9
|
||||||
|
|
||||||
|
# Also test the results are still the same after IO
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
nlp.to_disk(tmp_dir)
|
||||||
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
|
doc2 = nlp2(test_text)
|
||||||
|
cats2 = doc2.cats
|
||||||
|
assert cats2["POSITIVE"] > 0.9
|
||||||
|
|
||||||
|
# Test scoring
|
||||||
|
scores = nlp.evaluate(train_examples)
|
||||||
|
assert scores["cats_micro_f"] == 1.0
|
||||||
|
assert scores["cats_score"] == 1.0
|
||||||
|
assert "cats_score_desc" in scores
|
||||||
|
|
||||||
|
# Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
|
||||||
|
texts = ["Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham."]
|
||||||
|
batch_cats_1 = [doc.cats for doc in nlp.pipe(texts)]
|
||||||
|
batch_cats_2 = [doc.cats for doc in nlp.pipe(texts)]
|
||||||
|
no_batch_cats = [doc.cats for doc in [nlp(text) for text in texts]]
|
||||||
|
assert_equal(batch_cats_1, batch_cats_2)
|
||||||
|
assert_equal(batch_cats_1, no_batch_cats)
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
|
|
@ -1,35 +1,38 @@
|
||||||
from thinc.api import fix_random_seed
|
import pytest
|
||||||
|
from thinc.api import Config, fix_random_seed
|
||||||
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
from spacy.pipeline.textcat import default_model_config, bow_model_config
|
||||||
|
from spacy.pipeline.textcat import cnn_model_config
|
||||||
from spacy.tokens import Span
|
from spacy.tokens import Span
|
||||||
from spacy import displacy
|
from spacy import displacy
|
||||||
from spacy.pipeline import merge_entities
|
from spacy.pipeline import merge_entities
|
||||||
|
from spacy.training import Example
|
||||||
|
|
||||||
|
|
||||||
def test_issue5551():
|
@pytest.mark.parametrize(
|
||||||
|
"textcat_config", [default_model_config, bow_model_config, cnn_model_config]
|
||||||
|
)
|
||||||
|
def test_issue5551(textcat_config):
|
||||||
"""Test that after fixing the random seed, the results of the pipeline are truly identical"""
|
"""Test that after fixing the random seed, the results of the pipeline are truly identical"""
|
||||||
component = "textcat"
|
component = "textcat"
|
||||||
pipe_cfg = {
|
|
||||||
"model": {
|
pipe_cfg = Config().from_str(textcat_config)
|
||||||
"@architectures": "spacy.TextCatBOW.v1",
|
|
||||||
"exclusive_classes": True,
|
|
||||||
"ngram_size": 2,
|
|
||||||
"no_output_layer": False,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
results = []
|
results = []
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
fix_random_seed(0)
|
fix_random_seed(0)
|
||||||
nlp = English()
|
nlp = English()
|
||||||
example = (
|
text = "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g."
|
||||||
"Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g.",
|
annots = {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}}
|
||||||
{"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}},
|
|
||||||
)
|
|
||||||
pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
|
pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
|
||||||
for label in set(example[1]["cats"]):
|
for label in set(annots["cats"]):
|
||||||
pipe.add_label(label)
|
pipe.add_label(label)
|
||||||
|
# Train
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
|
doc = nlp.make_doc(text)
|
||||||
|
nlp.update([Example.from_dict(doc, annots)])
|
||||||
# Store the result of each iteration
|
# Store the result of each iteration
|
||||||
result = pipe.model.predict([nlp.make_doc(example[0])])
|
result = pipe.model.predict([doc])
|
||||||
results.append(list(result[0]))
|
results.append(list(result[0]))
|
||||||
# All results should be the same because of the fixed seed
|
# All results should be the same because of the fixed seed
|
||||||
assert len(results) == 3
|
assert len(results) == 3
|
||||||
|
|
|
@ -3,15 +3,15 @@ from thinc.api import Config, ConfigValidationError
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.de import German
|
from spacy.lang.de import German
|
||||||
from spacy.language import Language, DEFAULT_CONFIG
|
from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
|
||||||
from spacy.util import registry, load_model_from_config
|
from spacy.util import registry, load_model_from_config, load_config
|
||||||
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
|
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
|
||||||
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
|
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
|
||||||
from spacy.schemas import ConfigSchema
|
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
|
||||||
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
nlp_config_string = """
|
nlp_config_string = """
|
||||||
[paths]
|
[paths]
|
||||||
train = null
|
train = null
|
||||||
|
@ -63,6 +63,59 @@ factory = "tagger"
|
||||||
width = ${components.tok2vec.model.width}
|
width = ${components.tok2vec.model.width}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
pretrain_config_string = """
|
||||||
|
[paths]
|
||||||
|
train = null
|
||||||
|
dev = null
|
||||||
|
|
||||||
|
[corpora]
|
||||||
|
|
||||||
|
[corpora.train]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.train}
|
||||||
|
|
||||||
|
[corpora.dev]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.dev}
|
||||||
|
|
||||||
|
[training]
|
||||||
|
|
||||||
|
[training.batcher]
|
||||||
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
|
size = 666
|
||||||
|
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["tok2vec", "tagger"]
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.tok2vec]
|
||||||
|
factory = "tok2vec"
|
||||||
|
|
||||||
|
[components.tok2vec.model]
|
||||||
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 342
|
||||||
|
depth = 4
|
||||||
|
window_size = 1
|
||||||
|
embed_size = 2000
|
||||||
|
maxout_pieces = 3
|
||||||
|
subword_features = true
|
||||||
|
|
||||||
|
[components.tagger]
|
||||||
|
factory = "tagger"
|
||||||
|
|
||||||
|
[components.tagger.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
|
||||||
|
[components.tagger.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.width}
|
||||||
|
|
||||||
|
[pretraining]
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
parser_config_string = """
|
parser_config_string = """
|
||||||
[model]
|
[model]
|
||||||
|
@ -126,6 +179,14 @@ def test_create_nlp_from_config():
|
||||||
load_model_from_config(Config(bad_cfg), auto_fill=True)
|
load_model_from_config(Config(bad_cfg), auto_fill=True)
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_nlp_from_pretraining_config():
|
||||||
|
"""Test that the default pretraining config validates properly"""
|
||||||
|
config = Config().from_str(pretrain_config_string)
|
||||||
|
pretrain_config = load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
||||||
|
filled = config.merge(pretrain_config)
|
||||||
|
resolved = registry.resolve(filled["pretraining"], schema=ConfigSchemaPretrain)
|
||||||
|
|
||||||
|
|
||||||
def test_create_nlp_from_config_multiple_instances():
|
def test_create_nlp_from_config_multiple_instances():
|
||||||
"""Test that the nlp object is created correctly for a config with multiple
|
"""Test that the nlp object is created correctly for a config with multiple
|
||||||
instances of the same component."""
|
instances of the same component."""
|
||||||
|
|
|
@ -4,7 +4,7 @@ import ctypes
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from spacy.about import __version__ as spacy_version
|
from spacy.about import __version__ as spacy_version
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy import prefer_gpu, require_gpu
|
from spacy import prefer_gpu, require_gpu, require_cpu
|
||||||
from spacy.ml._precomputable_affine import PrecomputableAffine
|
from spacy.ml._precomputable_affine import PrecomputableAffine
|
||||||
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
||||||
from spacy.util import dot_to_object, SimpleFrozenList
|
from spacy.util import dot_to_object, SimpleFrozenList
|
||||||
|
@ -15,6 +15,8 @@ from spacy.lang.nl import Dutch
|
||||||
from spacy.language import DEFAULT_CONFIG_PATH
|
from spacy.language import DEFAULT_CONFIG_PATH
|
||||||
from spacy.schemas import ConfigSchemaTraining
|
from spacy.schemas import ConfigSchemaTraining
|
||||||
|
|
||||||
|
from thinc.api import get_current_ops, NumpyOps, CupyOps
|
||||||
|
|
||||||
from .util import get_random_doc
|
from .util import get_random_doc
|
||||||
|
|
||||||
|
|
||||||
|
@ -81,6 +83,8 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
|
||||||
def test_prefer_gpu():
|
def test_prefer_gpu():
|
||||||
try:
|
try:
|
||||||
import cupy # noqa: F401
|
import cupy # noqa: F401
|
||||||
|
prefer_gpu()
|
||||||
|
assert isinstance(get_current_ops(), CupyOps)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
assert not prefer_gpu()
|
assert not prefer_gpu()
|
||||||
|
|
||||||
|
@ -88,10 +92,24 @@ def test_prefer_gpu():
|
||||||
def test_require_gpu():
|
def test_require_gpu():
|
||||||
try:
|
try:
|
||||||
import cupy # noqa: F401
|
import cupy # noqa: F401
|
||||||
|
require_gpu()
|
||||||
|
assert isinstance(get_current_ops(), CupyOps)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
require_gpu()
|
require_gpu()
|
||||||
|
|
||||||
|
def test_require_cpu():
|
||||||
|
require_cpu()
|
||||||
|
assert isinstance(get_current_ops(), NumpyOps)
|
||||||
|
try:
|
||||||
|
import cupy # noqa: F401
|
||||||
|
require_gpu()
|
||||||
|
assert isinstance(get_current_ops(), CupyOps)
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
require_cpu()
|
||||||
|
assert isinstance(get_current_ops(), NumpyOps)
|
||||||
|
|
||||||
|
|
||||||
def test_ascii_filenames():
|
def test_ascii_filenames():
|
||||||
"""Test that all filenames in the project are ASCII.
|
"""Test that all filenames in the project are ASCII.
|
||||||
|
|
|
@ -2,6 +2,7 @@ import pytest
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.tokenizer import Tokenizer
|
from spacy.tokenizer import Tokenizer
|
||||||
from spacy.util import ensure_path
|
from spacy.util import ensure_path
|
||||||
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_handles_no_word(tokenizer):
|
def test_tokenizer_handles_no_word(tokenizer):
|
||||||
|
@ -150,6 +151,22 @@ def test_tokenizer_special_cases_with_affixes(tokenizer):
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_special_cases_with_affixes_preserve_spacy():
|
||||||
|
tokenizer = English().tokenizer
|
||||||
|
# reset all special cases
|
||||||
|
tokenizer.rules = {}
|
||||||
|
|
||||||
|
# in-place modification (only merges)
|
||||||
|
text = "''a'' "
|
||||||
|
tokenizer.add_special_case("''", [{"ORTH": "''"}])
|
||||||
|
assert tokenizer(text).text == text
|
||||||
|
|
||||||
|
# not in-place (splits and merges)
|
||||||
|
tokenizer.add_special_case("ab", [{"ORTH": "a"}, {"ORTH": "b"}])
|
||||||
|
text = "ab ab ab ''ab ab'' ab'' ''ab"
|
||||||
|
assert tokenizer(text).text == text
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_special_cases_with_period(tokenizer):
|
def test_tokenizer_special_cases_with_period(tokenizer):
|
||||||
text = "_SPECIAL_."
|
text = "_SPECIAL_."
|
||||||
tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
|
tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
|
||||||
|
|
|
@ -72,6 +72,10 @@ def test_readers():
|
||||||
def test_cat_readers(reader, additional_config):
|
def test_cat_readers(reader, additional_config):
|
||||||
nlp_config_string = """
|
nlp_config_string = """
|
||||||
[training]
|
[training]
|
||||||
|
seed = 0
|
||||||
|
|
||||||
|
[training.score_weights]
|
||||||
|
cats_macro_auc = 1.0
|
||||||
|
|
||||||
[corpora]
|
[corpora]
|
||||||
@readers = "PLACEHOLDER"
|
@readers = "PLACEHOLDER"
|
||||||
|
@ -92,9 +96,7 @@ def test_cat_readers(reader, additional_config):
|
||||||
config["corpora"]["@readers"] = reader
|
config["corpora"]["@readers"] = reader
|
||||||
config["corpora"].update(additional_config)
|
config["corpora"].update(additional_config)
|
||||||
nlp = load_model_from_config(config, auto_fill=True)
|
nlp = load_model_from_config(config, auto_fill=True)
|
||||||
T = registry.resolve(
|
T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
|
||||||
nlp.config["training"].interpolate(), schema=ConfigSchemaTraining
|
|
||||||
)
|
|
||||||
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||||
train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
|
train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
|
||||||
optimizer = T["optimizer"]
|
optimizer = T["optimizer"]
|
||||||
|
|
|
@ -514,6 +514,11 @@ def test_roundtrip_docs_to_docbin(doc):
|
||||||
([[0], [1], [2, 3]], [[0], [1], [2], [2]]),
|
([[0], [1], [2, 3]], [[0], [1], [2], [2]]),
|
||||||
),
|
),
|
||||||
([" ", "a"], ["a"], ([[], [0]], [[1]])),
|
([" ", "a"], ["a"], ([[], [0]], [[1]])),
|
||||||
|
(
|
||||||
|
["a", "''", "'", ","],
|
||||||
|
["a'", "''", ","],
|
||||||
|
([[0], [0, 1], [1], [2]], [[0, 1], [1, 2], [3]]),
|
||||||
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_align(tokens_a, tokens_b, expected): # noqa
|
def test_align(tokens_a, tokens_b, expected): # noqa
|
||||||
|
@ -698,7 +703,7 @@ def test_alignment_spaces(en_vocab):
|
||||||
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1]
|
assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1]
|
||||||
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
||||||
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
|
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
|
||||||
assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6]
|
assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6]
|
||||||
|
|
||||||
# multiple leading whitespace tokens
|
# multiple leading whitespace tokens
|
||||||
|
@ -707,7 +712,7 @@ def test_alignment_spaces(en_vocab):
|
||||||
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1]
|
assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1]
|
||||||
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
||||||
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
|
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
|
||||||
assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7]
|
assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7]
|
||||||
|
|
||||||
# both with leading whitespace, not identical
|
# both with leading whitespace, not identical
|
||||||
|
|
|
@ -338,7 +338,7 @@ cdef class Tokenizer:
|
||||||
# Copy special case tokens into doc and adjust token and
|
# Copy special case tokens into doc and adjust token and
|
||||||
# character offsets
|
# character offsets
|
||||||
idx_offset = 0
|
idx_offset = 0
|
||||||
orig_final_spacy = doc.c[span_end + offset - 1].spacy
|
orig_final_spacy = doc.c[span_end - 1].spacy
|
||||||
orig_idx = doc.c[i].idx
|
orig_idx = doc.c[i].idx
|
||||||
for j in range(cached.length):
|
for j in range(cached.length):
|
||||||
tokens[i + offset + j] = cached.data.tokens[j]
|
tokens[i + offset + j] = cached.data.tokens[j]
|
||||||
|
|
|
@ -198,7 +198,10 @@ class DocBin:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/docbin#from_bytes
|
DOCS: https://nightly.spacy.io/api/docbin#from_bytes
|
||||||
"""
|
"""
|
||||||
|
try:
|
||||||
msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
|
msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
|
||||||
|
except zlib.error:
|
||||||
|
raise ValueError(Errors.E1014)
|
||||||
self.attrs = msg["attrs"]
|
self.attrs = msg["attrs"]
|
||||||
self.strings = set(msg["strings"])
|
self.strings = set(msg["strings"])
|
||||||
lengths = numpy.frombuffer(msg["lengths"], dtype="int32")
|
lengths = numpy.frombuffer(msg["lengths"], dtype="int32")
|
||||||
|
|
|
@ -7,8 +7,8 @@ from ..errors import Errors
|
||||||
|
|
||||||
def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]:
|
def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]:
|
||||||
# Create character-to-token mappings
|
# Create character-to-token mappings
|
||||||
char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A))))
|
char_to_token_a = tuple(chain(*((i,) * len(x.lower()) for i, x in enumerate(A))))
|
||||||
char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B))))
|
char_to_token_b = tuple(chain(*((i,) * len(x.lower()) for i, x in enumerate(B))))
|
||||||
str_a = "".join(A).lower()
|
str_a = "".join(A).lower()
|
||||||
str_b = "".join(B).lower()
|
str_b = "".join(B).lower()
|
||||||
cdef int len_str_a = len(str_a)
|
cdef int len_str_a = len(str_a)
|
||||||
|
@ -36,8 +36,14 @@ def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[Li
|
||||||
if prev_token_idx_b != token_idx_b:
|
if prev_token_idx_b != token_idx_b:
|
||||||
b2a.append(set())
|
b2a.append(set())
|
||||||
# Process the alignment at the current position
|
# Process the alignment at the current position
|
||||||
if A[token_idx_a] == B[token_idx_b]:
|
if A[token_idx_a] == B[token_idx_b] and \
|
||||||
# Current tokens are identical
|
(char_idx_a == 0 or \
|
||||||
|
char_to_token_a[char_idx_a - 1] < token_idx_a) and \
|
||||||
|
(char_idx_b == 0 or \
|
||||||
|
char_to_token_b[char_idx_b - 1] < token_idx_b):
|
||||||
|
# Current tokens are identical and both character offsets are the
|
||||||
|
# start of a token (either at the beginning of the document or the
|
||||||
|
# previous character belongs to a different token)
|
||||||
a2b[-1].add(token_idx_b)
|
a2b[-1].add(token_idx_b)
|
||||||
b2a[-1].add(token_idx_a)
|
b2a[-1].add(token_idx_a)
|
||||||
char_idx_a += len(A[token_idx_a])
|
char_idx_a += len(A[token_idx_a])
|
||||||
|
|
|
@ -103,7 +103,7 @@ def load_vectors_into_model(
|
||||||
"with the packaged vectors. Make sure that the vectors package you're "
|
"with the packaged vectors. Make sure that the vectors package you're "
|
||||||
"loading is compatible with the current version of spaCy."
|
"loading is compatible with the current version of spaCy."
|
||||||
)
|
)
|
||||||
err = ConfigValidationError.from_error(e, config=None, title=title, desc=desc)
|
err = ConfigValidationError.from_error(e, title=title, desc=desc)
|
||||||
raise err from None
|
raise err from None
|
||||||
nlp.vocab.vectors = vectors_nlp.vocab.vectors
|
nlp.vocab.vectors = vectors_nlp.vocab.vectors
|
||||||
if add_strings:
|
if add_strings:
|
||||||
|
|
|
@ -28,7 +28,7 @@ def train(
|
||||||
use_gpu: int = -1,
|
use_gpu: int = -1,
|
||||||
stdout: IO = sys.stdout,
|
stdout: IO = sys.stdout,
|
||||||
stderr: IO = sys.stderr,
|
stderr: IO = sys.stderr,
|
||||||
) -> None:
|
) -> Tuple["Language", Optional[Path]]:
|
||||||
"""Train a pipeline.
|
"""Train a pipeline.
|
||||||
|
|
||||||
nlp (Language): The initialized nlp object with the full config.
|
nlp (Language): The initialized nlp object with the full config.
|
||||||
|
@ -40,7 +40,7 @@ def train(
|
||||||
stderr (file): A second file-like object to write output messages. To disable
|
stderr (file): A second file-like object to write output messages. To disable
|
||||||
printing, set to io.StringIO.
|
printing, set to io.StringIO.
|
||||||
|
|
||||||
RETURNS (Path / None): The path to the final exported model.
|
RETURNS (tuple): The final nlp object and the path to the exported model.
|
||||||
"""
|
"""
|
||||||
# We use no_print here so we can respect the stdout/stderr options.
|
# We use no_print here so we can respect the stdout/stderr options.
|
||||||
msg = Printer(no_print=True)
|
msg = Printer(no_print=True)
|
||||||
|
@ -105,17 +105,18 @@ def train(
|
||||||
raise e
|
raise e
|
||||||
finally:
|
finally:
|
||||||
finalize_logger()
|
finalize_logger()
|
||||||
|
if optimizer.averages:
|
||||||
|
nlp.use_params(optimizer.averages)
|
||||||
if output_path is not None:
|
if output_path is not None:
|
||||||
final_model_path = output_path / DIR_MODEL_LAST
|
final_model_path = output_path / DIR_MODEL_LAST
|
||||||
if optimizer.averages:
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
nlp.to_disk(final_model_path)
|
|
||||||
else:
|
|
||||||
nlp.to_disk(final_model_path)
|
nlp.to_disk(final_model_path)
|
||||||
# This will only run if we don't hit an error
|
# This will only run if we don't hit an error
|
||||||
stdout.write(
|
stdout.write(
|
||||||
msg.good("Saved pipeline to output directory", final_model_path) + "\n"
|
msg.good("Saved pipeline to output directory", final_model_path) + "\n"
|
||||||
)
|
)
|
||||||
|
return (nlp, final_model_path)
|
||||||
|
else:
|
||||||
|
return (nlp, None)
|
||||||
|
|
||||||
|
|
||||||
def train_while_improving(
|
def train_while_improving(
|
||||||
|
|
|
@ -1,23 +1,17 @@
|
||||||
from typing import Optional, Callable, Iterable, Union, List
|
from typing import Optional, Callable, Iterable, Union, List
|
||||||
from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
|
from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
|
||||||
from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
|
from thinc.api import set_dropout_rate
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from functools import partial
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import srsly
|
import srsly
|
||||||
import numpy
|
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from .example import Example
|
from .example import Example
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..attrs import ID
|
|
||||||
from ..ml.models.multi_task import build_cloze_multi_task_model
|
|
||||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
|
||||||
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
|
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
|
||||||
from ..errors import Errors
|
from ..util import registry, load_model_from_config, dot_to_object
|
||||||
from ..util import registry, load_model_from_config, resolve_dot_names
|
|
||||||
|
|
||||||
|
|
||||||
def pretrain(
|
def pretrain(
|
||||||
|
@ -38,7 +32,8 @@ def pretrain(
|
||||||
_config = nlp.config.interpolate()
|
_config = nlp.config.interpolate()
|
||||||
T = registry.resolve(_config["training"], schema=ConfigSchemaTraining)
|
T = registry.resolve(_config["training"], schema=ConfigSchemaTraining)
|
||||||
P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain)
|
P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain)
|
||||||
corpus = resolve_dot_names(_config, [P["corpus"]])[0]
|
corpus = dot_to_object(_config, P["corpus"])
|
||||||
|
corpus = registry.resolve({"corpus": corpus})["corpus"]
|
||||||
batcher = P["batcher"]
|
batcher = P["batcher"]
|
||||||
model = create_pretraining_model(nlp, P)
|
model = create_pretraining_model(nlp, P)
|
||||||
optimizer = P["optimizer"]
|
optimizer = P["optimizer"]
|
||||||
|
@ -48,6 +43,7 @@ def pretrain(
|
||||||
else:
|
else:
|
||||||
# Without '--resume-path' the '--epoch-resume' argument is ignored
|
# Without '--resume-path' the '--epoch-resume' argument is ignored
|
||||||
epoch_resume = 0
|
epoch_resume = 0
|
||||||
|
objective = model.attrs["loss"]
|
||||||
# TODO: move this to logger function?
|
# TODO: move this to logger function?
|
||||||
tracker = ProgressTracker(frequency=10000)
|
tracker = ProgressTracker(frequency=10000)
|
||||||
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
||||||
|
@ -68,7 +64,6 @@ def pretrain(
|
||||||
with (output_dir / "log.jsonl").open("a") as file_:
|
with (output_dir / "log.jsonl").open("a") as file_:
|
||||||
file_.write(srsly.json_dumps(log) + "\n")
|
file_.write(srsly.json_dumps(log) + "\n")
|
||||||
|
|
||||||
objective = create_objective(P["objective"])
|
|
||||||
# TODO: I think we probably want this to look more like the
|
# TODO: I think we probably want this to look more like the
|
||||||
# 'create_train_batches' function?
|
# 'create_train_batches' function?
|
||||||
for epoch in range(epoch_resume, P["max_epochs"]):
|
for epoch in range(epoch_resume, P["max_epochs"]):
|
||||||
|
@ -131,58 +126,6 @@ def make_update(
|
||||||
return float(loss)
|
return float(loss)
|
||||||
|
|
||||||
|
|
||||||
def create_objective(config: Config):
|
|
||||||
"""Create the objective for pretraining.
|
|
||||||
|
|
||||||
We'd like to replace this with a registry function but it's tricky because
|
|
||||||
we're also making a model choice based on this. For now we hard-code support
|
|
||||||
for two types (characters, vectors). For characters you can specify
|
|
||||||
n_characters, for vectors you can specify the loss.
|
|
||||||
|
|
||||||
Bleh.
|
|
||||||
"""
|
|
||||||
objective_type = config["type"]
|
|
||||||
if objective_type == "characters":
|
|
||||||
return partial(get_characters_loss, nr_char=config["n_characters"])
|
|
||||||
elif objective_type == "vectors":
|
|
||||||
if config["loss"] == "cosine":
|
|
||||||
distance = CosineDistance(normalize=True, ignore_zeros=True)
|
|
||||||
return partial(get_vectors_loss, distance=distance)
|
|
||||||
elif config["loss"] == "L2":
|
|
||||||
distance = L2Distance(normalize=True, ignore_zeros=True)
|
|
||||||
return partial(get_vectors_loss, distance=distance)
|
|
||||||
else:
|
|
||||||
raise ValueError(Errors.E906.format(loss_type=config["loss"]))
|
|
||||||
else:
|
|
||||||
raise ValueError(Errors.E907.format(objective_type=objective_type))
|
|
||||||
|
|
||||||
|
|
||||||
def get_vectors_loss(ops, docs, prediction, distance):
|
|
||||||
"""Compute a loss based on a distance between the documents' vectors and
|
|
||||||
the prediction.
|
|
||||||
"""
|
|
||||||
# The simplest way to implement this would be to vstack the
|
|
||||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
|
||||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
|
||||||
# and look them up all at once. This prevents data copying.
|
|
||||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
|
||||||
target = docs[0].vocab.vectors.data[ids]
|
|
||||||
d_target, loss = distance(prediction, target)
|
|
||||||
return loss, d_target
|
|
||||||
|
|
||||||
|
|
||||||
def get_characters_loss(ops, docs, prediction, nr_char):
|
|
||||||
"""Compute a loss based on a number of characters predicted from the docs."""
|
|
||||||
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
|
|
||||||
target_ids = target_ids.reshape((-1,))
|
|
||||||
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
|
||||||
target = target.reshape((-1, 256 * nr_char))
|
|
||||||
diff = prediction - target
|
|
||||||
loss = (diff ** 2).sum()
|
|
||||||
d_target = diff / float(prediction.shape[0])
|
|
||||||
return loss, d_target
|
|
||||||
|
|
||||||
|
|
||||||
def create_pretraining_model(nlp, pretrain_config):
|
def create_pretraining_model(nlp, pretrain_config):
|
||||||
"""Define a network for the pretraining. We simply add an output layer onto
|
"""Define a network for the pretraining. We simply add an output layer onto
|
||||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
the tok2vec input model. The tok2vec input model needs to be a model that
|
||||||
|
@ -191,27 +134,15 @@ def create_pretraining_model(nlp, pretrain_config):
|
||||||
The actual tok2vec layer is stored as a reference, and only this bit will be
|
The actual tok2vec layer is stored as a reference, and only this bit will be
|
||||||
serialized to file and read back in when calling the 'train' command.
|
serialized to file and read back in when calling the 'train' command.
|
||||||
"""
|
"""
|
||||||
|
nlp.initialize()
|
||||||
component = nlp.get_pipe(pretrain_config["component"])
|
component = nlp.get_pipe(pretrain_config["component"])
|
||||||
if pretrain_config.get("layer"):
|
if pretrain_config.get("layer"):
|
||||||
tok2vec = component.model.get_ref(pretrain_config["layer"])
|
tok2vec = component.model.get_ref(pretrain_config["layer"])
|
||||||
else:
|
else:
|
||||||
tok2vec = component.model
|
tok2vec = component.model
|
||||||
|
|
||||||
# TODO
|
create_function = pretrain_config["objective"]
|
||||||
maxout_pieces = 3
|
model = create_function(nlp.vocab, tok2vec)
|
||||||
hidden_size = 300
|
|
||||||
if pretrain_config["objective"]["type"] == "vectors":
|
|
||||||
model = build_cloze_multi_task_model(
|
|
||||||
nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
|
||||||
)
|
|
||||||
elif pretrain_config["objective"]["type"] == "characters":
|
|
||||||
model = build_cloze_characters_multi_task_model(
|
|
||||||
nlp.vocab,
|
|
||||||
tok2vec,
|
|
||||||
hidden_size=hidden_size,
|
|
||||||
maxout_pieces=maxout_pieces,
|
|
||||||
nr_char=pretrain_config["objective"]["n_characters"],
|
|
||||||
)
|
|
||||||
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
||||||
set_dropout_rate(model, pretrain_config["dropout"])
|
set_dropout_rate(model, pretrain_config["dropout"])
|
||||||
return model
|
return model
|
||||||
|
|
|
@ -465,16 +465,22 @@ def load_config(
|
||||||
) -> Config:
|
) -> Config:
|
||||||
"""Load a config file. Takes care of path validation and section order.
|
"""Load a config file. Takes care of path validation and section order.
|
||||||
|
|
||||||
path (Union[str, Path]): Path to the config file.
|
path (Union[str, Path]): Path to the config file or "-" to read from stdin.
|
||||||
overrides: (Dict[str, Any]): Config overrides as nested dict or
|
overrides: (Dict[str, Any]): Config overrides as nested dict or
|
||||||
dict keyed by section values in dot notation.
|
dict keyed by section values in dot notation.
|
||||||
interpolate (bool): Whether to interpolate and resolve variables.
|
interpolate (bool): Whether to interpolate and resolve variables.
|
||||||
RETURNS (Config): The loaded config.
|
RETURNS (Config): The loaded config.
|
||||||
"""
|
"""
|
||||||
config_path = ensure_path(path)
|
config_path = ensure_path(path)
|
||||||
if not config_path.exists() or not config_path.is_file():
|
config = Config(section_order=CONFIG_SECTION_ORDER)
|
||||||
|
if str(config_path) == "-": # read from standard input
|
||||||
|
return config.from_str(
|
||||||
|
sys.stdin.read(), overrides=overrides, interpolate=interpolate
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if not config_path or not config_path.exists() or not config_path.is_file():
|
||||||
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
|
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
|
||||||
return Config(section_order=CONFIG_SECTION_ORDER).from_disk(
|
return config.from_disk(
|
||||||
config_path, overrides=overrides, interpolate=interpolate
|
config_path, overrides=overrides, interpolate=interpolate
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -143,10 +143,10 @@ argument that connects to the shared `tok2vec` component in the pipeline.
|
||||||
|
|
||||||
Construct an embedding layer that separately embeds a number of lexical
|
Construct an embedding layer that separately embeds a number of lexical
|
||||||
attributes using hash embedding, concatenates the results, and passes it through
|
attributes using hash embedding, concatenates the results, and passes it through
|
||||||
a feed-forward subnetwork to build a mixed representation. The features used
|
a feed-forward subnetwork to build a mixed representation. The features used can
|
||||||
can be configured with the `attrs` argument. The suggested attributes are
|
be configured with the `attrs` argument. The suggested attributes are `NORM`,
|
||||||
`NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account
|
`PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some
|
||||||
some subword information, without construction a fully character-based
|
subword information, without construction a fully character-based
|
||||||
representation. If pretrained vectors are available, they can be included in the
|
representation. If pretrained vectors are available, they can be included in the
|
||||||
representation as well, with the vectors table will be kept static (i.e. it's
|
representation as well, with the vectors table will be kept static (i.e. it's
|
||||||
not updated).
|
not updated).
|
||||||
|
@ -394,9 +394,10 @@ tokens. The layer therefore requires a reduction operation in order to calculate
|
||||||
a single token vector given zero or more wordpiece vectors.
|
a single token vector given zero or more wordpiece vectors.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ |
|
| `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ |
|
||||||
| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ |
|
| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ |
|
||||||
|
| `upstream` | A string to identify the "upstream" `Transformer` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Transformer` component. You'll almost never have multiple upstream `Transformer` components, so the wildcard string will almost always be fine. ~~str~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
|
|
||||||
### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer}
|
### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer}
|
||||||
|
@ -563,7 +564,8 @@ from the linear model, where it is stored in `model.attrs["multi_label"]`.
|
||||||
|
|
||||||
<Accordion title="spacy.TextCatEnsemble.v1 definition" spaced>
|
<Accordion title="spacy.TextCatEnsemble.v1 definition" spaced>
|
||||||
|
|
||||||
The v1 was functionally similar, but used an internal `tok2vec` instead of taking it as argument.
|
The v1 was functionally similar, but used an internal `tok2vec` instead of
|
||||||
|
taking it as argument.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
|
|
@ -126,7 +126,7 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
|
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
|
||||||
| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ |
|
| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ |
|
||||||
| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ |
|
| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ |
|
||||||
| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
|
| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
|
||||||
|
@ -224,8 +224,8 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
||||||
| `output_path` | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~ |
|
| `output_path` | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
||||||
|
@ -428,7 +428,7 @@ File /path/to/thinc/thinc/schedules.py (line 91)
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--show-functions`, `-F` | Show an overview of all registered function blocks used in the config and where those functions come from, including the module name, Python file and line number. ~~bool (flag)~~ |
|
| `--show-functions`, `-F` | Show an overview of all registered function blocks used in the config and where those functions come from, including the module name, Python file and line number. ~~bool (flag)~~ |
|
||||||
| `--show-variables`, `-V` | Show an overview of all variables referenced in the config, e.g. `${paths.train}` and their values that will be used. This also reflects any config overrides provided on the CLI, e.g. `--paths.train /path`. ~~bool (flag)~~ |
|
| `--show-variables`, `-V` | Show an overview of all variables referenced in the config, e.g. `${paths.train}` and their values that will be used. This also reflects any config overrides provided on the CLI, e.g. `--paths.train /path`. ~~bool (flag)~~ |
|
||||||
|
@ -601,8 +601,8 @@ will not be available.
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--ignore-warnings`, `-IW` | Ignore warnings, only show stats and errors. ~~bool (flag)~~ |
|
| `--ignore-warnings`, `-IW` | Ignore warnings, only show stats and errors. ~~bool (flag)~~ |
|
||||||
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
|
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
|
||||||
|
@ -743,8 +743,8 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
||||||
| `component` | Name of the pipeline component of which the model should be analyzed. ~~str (positional)~~ |
|
| `component` | Name of the pipeline component of which the model should be analyzed. ~~str (positional)~~ |
|
||||||
| `--layers`, `-l` | Comma-separated names of layer IDs to print. ~~str (option)~~ |
|
| `--layers`, `-l` | Comma-separated names of layer IDs to print. ~~str (option)~~ |
|
||||||
| `--dimensions`, `-DIM` | Show dimensions of each layer. ~~bool (flag)~~ |
|
| `--dimensions`, `-DIM` | Show dimensions of each layer. ~~bool (flag)~~ |
|
||||||
|
@ -788,8 +788,8 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
||||||
| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
|
| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
||||||
|
@ -828,8 +828,8 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
||||||
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
||||||
|
|
|
@ -66,9 +66,6 @@ shortcut for this and instantiate the component using its string name and
|
||||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
|
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
|
||||||
| `labels_morph` | Mapping of morph + POS tags to morph labels. ~~Dict[str, str]~~ |
|
|
||||||
| `labels_pos` | Mapping of morph + POS tags to POS tags. ~~Dict[str, str]~~ |
|
|
||||||
|
|
||||||
## Morphologizer.\_\_call\_\_ {#call tag="method"}
|
## Morphologizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -21,16 +21,12 @@ architectures and their arguments and hyperparameters.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
|
> from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
|
||||||
> config = {
|
> config = {"model": DEFAULT_TAGGER_MODEL}
|
||||||
> "set_morphology": False,
|
|
||||||
> "model": DEFAULT_TAGGER_MODEL,
|
|
||||||
> }
|
|
||||||
> nlp.add_pipe("tagger", config=config)
|
> nlp.add_pipe("tagger", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `set_morphology` | Whether to set morphological features. Defaults to `False`. ~~bool~~ |
|
|
||||||
| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
|
| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -63,8 +59,6 @@ shortcut for this and instantiate the component using its string name and
|
||||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
|
| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
|
||||||
| `set_morphology` | Whether to set morphological features. ~~bool~~ |
|
|
||||||
|
|
||||||
## Tagger.\_\_call\_\_ {#call tag="method"}
|
## Tagger.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -171,6 +171,25 @@ and _before_ loading any pipelines.
|
||||||
| `gpu_id` | Device index to select. Defaults to `0`. ~~int~~ |
|
| `gpu_id` | Device index to select. Defaults to `0`. ~~int~~ |
|
||||||
| **RETURNS** | `True` ~~bool~~ |
|
| **RETURNS** | `True` ~~bool~~ |
|
||||||
|
|
||||||
|
### spacy.require_cpu {#spacy.require_cpu tag="function" new="3.0.0"}
|
||||||
|
|
||||||
|
Allocate data and perform operations on CPU.
|
||||||
|
If data has already been allocated on GPU, it will not
|
||||||
|
be moved. Ideally, this function should be called right after importing spaCy
|
||||||
|
and _before_ loading any pipelines.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> import spacy
|
||||||
|
> spacy.require_cpu()
|
||||||
|
> nlp = spacy.load("en_core_web_sm")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------ |
|
||||||
|
| **RETURNS** | `True` ~~bool~~ |
|
||||||
|
|
||||||
## displaCy {#displacy source="spacy/displacy"}
|
## displaCy {#displacy source="spacy/displacy"}
|
||||||
|
|
||||||
As of v2.0, spaCy comes with a built-in visualization suite. For more info and
|
As of v2.0, spaCy comes with a built-in visualization suite. For more info and
|
||||||
|
|
|
@ -158,29 +158,37 @@ The other way to install spaCy is to clone its
|
||||||
source. That is the common way if you want to make changes to the code base.
|
source. That is the common way if you want to make changes to the code base.
|
||||||
You'll need to make sure that you have a development environment consisting of a
|
You'll need to make sure that you have a development environment consisting of a
|
||||||
Python distribution including header files, a compiler,
|
Python distribution including header files, a compiler,
|
||||||
[pip](https://pip.pypa.io/en/latest/installing/),
|
[pip](https://pip.pypa.io/en/stable/) and [git](https://git-scm.com) installed.
|
||||||
[virtualenv](https://virtualenv.pypa.io/) and [git](https://git-scm.com)
|
The compiler part is the trickiest. How to do that depends on your system. See
|
||||||
installed. The compiler part is the trickiest. How to do that depends on your
|
notes on [Ubuntu](#source-ubuntu), [macOS / OS X](#source-osx) and
|
||||||
system. See notes on [Ubuntu](#source-ubuntu), [macOS / OS X](#source-osx) and
|
|
||||||
[Windows](#source-windows) for details.
|
[Windows](#source-windows) for details.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m pip install -U pip # update pip
|
$ python -m pip install -U pip setuptools wheel # install/update build tools
|
||||||
$ git clone https://github.com/explosion/spaCy # clone spaCy
|
$ git clone https://github.com/explosion/spaCy # clone spaCy
|
||||||
$ cd spaCy # navigate into dir
|
$ cd spaCy # navigate into dir
|
||||||
|
|
||||||
$ python -m venv .env # create environment in .env
|
$ python -m venv .env # create environment in .env
|
||||||
$ source .env/bin/activate # activate virtual env
|
$ source .env/bin/activate # activate virtual env
|
||||||
$ export PYTHONPATH=`pwd` # set Python path to spaCy dir
|
$ pip install . # compile and install spaCy
|
||||||
$ pip install -r requirements.txt # install all requirements
|
|
||||||
$ python setup.py build_ext --inplace # compile spaCy
|
|
||||||
$ python setup.py install # install spaCy
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Compared to regular install via pip, the
|
To install with extras:
|
||||||
[`requirements.txt`](%%GITHUB_SPACY/requirements.txt) additionally installs
|
|
||||||
developer dependencies such as Cython. See the [quickstart widget](#quickstart)
|
```bash
|
||||||
to get the right commands for your platform and Python version.
|
$ pip install .[lookups,cuda102] # install spaCy with extras
|
||||||
|
```
|
||||||
|
|
||||||
|
To install all dependencies required for development:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
Compared to a regular install via pip, the
|
||||||
|
[`requirements.txt`](%%GITHUB_SPACY/requirements.txt) additionally includes
|
||||||
|
developer dependencies such as Cython and the libraries required to run the test
|
||||||
|
suite. See the [quickstart widget](#quickstart) to get the right commands for
|
||||||
|
your platform and Python version.
|
||||||
|
|
||||||
<a id="source-ubuntu"></a><a id="source-osx"></a><a id="source-windows"></a>
|
<a id="source-ubuntu"></a><a id="source-osx"></a><a id="source-windows"></a>
|
||||||
|
|
||||||
|
@ -195,6 +203,32 @@ to get the right commands for your platform and Python version.
|
||||||
[Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/)
|
[Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/)
|
||||||
that matches the version that was used to compile your Python interpreter.
|
that matches the version that was used to compile your Python interpreter.
|
||||||
|
|
||||||
|
#### Additional options for developers {#source-developers}
|
||||||
|
|
||||||
|
Some additional options may be useful for spaCy developers who are editing the
|
||||||
|
source code and recompiling frequently.
|
||||||
|
|
||||||
|
- Install in editable mode. Changes to `.py` files will be reflected as soon as
|
||||||
|
the files are saved, but edits to Cython files (`.pxd`, `.pyx`) will require
|
||||||
|
the `pip install` or `python setup.py build_ext` command below to be run
|
||||||
|
again. Before installing in editable mode, be sure you have removed any
|
||||||
|
previous installs with `pip uninstall spacy`, which you may need to run
|
||||||
|
multiple times to remove all traces of earlier installs.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ pip install -r requirements.txt
|
||||||
|
$ pip install --no-build-isolation --editable .
|
||||||
|
```
|
||||||
|
|
||||||
|
- Build in parallel using `N` CPUs to speed up compilation and then install in
|
||||||
|
editable mode:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ pip install -r requirements.txt
|
||||||
|
$ python setup.py build_ext --inplace -j N
|
||||||
|
$ pip install --no-build-isolation --editable .
|
||||||
|
```
|
||||||
|
|
||||||
### Building an executable {#executable}
|
### Building an executable {#executable}
|
||||||
|
|
||||||
The spaCy repository includes a [`Makefile`](%%GITHUB_SPACY/Makefile) that
|
The spaCy repository includes a [`Makefile`](%%GITHUB_SPACY/Makefile) that
|
||||||
|
|
|
@ -502,7 +502,7 @@ with Model.define_operators({">>": chain}):
|
||||||
|
|
||||||
## Create new trainable components {#components}
|
## Create new trainable components {#components}
|
||||||
|
|
||||||
In addition to [swapping out](#swap-architectures) default models in built-in
|
In addition to [swapping out](#swap-architectures) layers in existing
|
||||||
components, you can also implement an entirely new,
|
components, you can also implement an entirely new,
|
||||||
[trainable](/usage/processing-pipelines#trainable-components) pipeline component
|
[trainable](/usage/processing-pipelines#trainable-components) pipeline component
|
||||||
from scratch. This can be done by creating a new class inheriting from
|
from scratch. This can be done by creating a new class inheriting from
|
||||||
|
@ -523,20 +523,28 @@ overview of the `TrainablePipe` methods used by
|
||||||
This section outlines an example use-case of implementing a **novel relation
|
This section outlines an example use-case of implementing a **novel relation
|
||||||
extraction component** from scratch. We'll implement a binary relation
|
extraction component** from scratch. We'll implement a binary relation
|
||||||
extraction method that determines whether or not **two entities** in a document
|
extraction method that determines whether or not **two entities** in a document
|
||||||
are related, and if so, what type of relation. We'll allow multiple types of
|
are related, and if so, what type of relation connects them. We allow multiple
|
||||||
relations between two such entities (multi-label setting). There are two major
|
types of relations between two such entities (a multi-label setting). There are
|
||||||
steps required:
|
two major steps required:
|
||||||
|
|
||||||
1. Implement a [machine learning model](#component-rel-model) specific to this
|
1. Implement a [machine learning model](#component-rel-model) specific to this
|
||||||
task. It will have to extract candidates from a [`Doc`](/api/doc) and predict
|
task. It will have to extract candidate relation instances from a
|
||||||
a relation for the available candidate pairs.
|
[`Doc`](/api/doc) and predict the corresponding scores for each relation
|
||||||
2. Implement a custom [pipeline component](#component-rel-pipe) powered by the
|
label.
|
||||||
machine learning model that sets annotations on the [`Doc`](/api/doc) passing
|
2. Implement a custom [pipeline component](#component-rel-pipe) - powered by the
|
||||||
through the pipeline.
|
machine learning model from step 1 - that translates the predicted scores
|
||||||
|
into annotations that are stored on the [`Doc`](/api/doc) objects as they
|
||||||
|
pass through the `nlp` pipeline.
|
||||||
|
|
||||||
<!-- TODO: <Project id="tutorials/ner-relations">
|
<Project id="tutorials/rel_component">
|
||||||
|
Run this example use-case by using our project template. It includes all the
|
||||||
</Project> -->
|
code to create the ML model and the pipeline component from scratch.
|
||||||
|
It also contains two config files to train the model:
|
||||||
|
one to run on CPU with a Tok2Vec layer, and one for the GPU using a transformer.
|
||||||
|
The project applies the relation extraction component to identify biomolecular
|
||||||
|
interactions in a sample dataset, but you can easily swap in your own dataset
|
||||||
|
for your experiments in any other domain.
|
||||||
|
</Project>
|
||||||
|
|
||||||
#### Step 1: Implementing the Model {#component-rel-model}
|
#### Step 1: Implementing the Model {#component-rel-model}
|
||||||
|
|
||||||
|
@ -552,41 +560,17 @@ matrix** (~~Floats2d~~) of predictions:
|
||||||
> for details.
|
> for details.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Register the model architecture
|
### The model architecture
|
||||||
@registry.architectures.register("rel_model.v1")
|
@spacy.registry.architectures.register("rel_model.v1")
|
||||||
def create_relation_model(...) -> Model[List[Doc], Floats2d]:
|
def create_relation_model(...) -> Model[List[Doc], Floats2d]:
|
||||||
model = ... # 👈 model will go here
|
model = ... # 👈 model will go here
|
||||||
return model
|
return model
|
||||||
```
|
```
|
||||||
|
|
||||||
The first layer in this model will typically be an
|
We adapt a **modular approach** to the definition of this relation model, and
|
||||||
[embedding layer](/usage/embeddings-transformers) such as a
|
define it as chaining two layers together: the first layer that generates an
|
||||||
[`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This
|
instance tensor from a given set of documents, and the second layer that
|
||||||
layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it
|
transforms the instance tensor into a final tensor holding the predictions:
|
||||||
transforms each **document into a list of tokens**, with each token being
|
|
||||||
represented by its embedding in the vector space.
|
|
||||||
|
|
||||||
Next, we need a method that **generates pairs of entities** that we want to
|
|
||||||
classify as being related or not. As these candidate pairs are typically formed
|
|
||||||
within one document, this function takes a [`Doc`](/api/doc) as input and
|
|
||||||
outputs a `List` of `Span` tuples. For instance, a very straightforward
|
|
||||||
implementation would be to just take any two entities from the same document:
|
|
||||||
|
|
||||||
```python
|
|
||||||
### Simple candiate generation
|
|
||||||
def get_candidates(doc: Doc) -> List[Tuple[Span, Span]]:
|
|
||||||
candidates = []
|
|
||||||
for ent1 in doc.ents:
|
|
||||||
for ent2 in doc.ents:
|
|
||||||
candidates.append((ent1, ent2))
|
|
||||||
return candidates
|
|
||||||
```
|
|
||||||
|
|
||||||
But we could also refine this further by **excluding relations** of an entity
|
|
||||||
with itself, and posing a **maximum distance** (in number of tokens) between two
|
|
||||||
entities. We register this function in the
|
|
||||||
[`@misc` registry](/api/top-level#registry) so we can refer to it from the
|
|
||||||
config, and easily swap it out for any other candidate generation function.
|
|
||||||
|
|
||||||
> #### config.cfg (excerpt)
|
> #### config.cfg (excerpt)
|
||||||
>
|
>
|
||||||
|
@ -594,18 +578,159 @@ config, and easily swap it out for any other candidate generation function.
|
||||||
> [model]
|
> [model]
|
||||||
> @architectures = "rel_model.v1"
|
> @architectures = "rel_model.v1"
|
||||||
>
|
>
|
||||||
> [model.tok2vec]
|
> [model.create_instance_tensor]
|
||||||
> # ...
|
> # ...
|
||||||
>
|
>
|
||||||
> [model.get_candidates]
|
> [model.classification_layer]
|
||||||
> @misc = "rel_cand_generator.v1"
|
> # ...
|
||||||
> max_length = 20
|
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Extended candidate generation {highlight="1,2,7,8"}
|
### The model architecture {highlight="6"}
|
||||||
@registry.misc.register("rel_cand_generator.v1")
|
@spacy.registry.architectures.register("rel_model.v1")
|
||||||
def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
|
def create_relation_model(
|
||||||
|
create_instance_tensor: Model[List[Doc], Floats2d],
|
||||||
|
classification_layer: Model[Floats2d, Floats2d],
|
||||||
|
) -> Model[List[Doc], Floats2d]:
|
||||||
|
model = chain(create_instance_tensor, classification_layer)
|
||||||
|
return model
|
||||||
|
```
|
||||||
|
|
||||||
|
The `classification_layer` could be something like a
|
||||||
|
[Linear](https://thinc.ai/docs/api-layers#linear) layer followed by a
|
||||||
|
[logistic](https://thinc.ai/docs/api-layers#logistic) activation function:
|
||||||
|
|
||||||
|
> #### config.cfg (excerpt)
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [model.classification_layer]
|
||||||
|
> @architectures = "rel_classification_layer.v1"
|
||||||
|
> nI = null
|
||||||
|
> nO = null
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```python
|
||||||
|
### The classification layer
|
||||||
|
@spacy.registry.architectures.register("rel_classification_layer.v1")
|
||||||
|
def create_classification_layer(
|
||||||
|
nO: int = None, nI: int = None
|
||||||
|
) -> Model[Floats2d, Floats2d]:
|
||||||
|
return chain(Linear(nO=nO, nI=nI), Logistic())
|
||||||
|
```
|
||||||
|
|
||||||
|
The first layer that **creates the instance tensor** can be defined by
|
||||||
|
implementing a
|
||||||
|
[custom forward function](https://thinc.ai/docs/usage-models#weights-layers-forward)
|
||||||
|
with an appropriate backpropagation callback. We also define an
|
||||||
|
[initialization method](https://thinc.ai/docs/usage-models#weights-layers-init)
|
||||||
|
that ensures that the layer is properly set up for training.
|
||||||
|
|
||||||
|
We omit some of the implementation details here, and refer to the
|
||||||
|
[spaCy project](https://github.com/explosion/projects/tree/v3/tutorials/rel_component)
|
||||||
|
that has the full implementation.
|
||||||
|
|
||||||
|
> #### config.cfg (excerpt)
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [model.create_instance_tensor]
|
||||||
|
> @architectures = "rel_instance_tensor.v1"
|
||||||
|
>
|
||||||
|
> [model.create_instance_tensor.tok2vec]
|
||||||
|
> @architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
> # ...
|
||||||
|
>
|
||||||
|
> [model.create_instance_tensor.pooling]
|
||||||
|
> @layers = "reduce_mean.v1"
|
||||||
|
>
|
||||||
|
> [model.create_instance_tensor.get_instances]
|
||||||
|
> # ...
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```python
|
||||||
|
### The layer that creates the instance tensor
|
||||||
|
@spacy.registry.architectures.register("rel_instance_tensor.v1")
|
||||||
|
def create_tensors(
|
||||||
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
|
pooling: Model[Ragged, Floats2d],
|
||||||
|
get_instances: Callable[[Doc], List[Tuple[Span, Span]]],
|
||||||
|
) -> Model[List[Doc], Floats2d]:
|
||||||
|
|
||||||
|
return Model(
|
||||||
|
"instance_tensors",
|
||||||
|
instance_forward,
|
||||||
|
init=instance_init,
|
||||||
|
layers=[tok2vec, pooling],
|
||||||
|
refs={"tok2vec": tok2vec, "pooling": pooling},
|
||||||
|
attrs={"get_instances": get_instances},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# The custom forward function
|
||||||
|
def instance_forward(
|
||||||
|
model: Model[List[Doc], Floats2d],
|
||||||
|
docs: List[Doc],
|
||||||
|
is_train: bool,
|
||||||
|
) -> Tuple[Floats2d, Callable]:
|
||||||
|
tok2vec = model.get_ref("tok2vec")
|
||||||
|
tokvecs, bp_tokvecs = tok2vec(docs, is_train)
|
||||||
|
get_instances = model.attrs["get_instances"]
|
||||||
|
all_instances = [get_instances(doc) for doc in docs]
|
||||||
|
pooling = model.get_ref("pooling")
|
||||||
|
relations = ...
|
||||||
|
|
||||||
|
def backprop(d_relations: Floats2d) -> List[Doc]:
|
||||||
|
d_tokvecs = ...
|
||||||
|
return bp_tokvecs(d_tokvecs)
|
||||||
|
|
||||||
|
return relations, backprop
|
||||||
|
|
||||||
|
|
||||||
|
# The custom initialization method
|
||||||
|
def instance_init(
|
||||||
|
model: Model,
|
||||||
|
X: List[Doc] = None,
|
||||||
|
Y: Floats2d = None,
|
||||||
|
) -> Model:
|
||||||
|
tok2vec = model.get_ref("tok2vec")
|
||||||
|
tok2vec.initialize(X)
|
||||||
|
return model
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
This custom layer uses an [embedding layer](/usage/embeddings-transformers) such
|
||||||
|
as a [`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer).
|
||||||
|
This layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it
|
||||||
|
transforms each **document into a list of tokens**, with each token being
|
||||||
|
represented by its embedding in the vector space.
|
||||||
|
|
||||||
|
The `pooling` layer will be applied to summarize the token vectors into **entity
|
||||||
|
vectors**, as named entities (represented by ~~Span~~ objects) can consist of
|
||||||
|
one or multiple tokens. For instance, the pooling layer could resort to
|
||||||
|
calculating the average of all token vectors in an entity. Thinc provides
|
||||||
|
several
|
||||||
|
[built-in pooling operators](https://thinc.ai/docs/api-layers#reduction-ops) for
|
||||||
|
this purpose.
|
||||||
|
|
||||||
|
Finally, we need a `get_instances` method that **generates pairs of entities**
|
||||||
|
that we want to classify as being related or not. As these candidate pairs are
|
||||||
|
typically formed within one document, this function takes a [`Doc`](/api/doc) as
|
||||||
|
input and outputs a `List` of `Span` tuples. For instance, the following
|
||||||
|
implementation takes any two entities from the same document, as long as they
|
||||||
|
are within a **maximum distance** (in number of tokens) of eachother:
|
||||||
|
|
||||||
|
> #### config.cfg (excerpt)
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
>
|
||||||
|
> [model.create_instance_tensor.get_instances]
|
||||||
|
> @misc = "rel_instance_generator.v1"
|
||||||
|
> max_length = 100
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```python
|
||||||
|
### Candidate generation
|
||||||
|
@spacy.registry.misc.register("rel_instance_generator.v1")
|
||||||
|
def create_instances(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
|
||||||
def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
|
def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
|
||||||
candidates = []
|
candidates = []
|
||||||
for ent1 in doc.ents:
|
for ent1 in doc.ents:
|
||||||
|
@ -617,45 +742,39 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span
|
||||||
return get_candidates
|
return get_candidates
|
||||||
```
|
```
|
||||||
|
|
||||||
Finally, we require a method that transforms the candidate entity pairs into a
|
This function in added to the [`@misc` registry](/api/top-level#registry) so we
|
||||||
2D tensor using the specified [`Tok2Vec`](/api/tok2vec) or
|
can refer to it from the config, and easily swap it out for any other candidate
|
||||||
[`Transformer`](/api/transformer). The resulting ~~Floats2~~ object will then be
|
generation function.
|
||||||
processed by a final `output_layer` of the network. Putting all this together,
|
|
||||||
we can define our relation model in a config file as such:
|
|
||||||
|
|
||||||
```ini
|
#### Intermezzo: define how to store the relations data {#component-rel-attribute}
|
||||||
### config.cfg
|
|
||||||
[model]
|
|
||||||
@architectures = "rel_model.v1"
|
|
||||||
# ...
|
|
||||||
|
|
||||||
[model.tok2vec]
|
> #### Example output
|
||||||
# ...
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("Amsterdam is the capital of the Netherlands.")
|
||||||
|
> print("spans", [(e.start, e.text, e.label_) for e in doc.ents])
|
||||||
|
> for value, rel_dict in doc._.rel.items():
|
||||||
|
> print(f"{value}: {rel_dict}")
|
||||||
|
>
|
||||||
|
> # spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')]
|
||||||
|
> # (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002}
|
||||||
|
> # (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017}
|
||||||
|
> ```
|
||||||
|
|
||||||
[model.get_candidates]
|
For our new relation extraction component, we will use a custom
|
||||||
@misc = "rel_cand_generator.v1"
|
[extension attribute](/usage/processing-pipelines#custom-components-attributes)
|
||||||
max_length = 20
|
`doc._.rel` in which we store relation data. The attribute refers to a
|
||||||
|
dictionary, keyed by the **start offsets of each entity** involved in the
|
||||||
[model.create_candidate_tensor]
|
candidate relation. The values in the dictionary refer to another dictionary
|
||||||
@misc = "rel_cand_tensor.v1"
|
where relation labels are mapped to values between 0 and 1. We assume anything
|
||||||
|
above 0.5 to be a `True` relation. The ~~Example~~ instances that we'll use as
|
||||||
[model.output_layer]
|
training data, will include their gold-standard relation annotations in
|
||||||
@architectures = "rel_output_layer.v1"
|
`example.reference._.rel`.
|
||||||
# ...
|
|
||||||
```
|
|
||||||
|
|
||||||
<!-- TODO: link to project for implementation details -->
|
|
||||||
<!-- TODO: maybe embed files from project that show the architectures? -->
|
|
||||||
|
|
||||||
When creating this model, we store the custom functions as
|
|
||||||
[attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as
|
|
||||||
references, so we can access them easily:
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
tok2vec_layer = model.get_ref("tok2vec")
|
### Registering the extension attribute
|
||||||
output_layer = model.get_ref("output_layer")
|
from spacy.tokens import Doc
|
||||||
create_candidate_tensor = model.attrs["create_candidate_tensor"]
|
Doc.set_extension("rel", default={})
|
||||||
get_candidates = model.attrs["get_candidates"]
|
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Step 2: Implementing the pipeline component {#component-rel-pipe}
|
#### Step 2: Implementing the pipeline component {#component-rel-pipe}
|
||||||
|
@ -698,19 +817,44 @@ class RelationExtractor(TrainablePipe):
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
|
|
||||||
Before the model can be used, it needs to be
|
Typically, the **constructor** defines the vocab, the Machine Learning model,
|
||||||
[initialized](/usage/training#initialization). This function receives a callback
|
and the name of this component. Additionally, this component, just like the
|
||||||
to access the full **training data set**, or a representative sample. This data
|
`textcat` and the `tagger`, stores an **internal list of labels**. The ML model
|
||||||
set can be used to deduce all **relevant labels**. Alternatively, a list of
|
will predict scores for each label. We add convenience methods to easily
|
||||||
labels can be provided to `initialize`, or you can call
|
retrieve and add to them.
|
||||||
`RelationExtractor.add_label` directly. The number of labels defines the output
|
|
||||||
dimensionality of the network, and will be used to do
|
```python
|
||||||
|
### The constructor (continued)
|
||||||
|
def __init__(self, vocab, model, name="rel"):
|
||||||
|
"""Create a component instance."""
|
||||||
|
# ...
|
||||||
|
self.cfg = {"labels": []}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def labels(self) -> Tuple[str]:
|
||||||
|
"""Returns the labels currently added to the component."""
|
||||||
|
return tuple(self.cfg["labels"])
|
||||||
|
|
||||||
|
def add_label(self, label: str):
|
||||||
|
"""Add a new label to the pipe."""
|
||||||
|
self.cfg["labels"] = list(self.labels) + [label]
|
||||||
|
```
|
||||||
|
|
||||||
|
After creation, the component needs to be
|
||||||
|
[initialized](/usage/training#initialization). This method can define the
|
||||||
|
relevant labels in two ways: explicitely by setting the `labels` argument in the
|
||||||
|
[`initialize` block](/api/data-formats#config-initialize) of the config, or
|
||||||
|
implicately by deducing them from the `get_examples` callback that generates the
|
||||||
|
full **training data set**, or a representative sample.
|
||||||
|
|
||||||
|
The final number of labels defines the output dimensionality of the network, and
|
||||||
|
will be used to do
|
||||||
[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
|
[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
|
||||||
layers of the neural network. This is triggered by calling
|
layers of the neural network. This is triggered by calling
|
||||||
[`Model.initialize`](https://thinc.ai/api/model#initialize).
|
[`Model.initialize`](https://thinc.ai/api/model#initialize).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### The initialize method {highlight="12,18,22"}
|
### The initialize method {highlight="12,15,18,22"}
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
|
||||||
def initialize(
|
def initialize(
|
||||||
|
@ -741,7 +885,7 @@ Typically, this happens when the pipeline is set up before training in
|
||||||
[`spacy train`](/api/cli#training). After initialization, the pipeline component
|
[`spacy train`](/api/cli#training). After initialization, the pipeline component
|
||||||
and its internal model can be trained and used to make predictions.
|
and its internal model can be trained and used to make predictions.
|
||||||
|
|
||||||
During training, the function [`update`](/api/pipe#update) is invoked which
|
During training, the method [`update`](/api/pipe#update) is invoked which
|
||||||
delegates to
|
delegates to
|
||||||
[`Model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
|
[`Model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
|
||||||
[`get_loss`](/api/pipe#get_loss) function that **calculates the loss** for a
|
[`get_loss`](/api/pipe#get_loss) function that **calculates the loss** for a
|
||||||
|
@ -761,18 +905,18 @@ def update(
|
||||||
sgd: Optional[Optimizer] = None,
|
sgd: Optional[Optimizer] = None,
|
||||||
losses: Optional[Dict[str, float]] = None,
|
losses: Optional[Dict[str, float]] = None,
|
||||||
) -> Dict[str, float]:
|
) -> Dict[str, float]:
|
||||||
...
|
# ...
|
||||||
docs = [ex.predicted for ex in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
predictions, backprop = self.model.begin_update(docs)
|
predictions, backprop = self.model.begin_update(docs)
|
||||||
loss, gradient = self.get_loss(examples, predictions)
|
loss, gradient = self.get_loss(examples, predictions)
|
||||||
backprop(gradient)
|
backprop(gradient)
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
...
|
# ...
|
||||||
return losses
|
return losses
|
||||||
```
|
```
|
||||||
|
|
||||||
When the internal model is trained, the component can be used to make novel
|
After training the model, the component can be used to make novel
|
||||||
**predictions**. The [`predict`](/api/pipe#predict) function needs to be
|
**predictions**. The [`predict`](/api/pipe#predict) method needs to be
|
||||||
implemented for each subclass of `TrainablePipe`. In our case, we can simply
|
implemented for each subclass of `TrainablePipe`. In our case, we can simply
|
||||||
delegate to the internal model's
|
delegate to the internal model's
|
||||||
[predict](https://thinc.ai/docs/api-model#predict) function that takes a batch
|
[predict](https://thinc.ai/docs/api-model#predict) function that takes a batch
|
||||||
|
@ -788,42 +932,21 @@ def predict(self, docs: Iterable[Doc]) -> Floats2d:
|
||||||
The final method that needs to be implemented, is
|
The final method that needs to be implemented, is
|
||||||
[`set_annotations`](/api/pipe#set_annotations). This function takes the
|
[`set_annotations`](/api/pipe#set_annotations). This function takes the
|
||||||
predictions, and modifies the given `Doc` object in place to store them. For our
|
predictions, and modifies the given `Doc` object in place to store them. For our
|
||||||
relation extraction component, we store the data as a dictionary in a custom
|
relation extraction component, we store the data in the
|
||||||
[extension attribute](/usage/processing-pipelines#custom-components-attributes)
|
[custom attribute](#component-rel-attribute)`doc._.rel`.
|
||||||
`doc._.rel`. As keys, we represent the candidate pair by the **start offsets of
|
|
||||||
each entity**, as this defines an entity pair uniquely within one document.
|
|
||||||
|
|
||||||
To interpret the scores predicted by the relation extraction model correctly, we
|
To interpret the scores predicted by the relation extraction model correctly, we
|
||||||
need to refer to the model's `get_candidates` function that defined which pairs
|
need to refer to the model's `get_instances` function that defined which pairs
|
||||||
of entities were relevant candidates, so that the predictions can be linked to
|
of entities were relevant candidates, so that the predictions can be linked to
|
||||||
those exact entities:
|
those exact entities:
|
||||||
|
|
||||||
> #### Example output
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> doc = nlp("Amsterdam is the capital of the Netherlands.")
|
|
||||||
> print("spans", [(e.start, e.text, e.label_) for e in doc.ents])
|
|
||||||
> for value, rel_dict in doc._.rel.items():
|
|
||||||
> print(f"{value}: {rel_dict}")
|
|
||||||
>
|
|
||||||
> # spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')]
|
|
||||||
> # (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002}
|
|
||||||
> # (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017}
|
|
||||||
> ```
|
|
||||||
|
|
||||||
```python
|
|
||||||
### Registering the extension attribute
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
Doc.set_extension("rel", default={})
|
|
||||||
```
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### The set_annotations method {highlight="5-6,10"}
|
### The set_annotations method {highlight="5-6,10"}
|
||||||
def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
|
def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
|
||||||
c = 0
|
c = 0
|
||||||
get_candidates = self.model.attrs["get_candidates"]
|
get_instances = self.model.attrs["get_instances"]
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
for (e1, e2) in get_candidates(doc):
|
for (e1, e2) in get_instances(doc):
|
||||||
offset = (e1.start, e2.start)
|
offset = (e1.start, e2.start)
|
||||||
if offset not in doc._.rel:
|
if offset not in doc._.rel:
|
||||||
doc._.rel[offset] = {}
|
doc._.rel[offset] = {}
|
||||||
|
@ -837,15 +960,15 @@ Under the hood, when the pipe is applied to a document, it delegates to the
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### The __call__ method
|
### The __call__ method
|
||||||
def __call__(self, Doc doc):
|
def __call__(self, doc: Doc):
|
||||||
predictions = self.predict([doc])
|
predictions = self.predict([doc])
|
||||||
self.set_annotations([doc], predictions)
|
self.set_annotations([doc], predictions)
|
||||||
return doc
|
return doc
|
||||||
```
|
```
|
||||||
|
|
||||||
There is one more optional method to implement: [`score`](/api/pipe#score)
|
There is one more optional method to implement: [`score`](/api/pipe#score)
|
||||||
calculates the performance of your component on a set of examples, and
|
calculates the performance of your component on a set of examples, and returns
|
||||||
returns the results as a dictionary:
|
the results as a dictionary:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### The score method
|
### The score method
|
||||||
|
@ -861,8 +984,8 @@ def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
This is particularly useful to see the scores on the development corpus
|
This is particularly useful for calculating relevant scores on the development
|
||||||
when training the component with [`spacy train`](/api/cli#training).
|
corpus when training the component with [`spacy train`](/api/cli#training).
|
||||||
|
|
||||||
Once our `TrainablePipe` subclass is fully implemented, we can
|
Once our `TrainablePipe` subclass is fully implemented, we can
|
||||||
[register](/usage/processing-pipelines#custom-components-factories) the
|
[register](/usage/processing-pipelines#custom-components-factories) the
|
||||||
|
@ -879,14 +1002,8 @@ assigns it a name and lets you create the component with
|
||||||
>
|
>
|
||||||
> [components.relation_extractor.model]
|
> [components.relation_extractor.model]
|
||||||
> @architectures = "rel_model.v1"
|
> @architectures = "rel_model.v1"
|
||||||
>
|
|
||||||
> [components.relation_extractor.model.tok2vec]
|
|
||||||
> # ...
|
> # ...
|
||||||
>
|
>
|
||||||
> [components.relation_extractor.model.get_candidates]
|
|
||||||
> @misc = "rel_cand_generator.v1"
|
|
||||||
> max_length = 20
|
|
||||||
>
|
|
||||||
> [training.score_weights]
|
> [training.score_weights]
|
||||||
> rel_micro_p = 0.0
|
> rel_micro_p = 0.0
|
||||||
> rel_micro_r = 0.0
|
> rel_micro_r = 0.0
|
||||||
|
@ -924,6 +1041,12 @@ def make_relation_extractor(nlp, name, model):
|
||||||
return RelationExtractor(nlp.vocab, model, name)
|
return RelationExtractor(nlp.vocab, model, name)
|
||||||
```
|
```
|
||||||
|
|
||||||
<!-- TODO: <Project id="tutorials/ner-relations">
|
<Project id="tutorials/rel_component">
|
||||||
|
Run this example use-case by using our project template. It includes all the
|
||||||
</Project> -->
|
code to create the ML model and the pipeline component from scratch.
|
||||||
|
It contains two config files to train the model:
|
||||||
|
one to run on CPU with a Tok2Vec layer, and one for the GPU using a transformer.
|
||||||
|
The project applies the relation extraction component to identify biomolecular
|
||||||
|
interactions, but you can easily swap in your own dataset for your experiments
|
||||||
|
in any other domain.
|
||||||
|
</Project>
|
||||||
|
|
|
@ -264,6 +264,26 @@ defined in the config file.
|
||||||
$ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh
|
$ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Reading from standard input {#config-stdin}
|
||||||
|
|
||||||
|
Setting the config path to `-` on the command line lets you read the config from
|
||||||
|
standard input and pipe it forward from a different process, like
|
||||||
|
[`init config`](/api/cli#init-config) or your own custom script. This is
|
||||||
|
especially useful for quick experiments, as it lets you generate a config on the
|
||||||
|
fly without having to save to and load from disk.
|
||||||
|
|
||||||
|
> #### 💡 Tip: Writing to stdout
|
||||||
|
>
|
||||||
|
> When you run `init config`, you can set the output path to `-` to write to
|
||||||
|
> stdout. In a custom script, you can print the string config, e.g.
|
||||||
|
> `print(nlp.config.to_str())`.
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy init config - --lang en --pipeline ner,textcat --optimize accuracy | python -m spacy train - --paths.train ./corpus/train.spacy --paths.dev ./corpus/dev.spacy
|
||||||
|
```
|
||||||
|
|
||||||
|
<!-- TODO: add reference to Prodigy's commands once Prodigy nightly is available -->
|
||||||
|
|
||||||
### Using variable interpolation {#config-interpolation}
|
### Using variable interpolation {#config-interpolation}
|
||||||
|
|
||||||
Another very useful feature of the config system is that it supports variable
|
Another very useful feature of the config system is that it supports variable
|
||||||
|
@ -378,7 +398,8 @@ weights and [resume training](/api/language#resume_training).
|
||||||
If you don't want a component to be updated, you can **freeze** it by adding it
|
If you don't want a component to be updated, you can **freeze** it by adding it
|
||||||
to the `frozen_components` list in the `[training]` block. Frozen components are
|
to the `frozen_components` list in the `[training]` block. Frozen components are
|
||||||
**not updated** during training and are included in the final trained pipeline
|
**not updated** during training and are included in the final trained pipeline
|
||||||
as-is. They are also excluded when calling [`nlp.initialize`](/api/language#initialize).
|
as-is. They are also excluded when calling
|
||||||
|
[`nlp.initialize`](/api/language#initialize).
|
||||||
|
|
||||||
> #### Note on frozen components
|
> #### Note on frozen components
|
||||||
>
|
>
|
||||||
|
@ -551,8 +572,8 @@ or TensorFlow, make **custom modifications** to the `nlp` object, create custom
|
||||||
optimizers or schedules, or **stream in data** and preprocesses it on the fly
|
optimizers or schedules, or **stream in data** and preprocesses it on the fly
|
||||||
while training.
|
while training.
|
||||||
|
|
||||||
Each custom function can have any number of arguments that are passed in via
|
Each custom function can have any number of arguments that are passed in via the
|
||||||
the [config](#config), just the built-in functions. If your function defines
|
[config](#config), just the built-in functions. If your function defines
|
||||||
**default argument values**, spaCy is able to auto-fill your config when you run
|
**default argument values**, spaCy is able to auto-fill your config when you run
|
||||||
[`init fill-config`](/api/cli#init-fill-config). If you want to make sure that a
|
[`init fill-config`](/api/cli#init-fill-config). If you want to make sure that a
|
||||||
given parameter is always explicitly set in the config, avoid setting a default
|
given parameter is always explicitly set in the config, avoid setting a default
|
||||||
|
@ -958,10 +979,10 @@ data assets, track changes and share your end-to-end processes with your team.
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
The binary `.spacy` format is a serialized [`DocBin`](/api/docbin) containing
|
The binary `.spacy` format is a serialized [`DocBin`](/api/docbin) containing
|
||||||
one or more [`Doc`](/api/doc) objects. It's extremely **efficient in
|
one or more [`Doc`](/api/doc) objects. It's extremely **efficient in storage**,
|
||||||
storage**, especially when packing multiple documents together. You can also
|
especially when packing multiple documents together. You can also create `Doc`
|
||||||
create `Doc` objects manually, so you can write your own custom logic to convert
|
objects manually, so you can write your own custom logic to convert and store
|
||||||
and store existing annotations for use in spaCy.
|
existing annotations for use in spaCy.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Training data from Doc objects {highlight="6-9"}
|
### Training data from Doc objects {highlight="6-9"}
|
||||||
|
@ -1300,10 +1321,10 @@ mapping so they know which worker owns which parameter.
|
||||||
As training proceeds, every worker will be computing gradients for **all** of
|
As training proceeds, every worker will be computing gradients for **all** of
|
||||||
the model parameters. When they compute gradients for parameters they don't own,
|
the model parameters. When they compute gradients for parameters they don't own,
|
||||||
they'll **send them to the worker** that does own that parameter, along with a
|
they'll **send them to the worker** that does own that parameter, along with a
|
||||||
version identifier so that the owner can decide whether to discard the
|
version identifier so that the owner can decide whether to discard the gradient.
|
||||||
gradient. Workers use the gradients they receive and the ones they compute
|
Workers use the gradients they receive and the ones they compute locally to
|
||||||
locally to update the parameters they own, and then broadcast the updated array
|
update the parameters they own, and then broadcast the updated array and a new
|
||||||
and a new version ID to the other workers.
|
version ID to the other workers.
|
||||||
|
|
||||||
This training procedure is **asynchronous** and **non-blocking**. Workers always
|
This training procedure is **asynchronous** and **non-blocking**. Workers always
|
||||||
push their gradient increments and parameter updates, they do not have to pull
|
push their gradient increments and parameter updates, they do not have to pull
|
||||||
|
|
|
@ -969,18 +969,18 @@ The [`Language.update`](/api/language#update),
|
||||||
raw text and a dictionary of annotations.
|
raw text and a dictionary of annotations.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Training loop {highlight="11"}
|
### Training loop {highlight="5-8,12"}
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
||||||
("I like London.", {"entities": [(7, 13, "LOC")]}),
|
("I like London.", {"entities": [(7, 13, "LOC")]}),
|
||||||
]
|
]
|
||||||
nlp.initialize()
|
examples = []
|
||||||
for i in range(20):
|
for text, annots in TRAIN_DATA:
|
||||||
random.shuffle(TRAIN_DATA)
|
|
||||||
for batch in minibatch(TRAIN_DATA):
|
|
||||||
examples = []
|
|
||||||
for text, annots in batch:
|
|
||||||
examples.append(Example.from_dict(nlp.make_doc(text), annots))
|
examples.append(Example.from_dict(nlp.make_doc(text), annots))
|
||||||
|
nlp.initialize(lambda: examples)
|
||||||
|
for i in range(20):
|
||||||
|
random.shuffle(examples)
|
||||||
|
for batch in minibatch(examples, size=8):
|
||||||
nlp.update(examples)
|
nlp.update(examples)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -995,7 +995,7 @@ network,
|
||||||
setting up the label scheme.
|
setting up the label scheme.
|
||||||
|
|
||||||
```diff
|
```diff
|
||||||
- nlp.initialize(examples)
|
- nlp.begin_training()
|
||||||
+ nlp.initialize(lambda: examples)
|
+ nlp.initialize(lambda: examples)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -120,13 +120,9 @@ function parseArgs(raw) {
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
function formatCode(html, lang, prompt) {
|
function convertLine(line, i) {
|
||||||
if (lang === 'cli') {
|
console.log(line, i)
|
||||||
const cliRegex = /^(\$ )?python -m spacy/
|
const cliRegex = /^(\$ )?python -m spacy/
|
||||||
const lines = html
|
|
||||||
.trim()
|
|
||||||
.split('\n')
|
|
||||||
.map((line, i) => {
|
|
||||||
if (cliRegex.test(line)) {
|
if (cliRegex.test(line)) {
|
||||||
const text = line.replace(cliRegex, '')
|
const text = line.replace(cliRegex, '')
|
||||||
const args = parseArgs(text)
|
const args = parseArgs(text)
|
||||||
|
@ -135,8 +131,8 @@ function formatCode(html, lang, prompt) {
|
||||||
return value === null || value === true || i === 0 ? key : `${key} ${value}`
|
return value === null || value === true || i === 0 ? key : `${key} ${value}`
|
||||||
})
|
})
|
||||||
return (
|
return (
|
||||||
<Fragment key={i}>
|
<Fragment key={line}>
|
||||||
<span data-prompt="$" className={classes.cliArgSubtle}>
|
<span data-prompt={i === 0 ? '$' : null} className={classes.cliArgSubtle}>
|
||||||
python -m
|
python -m
|
||||||
</span>{' '}
|
</span>{' '}
|
||||||
<span>spacy</span>{' '}
|
<span>spacy</span>{' '}
|
||||||
|
@ -165,7 +161,24 @@ function formatCode(html, lang, prompt) {
|
||||||
}
|
}
|
||||||
const htmlLine = replacePrompt(highlightCode('bash', line), '$')
|
const htmlLine = replacePrompt(highlightCode('bash', line), '$')
|
||||||
return htmlToReact(htmlLine)
|
return htmlToReact(htmlLine)
|
||||||
})
|
}
|
||||||
|
|
||||||
|
function formatCode(html, lang, prompt) {
|
||||||
|
if (lang === 'cli') {
|
||||||
|
const lines = html
|
||||||
|
.trim()
|
||||||
|
.split('\n')
|
||||||
|
.map(line =>
|
||||||
|
line
|
||||||
|
.split(' | ')
|
||||||
|
.map((l, i) => convertLine(l, i))
|
||||||
|
.map((l, j) => (
|
||||||
|
<Fragment>
|
||||||
|
{j !== 0 && <span> | </span>}
|
||||||
|
{l}
|
||||||
|
</Fragment>
|
||||||
|
))
|
||||||
|
)
|
||||||
return lines.map((line, i) => (
|
return lines.map((line, i) => (
|
||||||
<Fragment key={i}>
|
<Fragment key={i}>
|
||||||
{i !== 0 && <br />}
|
{i !== 0 && <br />}
|
||||||
|
|
|
@ -120,7 +120,7 @@ function formatAccuracy(data) {
|
||||||
? null
|
? null
|
||||||
: {
|
: {
|
||||||
label,
|
label,
|
||||||
value: (value * 100).toFixed(2),
|
value: value.toFixed(2),
|
||||||
help: MODEL_META[label],
|
help: MODEL_META[label],
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
Loading…
Reference in New Issue
Block a user