Merge pull request #13286 from explosion/master
Sync `docs/llm_main` with `master`
1
.github/FUNDING.yml
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
custom: [https://explosion.ai/merch, https://explosion.ai/tailored-solutions]
|
4
.github/workflows/tests.yml
vendored
|
@ -58,7 +58,7 @@ jobs:
|
||||||
fail-fast: true
|
fail-fast: true
|
||||||
matrix:
|
matrix:
|
||||||
os: [ubuntu-latest, windows-latest, macos-latest]
|
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||||
python_version: ["3.11", "3.12.0-rc.2"]
|
python_version: ["3.12"]
|
||||||
include:
|
include:
|
||||||
- os: windows-latest
|
- os: windows-latest
|
||||||
python_version: "3.7"
|
python_version: "3.7"
|
||||||
|
@ -68,6 +68,8 @@ jobs:
|
||||||
python_version: "3.9"
|
python_version: "3.9"
|
||||||
- os: windows-latest
|
- os: windows-latest
|
||||||
python_version: "3.10"
|
python_version: "3.10"
|
||||||
|
- os: macos-latest
|
||||||
|
python_version: "3.11"
|
||||||
|
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
|
|
||||||
|
|
2
LICENSE
|
@ -1,6 +1,6 @@
|
||||||
The MIT License (MIT)
|
The MIT License (MIT)
|
||||||
|
|
||||||
Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|
11
README.md
|
@ -39,28 +39,35 @@ open-source software, released under the
|
||||||
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
||||||
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
||||||
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
||||||
|
| ⏩ **[GPU Processing]** | Use spaCy with CUDA-compatible GPU processing. |
|
||||||
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
||||||
|
| 🦙 **[Large Language Models]** | Integrate LLMs into spaCy pipelines. |
|
||||||
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
||||||
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
|
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
|
||||||
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
||||||
|
| 📰 **[Blog]** | Read about current spaCy and Prodigy development, releases, talks and more from Explosion. |
|
||||||
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||||
| 🛠 **[Changelog]** | Changes and version history. |
|
| 🛠 **[Changelog]** | Changes and version history. |
|
||||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
| 👕 **[Swag]** | Support us and our work with unique, custom-designed swag! |
|
||||||
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
| <a href="https://explosion.ai/tailored-solutions"><img src="https://github.com/explosion/spaCy/assets/13643239/36d2a42e-98c0-4599-90e1-788ef75181be" width="150" alt="Tailored Solutions"/></a> | Custom NLP consulting, implementation and strategic advice by spaCy’s core development team. Streamlined, production-ready, predictable and maintainable. Send us an email or take our 5-minute questionnaire, and well'be in touch! **[Learn more →](https://explosion.ai/tailored-solutions)** |
|
||||||
|
|
||||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||||
[new in v3.0]: https://spacy.io/usage/v3
|
[new in v3.0]: https://spacy.io/usage/v3
|
||||||
[usage guides]: https://spacy.io/usage/
|
[usage guides]: https://spacy.io/usage/
|
||||||
[api reference]: https://spacy.io/api/
|
[api reference]: https://spacy.io/api/
|
||||||
|
[gpu processing]: https://spacy.io/usage#gpu
|
||||||
[models]: https://spacy.io/models
|
[models]: https://spacy.io/models
|
||||||
|
[large language models]: https://spacy.io/usage/large-language-models
|
||||||
[universe]: https://spacy.io/universe
|
[universe]: https://spacy.io/universe
|
||||||
[spacy vs code extension]: https://github.com/explosion/spacy-vscode
|
[spacy vs code extension]: https://github.com/explosion/spacy-vscode
|
||||||
[videos]: https://www.youtube.com/c/ExplosionAI
|
[videos]: https://www.youtube.com/c/ExplosionAI
|
||||||
[online course]: https://course.spacy.io
|
[online course]: https://course.spacy.io
|
||||||
|
[blog]: https://explosion.ai
|
||||||
[project templates]: https://github.com/explosion/projects
|
[project templates]: https://github.com/explosion/projects
|
||||||
[changelog]: https://spacy.io/usage#changelog
|
[changelog]: https://spacy.io/usage#changelog
|
||||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||||
|
[swag]: https://explosion.ai/merch
|
||||||
|
|
||||||
## 💬 Where to ask questions
|
## 💬 Where to ask questions
|
||||||
|
|
||||||
|
|
|
@ -158,3 +158,45 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
SOFTWARE.
|
SOFTWARE.
|
||||||
|
|
||||||
|
|
||||||
|
SciPy
|
||||||
|
-----
|
||||||
|
|
||||||
|
* Files: scorer.py
|
||||||
|
|
||||||
|
The implementation of trapezoid() is adapted from SciPy, which is distributed
|
||||||
|
under the following license:
|
||||||
|
|
||||||
|
New BSD License
|
||||||
|
|
||||||
|
Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions
|
||||||
|
are met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above
|
||||||
|
copyright notice, this list of conditions and the following
|
||||||
|
disclaimer in the documentation and/or other materials provided
|
||||||
|
with the distribution.
|
||||||
|
|
||||||
|
3. Neither the name of the copyright holder nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
|
@ -5,7 +5,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.1.8,<8.3.0",
|
"thinc>=8.2.2,<8.3.0",
|
||||||
"numpy>=1.15.0; python_version < '3.9'",
|
"numpy>=1.15.0; python_version < '3.9'",
|
||||||
"numpy>=1.25.0; python_version >= '3.9'",
|
"numpy>=1.25.0; python_version >= '3.9'",
|
||||||
]
|
]
|
||||||
|
|
|
@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
|
||||||
spacy-loggers>=1.0.0,<2.0.0
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.1.8,<8.3.0
|
thinc>=8.2.2,<8.3.0
|
||||||
ml_datasets>=0.2.0,<0.3.0
|
ml_datasets>=0.2.0,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
|
|
|
@ -41,7 +41,7 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.1.8,<8.3.0
|
thinc>=8.2.2,<8.3.0
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.11,<3.1.0
|
spacy-legacy>=3.0.11,<3.1.0
|
||||||
|
@ -49,7 +49,7 @@ install_requires =
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.1.8,<8.3.0
|
thinc>=8.2.2,<8.3.0
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.7.1"
|
__version__ = "3.7.2"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -22,8 +22,17 @@ from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||||
from .package import package # noqa: F401
|
from .package import package # noqa: F401
|
||||||
from .pretrain import pretrain # noqa: F401
|
from .pretrain import pretrain # noqa: F401
|
||||||
from .profile import profile # noqa: F401
|
from .profile import profile # noqa: F401
|
||||||
from .train import train_cli # noqa: F401
|
from .project.assets import project_assets # type: ignore[attr-defined] # noqa: F401
|
||||||
from .validate import validate # noqa: F401
|
from .project.clone import project_clone # type: ignore[attr-defined] # noqa: F401
|
||||||
|
from .project.document import ( # type: ignore[attr-defined] # noqa: F401
|
||||||
|
project_document,
|
||||||
|
)
|
||||||
|
from .project.dvc import project_update_dvc # type: ignore[attr-defined] # noqa: F401
|
||||||
|
from .project.pull import project_pull # type: ignore[attr-defined] # noqa: F401
|
||||||
|
from .project.push import project_push # type: ignore[attr-defined] # noqa: F401
|
||||||
|
from .project.run import project_run # type: ignore[attr-defined] # noqa: F401
|
||||||
|
from .train import train_cli # type: ignore[attr-defined] # noqa: F401
|
||||||
|
from .validate import validate # type: ignore[attr-defined] # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||||
|
|
|
@ -13,7 +13,7 @@ from .. import util
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..training import Corpus
|
from ..training import Corpus
|
||||||
from ._util import Arg, Opt, benchmark_cli, setup_gpu
|
from ._util import Arg, Opt, benchmark_cli, import_code, setup_gpu
|
||||||
|
|
||||||
|
|
||||||
@benchmark_cli.command(
|
@benchmark_cli.command(
|
||||||
|
@ -30,12 +30,14 @@ def benchmark_speed_cli(
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
|
n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
|
||||||
warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
|
warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
|
||||||
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
|
Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
|
||||||
data in the binary .spacy format.
|
data in the binary .spacy format.
|
||||||
"""
|
"""
|
||||||
|
import_code(code_path)
|
||||||
setup_gpu(use_gpu=use_gpu, silent=False)
|
setup_gpu(use_gpu=use_gpu, silent=False)
|
||||||
|
|
||||||
nlp = util.load_model(model)
|
nlp = util.load_model(model)
|
||||||
|
@ -171,5 +173,5 @@ def print_outliers(sample: numpy.ndarray):
|
||||||
def warmup(
|
def warmup(
|
||||||
nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
|
nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
|
||||||
) -> numpy.ndarray:
|
) -> numpy.ndarray:
|
||||||
docs = warmup_epochs * docs
|
docs = [doc.copy() for doc in docs * warmup_epochs]
|
||||||
return annotate(nlp, docs, batch_size)
|
return annotate(nlp, docs, batch_size)
|
||||||
|
|
|
@ -7,7 +7,14 @@ from wasabi import msg
|
||||||
|
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..errors import OLD_MODEL_SHORTCUTS
|
from ..errors import OLD_MODEL_SHORTCUTS
|
||||||
from ..util import get_minor_version, is_package, is_prerelease_version, run_command
|
from ..util import (
|
||||||
|
get_minor_version,
|
||||||
|
is_in_interactive,
|
||||||
|
is_in_jupyter,
|
||||||
|
is_package,
|
||||||
|
is_prerelease_version,
|
||||||
|
run_command,
|
||||||
|
)
|
||||||
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
|
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
|
||||||
|
|
||||||
|
|
||||||
|
@ -77,6 +84,27 @@ def download(
|
||||||
"Download and installation successful",
|
"Download and installation successful",
|
||||||
f"You can now load the package via spacy.load('{model_name}')",
|
f"You can now load the package via spacy.load('{model_name}')",
|
||||||
)
|
)
|
||||||
|
if is_in_jupyter():
|
||||||
|
reload_deps_msg = (
|
||||||
|
"If you are in a Jupyter or Colab notebook, you may need to "
|
||||||
|
"restart Python in order to load all the package's dependencies. "
|
||||||
|
"You can do this by selecting the 'Restart kernel' or 'Restart "
|
||||||
|
"runtime' option."
|
||||||
|
)
|
||||||
|
msg.warn(
|
||||||
|
"Restart to reload dependencies",
|
||||||
|
reload_deps_msg,
|
||||||
|
)
|
||||||
|
elif is_in_interactive():
|
||||||
|
reload_deps_msg = (
|
||||||
|
"If you are in an interactive Python session, you may need to "
|
||||||
|
"exit and restart Python to load all the package's dependencies. "
|
||||||
|
"You can exit with Ctrl-D (or Ctrl-Z and Enter on Windows)."
|
||||||
|
)
|
||||||
|
msg.warn(
|
||||||
|
"Restart to reload dependencies",
|
||||||
|
reload_deps_msg,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -11,6 +13,7 @@ from thinc.api import Config
|
||||||
from wasabi import MarkdownRenderer, Printer, get_raw_input
|
from wasabi import MarkdownRenderer, Printer, get_raw_input
|
||||||
|
|
||||||
from .. import about, util
|
from .. import about, util
|
||||||
|
from ..compat import importlib_metadata
|
||||||
from ..schemas import ModelMetaSchema, validate
|
from ..schemas import ModelMetaSchema, validate
|
||||||
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
|
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
|
||||||
|
|
||||||
|
@ -35,7 +38,7 @@ def package_cli(
|
||||||
specified output directory, and the data will be copied over. If
|
specified output directory, and the data will be copied over. If
|
||||||
--create-meta is set and a meta.json already exists in the output directory,
|
--create-meta is set and a meta.json already exists in the output directory,
|
||||||
the existing values will be used as the defaults in the command-line prompt.
|
the existing values will be used as the defaults in the command-line prompt.
|
||||||
After packaging, "python setup.py sdist" is run in the package directory,
|
After packaging, "python -m build --sdist" is run in the package directory,
|
||||||
which will create a .tar.gz archive that can be installed via "pip install".
|
which will create a .tar.gz archive that can be installed via "pip install".
|
||||||
|
|
||||||
If additional code files are provided (e.g. Python files containing custom
|
If additional code files are provided (e.g. Python files containing custom
|
||||||
|
@ -78,9 +81,17 @@ def package(
|
||||||
input_path = util.ensure_path(input_dir)
|
input_path = util.ensure_path(input_dir)
|
||||||
output_path = util.ensure_path(output_dir)
|
output_path = util.ensure_path(output_dir)
|
||||||
meta_path = util.ensure_path(meta_path)
|
meta_path = util.ensure_path(meta_path)
|
||||||
if create_wheel and not has_wheel():
|
if create_wheel and not has_wheel() and not has_build():
|
||||||
err = "Generating a binary .whl file requires wheel to be installed"
|
err = (
|
||||||
msg.fail(err, "pip install wheel", exits=1)
|
"Generating wheels requires 'build' or 'wheel' (deprecated) to be installed"
|
||||||
|
)
|
||||||
|
msg.fail(err, "pip install build", exits=1)
|
||||||
|
if not has_build():
|
||||||
|
msg.warn(
|
||||||
|
"Generating packages without the 'build' package is deprecated and "
|
||||||
|
"will not be supported in the future. To install 'build': pip "
|
||||||
|
"install build"
|
||||||
|
)
|
||||||
if not input_path or not input_path.exists():
|
if not input_path or not input_path.exists():
|
||||||
msg.fail("Can't locate pipeline data", input_path, exits=1)
|
msg.fail("Can't locate pipeline data", input_path, exits=1)
|
||||||
if not output_path or not output_path.exists():
|
if not output_path or not output_path.exists():
|
||||||
|
@ -184,12 +195,37 @@ def package(
|
||||||
msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
|
msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
|
||||||
if create_sdist:
|
if create_sdist:
|
||||||
with util.working_dir(main_path):
|
with util.working_dir(main_path):
|
||||||
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
|
# run directly, since util.run_command is not designed to continue
|
||||||
|
# after a command fails
|
||||||
|
ret = subprocess.run(
|
||||||
|
[sys.executable, "-m", "build", ".", "--sdist"],
|
||||||
|
env=os.environ.copy(),
|
||||||
|
)
|
||||||
|
if ret.returncode != 0:
|
||||||
|
msg.warn(
|
||||||
|
"Creating sdist with 'python -m build' failed. Falling "
|
||||||
|
"back to deprecated use of 'python setup.py sdist'"
|
||||||
|
)
|
||||||
|
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
|
||||||
zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}"
|
zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}"
|
||||||
msg.good(f"Successfully created zipped Python package", zip_file)
|
msg.good(f"Successfully created zipped Python package", zip_file)
|
||||||
if create_wheel:
|
if create_wheel:
|
||||||
with util.working_dir(main_path):
|
with util.working_dir(main_path):
|
||||||
util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False)
|
# run directly, since util.run_command is not designed to continue
|
||||||
|
# after a command fails
|
||||||
|
ret = subprocess.run(
|
||||||
|
[sys.executable, "-m", "build", ".", "--wheel"],
|
||||||
|
env=os.environ.copy(),
|
||||||
|
)
|
||||||
|
if ret.returncode != 0:
|
||||||
|
msg.warn(
|
||||||
|
"Creating wheel with 'python -m build' failed. Falling "
|
||||||
|
"back to deprecated use of 'wheel' with "
|
||||||
|
"'python setup.py bdist_wheel'"
|
||||||
|
)
|
||||||
|
util.run_command(
|
||||||
|
[sys.executable, "setup.py", "bdist_wheel"], capture=False
|
||||||
|
)
|
||||||
wheel_name_squashed = re.sub("_+", "_", model_name_v)
|
wheel_name_squashed = re.sub("_+", "_", model_name_v)
|
||||||
wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
|
wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
|
||||||
msg.good(f"Successfully created binary wheel", wheel)
|
msg.good(f"Successfully created binary wheel", wheel)
|
||||||
|
@ -209,6 +245,17 @@ def has_wheel() -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def has_build() -> bool:
|
||||||
|
# it's very likely that there is a local directory named build/ (especially
|
||||||
|
# in an editable install), so an import check is not sufficient; instead
|
||||||
|
# check that there is a package version
|
||||||
|
try:
|
||||||
|
importlib_metadata.version("build")
|
||||||
|
return True
|
||||||
|
except importlib_metadata.PackageNotFoundError: # type: ignore[attr-defined]
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_third_party_dependencies(
|
def get_third_party_dependencies(
|
||||||
config: Config, exclude: List[str] = util.SimpleFrozenList()
|
config: Config, exclude: List[str] = util.SimpleFrozenList()
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
|
|
0
spacy/cli/project/__init__.py
Normal file
1
spacy/cli/project/assets.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
from weasel.cli.assets import *
|
1
spacy/cli/project/clone.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
from weasel.cli.clone import *
|
1
spacy/cli/project/document.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
from weasel.cli.document import *
|
1
spacy/cli/project/dvc.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
from weasel.cli.dvc import *
|
1
spacy/cli/project/pull.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
from weasel.cli.pull import *
|
1
spacy/cli/project/push.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
from weasel.cli.push import *
|
1
spacy/cli/project/remote_storage.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
from weasel.cli.remote_storage import *
|
1
spacy/cli/project/run.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
from weasel.cli.run import *
|
|
@ -271,8 +271,9 @@ grad_factor = 1.0
|
||||||
@layers = "reduce_mean.v1"
|
@layers = "reduce_mean.v1"
|
||||||
|
|
||||||
[components.textcat.model.linear_model]
|
[components.textcat.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
|
@ -308,8 +309,9 @@ grad_factor = 1.0
|
||||||
@layers = "reduce_mean.v1"
|
@layers = "reduce_mean.v1"
|
||||||
|
|
||||||
[components.textcat_multilabel.model.linear_model]
|
[components.textcat_multilabel.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
|
@ -542,14 +544,15 @@ nO = null
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
[components.textcat.model.linear_model]
|
[components.textcat.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
@ -570,15 +573,17 @@ nO = null
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
[components.textcat_multilabel.model.linear_model]
|
[components.textcat_multilabel.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat_multilabel.model]
|
[components.textcat_multilabel.model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
|
|
|
@ -142,7 +142,25 @@ class SpanRenderer:
|
||||||
spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
|
spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
|
||||||
title (str / None): Document title set in Doc.user_data['title'].
|
title (str / None): Document title set in Doc.user_data['title'].
|
||||||
"""
|
"""
|
||||||
per_token_info = []
|
per_token_info = self._assemble_per_token_info(tokens, spans)
|
||||||
|
markup = self._render_markup(per_token_info)
|
||||||
|
markup = TPL_SPANS.format(content=markup, dir=self.direction)
|
||||||
|
if title:
|
||||||
|
markup = TPL_TITLE.format(title=title) + markup
|
||||||
|
return markup
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _assemble_per_token_info(
|
||||||
|
tokens: List[str], spans: List[Dict[str, Any]]
|
||||||
|
) -> List[Dict[str, List[Dict[str, Any]]]]:
|
||||||
|
"""Assembles token info used to generate markup in render_spans().
|
||||||
|
tokens (List[str]): Tokens in text.
|
||||||
|
spans (List[Dict[str, Any]]): Spans in text.
|
||||||
|
RETURNS (List[Dict[str, List[Dict, str, Any]]]): Per token info needed to render HTML markup for given tokens
|
||||||
|
and spans.
|
||||||
|
"""
|
||||||
|
per_token_info: List[Dict[str, List[Dict[str, Any]]]] = []
|
||||||
|
|
||||||
# we must sort so that we can correctly describe when spans need to "stack"
|
# we must sort so that we can correctly describe when spans need to "stack"
|
||||||
# which is determined by their start token, then span length (longer spans on top),
|
# which is determined by their start token, then span length (longer spans on top),
|
||||||
# then break any remaining ties with the span label
|
# then break any remaining ties with the span label
|
||||||
|
@ -154,21 +172,22 @@ class SpanRenderer:
|
||||||
s["label"],
|
s["label"],
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
for s in spans:
|
for s in spans:
|
||||||
# this is the vertical 'slot' that the span will be rendered in
|
# this is the vertical 'slot' that the span will be rendered in
|
||||||
# vertical_position = span_label_offset + (offset_step * (slot - 1))
|
# vertical_position = span_label_offset + (offset_step * (slot - 1))
|
||||||
s["render_slot"] = 0
|
s["render_slot"] = 0
|
||||||
|
|
||||||
for idx, token in enumerate(tokens):
|
for idx, token in enumerate(tokens):
|
||||||
# Identify if a token belongs to a Span (and which) and if it's a
|
# Identify if a token belongs to a Span (and which) and if it's a
|
||||||
# start token of said Span. We'll use this for the final HTML render
|
# start token of said Span. We'll use this for the final HTML render
|
||||||
token_markup: Dict[str, Any] = {}
|
token_markup: Dict[str, Any] = {}
|
||||||
token_markup["text"] = token
|
token_markup["text"] = token
|
||||||
concurrent_spans = 0
|
intersecting_spans: List[Dict[str, Any]] = []
|
||||||
entities = []
|
entities = []
|
||||||
for span in spans:
|
for span in spans:
|
||||||
ent = {}
|
ent = {}
|
||||||
if span["start_token"] <= idx < span["end_token"]:
|
if span["start_token"] <= idx < span["end_token"]:
|
||||||
concurrent_spans += 1
|
|
||||||
span_start = idx == span["start_token"]
|
span_start = idx == span["start_token"]
|
||||||
ent["label"] = span["label"]
|
ent["label"] = span["label"]
|
||||||
ent["is_start"] = span_start
|
ent["is_start"] = span_start
|
||||||
|
@ -176,7 +195,12 @@ class SpanRenderer:
|
||||||
# When the span starts, we need to know how many other
|
# When the span starts, we need to know how many other
|
||||||
# spans are on the 'span stack' and will be rendered.
|
# spans are on the 'span stack' and will be rendered.
|
||||||
# This value becomes the vertical render slot for this entire span
|
# This value becomes the vertical render slot for this entire span
|
||||||
span["render_slot"] = concurrent_spans
|
span["render_slot"] = (
|
||||||
|
intersecting_spans[-1]["render_slot"]
|
||||||
|
if len(intersecting_spans)
|
||||||
|
else 0
|
||||||
|
) + 1
|
||||||
|
intersecting_spans.append(span)
|
||||||
ent["render_slot"] = span["render_slot"]
|
ent["render_slot"] = span["render_slot"]
|
||||||
kb_id = span.get("kb_id", "")
|
kb_id = span.get("kb_id", "")
|
||||||
kb_url = span.get("kb_url", "#")
|
kb_url = span.get("kb_url", "#")
|
||||||
|
@ -193,11 +217,8 @@ class SpanRenderer:
|
||||||
span["render_slot"] = 0
|
span["render_slot"] = 0
|
||||||
token_markup["entities"] = entities
|
token_markup["entities"] = entities
|
||||||
per_token_info.append(token_markup)
|
per_token_info.append(token_markup)
|
||||||
markup = self._render_markup(per_token_info)
|
|
||||||
markup = TPL_SPANS.format(content=markup, dir=self.direction)
|
return per_token_info
|
||||||
if title:
|
|
||||||
markup = TPL_TITLE.format(title=title) + markup
|
|
||||||
return markup
|
|
||||||
|
|
||||||
def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
|
def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
|
||||||
"""Render the markup from per-token information"""
|
"""Render the markup from per-token information"""
|
||||||
|
|
|
@ -227,7 +227,6 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
|
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
|
||||||
"This usually happens when spaCy calls `nlp.{method}` with a custom "
|
"This usually happens when spaCy calls `nlp.{method}` with a custom "
|
||||||
"component name that's not registered on the current language class. "
|
"component name that's not registered on the current language class. "
|
||||||
"If you're using a Transformer, make sure to install 'spacy-transformers'. "
|
|
||||||
"If you're using a custom component, make sure you've added the "
|
"If you're using a custom component, make sure you've added the "
|
||||||
"decorator `@Language.component` (for function components) or "
|
"decorator `@Language.component` (for function components) or "
|
||||||
"`@Language.factory` (for class components).\n\nAvailable "
|
"`@Language.factory` (for class components).\n\nAvailable "
|
||||||
|
@ -984,6 +983,10 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"predicted docs when training {component}.")
|
"predicted docs when training {component}.")
|
||||||
E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
|
E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
|
||||||
"but only callbacks with one or three parameters are supported")
|
"but only callbacks with one or three parameters are supported")
|
||||||
|
E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
|
||||||
|
E1057 = ("The `TextCatReduce` architecture must be used with at least one "
|
||||||
|
"reduction. Please enable one of `use_reduce_first`, "
|
||||||
|
"`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -1,3 +1,11 @@
|
||||||
from .candidate import Candidate, get_candidates, get_candidates_batch
|
from .candidate import Candidate, get_candidates, get_candidates_batch
|
||||||
from .kb import KnowledgeBase
|
from .kb import KnowledgeBase
|
||||||
from .kb_in_memory import InMemoryLookupKB
|
from .kb_in_memory import InMemoryLookupKB
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"Candidate",
|
||||||
|
"KnowledgeBase",
|
||||||
|
"InMemoryLookupKB",
|
||||||
|
"get_candidates",
|
||||||
|
"get_candidates_batch",
|
||||||
|
]
|
||||||
|
|
|
@ -6,7 +6,8 @@ _num_words = [
|
||||||
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
|
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
|
||||||
"sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty",
|
"sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty",
|
||||||
"fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
|
"fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
|
||||||
"million", "billion", "trillion", "quadrillion", "gajillion", "bazillion"
|
"million", "billion", "trillion", "quadrillion", "quintillion", "sextillion",
|
||||||
|
"septillion", "octillion", "nonillion", "decillion", "gajillion", "bazillion"
|
||||||
]
|
]
|
||||||
_ordinal_words = [
|
_ordinal_words = [
|
||||||
"first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
|
"first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
|
||||||
|
@ -14,7 +15,8 @@ _ordinal_words = [
|
||||||
"fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth",
|
"fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth",
|
||||||
"twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
|
"twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
|
||||||
"eightieth", "ninetieth", "hundredth", "thousandth", "millionth", "billionth",
|
"eightieth", "ninetieth", "hundredth", "thousandth", "millionth", "billionth",
|
||||||
"trillionth", "quadrillionth", "gajillionth", "bazillionth"
|
"trillionth", "quadrillionth", "quintillionth", "sextillionth", "septillionth",
|
||||||
|
"octillionth", "nonillionth", "decillionth", "gajillionth", "bazillionth"
|
||||||
]
|
]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
18
spacy/lang/fo/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
|
from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
|
class FaroeseDefaults(BaseDefaults):
|
||||||
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
|
||||||
|
class Faroese(Language):
|
||||||
|
lang = "fo"
|
||||||
|
Defaults = FaroeseDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Faroese"]
|
90
spacy/lang/fo/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,90 @@
|
||||||
|
from ...symbols import ORTH
|
||||||
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
|
_exc = {}
|
||||||
|
|
||||||
|
for orth in [
|
||||||
|
"apr.",
|
||||||
|
"aug.",
|
||||||
|
"avgr.",
|
||||||
|
"árg.",
|
||||||
|
"ávís.",
|
||||||
|
"beinl.",
|
||||||
|
"blkv.",
|
||||||
|
"blaðkv.",
|
||||||
|
"blm.",
|
||||||
|
"blaðm.",
|
||||||
|
"bls.",
|
||||||
|
"blstj.",
|
||||||
|
"blaðstj.",
|
||||||
|
"des.",
|
||||||
|
"eint.",
|
||||||
|
"febr.",
|
||||||
|
"fyrrv.",
|
||||||
|
"góðk.",
|
||||||
|
"h.m.",
|
||||||
|
"innt.",
|
||||||
|
"jan.",
|
||||||
|
"kl.",
|
||||||
|
"m.a.",
|
||||||
|
"mðr.",
|
||||||
|
"mió.",
|
||||||
|
"nr.",
|
||||||
|
"nto.",
|
||||||
|
"nov.",
|
||||||
|
"nút.",
|
||||||
|
"o.a.",
|
||||||
|
"o.a.m.",
|
||||||
|
"o.a.tíl.",
|
||||||
|
"o.fl.",
|
||||||
|
"ff.",
|
||||||
|
"o.m.a.",
|
||||||
|
"o.o.",
|
||||||
|
"o.s.fr.",
|
||||||
|
"o.tíl.",
|
||||||
|
"o.ø.",
|
||||||
|
"okt.",
|
||||||
|
"omf.",
|
||||||
|
"pst.",
|
||||||
|
"ritstj.",
|
||||||
|
"sbr.",
|
||||||
|
"sms.",
|
||||||
|
"smst.",
|
||||||
|
"smb.",
|
||||||
|
"sb.",
|
||||||
|
"sbrt.",
|
||||||
|
"sp.",
|
||||||
|
"sept.",
|
||||||
|
"spf.",
|
||||||
|
"spsk.",
|
||||||
|
"t.e.",
|
||||||
|
"t.s.",
|
||||||
|
"t.s.s.",
|
||||||
|
"tlf.",
|
||||||
|
"tel.",
|
||||||
|
"tsk.",
|
||||||
|
"t.o.v.",
|
||||||
|
"t.d.",
|
||||||
|
"uml.",
|
||||||
|
"ums.",
|
||||||
|
"uppl.",
|
||||||
|
"upprfr.",
|
||||||
|
"uppr.",
|
||||||
|
"útg.",
|
||||||
|
"útl.",
|
||||||
|
"útr.",
|
||||||
|
"vanl.",
|
||||||
|
"v.",
|
||||||
|
"v.h.",
|
||||||
|
"v.ø.o.",
|
||||||
|
"viðm.",
|
||||||
|
"viðv.",
|
||||||
|
"vm.",
|
||||||
|
"v.m.",
|
||||||
|
]:
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
capitalized = orth.capitalize()
|
||||||
|
_exc[capitalized] = [{ORTH: capitalized}]
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
20
spacy/lang/nn/__init__.py
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
|
from ..nb import SYNTAX_ITERATORS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
|
class NorwegianNynorskDefaults(BaseDefaults):
|
||||||
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
|
class NorwegianNynorsk(Language):
|
||||||
|
lang = "nn"
|
||||||
|
Defaults = NorwegianNynorskDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["NorwegianNynorsk"]
|
15
spacy/lang/nn/examples.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.nn.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/)
|
||||||
|
sentences = [
|
||||||
|
"Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.",
|
||||||
|
"Det er ein meir enn i same periode i fjor.",
|
||||||
|
"Det har lava ned enorme snømengder i store delar av Europa den siste tida.",
|
||||||
|
"Akhtar Chaudhry er ikkje innstilt på Oslo-lista til SV, men utfordrar Heikki Holmås om førsteplassen.",
|
||||||
|
]
|
74
spacy/lang/nn/punctuation.py
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
from ..char_classes import (
|
||||||
|
ALPHA,
|
||||||
|
ALPHA_LOWER,
|
||||||
|
ALPHA_UPPER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
CURRENCY,
|
||||||
|
LIST_CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
PUNCT,
|
||||||
|
UNITS,
|
||||||
|
)
|
||||||
|
from ..punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
|
_list_punct = [x for x in LIST_PUNCT if x != "#"]
|
||||||
|
_list_icons = [x for x in LIST_ICONS if x != "°"]
|
||||||
|
_list_icons = [x.replace("\\u00B0", "") for x in _list_icons]
|
||||||
|
_list_quotes = [x for x in LIST_QUOTES if x != "\\'"]
|
||||||
|
|
||||||
|
|
||||||
|
_prefixes = (
|
||||||
|
["§", "%", "=", "—", "–", r"\+(?![0-9])"]
|
||||||
|
+ _list_punct
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ LIST_CURRENCY
|
||||||
|
+ LIST_ICONS
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ _list_icons
|
||||||
|
+ [
|
||||||
|
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||||
|
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||||
|
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
_suffixes = (
|
||||||
|
LIST_PUNCT
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ _list_quotes
|
||||||
|
+ _list_icons
|
||||||
|
+ ["—", "–"]
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])\+",
|
||||||
|
r"(?<=°[FfCcKk])\.",
|
||||||
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
|
r"(?<=[{al}{e}{p}(?:{q})])\.".format(
|
||||||
|
al=ALPHA_LOWER, e=r"%²\-\+", q=_quotes, p=PUNCT
|
||||||
|
),
|
||||||
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
|
]
|
||||||
|
+ [r"(?<=[^sSxXzZ])'"]
|
||||||
|
)
|
||||||
|
_suffixes += [
|
||||||
|
suffix
|
||||||
|
for suffix in TOKENIZER_SUFFIXES
|
||||||
|
if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_PREFIXES = _prefixes
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
228
spacy/lang/nn/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,228 @@
|
||||||
|
from ...symbols import NORM, ORTH
|
||||||
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
|
_exc = {}
|
||||||
|
|
||||||
|
|
||||||
|
for exc_data in [
|
||||||
|
{ORTH: "jan.", NORM: "januar"},
|
||||||
|
{ORTH: "feb.", NORM: "februar"},
|
||||||
|
{ORTH: "mar.", NORM: "mars"},
|
||||||
|
{ORTH: "apr.", NORM: "april"},
|
||||||
|
{ORTH: "jun.", NORM: "juni"},
|
||||||
|
# note: "jul." is in the simple list below without a NORM exception
|
||||||
|
{ORTH: "aug.", NORM: "august"},
|
||||||
|
{ORTH: "sep.", NORM: "september"},
|
||||||
|
{ORTH: "okt.", NORM: "oktober"},
|
||||||
|
{ORTH: "nov.", NORM: "november"},
|
||||||
|
{ORTH: "des.", NORM: "desember"},
|
||||||
|
]:
|
||||||
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
|
for orth in [
|
||||||
|
"Ap.",
|
||||||
|
"Aq.",
|
||||||
|
"Ca.",
|
||||||
|
"Chr.",
|
||||||
|
"Co.",
|
||||||
|
"Dr.",
|
||||||
|
"F.eks.",
|
||||||
|
"Fr.p.",
|
||||||
|
"Frp.",
|
||||||
|
"Grl.",
|
||||||
|
"Kr.",
|
||||||
|
"Kr.F.",
|
||||||
|
"Kr.F.s",
|
||||||
|
"Mr.",
|
||||||
|
"Mrs.",
|
||||||
|
"Pb.",
|
||||||
|
"Pr.",
|
||||||
|
"Sp.",
|
||||||
|
"St.",
|
||||||
|
"a.m.",
|
||||||
|
"ad.",
|
||||||
|
"adm.dir.",
|
||||||
|
"adr.",
|
||||||
|
"b.c.",
|
||||||
|
"bl.a.",
|
||||||
|
"bla.",
|
||||||
|
"bm.",
|
||||||
|
"bnr.",
|
||||||
|
"bto.",
|
||||||
|
"c.c.",
|
||||||
|
"ca.",
|
||||||
|
"cand.mag.",
|
||||||
|
"co.",
|
||||||
|
"d.d.",
|
||||||
|
"d.m.",
|
||||||
|
"d.y.",
|
||||||
|
"dept.",
|
||||||
|
"dr.",
|
||||||
|
"dr.med.",
|
||||||
|
"dr.philos.",
|
||||||
|
"dr.psychol.",
|
||||||
|
"dss.",
|
||||||
|
"dvs.",
|
||||||
|
"e.Kr.",
|
||||||
|
"e.l.",
|
||||||
|
"eg.",
|
||||||
|
"eig.",
|
||||||
|
"ekskl.",
|
||||||
|
"el.",
|
||||||
|
"et.",
|
||||||
|
"etc.",
|
||||||
|
"etg.",
|
||||||
|
"ev.",
|
||||||
|
"evt.",
|
||||||
|
"f.",
|
||||||
|
"f.Kr.",
|
||||||
|
"f.eks.",
|
||||||
|
"f.o.m.",
|
||||||
|
"fhv.",
|
||||||
|
"fk.",
|
||||||
|
"foreg.",
|
||||||
|
"fork.",
|
||||||
|
"fv.",
|
||||||
|
"fvt.",
|
||||||
|
"g.",
|
||||||
|
"gl.",
|
||||||
|
"gno.",
|
||||||
|
"gnr.",
|
||||||
|
"grl.",
|
||||||
|
"gt.",
|
||||||
|
"h.r.adv.",
|
||||||
|
"hhv.",
|
||||||
|
"hoh.",
|
||||||
|
"hr.",
|
||||||
|
"ifb.",
|
||||||
|
"ifm.",
|
||||||
|
"iht.",
|
||||||
|
"inkl.",
|
||||||
|
"istf.",
|
||||||
|
"jf.",
|
||||||
|
"jr.",
|
||||||
|
"jul.",
|
||||||
|
"juris.",
|
||||||
|
"kfr.",
|
||||||
|
"kgl.",
|
||||||
|
"kgl.res.",
|
||||||
|
"kl.",
|
||||||
|
"komm.",
|
||||||
|
"kr.",
|
||||||
|
"kst.",
|
||||||
|
"lat.",
|
||||||
|
"lø.",
|
||||||
|
"m.a.",
|
||||||
|
"m.a.o.",
|
||||||
|
"m.fl.",
|
||||||
|
"m.m.",
|
||||||
|
"m.v.",
|
||||||
|
"ma.",
|
||||||
|
"mag.art.",
|
||||||
|
"md.",
|
||||||
|
"mfl.",
|
||||||
|
"mht.",
|
||||||
|
"mill.",
|
||||||
|
"min.",
|
||||||
|
"mnd.",
|
||||||
|
"moh.",
|
||||||
|
"mrd.",
|
||||||
|
"muh.",
|
||||||
|
"mv.",
|
||||||
|
"mva.",
|
||||||
|
"n.å.",
|
||||||
|
"ndf.",
|
||||||
|
"nr.",
|
||||||
|
"nto.",
|
||||||
|
"nyno.",
|
||||||
|
"o.a.",
|
||||||
|
"o.l.",
|
||||||
|
"obl.",
|
||||||
|
"off.",
|
||||||
|
"ofl.",
|
||||||
|
"on.",
|
||||||
|
"op.",
|
||||||
|
"org.",
|
||||||
|
"osv.",
|
||||||
|
"ovf.",
|
||||||
|
"p.",
|
||||||
|
"p.a.",
|
||||||
|
"p.g.a.",
|
||||||
|
"p.m.",
|
||||||
|
"p.t.",
|
||||||
|
"pga.",
|
||||||
|
"ph.d.",
|
||||||
|
"pkt.",
|
||||||
|
"pr.",
|
||||||
|
"pst.",
|
||||||
|
"pt.",
|
||||||
|
"red.anm.",
|
||||||
|
"ref.",
|
||||||
|
"res.",
|
||||||
|
"res.kap.",
|
||||||
|
"resp.",
|
||||||
|
"rv.",
|
||||||
|
"s.",
|
||||||
|
"s.d.",
|
||||||
|
"s.k.",
|
||||||
|
"s.u.",
|
||||||
|
"s.å.",
|
||||||
|
"sen.",
|
||||||
|
"sep.",
|
||||||
|
"siviling.",
|
||||||
|
"sms.",
|
||||||
|
"snr.",
|
||||||
|
"spm.",
|
||||||
|
"sr.",
|
||||||
|
"sst.",
|
||||||
|
"st.",
|
||||||
|
"st.meld.",
|
||||||
|
"st.prp.",
|
||||||
|
"stip.",
|
||||||
|
"stk.",
|
||||||
|
"stud.",
|
||||||
|
"sv.",
|
||||||
|
"såk.",
|
||||||
|
"sø.",
|
||||||
|
"t.d.",
|
||||||
|
"t.h.",
|
||||||
|
"t.o.m.",
|
||||||
|
"t.v.",
|
||||||
|
"temp.",
|
||||||
|
"ti.",
|
||||||
|
"tils.",
|
||||||
|
"tilsv.",
|
||||||
|
"tl;dr",
|
||||||
|
"tlf.",
|
||||||
|
"to.",
|
||||||
|
"ult.",
|
||||||
|
"utg.",
|
||||||
|
"v.",
|
||||||
|
"vedk.",
|
||||||
|
"vedr.",
|
||||||
|
"vg.",
|
||||||
|
"vgs.",
|
||||||
|
"vha.",
|
||||||
|
"vit.ass.",
|
||||||
|
"vn.",
|
||||||
|
"vol.",
|
||||||
|
"vs.",
|
||||||
|
"vsa.",
|
||||||
|
"§§",
|
||||||
|
"©NTB",
|
||||||
|
"årg.",
|
||||||
|
"årh.",
|
||||||
|
]:
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
# Dates
|
||||||
|
for h in range(1, 31 + 1):
|
||||||
|
for period in ["."]:
|
||||||
|
_exc[f"{h}{period}"] = [{ORTH: f"{h}."}]
|
||||||
|
|
||||||
|
_custom_base_exc = {"i.": [{ORTH: "i", NORM: "i"}, {ORTH: "."}]}
|
||||||
|
_exc.update(_custom_base_exc)
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
|
@ -1683,6 +1683,12 @@ class Language:
|
||||||
for proc in procs:
|
for proc in procs:
|
||||||
proc.start()
|
proc.start()
|
||||||
|
|
||||||
|
# Close writing-end of channels. This is needed to avoid that reading
|
||||||
|
# from the channel blocks indefinitely when the worker closes the
|
||||||
|
# channel.
|
||||||
|
for tx in bytedocs_send_ch:
|
||||||
|
tx.close()
|
||||||
|
|
||||||
# Cycle channels not to break the order of docs.
|
# Cycle channels not to break the order of docs.
|
||||||
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
|
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
|
||||||
byte_tuples = chain.from_iterable(
|
byte_tuples = chain.from_iterable(
|
||||||
|
@ -1705,8 +1711,23 @@ class Language:
|
||||||
# tell `sender` that one batch was consumed.
|
# tell `sender` that one batch was consumed.
|
||||||
sender.step()
|
sender.step()
|
||||||
finally:
|
finally:
|
||||||
|
# If we are stopping in an orderly fashion, the workers' queues
|
||||||
|
# are empty. Put the sentinel in their queues to signal that work
|
||||||
|
# is done, so that they can exit gracefully.
|
||||||
|
for q in texts_q:
|
||||||
|
q.put(_WORK_DONE_SENTINEL)
|
||||||
|
|
||||||
|
# Otherwise, we are stopping because the error handler raised an
|
||||||
|
# exception. The sentinel will be last to go out of the queue.
|
||||||
|
# To avoid doing unnecessary work or hanging on platforms that
|
||||||
|
# block on sending (Windows), we'll close our end of the channel.
|
||||||
|
# This signals to the worker that it can exit the next time it
|
||||||
|
# attempts to send data down the channel.
|
||||||
|
for r in bytedocs_recv_ch:
|
||||||
|
r.close()
|
||||||
|
|
||||||
for proc in procs:
|
for proc in procs:
|
||||||
proc.terminate()
|
proc.join()
|
||||||
|
|
||||||
def _link_components(self) -> None:
|
def _link_components(self) -> None:
|
||||||
"""Register 'listeners' within pipeline components, to allow them to
|
"""Register 'listeners' within pipeline components, to allow them to
|
||||||
|
@ -2323,6 +2344,11 @@ def _apply_pipes(
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
texts_with_ctx = receiver.get()
|
texts_with_ctx = receiver.get()
|
||||||
|
|
||||||
|
# Stop working if we encounter the end-of-work sentinel.
|
||||||
|
if isinstance(texts_with_ctx, _WorkDoneSentinel):
|
||||||
|
return
|
||||||
|
|
||||||
docs = (
|
docs = (
|
||||||
ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx
|
ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx
|
||||||
)
|
)
|
||||||
|
@ -2331,11 +2357,21 @@ def _apply_pipes(
|
||||||
# Connection does not accept unpickable objects, so send list.
|
# Connection does not accept unpickable objects, so send list.
|
||||||
byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
|
byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
|
||||||
padding = [(None, None, None)] * (len(texts_with_ctx) - len(byte_docs))
|
padding = [(None, None, None)] * (len(texts_with_ctx) - len(byte_docs))
|
||||||
sender.send(byte_docs + padding) # type: ignore[operator]
|
data: Sequence[Tuple[Optional[bytes], Optional[Any], Optional[bytes]]] = (
|
||||||
|
byte_docs + padding # type: ignore[operator]
|
||||||
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
|
error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
|
||||||
padding = [(None, None, None)] * (len(texts_with_ctx) - 1)
|
padding = [(None, None, None)] * (len(texts_with_ctx) - 1)
|
||||||
sender.send(error_msg + padding)
|
data = error_msg + padding
|
||||||
|
|
||||||
|
try:
|
||||||
|
sender.send(data)
|
||||||
|
except BrokenPipeError:
|
||||||
|
# Parent has closed the pipe prematurely. This happens when a
|
||||||
|
# worker encounters an error and the error handler is set to
|
||||||
|
# stop processing.
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
class _Sender:
|
class _Sender:
|
||||||
|
@ -2365,3 +2401,10 @@ class _Sender:
|
||||||
if self.count >= self.chunk_size:
|
if self.count >= self.chunk_size:
|
||||||
self.count = 0
|
self.count = 0
|
||||||
self.send()
|
self.send()
|
||||||
|
|
||||||
|
|
||||||
|
class _WorkDoneSentinel:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
_WORK_DONE_SENTINEL = _WorkDoneSentinel()
|
||||||
|
|
|
@ -3,4 +3,4 @@ from .levenshtein import levenshtein
|
||||||
from .matcher import Matcher
|
from .matcher import Matcher
|
||||||
from .phrasematcher import PhraseMatcher
|
from .phrasematcher import PhraseMatcher
|
||||||
|
|
||||||
__all__ = ["Matcher", "PhraseMatcher", "DependencyMatcher", "levenshtein"]
|
__all__ = ["DependencyMatcher", "Matcher", "PhraseMatcher", "levenshtein"]
|
||||||
|
|
|
@ -1,21 +1,27 @@
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import List, Optional, cast
|
from typing import List, Optional, Tuple, cast
|
||||||
|
|
||||||
from thinc.api import (
|
from thinc.api import (
|
||||||
Dropout,
|
Dropout,
|
||||||
|
Gelu,
|
||||||
LayerNorm,
|
LayerNorm,
|
||||||
Linear,
|
Linear,
|
||||||
Logistic,
|
Logistic,
|
||||||
Maxout,
|
Maxout,
|
||||||
Model,
|
Model,
|
||||||
ParametricAttention,
|
ParametricAttention,
|
||||||
|
ParametricAttention_v2,
|
||||||
Relu,
|
Relu,
|
||||||
Softmax,
|
Softmax,
|
||||||
SparseLinear,
|
SparseLinear,
|
||||||
|
SparseLinear_v2,
|
||||||
chain,
|
chain,
|
||||||
clone,
|
clone,
|
||||||
concatenate,
|
concatenate,
|
||||||
list2ragged,
|
list2ragged,
|
||||||
|
reduce_first,
|
||||||
|
reduce_last,
|
||||||
|
reduce_max,
|
||||||
reduce_mean,
|
reduce_mean,
|
||||||
reduce_sum,
|
reduce_sum,
|
||||||
residual,
|
residual,
|
||||||
|
@ -25,9 +31,10 @@ from thinc.api import (
|
||||||
)
|
)
|
||||||
from thinc.layers.chain import init as init_chain
|
from thinc.layers.chain import init as init_chain
|
||||||
from thinc.layers.resizable import resize_linear_weighted, resize_model
|
from thinc.layers.resizable import resize_linear_weighted, resize_model
|
||||||
from thinc.types import Floats2d
|
from thinc.types import ArrayXd, Floats2d
|
||||||
|
|
||||||
from ...attrs import ORTH
|
from ...attrs import ORTH
|
||||||
|
from ...errors import Errors
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ..extract_ngrams import extract_ngrams
|
from ..extract_ngrams import extract_ngrams
|
||||||
|
@ -47,39 +54,15 @@ def build_simple_cnn_text_classifier(
|
||||||
outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
|
outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
|
||||||
is applied instead, so that outputs are in the range [0, 1].
|
is applied instead, so that outputs are in the range [0, 1].
|
||||||
"""
|
"""
|
||||||
fill_defaults = {"b": 0, "W": 0}
|
return build_reduce_text_classifier(
|
||||||
with Model.define_operators({">>": chain}):
|
tok2vec=tok2vec,
|
||||||
cnn = tok2vec >> list2ragged() >> reduce_mean()
|
exclusive_classes=exclusive_classes,
|
||||||
nI = tok2vec.maybe_get_dim("nO")
|
use_reduce_first=False,
|
||||||
if exclusive_classes:
|
use_reduce_last=False,
|
||||||
output_layer = Softmax(nO=nO, nI=nI)
|
use_reduce_max=False,
|
||||||
fill_defaults["b"] = NEG_VALUE
|
use_reduce_mean=True,
|
||||||
resizable_layer: Model = resizable(
|
nO=nO,
|
||||||
output_layer,
|
)
|
||||||
resize_layer=partial(
|
|
||||||
resize_linear_weighted, fill_defaults=fill_defaults
|
|
||||||
),
|
|
||||||
)
|
|
||||||
model = cnn >> resizable_layer
|
|
||||||
else:
|
|
||||||
output_layer = Linear(nO=nO, nI=nI)
|
|
||||||
resizable_layer = resizable(
|
|
||||||
output_layer,
|
|
||||||
resize_layer=partial(
|
|
||||||
resize_linear_weighted, fill_defaults=fill_defaults
|
|
||||||
),
|
|
||||||
)
|
|
||||||
model = cnn >> resizable_layer >> Logistic()
|
|
||||||
model.set_ref("output_layer", output_layer)
|
|
||||||
model.attrs["resize_output"] = partial(
|
|
||||||
resize_and_set_ref,
|
|
||||||
resizable_layer=resizable_layer,
|
|
||||||
)
|
|
||||||
model.set_ref("tok2vec", tok2vec)
|
|
||||||
if nO is not None:
|
|
||||||
model.set_dim("nO", cast(int, nO))
|
|
||||||
model.attrs["multi_label"] = not exclusive_classes
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def resize_and_set_ref(model, new_nO, resizable_layer):
|
def resize_and_set_ref(model, new_nO, resizable_layer):
|
||||||
|
@ -95,10 +78,48 @@ def build_bow_text_classifier(
|
||||||
ngram_size: int,
|
ngram_size: int,
|
||||||
no_output_layer: bool,
|
no_output_layer: bool,
|
||||||
nO: Optional[int] = None,
|
nO: Optional[int] = None,
|
||||||
|
) -> Model[List[Doc], Floats2d]:
|
||||||
|
return _build_bow_text_classifier(
|
||||||
|
exclusive_classes=exclusive_classes,
|
||||||
|
ngram_size=ngram_size,
|
||||||
|
no_output_layer=no_output_layer,
|
||||||
|
nO=nO,
|
||||||
|
sparse_linear=SparseLinear(nO=nO),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.TextCatBOW.v3")
|
||||||
|
def build_bow_text_classifier_v3(
|
||||||
|
exclusive_classes: bool,
|
||||||
|
ngram_size: int,
|
||||||
|
no_output_layer: bool,
|
||||||
|
length: int = 262144,
|
||||||
|
nO: Optional[int] = None,
|
||||||
|
) -> Model[List[Doc], Floats2d]:
|
||||||
|
if length < 1:
|
||||||
|
raise ValueError(Errors.E1056.format(length=length))
|
||||||
|
|
||||||
|
# Find k such that 2**(k-1) < length <= 2**k.
|
||||||
|
length = 2 ** (length - 1).bit_length()
|
||||||
|
|
||||||
|
return _build_bow_text_classifier(
|
||||||
|
exclusive_classes=exclusive_classes,
|
||||||
|
ngram_size=ngram_size,
|
||||||
|
no_output_layer=no_output_layer,
|
||||||
|
nO=nO,
|
||||||
|
sparse_linear=SparseLinear_v2(nO=nO, length=length),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_bow_text_classifier(
|
||||||
|
exclusive_classes: bool,
|
||||||
|
ngram_size: int,
|
||||||
|
no_output_layer: bool,
|
||||||
|
sparse_linear: Model[Tuple[ArrayXd, ArrayXd, ArrayXd], ArrayXd],
|
||||||
|
nO: Optional[int] = None,
|
||||||
) -> Model[List[Doc], Floats2d]:
|
) -> Model[List[Doc], Floats2d]:
|
||||||
fill_defaults = {"b": 0, "W": 0}
|
fill_defaults = {"b": 0, "W": 0}
|
||||||
with Model.define_operators({">>": chain}):
|
with Model.define_operators({">>": chain}):
|
||||||
sparse_linear = SparseLinear(nO=nO)
|
|
||||||
output_layer = None
|
output_layer = None
|
||||||
if not no_output_layer:
|
if not no_output_layer:
|
||||||
fill_defaults["b"] = NEG_VALUE
|
fill_defaults["b"] = NEG_VALUE
|
||||||
|
@ -127,6 +148,9 @@ def build_text_classifier_v2(
|
||||||
linear_model: Model[List[Doc], Floats2d],
|
linear_model: Model[List[Doc], Floats2d],
|
||||||
nO: Optional[int] = None,
|
nO: Optional[int] = None,
|
||||||
) -> Model[List[Doc], Floats2d]:
|
) -> Model[List[Doc], Floats2d]:
|
||||||
|
# TODO: build the model with _build_parametric_attention_with_residual_nonlinear
|
||||||
|
# in spaCy v4. We don't do this in spaCy v3 to preserve model
|
||||||
|
# compatibility.
|
||||||
exclusive_classes = not linear_model.attrs["multi_label"]
|
exclusive_classes = not linear_model.attrs["multi_label"]
|
||||||
with Model.define_operators({">>": chain, "|": concatenate}):
|
with Model.define_operators({">>": chain, "|": concatenate}):
|
||||||
width = tok2vec.maybe_get_dim("nO")
|
width = tok2vec.maybe_get_dim("nO")
|
||||||
|
@ -190,3 +214,145 @@ def build_text_classifier_lowdata(
|
||||||
model = model >> Dropout(dropout)
|
model = model >> Dropout(dropout)
|
||||||
model = model >> Logistic()
|
model = model >> Logistic()
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.TextCatParametricAttention.v1")
|
||||||
|
def build_textcat_parametric_attention_v1(
|
||||||
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
|
exclusive_classes: bool,
|
||||||
|
nO: Optional[int] = None,
|
||||||
|
) -> Model[List[Doc], Floats2d]:
|
||||||
|
width = tok2vec.maybe_get_dim("nO")
|
||||||
|
parametric_attention = _build_parametric_attention_with_residual_nonlinear(
|
||||||
|
tok2vec=tok2vec,
|
||||||
|
nonlinear_layer=Maxout(nI=width, nO=width),
|
||||||
|
key_transform=Gelu(nI=width, nO=width),
|
||||||
|
)
|
||||||
|
with Model.define_operators({">>": chain}):
|
||||||
|
if exclusive_classes:
|
||||||
|
output_layer = Softmax(nO=nO)
|
||||||
|
else:
|
||||||
|
output_layer = Linear(nO=nO) >> Logistic()
|
||||||
|
model = parametric_attention >> output_layer
|
||||||
|
if model.has_dim("nO") is not False and nO is not None:
|
||||||
|
model.set_dim("nO", cast(int, nO))
|
||||||
|
model.set_ref("output_layer", output_layer)
|
||||||
|
model.attrs["multi_label"] = not exclusive_classes
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def _build_parametric_attention_with_residual_nonlinear(
|
||||||
|
*,
|
||||||
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
|
nonlinear_layer: Model[Floats2d, Floats2d],
|
||||||
|
key_transform: Optional[Model[Floats2d, Floats2d]] = None,
|
||||||
|
) -> Model[List[Doc], Floats2d]:
|
||||||
|
with Model.define_operators({">>": chain, "|": concatenate}):
|
||||||
|
width = tok2vec.maybe_get_dim("nO")
|
||||||
|
attention_layer = ParametricAttention_v2(nO=width, key_transform=key_transform)
|
||||||
|
norm_layer = LayerNorm(nI=width)
|
||||||
|
parametric_attention = (
|
||||||
|
tok2vec
|
||||||
|
>> list2ragged()
|
||||||
|
>> attention_layer
|
||||||
|
>> reduce_sum()
|
||||||
|
>> residual(nonlinear_layer >> norm_layer >> Dropout(0.0))
|
||||||
|
)
|
||||||
|
|
||||||
|
parametric_attention.init = _init_parametric_attention_with_residual_nonlinear
|
||||||
|
|
||||||
|
parametric_attention.set_ref("tok2vec", tok2vec)
|
||||||
|
parametric_attention.set_ref("attention_layer", attention_layer)
|
||||||
|
parametric_attention.set_ref("nonlinear_layer", nonlinear_layer)
|
||||||
|
parametric_attention.set_ref("norm_layer", norm_layer)
|
||||||
|
|
||||||
|
return parametric_attention
|
||||||
|
|
||||||
|
|
||||||
|
def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model:
|
||||||
|
tok2vec_width = get_tok2vec_width(model)
|
||||||
|
model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
|
||||||
|
model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width)
|
||||||
|
model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width)
|
||||||
|
model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
|
||||||
|
model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
|
||||||
|
init_chain(model, X, Y)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.TextCatReduce.v1")
|
||||||
|
def build_reduce_text_classifier(
|
||||||
|
tok2vec: Model,
|
||||||
|
exclusive_classes: bool,
|
||||||
|
use_reduce_first: bool,
|
||||||
|
use_reduce_last: bool,
|
||||||
|
use_reduce_max: bool,
|
||||||
|
use_reduce_mean: bool,
|
||||||
|
nO: Optional[int] = None,
|
||||||
|
) -> Model[List[Doc], Floats2d]:
|
||||||
|
"""Build a model that classifies pooled `Doc` representations.
|
||||||
|
|
||||||
|
Pooling is performed using reductions. Reductions are concatenated when
|
||||||
|
multiple reductions are used.
|
||||||
|
|
||||||
|
tok2vec (Model): the tok2vec layer to pool over.
|
||||||
|
exclusive_classes (bool): Whether or not classes are mutually exclusive.
|
||||||
|
use_reduce_first (bool): Pool by using the hidden representation of the
|
||||||
|
first token of a `Doc`.
|
||||||
|
use_reduce_last (bool): Pool by using the hidden representation of the
|
||||||
|
last token of a `Doc`.
|
||||||
|
use_reduce_max (bool): Pool by taking the maximum values of the hidden
|
||||||
|
representations of a `Doc`.
|
||||||
|
use_reduce_mean (bool): Pool by taking the mean of all hidden
|
||||||
|
representations of a `Doc`.
|
||||||
|
nO (Optional[int]): Number of classes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
fill_defaults = {"b": 0, "W": 0}
|
||||||
|
reductions = []
|
||||||
|
if use_reduce_first:
|
||||||
|
reductions.append(reduce_first())
|
||||||
|
if use_reduce_last:
|
||||||
|
reductions.append(reduce_last())
|
||||||
|
if use_reduce_max:
|
||||||
|
reductions.append(reduce_max())
|
||||||
|
if use_reduce_mean:
|
||||||
|
reductions.append(reduce_mean())
|
||||||
|
|
||||||
|
if not len(reductions):
|
||||||
|
raise ValueError(Errors.E1057)
|
||||||
|
|
||||||
|
with Model.define_operators({">>": chain}):
|
||||||
|
cnn = tok2vec >> list2ragged() >> concatenate(*reductions)
|
||||||
|
nO_tok2vec = tok2vec.maybe_get_dim("nO")
|
||||||
|
nI = nO_tok2vec * len(reductions) if nO_tok2vec is not None else None
|
||||||
|
if exclusive_classes:
|
||||||
|
output_layer = Softmax(nO=nO, nI=nI)
|
||||||
|
fill_defaults["b"] = NEG_VALUE
|
||||||
|
resizable_layer: Model = resizable(
|
||||||
|
output_layer,
|
||||||
|
resize_layer=partial(
|
||||||
|
resize_linear_weighted, fill_defaults=fill_defaults
|
||||||
|
),
|
||||||
|
)
|
||||||
|
model = cnn >> resizable_layer
|
||||||
|
else:
|
||||||
|
output_layer = Linear(nO=nO, nI=nI)
|
||||||
|
resizable_layer = resizable(
|
||||||
|
output_layer,
|
||||||
|
resize_layer=partial(
|
||||||
|
resize_linear_weighted, fill_defaults=fill_defaults
|
||||||
|
),
|
||||||
|
)
|
||||||
|
model = cnn >> resizable_layer >> Logistic()
|
||||||
|
model.set_ref("output_layer", output_layer)
|
||||||
|
model.attrs["resize_output"] = partial(
|
||||||
|
resize_and_set_ref,
|
||||||
|
resizable_layer=resizable_layer,
|
||||||
|
)
|
||||||
|
model.set_ref("tok2vec", tok2vec)
|
||||||
|
if nO is not None:
|
||||||
|
model.set_dim("nO", cast(int, nO))
|
||||||
|
model.attrs["multi_label"] = not exclusive_classes
|
||||||
|
return model
|
||||||
|
|
|
@ -22,6 +22,7 @@ from .trainable_pipe import TrainablePipe
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AttributeRuler",
|
"AttributeRuler",
|
||||||
"DependencyParser",
|
"DependencyParser",
|
||||||
|
"EditTreeLemmatizer",
|
||||||
"EntityLinker",
|
"EntityLinker",
|
||||||
"EntityRecognizer",
|
"EntityRecognizer",
|
||||||
"EntityRuler",
|
"EntityRuler",
|
||||||
|
|
|
@ -29,7 +29,7 @@ cdef class StateClass:
|
||||||
return [self.B(i) for i in range(self.c.buffer_length())]
|
return [self.B(i) for i in range(self.c.buffer_length())]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def token_vector_lenth(self):
|
def token_vector_length(self):
|
||||||
return self.doc.tensor.shape[1]
|
return self.doc.tensor.shape[1]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -36,8 +36,9 @@ maxout_pieces = 3
|
||||||
depth = 2
|
depth = 2
|
||||||
|
|
||||||
[model.linear_model]
|
[model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
"""
|
"""
|
||||||
|
@ -45,16 +46,21 @@ DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["m
|
||||||
|
|
||||||
single_label_bow_config = """
|
single_label_bow_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
"""
|
"""
|
||||||
|
|
||||||
single_label_cnn_config = """
|
single_label_cnn_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.TextCatCNN.v2"
|
@architectures = "spacy.TextCatReduce.v1"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
use_reduce_first = false
|
||||||
|
use_reduce_last = false
|
||||||
|
use_reduce_max = false
|
||||||
|
use_reduce_mean = true
|
||||||
|
|
||||||
[model.tok2vec]
|
[model.tok2vec]
|
||||||
@architectures = "spacy.HashEmbedCNN.v2"
|
@architectures = "spacy.HashEmbedCNN.v2"
|
||||||
|
|
|
@ -35,8 +35,9 @@ maxout_pieces = 3
|
||||||
depth = 2
|
depth = 2
|
||||||
|
|
||||||
[model.linear_model]
|
[model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
"""
|
"""
|
||||||
|
@ -44,7 +45,7 @@ DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["mod
|
||||||
|
|
||||||
multi_label_bow_config = """
|
multi_label_bow_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
@ -52,8 +53,12 @@ no_output_layer = false
|
||||||
|
|
||||||
multi_label_cnn_config = """
|
multi_label_cnn_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.TextCatCNN.v2"
|
@architectures = "spacy.TextCatReduce.v1"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
|
use_reduce_first = false
|
||||||
|
use_reduce_last = false
|
||||||
|
use_reduce_max = false
|
||||||
|
use_reduce_mean = true
|
||||||
|
|
||||||
[model.tok2vec]
|
[model.tok2vec]
|
||||||
@architectures = "spacy.HashEmbedCNN.v2"
|
@architectures = "spacy.HashEmbedCNN.v2"
|
||||||
|
|
138
spacy/scorer.py
|
@ -802,6 +802,140 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# The following implementation of trapezoid() is adapted from SciPy,
|
||||||
|
# which is distributed under the New BSD License.
|
||||||
|
# Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers.
|
||||||
|
# See licenses/3rd_party_licenses.txt
|
||||||
|
def trapezoid(y, x=None, dx=1.0, axis=-1):
|
||||||
|
r"""
|
||||||
|
Integrate along the given axis using the composite trapezoidal rule.
|
||||||
|
|
||||||
|
If `x` is provided, the integration happens in sequence along its
|
||||||
|
elements - they are not sorted.
|
||||||
|
|
||||||
|
Integrate `y` (`x`) along each 1d slice on the given axis, compute
|
||||||
|
:math:`\int y(x) dx`.
|
||||||
|
When `x` is specified, this integrates along the parametric curve,
|
||||||
|
computing :math:`\int_t y(t) dt =
|
||||||
|
\int_t y(t) \left.\frac{dx}{dt}\right|_{x=x(t)} dt`.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
y : array_like
|
||||||
|
Input array to integrate.
|
||||||
|
x : array_like, optional
|
||||||
|
The sample points corresponding to the `y` values. If `x` is None,
|
||||||
|
the sample points are assumed to be evenly spaced `dx` apart. The
|
||||||
|
default is None.
|
||||||
|
dx : scalar, optional
|
||||||
|
The spacing between sample points when `x` is None. The default is 1.
|
||||||
|
axis : int, optional
|
||||||
|
The axis along which to integrate.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
trapezoid : float or ndarray
|
||||||
|
Definite integral of `y` = n-dimensional array as approximated along
|
||||||
|
a single axis by the trapezoidal rule. If `y` is a 1-dimensional array,
|
||||||
|
then the result is a float. If `n` is greater than 1, then the result
|
||||||
|
is an `n`-1 dimensional array.
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
cumulative_trapezoid, simpson, romb
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
Image [2]_ illustrates trapezoidal rule -- y-axis locations of points
|
||||||
|
will be taken from `y` array, by default x-axis distances between
|
||||||
|
points will be 1.0, alternatively they can be provided with `x` array
|
||||||
|
or with `dx` scalar. Return value will be equal to combined area under
|
||||||
|
the red lines.
|
||||||
|
|
||||||
|
References
|
||||||
|
----------
|
||||||
|
.. [1] Wikipedia page: https://en.wikipedia.org/wiki/Trapezoidal_rule
|
||||||
|
|
||||||
|
.. [2] Illustration image:
|
||||||
|
https://en.wikipedia.org/wiki/File:Composite_trapezoidal_rule_illustration.png
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
Use the trapezoidal rule on evenly spaced points:
|
||||||
|
|
||||||
|
>>> import numpy as np
|
||||||
|
>>> from scipy import integrate
|
||||||
|
>>> integrate.trapezoid([1, 2, 3])
|
||||||
|
4.0
|
||||||
|
|
||||||
|
The spacing between sample points can be selected by either the
|
||||||
|
``x`` or ``dx`` arguments:
|
||||||
|
|
||||||
|
>>> integrate.trapezoid([1, 2, 3], x=[4, 6, 8])
|
||||||
|
8.0
|
||||||
|
>>> integrate.trapezoid([1, 2, 3], dx=2)
|
||||||
|
8.0
|
||||||
|
|
||||||
|
Using a decreasing ``x`` corresponds to integrating in reverse:
|
||||||
|
|
||||||
|
>>> integrate.trapezoid([1, 2, 3], x=[8, 6, 4])
|
||||||
|
-8.0
|
||||||
|
|
||||||
|
More generally ``x`` is used to integrate along a parametric curve. We can
|
||||||
|
estimate the integral :math:`\int_0^1 x^2 = 1/3` using:
|
||||||
|
|
||||||
|
>>> x = np.linspace(0, 1, num=50)
|
||||||
|
>>> y = x**2
|
||||||
|
>>> integrate.trapezoid(y, x)
|
||||||
|
0.33340274885464394
|
||||||
|
|
||||||
|
Or estimate the area of a circle, noting we repeat the sample which closes
|
||||||
|
the curve:
|
||||||
|
|
||||||
|
>>> theta = np.linspace(0, 2 * np.pi, num=1000, endpoint=True)
|
||||||
|
>>> integrate.trapezoid(np.cos(theta), x=np.sin(theta))
|
||||||
|
3.141571941375841
|
||||||
|
|
||||||
|
``trapezoid`` can be applied along a specified axis to do multiple
|
||||||
|
computations in one call:
|
||||||
|
|
||||||
|
>>> a = np.arange(6).reshape(2, 3)
|
||||||
|
>>> a
|
||||||
|
array([[0, 1, 2],
|
||||||
|
[3, 4, 5]])
|
||||||
|
>>> integrate.trapezoid(a, axis=0)
|
||||||
|
array([1.5, 2.5, 3.5])
|
||||||
|
>>> integrate.trapezoid(a, axis=1)
|
||||||
|
array([2., 8.])
|
||||||
|
"""
|
||||||
|
y = np.asanyarray(y)
|
||||||
|
if x is None:
|
||||||
|
d = dx
|
||||||
|
else:
|
||||||
|
x = np.asanyarray(x)
|
||||||
|
if x.ndim == 1:
|
||||||
|
d = np.diff(x)
|
||||||
|
# reshape to correct shape
|
||||||
|
shape = [1] * y.ndim
|
||||||
|
shape[axis] = d.shape[0]
|
||||||
|
d = d.reshape(shape)
|
||||||
|
else:
|
||||||
|
d = np.diff(x, axis=axis)
|
||||||
|
nd = y.ndim
|
||||||
|
slice1 = [slice(None)] * nd
|
||||||
|
slice2 = [slice(None)] * nd
|
||||||
|
slice1[axis] = slice(1, None)
|
||||||
|
slice2[axis] = slice(None, -1)
|
||||||
|
try:
|
||||||
|
ret = (d * (y[tuple(slice1)] + y[tuple(slice2)]) / 2.0).sum(axis)
|
||||||
|
except ValueError:
|
||||||
|
# Operations didn't work, cast to ndarray
|
||||||
|
d = np.asarray(d)
|
||||||
|
y = np.asarray(y)
|
||||||
|
ret = np.add.reduce(d * (y[tuple(slice1)] + y[tuple(slice2)]) / 2.0, axis)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
# The following implementation of roc_auc_score() is adapted from
|
# The following implementation of roc_auc_score() is adapted from
|
||||||
# scikit-learn, which is distributed under the New BSD License.
|
# scikit-learn, which is distributed under the New BSD License.
|
||||||
# Copyright (c) 2007–2019 The scikit-learn developers.
|
# Copyright (c) 2007–2019 The scikit-learn developers.
|
||||||
|
@ -1024,9 +1158,9 @@ def _auc(x, y):
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E164.format(x=x))
|
raise ValueError(Errors.E164.format(x=x))
|
||||||
|
|
||||||
area = direction * np.trapz(y, x)
|
area = direction * trapezoid(y, x)
|
||||||
if isinstance(area, np.memmap):
|
if isinstance(area, np.memmap):
|
||||||
# Reductions such as .sum used internally in np.trapz do not return a
|
# Reductions such as .sum used internally in trapezoid do not return a
|
||||||
# scalar by default for numpy.memmap instances contrary to
|
# scalar by default for numpy.memmap instances contrary to
|
||||||
# regular numpy.ndarray instances.
|
# regular numpy.ndarray instances.
|
||||||
area = area.dtype.type(area)
|
area = area.dtype.type(area)
|
||||||
|
|
|
@ -162,6 +162,11 @@ def fi_tokenizer():
|
||||||
return get_lang_class("fi")().tokenizer
|
return get_lang_class("fi")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def fo_tokenizer():
|
||||||
|
return get_lang_class("fo")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def fr_tokenizer():
|
def fr_tokenizer():
|
||||||
return get_lang_class("fr")().tokenizer
|
return get_lang_class("fr")().tokenizer
|
||||||
|
@ -317,6 +322,11 @@ def nl_tokenizer():
|
||||||
return get_lang_class("nl")().tokenizer
|
return get_lang_class("nl")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def nn_tokenizer():
|
||||||
|
return get_lang_class("nn")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def pl_tokenizer():
|
def pl_tokenizer():
|
||||||
return get_lang_class("pl")().tokenizer
|
return get_lang_class("pl")().tokenizer
|
||||||
|
|
|
@ -731,3 +731,12 @@ def test_for_no_ent_sents():
|
||||||
sents = list(doc.ents[0].sents)
|
sents = list(doc.ents[0].sents)
|
||||||
assert len(sents) == 1
|
assert len(sents) == 1
|
||||||
assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY"
|
assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY"
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_api_richcmp_other(en_tokenizer):
|
||||||
|
doc1 = en_tokenizer("a b")
|
||||||
|
doc2 = en_tokenizer("b c")
|
||||||
|
assert not doc1[1:2] == doc1[1]
|
||||||
|
assert not doc1[1:2] == doc2[0]
|
||||||
|
assert not doc1[1:2] == doc2[0:1]
|
||||||
|
assert not doc1[0:1] == doc2
|
||||||
|
|
|
@ -294,3 +294,12 @@ def test_missing_head_dep(en_vocab):
|
||||||
assert aligned_heads[0] == ref_heads[0]
|
assert aligned_heads[0] == ref_heads[0]
|
||||||
assert aligned_deps[5] == ref_deps[5]
|
assert aligned_deps[5] == ref_deps[5]
|
||||||
assert aligned_heads[5] == ref_heads[5]
|
assert aligned_heads[5] == ref_heads[5]
|
||||||
|
|
||||||
|
|
||||||
|
def test_token_api_richcmp_other(en_tokenizer):
|
||||||
|
doc1 = en_tokenizer("a b")
|
||||||
|
doc2 = en_tokenizer("b c")
|
||||||
|
assert not doc1[1] == doc1[0:1]
|
||||||
|
assert not doc1[1] == doc2[1:2]
|
||||||
|
assert not doc1[1] == doc2[0]
|
||||||
|
assert not doc1[0] == doc2
|
||||||
|
|
0
spacy/tests/lang/fo/__init__.py
Normal file
26
spacy/tests/lang/fo/test_tokenizer.py
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# examples taken from Basic LAnguage Resource Kit 1.0 for Faroese (https://maltokni.fo/en/resources) licensed with CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/)
|
||||||
|
# fmt: off
|
||||||
|
FO_TOKEN_EXCEPTION_TESTS = [
|
||||||
|
(
|
||||||
|
"Eftir løgtingslóg um samsýning og eftirløn landsstýrismanna v.m., skulu løgmaður og landsstýrismenn vanliga siga frá sær størv í almennari tænastu ella privatum virkjum, samtøkum ella stovnum. ",
|
||||||
|
[
|
||||||
|
"Eftir", "løgtingslóg", "um", "samsýning", "og", "eftirløn", "landsstýrismanna", "v.m.", ",", "skulu", "løgmaður", "og", "landsstýrismenn", "vanliga", "siga", "frá", "sær", "størv", "í", "almennari", "tænastu", "ella", "privatum", "virkjum", ",", "samtøkum", "ella", "stovnum", ".",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Sambandsflokkurin gongur aftur við 2,7 prosentum í mun til valið í 1994, tá flokkurin fekk undirtøku frá 23,4 prosent av veljarunum.",
|
||||||
|
[
|
||||||
|
"Sambandsflokkurin", "gongur", "aftur", "við", "2,7", "prosentum", "í", "mun", "til", "valið", "í", "1994", ",", "tá", "flokkurin", "fekk", "undirtøku", "frá", "23,4", "prosent", "av", "veljarunum", ".",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_tokens", FO_TOKEN_EXCEPTION_TESTS)
|
||||||
|
def test_fo_tokenizer_handles_exception_cases(fo_tokenizer, text, expected_tokens):
|
||||||
|
tokens = fo_tokenizer(text)
|
||||||
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
assert expected_tokens == token_list
|
0
spacy/tests/lang/nn/__init__.py
Normal file
38
spacy/tests/lang/nn/test_tokenizer.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# examples taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/)
|
||||||
|
# fmt: off
|
||||||
|
NN_TOKEN_EXCEPTION_TESTS = [
|
||||||
|
(
|
||||||
|
"Målet til direktoratet er at alle skal bli tilbydd jobb i politiet så raskt som mogleg i 2014.",
|
||||||
|
[
|
||||||
|
"Målet", "til", "direktoratet", "er", "at", "alle", "skal", "bli", "tilbydd", "jobb", "i", "politiet", "så", "raskt", "som", "mogleg", "i", "2014", ".",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Han ønskjer ikkje at staten skal vere med på å finansiere slik undervisning, men dette er rektor på skulen ueinig i.",
|
||||||
|
[
|
||||||
|
"Han", "ønskjer", "ikkje", "at", "staten", "skal", "vere", "med", "på", "å", "finansiere", "slik", "undervisning", ",", "men", "dette", "er", "rektor", "på", "skulen", "ueinig", "i", ".",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Ifølgje China Daily vart det 8.848 meter høge fjellet flytta 3 centimeter sørvestover under jordskjelvet, som vart målt til 7,8.",
|
||||||
|
[
|
||||||
|
"Ifølgje", "China", "Daily", "vart", "det", "8.848", "meter", "høge", "fjellet", "flytta", "3", "centimeter", "sørvestover", "under", "jordskjelvet", ",", "som", "vart", "målt", "til", "7,8", ".",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Brukssesongen er frå nov. til mai, med ein topp i mars.",
|
||||||
|
[
|
||||||
|
"Brukssesongen", "er", "frå", "nov.", "til", "mai", ",", "med", "ein", "topp", "i", "mars", ".",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_tokens", NN_TOKEN_EXCEPTION_TESTS)
|
||||||
|
def test_nn_tokenizer_handles_exception_cases(nn_tokenizer, text, expected_tokens):
|
||||||
|
tokens = nn_tokenizer(text)
|
||||||
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
assert expected_tokens == token_list
|
|
@ -203,7 +203,7 @@ def test_pipe_class_component_model():
|
||||||
"@architectures": "spacy.TextCatEnsemble.v2",
|
"@architectures": "spacy.TextCatEnsemble.v2",
|
||||||
"tok2vec": DEFAULT_TOK2VEC_MODEL,
|
"tok2vec": DEFAULT_TOK2VEC_MODEL,
|
||||||
"linear_model": {
|
"linear_model": {
|
||||||
"@architectures": "spacy.TextCatBOW.v2",
|
"@architectures": "spacy.TextCatBOW.v3",
|
||||||
"exclusive_classes": False,
|
"exclusive_classes": False,
|
||||||
"ngram_size": 1,
|
"ngram_size": 1,
|
||||||
"no_output_layer": False,
|
"no_output_layer": False,
|
||||||
|
|
|
@ -414,7 +414,7 @@ def test_implicit_label(name, get_examples):
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"name,textcat_config",
|
"name,textcat_config",
|
||||||
[
|
[
|
||||||
# BOW
|
# BOW V1
|
||||||
("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
|
("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
|
||||||
("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
|
("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
|
||||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
|
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
|
||||||
|
@ -451,14 +451,14 @@ def test_no_resize(name, textcat_config):
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"name,textcat_config",
|
"name,textcat_config",
|
||||||
[
|
[
|
||||||
# BOW
|
# BOW V3
|
||||||
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
|
("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
|
||||||
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
|
("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
|
||||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
|
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
|
||||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
|
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
|
||||||
# CNN
|
# CNN
|
||||||
("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
|
("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
|
||||||
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
|
("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
@ -480,14 +480,14 @@ def test_resize(name, textcat_config):
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"name,textcat_config",
|
"name,textcat_config",
|
||||||
[
|
[
|
||||||
# BOW
|
# BOW v3
|
||||||
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
|
("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
|
||||||
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
|
("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
|
||||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
|
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
|
||||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
|
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
|
||||||
# CNN
|
# REDUCE
|
||||||
("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
|
("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
|
||||||
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
|
("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
@ -693,12 +693,23 @@ def test_overfitting_IO_multi():
|
||||||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
|
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
|
||||||
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
|
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
|
||||||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
|
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
|
||||||
|
# BOW V3
|
||||||
|
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}),
|
||||||
|
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
|
||||||
|
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
|
||||||
|
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
|
||||||
# ENSEMBLE V2
|
# ENSEMBLE V2
|
||||||
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
|
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
|
||||||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
|
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
|
||||||
# CNN V2
|
# CNN V2 (legacy)
|
||||||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
|
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
|
||||||
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
|
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
|
||||||
|
# PARAMETRIC ATTENTION V1
|
||||||
|
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
|
||||||
|
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
|
||||||
|
# REDUCE V1
|
||||||
|
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
|
||||||
|
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
|
@ -15,7 +15,12 @@ def doc_w_attrs(en_tokenizer):
|
||||||
Token.set_extension("_test_token", default="t0")
|
Token.set_extension("_test_token", default="t0")
|
||||||
doc[1]._._test_token = "t1"
|
doc[1]._._test_token = "t1"
|
||||||
|
|
||||||
return doc
|
yield doc
|
||||||
|
|
||||||
|
Doc.remove_extension("_test_attr")
|
||||||
|
Doc.remove_extension("_test_prop")
|
||||||
|
Doc.remove_extension("_test_method")
|
||||||
|
Token.remove_extension("_test_token")
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_ext_attrs_from_bytes(doc_w_attrs):
|
def test_serialize_ext_attrs_from_bytes(doc_w_attrs):
|
||||||
|
|
|
@ -1061,3 +1061,8 @@ def test_debug_data_trainable_lemmatizer_not_annotated():
|
||||||
|
|
||||||
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
|
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
|
||||||
assert data["no_lemma_annotations"] == 2
|
assert data["no_lemma_annotations"] == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_project_api_imports():
|
||||||
|
from spacy.cli import project_run
|
||||||
|
from spacy.cli.project.run import project_run # noqa: F401, F811
|
||||||
|
|
|
@ -214,9 +214,6 @@ def test_project_clone(options):
|
||||||
assert (out / "README.md").is_file()
|
assert (out / "README.md").is_file()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
|
||||||
sys.version_info >= (3, 12), reason="Python 3.12+ not supported for remotes"
|
|
||||||
)
|
|
||||||
def test_project_push_pull(project_dir):
|
def test_project_push_pull(project_dir):
|
||||||
proj = dict(SAMPLE_PROJECT)
|
proj = dict(SAMPLE_PROJECT)
|
||||||
remote = "xyz"
|
remote = "xyz"
|
||||||
|
@ -241,7 +238,7 @@ def test_project_push_pull(project_dir):
|
||||||
|
|
||||||
def test_find_function_valid():
|
def test_find_function_valid():
|
||||||
# example of architecture in main code base
|
# example of architecture in main code base
|
||||||
function = "spacy.TextCatBOW.v2"
|
function = "spacy.TextCatBOW.v3"
|
||||||
result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"])
|
result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"])
|
||||||
assert f"Found registered function '{function}'" in result.stdout
|
assert f"Found registered function '{function}'" in result.stdout
|
||||||
assert "textcat.py" in result.stdout
|
assert "textcat.py" in result.stdout
|
||||||
|
@ -260,7 +257,7 @@ def test_find_function_valid():
|
||||||
|
|
||||||
def test_find_function_invalid():
|
def test_find_function_invalid():
|
||||||
# invalid registry
|
# invalid registry
|
||||||
function = "spacy.TextCatBOW.v2"
|
function = "spacy.TextCatBOW.v3"
|
||||||
registry = "foobar"
|
registry = "foobar"
|
||||||
result = CliRunner().invoke(
|
result = CliRunner().invoke(
|
||||||
app, ["find-function", function, "--registry", registry]
|
app, ["find-function", function, "--registry", registry]
|
||||||
|
|
|
@ -2,7 +2,7 @@ import numpy
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy import displacy
|
from spacy import displacy
|
||||||
from spacy.displacy.render import DependencyRenderer, EntityRenderer
|
from spacy.displacy.render import DependencyRenderer, EntityRenderer, SpanRenderer
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.fa import Persian
|
from spacy.lang.fa import Persian
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
|
@ -468,3 +468,23 @@ def test_issue12816(en_vocab) -> None:
|
||||||
# Verify that the HTML tag is still escaped
|
# Verify that the HTML tag is still escaped
|
||||||
html = displacy.render(doc, style="span")
|
html = displacy.render(doc, style="span")
|
||||||
assert "<TEST>" in html
|
assert "<TEST>" in html
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(13056)
|
||||||
|
def test_displacy_span_stacking():
|
||||||
|
"""Test whether span stacking works properly for multiple overlapping spans."""
|
||||||
|
spans = [
|
||||||
|
{"start_token": 2, "end_token": 5, "label": "SkillNC"},
|
||||||
|
{"start_token": 0, "end_token": 2, "label": "Skill"},
|
||||||
|
{"start_token": 1, "end_token": 3, "label": "Skill"},
|
||||||
|
]
|
||||||
|
tokens = ["Welcome", "to", "the", "Bank", "of", "China", "."]
|
||||||
|
per_token_info = SpanRenderer._assemble_per_token_info(spans=spans, tokens=tokens)
|
||||||
|
|
||||||
|
assert len(per_token_info) == len(tokens)
|
||||||
|
assert all([len(per_token_info[i]["entities"]) == 1 for i in (0, 3, 4)])
|
||||||
|
assert all([len(per_token_info[i]["entities"]) == 2 for i in (1, 2)])
|
||||||
|
assert per_token_info[1]["entities"][0]["render_slot"] == 1
|
||||||
|
assert per_token_info[1]["entities"][1]["render_slot"] == 2
|
||||||
|
assert per_token_info[2]["entities"][0]["render_slot"] == 2
|
||||||
|
assert per_token_info[2]["entities"][1]["render_slot"] == 3
|
||||||
|
|
|
@ -376,8 +376,9 @@ def test_util_dot_section():
|
||||||
factory = "textcat"
|
factory = "textcat"
|
||||||
|
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
"""
|
"""
|
||||||
|
@ -485,8 +486,8 @@ def test_to_ternary_int():
|
||||||
|
|
||||||
def test_find_available_port():
|
def test_find_available_port():
|
||||||
host = "0.0.0.0"
|
host = "0.0.0.0"
|
||||||
port = 5000
|
port = 5001
|
||||||
assert find_available_port(port, host) == port, "Port 5000 isn't free"
|
assert find_available_port(port, host) == port, "Port 5001 isn't free"
|
||||||
|
|
||||||
from wsgiref.simple_server import demo_app, make_server
|
from wsgiref.simple_server import demo_app, make_server
|
||||||
|
|
||||||
|
|
|
@ -26,6 +26,7 @@ from spacy.ml.models import (
|
||||||
build_Tok2Vec_model,
|
build_Tok2Vec_model,
|
||||||
)
|
)
|
||||||
from spacy.ml.staticvectors import StaticVectors
|
from spacy.ml.staticvectors import StaticVectors
|
||||||
|
from spacy.util import registry
|
||||||
|
|
||||||
|
|
||||||
def get_textcat_bow_kwargs():
|
def get_textcat_bow_kwargs():
|
||||||
|
@ -284,3 +285,17 @@ def test_spancat_model_forward_backward(nO=5):
|
||||||
Y, backprop = model((docs, spans), is_train=True)
|
Y, backprop = model((docs, spans), is_train=True)
|
||||||
assert Y.shape == (spans.dataXd.shape[0], nO)
|
assert Y.shape == (spans.dataXd.shape[0], nO)
|
||||||
backprop(Y)
|
backprop(Y)
|
||||||
|
|
||||||
|
|
||||||
|
def test_textcat_reduce_invalid_args():
|
||||||
|
textcat_reduce = registry.architectures.get("spacy.TextCatReduce.v1")
|
||||||
|
tok2vec = make_test_tok2vec()
|
||||||
|
with pytest.raises(ValueError, match=r"must be used with at least one reduction"):
|
||||||
|
textcat_reduce(
|
||||||
|
tok2vec=tok2vec,
|
||||||
|
exclusive_classes=False,
|
||||||
|
use_reduce_first=False,
|
||||||
|
use_reduce_last=False,
|
||||||
|
use_reduce_max=False,
|
||||||
|
use_reduce_mean=False,
|
||||||
|
)
|
||||||
|
|
|
@ -85,6 +85,18 @@ def test_tokenizer_explain_special_matcher(en_vocab):
|
||||||
assert tokens == explain_tokens
|
assert tokens == explain_tokens
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_explain_special_matcher_whitespace(en_vocab):
|
||||||
|
rules = {":]": [{"ORTH": ":]"}]}
|
||||||
|
tokenizer = Tokenizer(
|
||||||
|
en_vocab,
|
||||||
|
rules=rules,
|
||||||
|
)
|
||||||
|
text = ": ]"
|
||||||
|
tokens = [t.text for t in tokenizer(text)]
|
||||||
|
explain_tokens = [t[1] for t in tokenizer.explain(text)]
|
||||||
|
assert tokens == explain_tokens
|
||||||
|
|
||||||
|
|
||||||
@hypothesis.strategies.composite
|
@hypothesis.strategies.composite
|
||||||
def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str:
|
def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str:
|
||||||
"""
|
"""
|
||||||
|
@ -123,6 +135,9 @@ def test_tokenizer_explain_fuzzy(lang: str, sentence: str) -> None:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tokenizer: Tokenizer = spacy.blank(lang).tokenizer
|
tokenizer: Tokenizer = spacy.blank(lang).tokenizer
|
||||||
tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
|
# Tokenizer.explain is not intended to handle whitespace or control
|
||||||
|
# characters in the same way as Tokenizer
|
||||||
|
sentence = re.sub(r"\s+", " ", sentence).strip()
|
||||||
|
tokens = [t.text for t in tokenizer(sentence)]
|
||||||
debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
|
debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
|
||||||
assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}"
|
assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}"
|
||||||
|
|
|
@ -730,9 +730,16 @@ cdef class Tokenizer:
|
||||||
if i in spans_by_start:
|
if i in spans_by_start:
|
||||||
span = spans_by_start[i]
|
span = spans_by_start[i]
|
||||||
exc = [d[ORTH] for d in special_cases[span.label_]]
|
exc = [d[ORTH] for d in special_cases[span.label_]]
|
||||||
for j, orth in enumerate(exc):
|
# The phrase matcher can overmatch for tokens separated by
|
||||||
final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
|
# spaces in the text but not in the underlying rule, so skip
|
||||||
i += len(span)
|
# cases where the texts aren't identical
|
||||||
|
if span.text != "".join([self.vocab.strings[orth] for orth in exc]):
|
||||||
|
final_tokens.append(tokens[i])
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
for j, orth in enumerate(exc):
|
||||||
|
final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
|
||||||
|
i += len(span)
|
||||||
else:
|
else:
|
||||||
final_tokens.append(tokens[i])
|
final_tokens.append(tokens[i])
|
||||||
i += 1
|
i += 1
|
||||||
|
|
|
@ -5,4 +5,4 @@ from .span import Span
|
||||||
from .span_group import SpanGroup
|
from .span_group import SpanGroup
|
||||||
from .token import Token
|
from .token import Token
|
||||||
|
|
||||||
__all__ = ["Doc", "Token", "Span", "SpanGroup", "DocBin", "MorphAnalysis"]
|
__all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"]
|
||||||
|
|
|
@ -42,7 +42,7 @@ class Doc:
|
||||||
user_hooks: Dict[str, Callable[..., Any]]
|
user_hooks: Dict[str, Callable[..., Any]]
|
||||||
user_token_hooks: Dict[str, Callable[..., Any]]
|
user_token_hooks: Dict[str, Callable[..., Any]]
|
||||||
user_span_hooks: Dict[str, Callable[..., Any]]
|
user_span_hooks: Dict[str, Callable[..., Any]]
|
||||||
tensor: np.ndarray[Any, np.dtype[np.float_]]
|
tensor: np.ndarray[Any, np.dtype[np.float64]]
|
||||||
user_data: Dict[str, Any]
|
user_data: Dict[str, Any]
|
||||||
has_unknown_spaces: bool
|
has_unknown_spaces: bool
|
||||||
_context: Any
|
_context: Any
|
||||||
|
@ -125,7 +125,7 @@ class Doc:
|
||||||
vector: Optional[Floats1d] = ...,
|
vector: Optional[Floats1d] = ...,
|
||||||
alignment_mode: str = ...,
|
alignment_mode: str = ...,
|
||||||
span_id: Union[int, str] = ...,
|
span_id: Union[int, str] = ...,
|
||||||
) -> Span: ...
|
) -> Optional[Span]: ...
|
||||||
def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
|
def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
|
||||||
@property
|
@property
|
||||||
def has_vector(self) -> bool: ...
|
def has_vector(self) -> bool: ...
|
||||||
|
@ -166,7 +166,7 @@ class Doc:
|
||||||
) -> Doc: ...
|
) -> Doc: ...
|
||||||
def to_array(
|
def to_array(
|
||||||
self, py_attr_ids: Union[int, str, List[Union[int, str]]]
|
self, py_attr_ids: Union[int, str, List[Union[int, str]]]
|
||||||
) -> np.ndarray[Any, np.dtype[np.float_]]: ...
|
) -> np.ndarray[Any, np.dtype[np.float64]]: ...
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_docs(
|
def from_docs(
|
||||||
docs: List[Doc],
|
docs: List[Doc],
|
||||||
|
@ -179,15 +179,13 @@ class Doc:
|
||||||
self, path: Union[str, Path], *, exclude: Iterable[str] = ...
|
self, path: Union[str, Path], *, exclude: Iterable[str] = ...
|
||||||
) -> None: ...
|
) -> None: ...
|
||||||
def from_disk(
|
def from_disk(
|
||||||
self, path: Union[str, Path], *, exclude: Union[List[str], Tuple[str]] = ...
|
self, path: Union[str, Path], *, exclude: Iterable[str] = ...
|
||||||
) -> Doc: ...
|
) -> Doc: ...
|
||||||
def to_bytes(self, *, exclude: Union[List[str], Tuple[str]] = ...) -> bytes: ...
|
def to_bytes(self, *, exclude: Iterable[str] = ...) -> bytes: ...
|
||||||
def from_bytes(
|
def from_bytes(self, bytes_data: bytes, *, exclude: Iterable[str] = ...) -> Doc: ...
|
||||||
self, bytes_data: bytes, *, exclude: Union[List[str], Tuple[str]] = ...
|
def to_dict(self, *, exclude: Iterable[str] = ...) -> Dict[str, Any]: ...
|
||||||
) -> Doc: ...
|
|
||||||
def to_dict(self, *, exclude: Union[List[str], Tuple[str]] = ...) -> bytes: ...
|
|
||||||
def from_dict(
|
def from_dict(
|
||||||
self, msg: bytes, *, exclude: Union[List[str], Tuple[str]] = ...
|
self, msg: Dict[str, Any], *, exclude: Iterable[str] = ...
|
||||||
) -> Doc: ...
|
) -> Doc: ...
|
||||||
def extend_tensor(self, tensor: Floats2d) -> None: ...
|
def extend_tensor(self, tensor: Floats2d) -> None: ...
|
||||||
def retokenize(self) -> Retokenizer: ...
|
def retokenize(self) -> Retokenizer: ...
|
||||||
|
|
|
@ -1326,7 +1326,7 @@ cdef class Doc:
|
||||||
|
|
||||||
path (str / Path): A path to a directory. Paths may be either
|
path (str / Path): A path to a directory. Paths may be either
|
||||||
strings or `Path`-like objects.
|
strings or `Path`-like objects.
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Doc): The modified `Doc` object.
|
RETURNS (Doc): The modified `Doc` object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#from_disk
|
DOCS: https://spacy.io/api/doc#from_disk
|
||||||
|
@ -1339,7 +1339,7 @@ cdef class Doc:
|
||||||
def to_bytes(self, *, exclude=tuple()):
|
def to_bytes(self, *, exclude=tuple()):
|
||||||
"""Serialize, i.e. export the document contents to a binary string.
|
"""Serialize, i.e. export the document contents to a binary string.
|
||||||
|
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
||||||
all annotations.
|
all annotations.
|
||||||
|
|
||||||
|
@ -1351,7 +1351,7 @@ cdef class Doc:
|
||||||
"""Deserialize, i.e. import the document contents from a binary string.
|
"""Deserialize, i.e. import the document contents from a binary string.
|
||||||
|
|
||||||
data (bytes): The string to load from.
|
data (bytes): The string to load from.
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Doc): Itself.
|
RETURNS (Doc): Itself.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#from_bytes
|
DOCS: https://spacy.io/api/doc#from_bytes
|
||||||
|
@ -1361,11 +1361,8 @@ cdef class Doc:
|
||||||
def to_dict(self, *, exclude=tuple()):
|
def to_dict(self, *, exclude=tuple()):
|
||||||
"""Export the document contents to a dictionary for serialization.
|
"""Export the document contents to a dictionary for serialization.
|
||||||
|
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
RETURNS (Dict[str, Any]): A dictionary representation of the `Doc`
|
||||||
all annotations.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#to_bytes
|
|
||||||
"""
|
"""
|
||||||
array_head = Doc._get_array_attrs()
|
array_head = Doc._get_array_attrs()
|
||||||
strings = set()
|
strings = set()
|
||||||
|
@ -1411,13 +1408,11 @@ cdef class Doc:
|
||||||
return util.to_dict(serializers, exclude)
|
return util.to_dict(serializers, exclude)
|
||||||
|
|
||||||
def from_dict(self, msg, *, exclude=tuple()):
|
def from_dict(self, msg, *, exclude=tuple()):
|
||||||
"""Deserialize, i.e. import the document contents from a binary string.
|
"""Deserialize the document contents from a dictionary representation.
|
||||||
|
|
||||||
data (bytes): The string to load from.
|
msg (Dict[str, Any]): The dictionary to load from.
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
RETURNS (Doc): Itself.
|
RETURNS (Doc): Itself.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#from_dict
|
|
||||||
"""
|
"""
|
||||||
if self.length != 0:
|
if self.length != 0:
|
||||||
raise ValueError(Errors.E033.format(length=self.length))
|
raise ValueError(Errors.E033.format(length=self.length))
|
||||||
|
|
|
@ -127,14 +127,17 @@ cdef class Span:
|
||||||
self._vector = vector
|
self._vector = vector
|
||||||
self._vector_norm = vector_norm
|
self._vector_norm = vector_norm
|
||||||
|
|
||||||
def __richcmp__(self, Span other, int op):
|
def __richcmp__(self, object other, int op):
|
||||||
if other is None:
|
if other is None:
|
||||||
if op == 0 or op == 1 or op == 2:
|
if op == 0 or op == 1 or op == 2:
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
if not isinstance(other, Span):
|
||||||
|
return False
|
||||||
|
cdef Span other_span = other
|
||||||
self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.id, self.doc)
|
self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.id, self.doc)
|
||||||
other_tuple = (other.c.start_char, other.c.end_char, other.c.label, other.c.kb_id, other.id, other.doc)
|
other_tuple = (other_span.c.start_char, other_span.c.end_char, other_span.c.label, other_span.c.kb_id, other_span.id, other_span.doc)
|
||||||
# <
|
# <
|
||||||
if op == 0:
|
if op == 0:
|
||||||
return self_tuple < other_tuple
|
return self_tuple < other_tuple
|
||||||
|
|
|
@ -53,7 +53,12 @@ class Token:
|
||||||
def __bytes__(self) -> bytes: ...
|
def __bytes__(self) -> bytes: ...
|
||||||
def __str__(self) -> str: ...
|
def __str__(self) -> str: ...
|
||||||
def __repr__(self) -> str: ...
|
def __repr__(self) -> str: ...
|
||||||
def __richcmp__(self, other: Token, op: int) -> bool: ...
|
def __lt__(self, other: Any) -> bool: ...
|
||||||
|
def __le__(self, other: Any) -> bool: ...
|
||||||
|
def __eq__(self, other: Any) -> bool: ...
|
||||||
|
def __ne__(self, other: Any) -> bool: ...
|
||||||
|
def __gt__(self, other: Any) -> bool: ...
|
||||||
|
def __ge__(self, other: Any) -> bool: ...
|
||||||
@property
|
@property
|
||||||
def _(self) -> Underscore: ...
|
def _(self) -> Underscore: ...
|
||||||
def nbor(self, i: int = ...) -> Token: ...
|
def nbor(self, i: int = ...) -> Token: ...
|
||||||
|
|
|
@ -139,17 +139,20 @@ cdef class Token:
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return self.__str__()
|
return self.__str__()
|
||||||
|
|
||||||
def __richcmp__(self, Token other, int op):
|
def __richcmp__(self, object other, int op):
|
||||||
# http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
|
# http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
|
||||||
if other is None:
|
if other is None:
|
||||||
if op in (0, 1, 2):
|
if op in (0, 1, 2):
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
if not isinstance(other, Token):
|
||||||
|
return False
|
||||||
|
cdef Token other_token = other
|
||||||
cdef Doc my_doc = self.doc
|
cdef Doc my_doc = self.doc
|
||||||
cdef Doc other_doc = other.doc
|
cdef Doc other_doc = other_token.doc
|
||||||
my = self.idx
|
my = self.idx
|
||||||
their = other.idx
|
their = other_token.idx
|
||||||
if op == 0:
|
if op == 0:
|
||||||
return my < their
|
return my < their
|
||||||
elif op == 2:
|
elif op == 2:
|
||||||
|
|
|
@ -16,3 +16,28 @@ from .iob_utils import ( # noqa: F401
|
||||||
tags_to_entities,
|
tags_to_entities,
|
||||||
)
|
)
|
||||||
from .loggers import console_logger # noqa: F401
|
from .loggers import console_logger # noqa: F401
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"Alignment",
|
||||||
|
"Corpus",
|
||||||
|
"Example",
|
||||||
|
"JsonlCorpus",
|
||||||
|
"PlainTextCorpus",
|
||||||
|
"biluo_tags_to_offsets",
|
||||||
|
"biluo_tags_to_spans",
|
||||||
|
"biluo_to_iob",
|
||||||
|
"create_copy_from_base_model",
|
||||||
|
"docs_to_json",
|
||||||
|
"dont_augment",
|
||||||
|
"iob_to_biluo",
|
||||||
|
"minibatch_by_padded_size",
|
||||||
|
"minibatch_by_words",
|
||||||
|
"offsets_to_biluo_tags",
|
||||||
|
"orth_variants_augmenter",
|
||||||
|
"read_json_file",
|
||||||
|
"remove_bilu_prefix",
|
||||||
|
"split_bilu_label",
|
||||||
|
"tags_to_entities",
|
||||||
|
"validate_get_examples",
|
||||||
|
"validate_examples",
|
||||||
|
]
|
||||||
|
|
|
@ -1077,20 +1077,38 @@ def make_tempdir() -> Generator[Path, None, None]:
|
||||||
|
|
||||||
|
|
||||||
def is_in_jupyter() -> bool:
|
def is_in_jupyter() -> bool:
|
||||||
"""Check if user is running spaCy from a Jupyter notebook by detecting the
|
"""Check if user is running spaCy from a Jupyter or Colab notebook by
|
||||||
IPython kernel. Mainly used for the displaCy visualizer.
|
detecting the IPython kernel. Mainly used for the displaCy visualizer.
|
||||||
RETURNS (bool): True if in Jupyter, False if not.
|
RETURNS (bool): True if in Jupyter/Colab, False if not.
|
||||||
"""
|
"""
|
||||||
# https://stackoverflow.com/a/39662359/6400719
|
# https://stackoverflow.com/a/39662359/6400719
|
||||||
|
# https://stackoverflow.com/questions/15411967
|
||||||
try:
|
try:
|
||||||
shell = get_ipython().__class__.__name__ # type: ignore[name-defined]
|
if get_ipython().__class__.__name__ == "ZMQInteractiveShell": # type: ignore[name-defined]
|
||||||
if shell == "ZMQInteractiveShell":
|
|
||||||
return True # Jupyter notebook or qtconsole
|
return True # Jupyter notebook or qtconsole
|
||||||
|
if get_ipython().__class__.__module__ == "google.colab._shell": # type: ignore[name-defined]
|
||||||
|
return True # Colab notebook
|
||||||
except NameError:
|
except NameError:
|
||||||
return False # Probably standard Python interpreter
|
pass # Probably standard Python interpreter
|
||||||
|
# additional check for Colab
|
||||||
|
try:
|
||||||
|
import google.colab
|
||||||
|
|
||||||
|
return True # Colab notebook
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_in_interactive() -> bool:
|
||||||
|
"""Check if user is running spaCy from an interactive Python
|
||||||
|
shell. Will return True in Jupyter notebooks too.
|
||||||
|
RETURNS (bool): True if in interactive mode, False if not.
|
||||||
|
"""
|
||||||
|
# https://stackoverflow.com/questions/2356399/tell-if-python-is-in-interactive-mode
|
||||||
|
return hasattr(sys, "ps1") or hasattr(sys, "ps2")
|
||||||
|
|
||||||
|
|
||||||
def get_object_name(obj: Any) -> str:
|
def get_object_name(obj: Any) -> str:
|
||||||
"""Get a human-readable name of a Python object, e.g. a pipeline component.
|
"""Get a human-readable name of a Python object, e.g. a pipeline component.
|
||||||
|
|
||||||
|
|
|
@ -78,16 +78,16 @@ subword features, and a
|
||||||
[MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder) encoding layer
|
[MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder) encoding layer
|
||||||
consisting of a CNN and a layer-normalized maxout activation function.
|
consisting of a CNN and a layer-normalized maxout activation function.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
|
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
|
||||||
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
|
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
|
||||||
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
|
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
|
||||||
| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * window_size * 2 + 1`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
|
| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * window_size * 2 + 1`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
|
||||||
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
|
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
|
||||||
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
|
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
|
||||||
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |
|
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
|
|
||||||
### spacy.Tok2VecListener.v1 {id="Tok2VecListener"}
|
### spacy.Tok2VecListener.v1 {id="Tok2VecListener"}
|
||||||
|
|
||||||
|
@ -962,8 +962,9 @@ single-label use-cases where `exclusive_classes = true`, while the
|
||||||
> nO = null
|
> nO = null
|
||||||
>
|
>
|
||||||
> [model.linear_model]
|
> [model.linear_model]
|
||||||
> @architectures = "spacy.TextCatBOW.v2"
|
> @architectures = "spacy.TextCatBOW.v3"
|
||||||
> exclusive_classes = true
|
> exclusive_classes = true
|
||||||
|
> length = 262144
|
||||||
> ngram_size = 1
|
> ngram_size = 1
|
||||||
> no_output_layer = false
|
> no_output_layer = false
|
||||||
>
|
>
|
||||||
|
@ -1017,54 +1018,15 @@ but used an internal `tok2vec` instead of taking it as argument:
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
### spacy.TextCatCNN.v2 {id="TextCatCNN"}
|
### spacy.TextCatBOW.v3 {id="TextCatBOW"}
|
||||||
|
|
||||||
> #### Example Config
|
> #### Example Config
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [model]
|
> [model]
|
||||||
> @architectures = "spacy.TextCatCNN.v2"
|
> @architectures = "spacy.TextCatBOW.v3"
|
||||||
> exclusive_classes = false
|
|
||||||
> nO = null
|
|
||||||
>
|
|
||||||
> [model.tok2vec]
|
|
||||||
> @architectures = "spacy.HashEmbedCNN.v2"
|
|
||||||
> pretrained_vectors = null
|
|
||||||
> width = 96
|
|
||||||
> depth = 4
|
|
||||||
> embed_size = 2000
|
|
||||||
> window_size = 1
|
|
||||||
> maxout_pieces = 3
|
|
||||||
> subword_features = true
|
|
||||||
> ```
|
|
||||||
|
|
||||||
A neural network model where token vectors are calculated using a CNN. The
|
|
||||||
vectors are mean pooled and used as features in a feed-forward network. This
|
|
||||||
architecture is usually less accurate than the ensemble, but runs faster.
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
||||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
|
||||||
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
|
|
||||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
|
||||||
|
|
||||||
<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
|
|
||||||
|
|
||||||
[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
|
|
||||||
not yet resizable. Since v2, new labels can be added to this component, even
|
|
||||||
after training.
|
|
||||||
|
|
||||||
</Accordion>
|
|
||||||
|
|
||||||
### spacy.TextCatBOW.v2 {id="TextCatBOW"}
|
|
||||||
|
|
||||||
> #### Example Config
|
|
||||||
>
|
|
||||||
> ```ini
|
|
||||||
> [model]
|
|
||||||
> @architectures = "spacy.TextCatBOW.v2"
|
|
||||||
> exclusive_classes = false
|
> exclusive_classes = false
|
||||||
|
> length = 262144
|
||||||
> ngram_size = 1
|
> ngram_size = 1
|
||||||
> no_output_layer = false
|
> no_output_layer = false
|
||||||
> nO = null
|
> nO = null
|
||||||
|
@ -1078,17 +1040,108 @@ the others, but may not be as accurate, especially if texts are short.
|
||||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||||
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ |
|
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ |
|
||||||
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ |
|
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ |
|
||||||
|
| `length` | The size of the weights vector. The length will be rounded up to the next power of two if it is not a power of two. Defaults to `262144`. ~~int~~ |
|
||||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
<Accordion title="spacy.TextCatBOW.v1 definition" spaced>
|
<Accordion title="Previous versions of spacy.TextCatBOW" spaced>
|
||||||
|
|
||||||
[TextCatBOW.v1](/api/legacy#TextCatBOW_v1) had the exact same signature, but was
|
- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) was not yet resizable. Since v2,
|
||||||
not yet resizable. Since v2, new labels can be added to this component, even
|
new labels can be added to this component, even after training.
|
||||||
after training.
|
- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
|
||||||
|
[TextCatBOW.v2](/api/legacy#TextCatBOW_v2) used an erroneous sparse linear
|
||||||
|
layer that only used a small number of the allocated parameters.
|
||||||
|
- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
|
||||||
|
[TextCatBOW.v2](/api/legacy#TextCatBOW_v2) did not have the `length` argument.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
|
### spacy.TextCatParametricAttention.v1 {id="TextCatParametricAttention"}
|
||||||
|
|
||||||
|
> #### Example Config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [model]
|
||||||
|
> @architectures = "spacy.TextCatParametricAttention.v1"
|
||||||
|
> exclusive_classes = true
|
||||||
|
> nO = null
|
||||||
|
>
|
||||||
|
> [model.tok2vec]
|
||||||
|
> @architectures = "spacy.Tok2Vec.v2"
|
||||||
|
>
|
||||||
|
> [model.tok2vec.embed]
|
||||||
|
> @architectures = "spacy.MultiHashEmbed.v2"
|
||||||
|
> width = 64
|
||||||
|
> rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
||||||
|
> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
||||||
|
> include_static_vectors = false
|
||||||
|
>
|
||||||
|
> [model.tok2vec.encode]
|
||||||
|
> @architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||||
|
> width = ${model.tok2vec.embed.width}
|
||||||
|
> window_size = 1
|
||||||
|
> maxout_pieces = 3
|
||||||
|
> depth = 2
|
||||||
|
> ```
|
||||||
|
|
||||||
|
A neural network model that is built upon Tok2Vec and uses parametric attention
|
||||||
|
to attend to tokens that are relevant to text classification.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `tok2vec` | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
|
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||||
|
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
|
### spacy.TextCatReduce.v1 {id="TextCatReduce"}
|
||||||
|
|
||||||
|
> #### Example Config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [model]
|
||||||
|
> @architectures = "spacy.TextCatReduce.v1"
|
||||||
|
> exclusive_classes = false
|
||||||
|
> use_reduce_first = false
|
||||||
|
> use_reduce_last = false
|
||||||
|
> use_reduce_max = false
|
||||||
|
> use_reduce_mean = true
|
||||||
|
> nO = null
|
||||||
|
>
|
||||||
|
> [model.tok2vec]
|
||||||
|
> @architectures = "spacy.HashEmbedCNN.v2"
|
||||||
|
> pretrained_vectors = null
|
||||||
|
> width = 96
|
||||||
|
> depth = 4
|
||||||
|
> embed_size = 2000
|
||||||
|
> window_size = 1
|
||||||
|
> maxout_pieces = 3
|
||||||
|
> subword_features = true
|
||||||
|
> ```
|
||||||
|
|
||||||
|
A classifier that pools token hidden representations of each `Doc` using first,
|
||||||
|
max or mean reduction and then applies a classification layer. Reductions are
|
||||||
|
concatenated when multiple reductions are used.
|
||||||
|
|
||||||
|
<Infobox variant="warning" title="Relation to TextCatCNN" id="TextCatCNN">
|
||||||
|
|
||||||
|
`TextCatReduce` is a generalization of the older
|
||||||
|
[`TextCatCNN`](/api/legacy#TextCatCNN_v2) model. `TextCatCNN` always uses a mean
|
||||||
|
reduction, whereas `TextCatReduce` also supports first/max reductions.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||||
|
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
|
||||||
|
| `use_reduce_first` | Pool by using the hidden representation of the first token of a `Doc`. ~~bool~~ |
|
||||||
|
| `use_reduce_last` | Pool by using the hidden representation of the last token of a `Doc`. ~~bool~~ |
|
||||||
|
| `use_reduce_max` | Pool by taking the maximum values of the hidden representations of a `Doc`. ~~bool~~ |
|
||||||
|
| `use_reduce_mean` | Pool by taking the mean of all hidden representations of a `Doc`. ~~bool~~ |
|
||||||
|
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
## Span classification architectures {id="spancat",source="spacy/ml/models/spancat.py"}
|
## Span classification architectures {id="spancat",source="spacy/ml/models/spancat.py"}
|
||||||
|
|
||||||
### spacy.SpanCategorizer.v1 {id="SpanCategorizer"}
|
### spacy.SpanCategorizer.v1 {id="SpanCategorizer"}
|
||||||
|
|
|
@ -1268,20 +1268,21 @@ the [binary `.spacy` format](/api/data-formats#binary-training). The pipeline is
|
||||||
warmed up before any measurements are taken.
|
warmed up before any measurements are taken.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy benchmark speed [model] [data_path] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup]
|
$ python -m spacy benchmark speed [model] [data_path] [--code] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------------- | -------------------------------------------------------------------------------------------------------- |
|
| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `model` | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
| `model` | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||||
| `data_path` | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
|
| `data_path` | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
|
||||||
| `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--no-shuffle` | Do not shuffle documents in the benchmark data. ~~bool (flag)~~ |
|
| `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~ |
|
||||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
| `--no-shuffle` | Do not shuffle documents in the benchmark data. ~~bool (flag)~~ |
|
||||||
| `--batches` | Number of batches to benchmark on. Defaults to `50`. ~~Optional[int] \(option)~~ |
|
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||||
| `--warmup`, `-w` | Iterations over the benchmark data for warmup. Defaults to `3` ~~Optional[int] \(option)~~ |
|
| `--batches` | Number of batches to benchmark on. Defaults to `50`. ~~Optional[int] \(option)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--warmup`, `-w` | Iterations over the benchmark data for warmup. Defaults to `3` ~~Optional[int] \(option)~~ |
|
||||||
| **PRINTS** | Pipeline speed in words per second with a 95% confidence interval. |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
| **PRINTS** | Pipeline speed in words per second with a 95% confidence interval. |
|
||||||
|
|
||||||
## apply {id="apply", version="3.5", tag="command"}
|
## apply {id="apply", version="3.5", tag="command"}
|
||||||
|
|
||||||
|
@ -1296,6 +1297,9 @@ input formats are:
|
||||||
|
|
||||||
When a directory is provided it is traversed recursively to collect all files.
|
When a directory is provided it is traversed recursively to collect all files.
|
||||||
|
|
||||||
|
When loading a .spacy file, any potential annotations stored on the `Doc` that are not overwritten by the pipeline will be preserved.
|
||||||
|
If you want to evaluate the pipeline on raw text only, make sure that the .spacy file does not contain any annotations.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
|
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
|
||||||
```
|
```
|
||||||
|
|
|
@ -400,6 +400,14 @@ identifiers are grouped by token. Instances of this class are typically assigned
|
||||||
to the [`Doc._.trf_data`](/api/curatedtransformer#assigned-attributes) extension
|
to the [`Doc._.trf_data`](/api/curatedtransformer#assigned-attributes) extension
|
||||||
attribute.
|
attribute.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> # Get the last hidden layer output for "is" (token index 1)
|
||||||
|
> doc = nlp("This is a text.")
|
||||||
|
> tensors = doc._.trf_data.last_hidden_layer_state[1]
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `all_outputs` | List of `Ragged` tensors that correspends to outputs of the different transformer layers. Each tensor element corresponds to a piece identifier's representation. ~~List[Ragged]~~ |
|
| `all_outputs` | List of `Ragged` tensors that correspends to outputs of the different transformer layers. Each tensor element corresponds to a piece identifier's representation. ~~List[Ragged]~~ |
|
||||||
|
|
|
@ -20,10 +20,9 @@ An LLM component is implemented through the `LLMWrapper` class. It is accessible
|
||||||
through a generic `llm`
|
through a generic `llm`
|
||||||
[component factory](https://spacy.io/usage/processing-pipelines#custom-components-factories)
|
[component factory](https://spacy.io/usage/processing-pipelines#custom-components-factories)
|
||||||
as well as through task-specific component factories: `llm_ner`, `llm_spancat`,
|
as well as through task-specific component factories: `llm_ner`, `llm_spancat`,
|
||||||
`llm_rel`, `llm_textcat`, `llm_sentiment`, `llm_summarization` and
|
`llm_rel`, `llm_textcat`, `llm_sentiment`, `llm_summarization`,
|
||||||
`llm_entity_linker`.
|
`llm_entity_linker`, `llm_raw` and `llm_translation`. For these factories, the
|
||||||
|
GPT-3-5 model from OpenAI is used by default, but this can be customized.
|
||||||
### LLMWrapper.\_\_init\_\_ {id="init",tag="method"}
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -33,13 +32,18 @@ as well as through task-specific component factories: `llm_ner`, `llm_spancat`,
|
||||||
> llm = nlp.add_pipe("llm", config=config)
|
> llm = nlp.add_pipe("llm", config=config)
|
||||||
>
|
>
|
||||||
> # Construction via add_pipe with a task-specific factory and default GPT3.5 model
|
> # Construction via add_pipe with a task-specific factory and default GPT3.5 model
|
||||||
> llm = nlp.add_pipe("llm-ner")
|
> llm = nlp.add_pipe("llm_ner")
|
||||||
|
>
|
||||||
|
> # Construction via add_pipe with a task-specific factory and custom model
|
||||||
|
> llm = nlp.add_pipe("llm_ner", config={"model": {"@llm_models": "spacy.Dolly.v1", "name": "dolly-v2-12b"}})
|
||||||
>
|
>
|
||||||
> # Construction from class
|
> # Construction from class
|
||||||
> from spacy_llm.pipeline import LLMWrapper
|
> from spacy_llm.pipeline import LLMWrapper
|
||||||
> llm = LLMWrapper(vocab=nlp.vocab, task=task, model=model, cache=cache, save_io=True)
|
> llm = LLMWrapper(vocab=nlp.vocab, task=task, model=model, cache=cache, save_io=True)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
### LLMWrapper.\_\_init\_\_ {id="init",tag="method"}
|
||||||
|
|
||||||
Create a new pipeline instance. In your application, you would normally use a
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
@ -225,8 +229,8 @@ All tasks are registered in the `llm_tasks` registry.
|
||||||
dataset across multiple storage units for easier processing and lookups. In
|
dataset across multiple storage units for easier processing and lookups. In
|
||||||
`spacy-llm` we use this term (synonymously: "mapping") to describe the splitting
|
`spacy-llm` we use this term (synonymously: "mapping") to describe the splitting
|
||||||
up of prompts if they are too long for a model to handle, and "fusing"
|
up of prompts if they are too long for a model to handle, and "fusing"
|
||||||
(synonymously: "reducing") to describe how the model responses for several shards
|
(synonymously: "reducing") to describe how the model responses for several
|
||||||
are merged back together into a single document.
|
shards are merged back together into a single document.
|
||||||
|
|
||||||
Prompts are broken up in a manner that _always_ keeps the prompt in the template
|
Prompts are broken up in a manner that _always_ keeps the prompt in the template
|
||||||
intact, meaning that the instructions to the LLM will always stay complete. The
|
intact, meaning that the instructions to the LLM will always stay complete. The
|
||||||
|
@ -1133,6 +1137,25 @@ supports `.yml`, `.yaml`, `.json` and `.jsonl`.
|
||||||
path = "textcat_examples.json"
|
path = "textcat_examples.json"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If you want to perform few-shot learning with a binary classifier (i. e. a text
|
||||||
|
either should or should not be assigned to a given class), you can provide
|
||||||
|
positive and negative examples with answers of "POS" or "NEG". "POS" means that
|
||||||
|
this example should be assigned the class label defined in the configuration,
|
||||||
|
"NEG" means it shouldn't. E. g. for spam classification:
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"text": "You won the lottery! Wire a fee of 200$ to be able to withdraw your winnings.",
|
||||||
|
"answer": "POS"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Your order #123456789 has arrived",
|
||||||
|
"answer": "NEG"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
### REL {id="rel"}
|
### REL {id="rel"}
|
||||||
|
|
||||||
The REL task extracts relations between named entities.
|
The REL task extracts relations between named entities.
|
||||||
|
@ -1484,7 +1507,7 @@ These models all take the same parameters:
|
||||||
> ```ini
|
> ```ini
|
||||||
> [components.llm.model]
|
> [components.llm.model]
|
||||||
> @llm_models = "spacy.Llama2.v1"
|
> @llm_models = "spacy.Llama2.v1"
|
||||||
> name = "llama2-7b-hf"
|
> name = "Llama-2-7b-hf"
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Currently, these models are provided as part of the core library:
|
Currently, these models are provided as part of the core library:
|
||||||
|
|
|
@ -162,7 +162,10 @@ network has an internal CNN Tok2Vec layer and uses attention.
|
||||||
|
|
||||||
Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means
|
Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means
|
||||||
that you can add labels to a previously trained textcat. `TextCatCNN` v1 did not
|
that you can add labels to a previously trained textcat. `TextCatCNN` v1 did not
|
||||||
yet support that.
|
yet support that. `TextCatCNN` has been replaced by the more general
|
||||||
|
[`TextCatReduce`](/api/architectures#TextCatReduce) layer. `TextCatCNN` is
|
||||||
|
identical to `TextCatReduce` with `use_reduce_mean=true`,
|
||||||
|
`use_reduce_first=false`, `reduce_last=false` and `use_reduce_max=false`.
|
||||||
|
|
||||||
> #### Example Config
|
> #### Example Config
|
||||||
>
|
>
|
||||||
|
@ -194,11 +197,58 @@ architecture is usually less accurate than the ensemble, but runs faster.
|
||||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
|
### spacy.TextCatCNN.v2 {id="TextCatCNN_v2"}
|
||||||
|
|
||||||
|
> #### Example Config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [model]
|
||||||
|
> @architectures = "spacy.TextCatCNN.v2"
|
||||||
|
> exclusive_classes = false
|
||||||
|
> nO = null
|
||||||
|
>
|
||||||
|
> [model.tok2vec]
|
||||||
|
> @architectures = "spacy.HashEmbedCNN.v2"
|
||||||
|
> pretrained_vectors = null
|
||||||
|
> width = 96
|
||||||
|
> depth = 4
|
||||||
|
> embed_size = 2000
|
||||||
|
> window_size = 1
|
||||||
|
> maxout_pieces = 3
|
||||||
|
> subword_features = true
|
||||||
|
> ```
|
||||||
|
|
||||||
|
A neural network model where token vectors are calculated using a CNN. The
|
||||||
|
vectors are mean pooled and used as features in a feed-forward network. This
|
||||||
|
architecture is usually less accurate than the ensemble, but runs faster.
|
||||||
|
|
||||||
|
`TextCatCNN` has been replaced by the more general
|
||||||
|
[`TextCatReduce`](/api/architectures#TextCatReduce) layer. `TextCatCNN` is
|
||||||
|
identical to `TextCatReduce` with `use_reduce_mean=true`,
|
||||||
|
`use_reduce_first=false`, `reduce_last=false` and `use_reduce_max=false`.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||||
|
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
|
||||||
|
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
|
<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
|
||||||
|
|
||||||
|
[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
|
||||||
|
not yet resizable. Since v2, new labels can be added to this component, even
|
||||||
|
after training.
|
||||||
|
|
||||||
|
</Accordion>
|
||||||
|
|
||||||
### spacy.TextCatBOW.v1 {id="TextCatBOW_v1"}
|
### spacy.TextCatBOW.v1 {id="TextCatBOW_v1"}
|
||||||
|
|
||||||
Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means
|
Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means
|
||||||
that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not
|
that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not
|
||||||
yet support that.
|
yet support that. Versions of this model before `spacy.TextCatBOW.v3` used an
|
||||||
|
erroneous sparse linear layer that only used a small number of the allocated
|
||||||
|
parameters.
|
||||||
|
|
||||||
> #### Example Config
|
> #### Example Config
|
||||||
>
|
>
|
||||||
|
@ -222,6 +272,33 @@ the others, but may not be as accurate, especially if texts are short.
|
||||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
|
### spacy.TextCatBOW.v2 {id="TextCatBOW"}
|
||||||
|
|
||||||
|
Versions of this model before `spacy.TextCatBOW.v3` used an erroneous sparse
|
||||||
|
linear layer that only used a small number of the allocated parameters.
|
||||||
|
|
||||||
|
> #### Example Config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [model]
|
||||||
|
> @architectures = "spacy.TextCatBOW.v2"
|
||||||
|
> exclusive_classes = false
|
||||||
|
> ngram_size = 1
|
||||||
|
> no_output_layer = false
|
||||||
|
> nO = null
|
||||||
|
> ```
|
||||||
|
|
||||||
|
An n-gram "bag-of-words" model. This architecture should run much faster than
|
||||||
|
the others, but may not be as accurate, especially if texts are short.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||||
|
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ |
|
||||||
|
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ |
|
||||||
|
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
|
### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
|
||||||
|
|
||||||
Identical to
|
Identical to
|
||||||
|
|
|
@ -89,6 +89,21 @@ architectures and their arguments and hyperparameters.
|
||||||
| `negative_weight` <Tag variant="new">3.5.1</Tag> | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ |
|
| `negative_weight` <Tag variant="new">3.5.1</Tag> | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ |
|
||||||
| `allow_overlap` <Tag variant="new">3.5.1</Tag> | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ |
|
| `allow_overlap` <Tag variant="new">3.5.1</Tag> | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ |
|
||||||
|
|
||||||
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
If you set a non-default value for `spans_key`, you'll have to update
|
||||||
|
`[training.score_weights]` as well so that weights are computed properly. E. g.
|
||||||
|
for `spans_key == "myspankey"`, include this in your config:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[training.score_weights]
|
||||||
|
spans_myspankey_f = 1.0
|
||||||
|
spans_myspankey_p = 0.0
|
||||||
|
spans_myspankey_r = 0.0
|
||||||
|
```
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/spancat.py
|
%%GITHUB_SPACY/spacy/pipeline/spancat.py
|
||||||
```
|
```
|
||||||
|
|
|
@ -397,6 +397,17 @@ are wrapped into the
|
||||||
by this class. Instances of this class are typically assigned to the
|
by this class. Instances of this class are typically assigned to the
|
||||||
[`Doc._.trf_data`](/api/transformer#assigned-attributes) extension attribute.
|
[`Doc._.trf_data`](/api/transformer#assigned-attributes) extension attribute.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> # Get the last hidden layer output for "is" (token index 1)
|
||||||
|
> doc = nlp("This is a text.")
|
||||||
|
> indices = doc._.trf_data.align[1].data.flatten()
|
||||||
|
> last_hidden_state = doc._.trf_data.model_output.last_hidden_state
|
||||||
|
> dim = last_hidden_state.shape[-1]
|
||||||
|
> tensors = last_hidden_state.reshape(-1, dim)[indices]
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ |
|
| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ |
|
||||||
|
|
|
@ -13,7 +13,7 @@ between `Doc` objects.
|
||||||
<Infobox variant ="warning">
|
<Infobox variant ="warning">
|
||||||
|
|
||||||
Note that a `Vocab` instance is not static. It increases in size as texts with
|
Note that a `Vocab` instance is not static. It increases in size as texts with
|
||||||
new tokens are processed.
|
new tokens are processed. Some models may have an empty vocab at initialization.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
@ -93,6 +93,7 @@ given string, you need to look it up in
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
|
> nlp("I'm eating an apple")
|
||||||
> apple = nlp.vocab.strings["apple"]
|
> apple = nlp.vocab.strings["apple"]
|
||||||
> oov = nlp.vocab.strings["dskfodkfos"]
|
> oov = nlp.vocab.strings["dskfodkfos"]
|
||||||
> assert apple in nlp.vocab
|
> assert apple in nlp.vocab
|
||||||
|
|
Before Width: | Height: | Size: 6.8 KiB After Width: | Height: | Size: 6.8 KiB |
|
@ -108,12 +108,12 @@ In the `sm`/`md`/`lg` models:
|
||||||
|
|
||||||
#### CNN/CPU pipelines with floret vectors
|
#### CNN/CPU pipelines with floret vectors
|
||||||
|
|
||||||
The Finnish, Korean and Swedish `md` and `lg` pipelines use
|
The Croatian, Finnish, Korean, Slovenian, Swedish and Ukrainian `md` and `lg`
|
||||||
[floret vectors](/usage/v3-2#vectors) instead of default vectors. If you're
|
pipelines use [floret vectors](/usage/v3-2#vectors) instead of default vectors.
|
||||||
running a trained pipeline on texts and working with [`Doc`](/api/doc) objects,
|
If you're running a trained pipeline on texts and working with [`Doc`](/api/doc)
|
||||||
you shouldn't notice any difference with floret vectors. With floret vectors no
|
objects, you shouldn't notice any difference with floret vectors. With floret
|
||||||
tokens are out-of-vocabulary, so [`Token.is_oov`](/api/token#attributes) will
|
vectors no tokens are out-of-vocabulary, so
|
||||||
return `False` for all tokens.
|
[`Token.is_oov`](/api/token#attributes) will return `False` for all tokens.
|
||||||
|
|
||||||
If you access vectors directly for similarity comparisons, there are a few
|
If you access vectors directly for similarity comparisons, there are a few
|
||||||
differences because floret vectors don't include a fixed word list like the
|
differences because floret vectors don't include a fixed word list like the
|
||||||
|
@ -132,10 +132,20 @@ vector keys for default vectors.
|
||||||
|
|
||||||
### Transformer pipeline design {id="design-trf"}
|
### Transformer pipeline design {id="design-trf"}
|
||||||
|
|
||||||
In the transformer (`trf`) models, the `tagger`, `parser` and `ner` (if present)
|
In the transformer (`trf`) pipelines, the `tagger`, `parser` and `ner` (if
|
||||||
all listen to the `transformer` component. The `attribute_ruler` and
|
present) all listen to the `transformer` component. The `attribute_ruler` and
|
||||||
`lemmatizer` have the same configuration as in the CNN models.
|
`lemmatizer` have the same configuration as in the CNN models.
|
||||||
|
|
||||||
|
For spaCy v3.0-v3.6, `trf` pipelines use
|
||||||
|
[`spacy-transformers`](https://github.com/explosion/spacy-transformers) and the
|
||||||
|
transformer output in `doc._.trf_data` is a
|
||||||
|
[`TransformerData`](/api/transformer#transformerdata) object.
|
||||||
|
|
||||||
|
For spaCy v3.7+, `trf` pipelines use
|
||||||
|
[`spacy-curated-transformers`](https://github.com/explosion/spacy-curated-transformers)
|
||||||
|
and `doc._.trf_data` is a
|
||||||
|
[`DocTransformerOutput`](/api/curatedtransformer#doctransformeroutput) object.
|
||||||
|
|
||||||
### Modifying the default pipeline {id="design-modify"}
|
### Modifying the default pipeline {id="design-modify"}
|
||||||
|
|
||||||
For faster processing, you may only want to run a subset of the components in a
|
For faster processing, you may only want to run a subset of the components in a
|
||||||
|
|
|
@ -31,8 +31,6 @@ for ent in doc.ents:
|
||||||
Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what
|
Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what
|
||||||
our example sentence and its named entities look like:
|
our example sentence and its named entities look like:
|
||||||
|
|
||||||
<Iframe
|
<Standalone height={120}>
|
||||||
title="displaCy visualization of entities"
|
<div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}><mark style={{ background: '#7aecec', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Apple <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> is looking at buying <mark style={{ background: '#feca74', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>U.K. <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>GPE</span></mark> startup for <mark style={{ background: '#e4e7d2', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>$1 billion <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>MONEY</span></mark></div>
|
||||||
src="/images/displacy-ent1.html"
|
</Standalone>
|
||||||
height={100}
|
|
||||||
/>
|
|
||||||
|
|
|
@ -56,8 +56,7 @@ for token in doc:
|
||||||
Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what
|
Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what
|
||||||
our example sentence and its dependencies look like:
|
our example sentence and its dependencies look like:
|
||||||
|
|
||||||
<Iframe
|
<ImageScrollable
|
||||||
title="displaCy visualization of dependencies and entities"
|
src="/images/displacy-long.svg"
|
||||||
src="/images/displacy-long.html"
|
width={1975}
|
||||||
height={450}
|
|
||||||
/>
|
/>
|
||||||
|
|
|
@ -153,8 +153,9 @@ maxout_pieces = 3
|
||||||
depth = 2
|
depth = 2
|
||||||
|
|
||||||
[components.textcat.model.linear_model]
|
[components.textcat.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
```
|
```
|
||||||
|
@ -170,8 +171,9 @@ factory = "textcat"
|
||||||
labels = []
|
labels = []
|
||||||
|
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
nO = null
|
nO = null
|
||||||
|
|
|
@ -290,11 +290,7 @@ for token in doc:
|
||||||
| toward | `prep` | shift | `NOUN` | manufacturers |
|
| toward | `prep` | shift | `NOUN` | manufacturers |
|
||||||
| manufacturers | `pobj` | toward | `ADP` | |
|
| manufacturers | `pobj` | toward | `ADP` | |
|
||||||
|
|
||||||
<Iframe
|
<ImageScrollable src="/images/displacy-long2.svg" width={1275} />
|
||||||
title="displaCy visualization of dependencies and entities 2"
|
|
||||||
src="/images/displacy-long2.html"
|
|
||||||
height={450}
|
|
||||||
/>
|
|
||||||
|
|
||||||
Because the syntactic relations form a tree, every word has **exactly one
|
Because the syntactic relations form a tree, every word has **exactly one
|
||||||
head**. You can therefore iterate over the arcs in the tree by iterating over
|
head**. You can therefore iterate over the arcs in the tree by iterating over
|
||||||
|
@ -709,11 +705,9 @@ doc = nlp(text)
|
||||||
displacy.serve(doc, style="ent")
|
displacy.serve(doc, style="ent")
|
||||||
```
|
```
|
||||||
|
|
||||||
<Iframe
|
<Standalone height={180}>
|
||||||
title="displaCy visualizer for entities"
|
<div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>When <mark style={{ background: '#aa9cfc', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Sebastian Thrun <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>PERSON</span></mark> started working on self-driving cars at <mark style={{ background: '#7aecec', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Google <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> in <mark style={{ background: '#bfe1d9', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>2007 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>DATE</span></mark>, few people outside of the company took him seriously.</div>
|
||||||
src="/images/displacy-ent2.html"
|
</Standalone>
|
||||||
height={180}
|
|
||||||
/>
|
|
||||||
|
|
||||||
## Entity Linking {id="entity-linking"}
|
## Entity Linking {id="entity-linking"}
|
||||||
|
|
||||||
|
@ -723,6 +717,10 @@ identifier from a knowledge base (KB). You can create your own
|
||||||
[`KnowledgeBase`](/api/kb) and [train](/usage/training) a new
|
[`KnowledgeBase`](/api/kb) and [train](/usage/training) a new
|
||||||
[`EntityLinker`](/api/entitylinker) using that custom knowledge base.
|
[`EntityLinker`](/api/entitylinker) using that custom knowledge base.
|
||||||
|
|
||||||
|
As an example on how to define a KnowledgeBase and train an entity linker model,
|
||||||
|
see [`this tutorial`](https://github.com/explosion/projects/blob/v3/tutorials/nel_emerson)
|
||||||
|
using [spaCy projects](/usage/projects).
|
||||||
|
|
||||||
### Accessing entity identifiers {id="entity-linking-accessing",model="entity linking"}
|
### Accessing entity identifiers {id="entity-linking-accessing",model="entity linking"}
|
||||||
|
|
||||||
The annotated KB identifier is accessible as either a hash value or as a string,
|
The annotated KB identifier is accessible as either a hash value or as a string,
|
||||||
|
@ -733,6 +731,7 @@ object, or the `ent_kb_id` and `ent_kb_id_` attributes of a
|
||||||
```python
|
```python
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
|
# "my_custom_el_pipeline" is assumed to be a custom NLP pipeline that was trained and serialized to disk
|
||||||
nlp = spacy.load("my_custom_el_pipeline")
|
nlp = spacy.load("my_custom_el_pipeline")
|
||||||
doc = nlp("Ada Lovelace was born in London")
|
doc = nlp("Ada Lovelace was born in London")
|
||||||
|
|
||||||
|
|
|
@ -1328,8 +1328,9 @@ labels = []
|
||||||
# This function is created and then passed to the "textcat" component as
|
# This function is created and then passed to the "textcat" component as
|
||||||
# the argument "model"
|
# the argument "model"
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
|
|
|
@ -1144,10 +1144,9 @@ relations and tokens we want to match:
|
||||||
> displacy.serve(doc)
|
> displacy.serve(doc)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
<Iframe
|
<ImageScrollable
|
||||||
title="displaCy visualization of dependencies"
|
src="/images/displacy-dep-founded.svg"
|
||||||
src="/images/displacy-dep-founded.html"
|
width={925}
|
||||||
height={450}
|
|
||||||
/>
|
/>
|
||||||
|
|
||||||
The relations we're interested in are:
|
The relations we're interested in are:
|
||||||
|
|
|
@ -405,7 +405,7 @@ available to spaCy, all you need to do is install the package in your
|
||||||
environment:
|
environment:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python setup.py develop
|
$ python -m pip install .
|
||||||
```
|
```
|
||||||
|
|
||||||
spaCy is now able to create the pipeline component `"snek"` – even though you
|
spaCy is now able to create the pipeline component `"snek"` – even though you
|
||||||
|
@ -586,11 +586,9 @@ After installing the package, the custom colors will be used when visualizing
|
||||||
text with `displacy`. Whenever the label `SNEK` is assigned, it will be
|
text with `displacy`. Whenever the label `SNEK` is assigned, it will be
|
||||||
displayed in `#3dff74`.
|
displayed in `#3dff74`.
|
||||||
|
|
||||||
<Iframe
|
<Standalone height={100}>
|
||||||
title="displaCy visualization of entities"
|
<div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>🌱🌿 <mark style={{ background: '#3dff74', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>🐍 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>SNEK</span></mark> ____ 🌳🌲 ____ <mark style={{ background: '#cfc5ff', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>👨🌾 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>HUMAN</span></mark> 🏘️</div>
|
||||||
src="/images/displacy-ent-snek.html"
|
</Standalone>
|
||||||
height={100}
|
|
||||||
/>
|
|
||||||
|
|
||||||
## Saving, loading and distributing trained pipelines {id="models"}
|
## Saving, loading and distributing trained pipelines {id="models"}
|
||||||
|
|
||||||
|
@ -675,7 +673,7 @@ $ python -m spacy package ./en_example_pipeline ./packages
|
||||||
```
|
```
|
||||||
|
|
||||||
This command will create a pipeline package directory and will run
|
This command will create a pipeline package directory and will run
|
||||||
`python setup.py sdist` in that directory to create a binary `.whl` file or
|
`python -m build` in that directory to create a binary `.whl` file or
|
||||||
`.tar.gz` archive of your package that can be installed using `pip install`.
|
`.tar.gz` archive of your package that can be installed using `pip install`.
|
||||||
Installing the binary wheel is usually more efficient.
|
Installing the binary wheel is usually more efficient.
|
||||||
|
|
||||||
|
|
|
@ -77,11 +77,9 @@ doc.spans["custom"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")]
|
||||||
displacy.serve(doc, style="span", options={"spans_key": "custom"})
|
displacy.serve(doc, style="span", options={"spans_key": "custom"})
|
||||||
```
|
```
|
||||||
|
|
||||||
<Iframe
|
<Standalone height={100}>
|
||||||
title="displaCy visualizer for overlapping spans"
|
<div style={{ lineHeight: 2.5, direction: 'ltr', fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18 }}>Welcome to the <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>Bank<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#7aecec', top: 40, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#7aecec', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>ORG</span></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>of <span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>China<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#feca74', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>GPE</span></span></span>.</div>
|
||||||
src="/images/displacy-span.html"
|
</Standalone>
|
||||||
height={180}
|
|
||||||
/>
|
|
||||||
|
|
||||||
## Additional features and improvements
|
## Additional features and improvements
|
||||||
|
|
||||||
|
|
|
@ -119,11 +119,9 @@ doc = nlp(text)
|
||||||
displacy.serve(doc, style="ent")
|
displacy.serve(doc, style="ent")
|
||||||
```
|
```
|
||||||
|
|
||||||
<Iframe
|
<Standalone height={180}>
|
||||||
title="displaCy visualizer for entities"
|
<div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>When <mark style={{ background: '#aa9cfc', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Sebastian Thrun <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>PERSON</span></mark> started working on self-driving cars at <mark style={{ background: '#7aecec', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Google <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> in <mark style={{ background: '#bfe1d9', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>2007 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>DATE</span></mark>, few people outside of the company took him seriously.</div>
|
||||||
src="/images/displacy-ent2.html"
|
</Standalone>
|
||||||
height={180}
|
|
||||||
/>
|
|
||||||
|
|
||||||
The entity visualizer lets you customize the following `options`:
|
The entity visualizer lets you customize the following `options`:
|
||||||
|
|
||||||
|
@ -148,11 +146,9 @@ use the `colors` setting to add your own colors for them.
|
||||||
> displacy.serve(doc, style="ent", options=options)
|
> displacy.serve(doc, style="ent", options=options)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
<Iframe
|
<Standalone height={225}>
|
||||||
title="displaCy visualizer for entities (custom styling)"
|
<div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>But <mark style={{ background: 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Google <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> is starting from behind. The company made a late push into hardware, and <mark style={{ background: 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Apple <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark>’s Siri, available on iPhones, and <mark style={{ background: 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Amazon <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark>’s Alexa software, which runs on its Echo and Dot devices, have clear leads in consumer adoption.</div>
|
||||||
src="/images/displacy-ent-custom.html"
|
</Standalone>
|
||||||
height={225}
|
|
||||||
/>
|
|
||||||
|
|
||||||
The above example uses a little trick: Since the background color values are
|
The above example uses a little trick: Since the background color values are
|
||||||
added as the `background` style attribute, you can use any
|
added as the `background` style attribute, you can use any
|
||||||
|
@ -197,11 +193,9 @@ doc.spans["sc"] = [
|
||||||
displacy.serve(doc, style="span")
|
displacy.serve(doc, style="span")
|
||||||
```
|
```
|
||||||
|
|
||||||
<Iframe
|
<Standalone height={100}>
|
||||||
title="displaCy visualizer for overlapping spans"
|
<div style={{ lineHeight: 2.5, direction: 'ltr', fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18 }}>Welcome to the <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>Bank<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#7aecec', top: 40, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#7aecec', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>ORG</span></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>of <span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>China<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#feca74', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>GPE</span></span></span>.</div>
|
||||||
src="/images/displacy-span.html"
|
</Standalone>
|
||||||
height={180}
|
|
||||||
/>
|
|
||||||
|
|
||||||
The span visualizer lets you customize the following `options`:
|
The span visualizer lets you customize the following `options`:
|
||||||
|
|
||||||
|
@ -223,11 +217,9 @@ specify which one displaCy should use with `spans_key` (`sc` is the default).
|
||||||
> displacy.serve(doc, style="span", options=options)
|
> displacy.serve(doc, style="span", options=options)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
<Iframe
|
<Standalone height={100}>
|
||||||
title="displaCy visualizer for spans (custom spans_key)"
|
<div style={{ lineHeight: 2.5, direction: 'ltr', fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18 }}>Welcome to the <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>Bank<span style={{ background: '#ddd', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#ddd', top: 40, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#ddd', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>BANK</span></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>of <span style={{ background: '#ddd', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>China<span style={{ background: '#ddd', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span>.</div>
|
||||||
src="/images/displacy-span-custom.html"
|
</Standalone>
|
||||||
height={225}
|
|
||||||
/>
|
|
||||||
|
|
||||||
## Using displaCy in Jupyter notebooks {id="jupyter"}
|
## Using displaCy in Jupyter notebooks {id="jupyter"}
|
||||||
|
|
||||||
|
|
|
@ -103,6 +103,10 @@
|
||||||
"has_examples": true,
|
"has_examples": true,
|
||||||
"models": ["fi_core_news_sm", "fi_core_news_md", "fi_core_news_lg"]
|
"models": ["fi_core_news_sm", "fi_core_news_md", "fi_core_news_lg"]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"code": "fo",
|
||||||
|
"name": "Faroese"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"code": "fr",
|
"code": "fr",
|
||||||
"name": "French",
|
"name": "French",
|
||||||
|
@ -290,6 +294,12 @@
|
||||||
"example": "Dit is een zin.",
|
"example": "Dit is een zin.",
|
||||||
"has_examples": true
|
"has_examples": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"code": "nn",
|
||||||
|
"name": "Norwegian Nynorsk",
|
||||||
|
"example": "Det er ein meir enn i same periode i fjor.",
|
||||||
|
"has_examples": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"code": "pl",
|
"code": "pl",
|
||||||
"name": "Polish",
|
"name": "Polish",
|
||||||
|
|
|
@ -9,14 +9,9 @@
|
||||||
{ "text": "Models & Languages", "url": "/usage/models" },
|
{ "text": "Models & Languages", "url": "/usage/models" },
|
||||||
{ "text": "Facts & Figures", "url": "/usage/facts-figures" },
|
{ "text": "Facts & Figures", "url": "/usage/facts-figures" },
|
||||||
{ "text": "spaCy 101", "url": "/usage/spacy-101" },
|
{ "text": "spaCy 101", "url": "/usage/spacy-101" },
|
||||||
{ "text": "New in v3.0", "url": "/usage/v3" },
|
{ "text": "New in v3.7", "url": "/usage/v3-7" },
|
||||||
{ "text": "New in v3.1", "url": "/usage/v3-1" },
|
|
||||||
{ "text": "New in v3.2", "url": "/usage/v3-2" },
|
|
||||||
{ "text": "New in v3.3", "url": "/usage/v3-3" },
|
|
||||||
{ "text": "New in v3.4", "url": "/usage/v3-4" },
|
|
||||||
{ "text": "New in v3.5", "url": "/usage/v3-5" },
|
|
||||||
{ "text": "New in v3.6", "url": "/usage/v3-6" },
|
{ "text": "New in v3.6", "url": "/usage/v3-6" },
|
||||||
{ "text": "New in v3.7", "url": "/usage/v3-7" }
|
{ "text": "New in v3.5", "url": "/usage/v3-5" }
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -66,6 +66,10 @@
|
||||||
{
|
{
|
||||||
"text": "Stack Overflow",
|
"text": "Stack Overflow",
|
||||||
"url": "http://stackoverflow.com/questions/tagged/spacy"
|
"url": "http://stackoverflow.com/questions/tagged/spacy"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Merchandise",
|
||||||
|
"url": "https://explosion.ai/merch"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
|
@ -4500,6 +4500,23 @@
|
||||||
"website": "https://nlp.unibuc.ro/people/snisioi.html"
|
"website": "https://nlp.unibuc.ro/people/snisioi.html"
|
||||||
},
|
},
|
||||||
"category": ["pipeline", "training", "models"]
|
"category": ["pipeline", "training", "models"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "redfield-spacy-nodes",
|
||||||
|
"title": "Redfield NLP Nodes for KNIME",
|
||||||
|
"slogan": "Makes the functionality of the spaCy library available in KNIME Analytics Platform.",
|
||||||
|
"description": "This extension provides nodes that make the functionality of the spaCy library available in the [KNIME Analytics Platform](https://www.knime.com/).",
|
||||||
|
"github": "Redfield-AB/Spacy-Nodes",
|
||||||
|
"url": "https://redfield.ai/spacy-redfield/",
|
||||||
|
"thumb": "https://raw.githubusercontent.com/Redfield-AB/Spacy-Nodes/master/resource/redfield_logo_100x100.png",
|
||||||
|
"image": "https://raw.githubusercontent.com/Redfield-AB/Spacy-Nodes/master/resource/screen1.png",
|
||||||
|
"author": "Redfield AB",
|
||||||
|
"author_links": {
|
||||||
|
"twitter": "Redfield_AB",
|
||||||
|
"github": "Redfield-AB",
|
||||||
|
"website": "https://redfield.ai"
|
||||||
|
},
|
||||||
|
"category": ["standalone"]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
||||||
|
|
Before Width: | Height: | Size: 5.1 KiB After Width: | Height: | Size: 5.1 KiB |
|
@ -1,80 +0,0 @@
|
||||||
<div
|
|
||||||
class="entities"
|
|
||||||
style="
|
|
||||||
line-height: 2.5;
|
|
||||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif,
|
|
||||||
'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';
|
|
||||||
font-size: 18px;
|
|
||||||
"
|
|
||||||
>But
|
|
||||||
<mark
|
|
||||||
class="entity"
|
|
||||||
style="
|
|
||||||
background: linear-gradient(90deg, #aa9cfc, #fc9ce7);
|
|
||||||
padding: 0.45em 0.6em;
|
|
||||||
margin: 0 0.25em;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
"
|
|
||||||
>Google
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
font-size: 0.8em;
|
|
||||||
font-weight: bold;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
text-transform: uppercase;
|
|
||||||
vertical-align: middle;
|
|
||||||
margin-left: 0.5rem;
|
|
||||||
"
|
|
||||||
>ORG</span
|
|
||||||
></mark
|
|
||||||
>is starting from behind. The company made a late push into hardware, and
|
|
||||||
<mark
|
|
||||||
class="entity"
|
|
||||||
style="
|
|
||||||
background: linear-gradient(90deg, #aa9cfc, #fc9ce7);
|
|
||||||
padding: 0.45em 0.6em;
|
|
||||||
margin: 0 0.25em;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
"
|
|
||||||
>Apple
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
font-size: 0.8em;
|
|
||||||
font-weight: bold;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
text-transform: uppercase;
|
|
||||||
vertical-align: middle;
|
|
||||||
margin-left: 0.5rem;
|
|
||||||
"
|
|
||||||
>ORG</span
|
|
||||||
></mark
|
|
||||||
>’s Siri, available on iPhones, and
|
|
||||||
<mark
|
|
||||||
class="entity"
|
|
||||||
style="
|
|
||||||
background: linear-gradient(90deg, #aa9cfc, #fc9ce7);
|
|
||||||
padding: 0.45em 0.6em;
|
|
||||||
margin: 0 0.25em;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
"
|
|
||||||
>Amazon
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
font-size: 0.8em;
|
|
||||||
font-weight: bold;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
text-transform: uppercase;
|
|
||||||
vertical-align: middle;
|
|
||||||
margin-left: 0.5rem;
|
|
||||||
"
|
|
||||||
>ORG</span
|
|
||||||
></mark
|
|
||||||
>’s Alexa software, which runs on its Echo and Dot devices, have clear leads in consumer
|
|
||||||
adoption.</div
|
|
||||||
>
|
|
|
@ -1,59 +0,0 @@
|
||||||
<div
|
|
||||||
class="entities"
|
|
||||||
style="
|
|
||||||
line-height: 2.5;
|
|
||||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif,
|
|
||||||
'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';
|
|
||||||
font-size: 16px;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
🌱🌿
|
|
||||||
<mark
|
|
||||||
class="entity"
|
|
||||||
style="
|
|
||||||
background: #3dff74;
|
|
||||||
padding: 0.45em 0.6em;
|
|
||||||
margin: 0 0.25em;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
"
|
|
||||||
>🐍
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
font-size: 0.8em;
|
|
||||||
font-weight: bold;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
text-transform: uppercase;
|
|
||||||
vertical-align: middle;
|
|
||||||
margin-left: 0.5rem;
|
|
||||||
"
|
|
||||||
>SNEK</span
|
|
||||||
></mark
|
|
||||||
>
|
|
||||||
____ 🌳🌲 ____
|
|
||||||
<mark
|
|
||||||
class="entity"
|
|
||||||
style="
|
|
||||||
background: #cfc5ff;
|
|
||||||
padding: 0.45em 0.6em;
|
|
||||||
margin: 0 0.25em;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
"
|
|
||||||
>👨🌾
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
font-size: 0.8em;
|
|
||||||
font-weight: bold;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
text-transform: uppercase;
|
|
||||||
vertical-align: middle;
|
|
||||||
margin-left: 0.5rem;
|
|
||||||
"
|
|
||||||
>HUMAN</span
|
|
||||||
></mark
|
|
||||||
>
|
|
||||||
🏘️
|
|
||||||
</div>
|
|
|
@ -1,84 +0,0 @@
|
||||||
<div
|
|
||||||
class="entities"
|
|
||||||
style="
|
|
||||||
line-height: 2.5;
|
|
||||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif,
|
|
||||||
'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';
|
|
||||||
font-size: 16px;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
<mark
|
|
||||||
class="entity"
|
|
||||||
style="
|
|
||||||
background: #7aecec;
|
|
||||||
padding: 0.45em 0.6em;
|
|
||||||
margin: 0 0.25em;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
Apple
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
font-size: 0.8em;
|
|
||||||
font-weight: bold;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
text-transform: uppercase;
|
|
||||||
vertical-align: middle;
|
|
||||||
margin-left: 0.5rem;
|
|
||||||
"
|
|
||||||
>ORG</span
|
|
||||||
>
|
|
||||||
</mark>
|
|
||||||
is looking at buying
|
|
||||||
<mark
|
|
||||||
class="entity"
|
|
||||||
style="
|
|
||||||
background: #feca74;
|
|
||||||
padding: 0.45em 0.6em;
|
|
||||||
margin: 0 0.25em;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
U.K.
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
font-size: 0.8em;
|
|
||||||
font-weight: bold;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
text-transform: uppercase;
|
|
||||||
vertical-align: middle;
|
|
||||||
margin-left: 0.5rem;
|
|
||||||
"
|
|
||||||
>GPE</span
|
|
||||||
>
|
|
||||||
</mark>
|
|
||||||
startup for
|
|
||||||
<mark
|
|
||||||
class="entity"
|
|
||||||
style="
|
|
||||||
background: #e4e7d2;
|
|
||||||
padding: 0.45em 0.6em;
|
|
||||||
margin: 0 0.25em;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
$1 billion
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
font-size: 0.8em;
|
|
||||||
font-weight: bold;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
text-transform: uppercase;
|
|
||||||
vertical-align: middle;
|
|
||||||
margin-left: 0.5rem;
|
|
||||||
"
|
|
||||||
>MONEY</span
|
|
||||||
>
|
|
||||||
</mark>
|
|
||||||
</div>
|
|
|
@ -1,86 +0,0 @@
|
||||||
<div
|
|
||||||
class="entities"
|
|
||||||
style="
|
|
||||||
line-height: 2.5;
|
|
||||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif,
|
|
||||||
'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';
|
|
||||||
font-size: 18px;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
When
|
|
||||||
<mark
|
|
||||||
class="entity"
|
|
||||||
style="
|
|
||||||
background: #aa9cfc;
|
|
||||||
padding: 0.45em 0.6em;
|
|
||||||
margin: 0 0.25em;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
Sebastian Thrun
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
font-size: 0.8em;
|
|
||||||
font-weight: bold;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
text-transform: uppercase;
|
|
||||||
vertical-align: middle;
|
|
||||||
margin-left: 0.5rem;
|
|
||||||
"
|
|
||||||
>PERSON</span
|
|
||||||
>
|
|
||||||
</mark>
|
|
||||||
started working on self-driving cars at
|
|
||||||
<mark
|
|
||||||
class="entity"
|
|
||||||
style="
|
|
||||||
background: #7aecec;
|
|
||||||
padding: 0.45em 0.6em;
|
|
||||||
margin: 0 0.25em;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
Google
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
font-size: 0.8em;
|
|
||||||
font-weight: bold;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
text-transform: uppercase;
|
|
||||||
vertical-align: middle;
|
|
||||||
margin-left: 0.5rem;
|
|
||||||
"
|
|
||||||
>ORG</span
|
|
||||||
>
|
|
||||||
</mark>
|
|
||||||
in
|
|
||||||
<mark
|
|
||||||
class="entity"
|
|
||||||
style="
|
|
||||||
background: #bfe1d9;
|
|
||||||
padding: 0.45em 0.6em;
|
|
||||||
margin: 0 0.25em;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
2007
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
font-size: 0.8em;
|
|
||||||
font-weight: bold;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 0.35em;
|
|
||||||
text-transform: uppercase;
|
|
||||||
vertical-align: middle;
|
|
||||||
margin-left: 0.5rem;
|
|
||||||
"
|
|
||||||
>DATE</span
|
|
||||||
>
|
|
||||||
</mark>
|
|
||||||
, few people outside of the company took him seriously.
|
|
||||||
</div>
|
|
Before Width: | Height: | Size: 11 KiB After Width: | Height: | Size: 11 KiB |
212
website/public/images/displacy-long2.svg
Normal file
|
@ -0,0 +1,212 @@
|
||||||
|
<svg
|
||||||
|
xmlns="http://www.w3.org/2000/svg"
|
||||||
|
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||||
|
id="0"
|
||||||
|
class="displacy"
|
||||||
|
width="1275"
|
||||||
|
height="399.5"
|
||||||
|
style="
|
||||||
|
max-width: none;
|
||||||
|
height: 399.5px;
|
||||||
|
color: #000000;
|
||||||
|
background: #ffffff;
|
||||||
|
font-family: Arial;
|
||||||
|
"
|
||||||
|
>
|
||||||
|
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
|
||||||
|
<tspan class="displacy-word" fill="currentColor" x="50">Autonomous</tspan>
|
||||||
|
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="50">ADJ</tspan>
|
||||||
|
</text>
|
||||||
|
|
||||||
|
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
|
||||||
|
<tspan class="displacy-word" fill="currentColor" x="225">cars</tspan>
|
||||||
|
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="225">NOUN</tspan>
|
||||||
|
</text>
|
||||||
|
|
||||||
|
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
|
||||||
|
<tspan class="displacy-word" fill="currentColor" x="400">shift</tspan>
|
||||||
|
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="400">VERB</tspan>
|
||||||
|
</text>
|
||||||
|
|
||||||
|
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
|
||||||
|
<tspan class="displacy-word" fill="currentColor" x="575">insurance</tspan>
|
||||||
|
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="575">NOUN</tspan>
|
||||||
|
</text>
|
||||||
|
|
||||||
|
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
|
||||||
|
<tspan class="displacy-word" fill="currentColor" x="750">liability</tspan>
|
||||||
|
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="750">NOUN</tspan>
|
||||||
|
</text>
|
||||||
|
|
||||||
|
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
|
||||||
|
<tspan class="displacy-word" fill="currentColor" x="925">toward</tspan>
|
||||||
|
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="925">ADP</tspan>
|
||||||
|
</text>
|
||||||
|
|
||||||
|
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
|
||||||
|
<tspan class="displacy-word" fill="currentColor" x="1100">manufacturers</tspan>
|
||||||
|
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="1100">NOUN</tspan>
|
||||||
|
</text>
|
||||||
|
|
||||||
|
<g class="displacy-arrow">
|
||||||
|
<path
|
||||||
|
class="displacy-arc"
|
||||||
|
id="arrow-0-0"
|
||||||
|
stroke-width="2px"
|
||||||
|
d="M70,264.5 C70,177.0 215.0,177.0 215.0,264.5"
|
||||||
|
fill="none"
|
||||||
|
stroke="currentColor"
|
||||||
|
></path>
|
||||||
|
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
|
||||||
|
<textpath
|
||||||
|
xlink:href="#arrow-0-0"
|
||||||
|
class="displacy-label"
|
||||||
|
startOffset="50%"
|
||||||
|
fill="currentColor"
|
||||||
|
text-anchor="middle"
|
||||||
|
>
|
||||||
|
amod
|
||||||
|
</textpath>
|
||||||
|
</text>
|
||||||
|
<path
|
||||||
|
class="displacy-arrowhead"
|
||||||
|
d="M70,266.5 L62,254.5 78,254.5"
|
||||||
|
fill="currentColor"
|
||||||
|
></path>
|
||||||
|
</g>
|
||||||
|
|
||||||
|
<g class="displacy-arrow">
|
||||||
|
<path
|
||||||
|
class="displacy-arc"
|
||||||
|
id="arrow-0-1"
|
||||||
|
stroke-width="2px"
|
||||||
|
d="M245,264.5 C245,177.0 390.0,177.0 390.0,264.5"
|
||||||
|
fill="none"
|
||||||
|
stroke="currentColor"
|
||||||
|
></path>
|
||||||
|
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
|
||||||
|
<textpath
|
||||||
|
xlink:href="#arrow-0-1"
|
||||||
|
class="displacy-label"
|
||||||
|
startOffset="50%"
|
||||||
|
fill="currentColor"
|
||||||
|
text-anchor="middle"
|
||||||
|
>
|
||||||
|
nsubj
|
||||||
|
</textpath>
|
||||||
|
</text>
|
||||||
|
<path
|
||||||
|
class="displacy-arrowhead"
|
||||||
|
d="M245,266.5 L237,254.5 253,254.5"
|
||||||
|
fill="currentColor"
|
||||||
|
></path>
|
||||||
|
</g>
|
||||||
|
|
||||||
|
<g class="displacy-arrow">
|
||||||
|
<path
|
||||||
|
class="displacy-arc"
|
||||||
|
id="arrow-0-2"
|
||||||
|
stroke-width="2px"
|
||||||
|
d="M595,264.5 C595,177.0 740.0,177.0 740.0,264.5"
|
||||||
|
fill="none"
|
||||||
|
stroke="currentColor"
|
||||||
|
></path>
|
||||||
|
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
|
||||||
|
<textpath
|
||||||
|
xlink:href="#arrow-0-2"
|
||||||
|
class="displacy-label"
|
||||||
|
startOffset="50%"
|
||||||
|
fill="currentColor"
|
||||||
|
text-anchor="middle"
|
||||||
|
>
|
||||||
|
compound
|
||||||
|
</textpath>
|
||||||
|
</text>
|
||||||
|
<path
|
||||||
|
class="displacy-arrowhead"
|
||||||
|
d="M595,266.5 L587,254.5 603,254.5"
|
||||||
|
fill="currentColor"
|
||||||
|
></path>
|
||||||
|
</g>
|
||||||
|
|
||||||
|
<g class="displacy-arrow">
|
||||||
|
<path
|
||||||
|
class="displacy-arc"
|
||||||
|
id="arrow-0-3"
|
||||||
|
stroke-width="2px"
|
||||||
|
d="M420,264.5 C420,89.5 745.0,89.5 745.0,264.5"
|
||||||
|
fill="none"
|
||||||
|
stroke="currentColor"
|
||||||
|
></path>
|
||||||
|
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
|
||||||
|
<textpath
|
||||||
|
xlink:href="#arrow-0-3"
|
||||||
|
class="displacy-label"
|
||||||
|
startOffset="50%"
|
||||||
|
fill="currentColor"
|
||||||
|
text-anchor="middle"
|
||||||
|
>
|
||||||
|
dobj
|
||||||
|
</textpath>
|
||||||
|
</text>
|
||||||
|
<path
|
||||||
|
class="displacy-arrowhead"
|
||||||
|
d="M745.0,266.5 L753.0,254.5 737.0,254.5"
|
||||||
|
fill="currentColor"
|
||||||
|
></path>
|
||||||
|
</g>
|
||||||
|
|
||||||
|
<g class="displacy-arrow">
|
||||||
|
<path
|
||||||
|
class="displacy-arc"
|
||||||
|
id="arrow-0-4"
|
||||||
|
stroke-width="2px"
|
||||||
|
d="M420,264.5 C420,2.0 925.0,2.0 925.0,264.5"
|
||||||
|
fill="none"
|
||||||
|
stroke="currentColor"
|
||||||
|
></path>
|
||||||
|
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
|
||||||
|
<textpath
|
||||||
|
xlink:href="#arrow-0-4"
|
||||||
|
class="displacy-label"
|
||||||
|
startOffset="50%"
|
||||||
|
fill="currentColor"
|
||||||
|
text-anchor="middle"
|
||||||
|
>
|
||||||
|
prep
|
||||||
|
</textpath>
|
||||||
|
</text>
|
||||||
|
<path
|
||||||
|
class="displacy-arrowhead"
|
||||||
|
d="M925.0,266.5 L933.0,254.5 917.0,254.5"
|
||||||
|
fill="currentColor"
|
||||||
|
></path>
|
||||||
|
</g>
|
||||||
|
|
||||||
|
<g class="displacy-arrow">
|
||||||
|
<path
|
||||||
|
class="displacy-arc"
|
||||||
|
id="arrow-0-5"
|
||||||
|
stroke-width="2px"
|
||||||
|
d="M945,264.5 C945,177.0 1090.0,177.0 1090.0,264.5"
|
||||||
|
fill="none"
|
||||||
|
stroke="currentColor"
|
||||||
|
></path>
|
||||||
|
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
|
||||||
|
<textpath
|
||||||
|
xlink:href="#arrow-0-5"
|
||||||
|
class="displacy-label"
|
||||||
|
startOffset="50%"
|
||||||
|
fill="currentColor"
|
||||||
|
text-anchor="middle"
|
||||||
|
>
|
||||||
|
pobj
|
||||||
|
</textpath>
|
||||||
|
</text>
|
||||||
|
<path
|
||||||
|
class="displacy-arrowhead"
|
||||||
|
d="M1090.0,266.5 L1098.0,254.5 1082.0,254.5"
|
||||||
|
fill="currentColor"
|
||||||
|
></path>
|
||||||
|
</g>
|
||||||
|
</svg>
|
After Width: | Height: | Size: 6.8 KiB |
|
@ -1,84 +0,0 @@
|
||||||
<div
|
|
||||||
class="spans"
|
|
||||||
style="
|
|
||||||
line-height: 2.5;
|
|
||||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif,
|
|
||||||
'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';
|
|
||||||
font-size: 18px;
|
|
||||||
direction: ltr;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
Welcome to the
|
|
||||||
<span style="font-weight: bold; display: inline-block; position: relative">
|
|
||||||
Bank
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
background: #ddd;
|
|
||||||
top: 40px;
|
|
||||||
height: 4px;
|
|
||||||
left: -1px;
|
|
||||||
width: calc(100% + 2px);
|
|
||||||
position: absolute;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
</span>
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
background: #ddd;
|
|
||||||
top: 40px;
|
|
||||||
height: 4px;
|
|
||||||
border-top-left-radius: 3px;
|
|
||||||
border-bottom-left-radius: 3px;
|
|
||||||
left: -1px;
|
|
||||||
width: calc(100% + 2px);
|
|
||||||
position: absolute;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
background: #ddd;
|
|
||||||
color: #000;
|
|
||||||
top: -0.5em;
|
|
||||||
padding: 2px 3px;
|
|
||||||
position: absolute;
|
|
||||||
font-size: 0.6em;
|
|
||||||
font-weight: bold;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 3px;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
BANK
|
|
||||||
</span>
|
|
||||||
</span>
|
|
||||||
</span>
|
|
||||||
<span style="font-weight: bold; display: inline-block; position: relative">
|
|
||||||
of
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
background: #ddd;
|
|
||||||
top: 40px;
|
|
||||||
height: 4px;
|
|
||||||
left: -1px;
|
|
||||||
width: calc(100% + 2px);
|
|
||||||
position: absolute;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
</span>
|
|
||||||
</span>
|
|
||||||
<span style="font-weight: bold; display: inline-block; position: relative">
|
|
||||||
China
|
|
||||||
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
background: #ddd;
|
|
||||||
top: 40px;
|
|
||||||
height: 4px;
|
|
||||||
left: -1px;
|
|
||||||
width: calc(100% + 2px);
|
|
||||||
position: absolute;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
</span>
|
|
||||||
</span>
|
|
||||||
.
|
|
||||||
</div>
|
|
|
@ -1,123 +0,0 @@
|
||||||
<div
|
|
||||||
class="spans"
|
|
||||||
style="
|
|
||||||
line-height: 2.5;
|
|
||||||
direction: ltr;
|
|
||||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif,
|
|
||||||
'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';
|
|
||||||
font-size: 18px;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
Welcome to the
|
|
||||||
<span style="font-weight: bold; display: inline-block; position: relative">
|
|
||||||
Bank
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
background: #7aecec;
|
|
||||||
top: 40px;
|
|
||||||
height: 4px;
|
|
||||||
left: -1px;
|
|
||||||
width: calc(100% + 2px);
|
|
||||||
position: absolute;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
</span>
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
background: #7aecec;
|
|
||||||
top: 40px;
|
|
||||||
height: 4px;
|
|
||||||
border-top-left-radius: 3px;
|
|
||||||
border-bottom-left-radius: 3px;
|
|
||||||
left: -1px;
|
|
||||||
width: calc(100% + 2px);
|
|
||||||
position: absolute;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
background: #7aecec;
|
|
||||||
color: #000;
|
|
||||||
top: -0.5em;
|
|
||||||
padding: 2px 3px;
|
|
||||||
position: absolute;
|
|
||||||
font-size: 0.6em;
|
|
||||||
font-weight: bold;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 3px;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
ORG
|
|
||||||
</span>
|
|
||||||
</span>
|
|
||||||
</span>
|
|
||||||
<span style="font-weight: bold; display: inline-block; position: relative">
|
|
||||||
of
|
|
||||||
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
background: #7aecec;
|
|
||||||
top: 40px;
|
|
||||||
height: 4px;
|
|
||||||
left: -1px;
|
|
||||||
width: calc(100% + 2px);
|
|
||||||
position: absolute;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
</span>
|
|
||||||
</span>
|
|
||||||
<span style="font-weight: bold; display: inline-block; position: relative">
|
|
||||||
China
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
background: #7aecec;
|
|
||||||
top: 40px;
|
|
||||||
height: 4px;
|
|
||||||
left: -1px;
|
|
||||||
width: calc(100% + 2px);
|
|
||||||
position: absolute;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
</span>
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
background: #feca74;
|
|
||||||
top: 57px;
|
|
||||||
height: 4px;
|
|
||||||
left: -1px;
|
|
||||||
width: calc(100% + 2px);
|
|
||||||
position: absolute;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
</span>
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
background: #feca74;
|
|
||||||
top: 57px;
|
|
||||||
height: 4px;
|
|
||||||
border-top-left-radius: 3px;
|
|
||||||
border-bottom-left-radius: 3px;
|
|
||||||
left: -1px;
|
|
||||||
width: calc(100% + 2px);
|
|
||||||
position: absolute;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
<span
|
|
||||||
style="
|
|
||||||
background: #feca74;
|
|
||||||
color: #000;
|
|
||||||
top: -0.5em;
|
|
||||||
padding: 2px 3px;
|
|
||||||
position: absolute;
|
|
||||||
font-size: 0.6em;
|
|
||||||
font-weight: bold;
|
|
||||||
line-height: 1;
|
|
||||||
border-radius: 3px;
|
|
||||||
"
|
|
||||||
>
|
|
||||||
GPE
|
|
||||||
</span>
|
|
||||||
</span>
|
|
||||||
</span>
|
|
||||||
.
|
|
||||||
</div>
|
|
|
@ -107,6 +107,22 @@ const Image = ({ src, alt, title, href, ...props }) => {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const ImageScrollable = ({ src, alt, width, ...props }) => {
|
||||||
|
return (
|
||||||
|
<figure className={classNames(classes.standalone, classes.scrollable)}>
|
||||||
|
<img className={classes['image-scrollable']} src={src} alt={alt} width={width} height="auto" />
|
||||||
|
</figure>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
const Standalone = ({ height, children, ...props }) => {
|
||||||
|
return (
|
||||||
|
<figure className={classes.standalone} style={{ height }}>
|
||||||
|
{children}
|
||||||
|
</figure>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
const ImageFill = ({ image, ...props }) => {
|
const ImageFill = ({ image, ...props }) => {
|
||||||
return (
|
return (
|
||||||
<span
|
<span
|
||||||
|
@ -137,4 +153,4 @@ const GoogleSheet = ({ id, link, height, button = 'View full table' }) => {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
export { YouTube, SoundCloud, Iframe, Image, ImageFill, GoogleSheet }
|
export { YouTube, SoundCloud, Iframe, Image, ImageFill, ImageScrollable, GoogleSheet, Standalone }
|
||||||
|
|
|
@ -13,7 +13,7 @@ import Aside from './components/aside'
|
||||||
import Button from './components/button'
|
import Button from './components/button'
|
||||||
import Tag from './components/tag'
|
import Tag from './components/tag'
|
||||||
import Grid from './components/grid'
|
import Grid from './components/grid'
|
||||||
import { YouTube, SoundCloud, Iframe, Image, GoogleSheet } from './components/embed'
|
import { YouTube, SoundCloud, Iframe, Image, ImageScrollable, GoogleSheet, Standalone } from './components/embed'
|
||||||
import Project from './widgets/project'
|
import Project from './widgets/project'
|
||||||
import { Integration, IntegrationLogo } from './widgets/integration.js'
|
import { Integration, IntegrationLogo } from './widgets/integration.js'
|
||||||
import { Logos, Colors, Patterns } from './widgets/styleguide'
|
import { Logos, Colors, Patterns } from './widgets/styleguide'
|
||||||
|
@ -90,6 +90,8 @@ export const remarkComponents = {
|
||||||
* For regular img elements it is not possible to pass properties
|
* For regular img elements it is not possible to pass properties
|
||||||
*/
|
*/
|
||||||
Image,
|
Image,
|
||||||
|
ImageScrollable,
|
||||||
|
Standalone,
|
||||||
|
|
||||||
Label,
|
Label,
|
||||||
Logos,
|
Logos,
|
||||||
|
|