Merge branch 'master' into spacy.io

This commit is contained in:
Adriane Boyd 2022-07-12 07:51:10 +02:00
commit 1087cea1ef
97 changed files with 1690 additions and 495 deletions

View File

@ -27,7 +27,6 @@ steps:
- script: python -m mypy spacy - script: python -m mypy spacy
displayName: 'Run mypy' displayName: 'Run mypy'
condition: ne(variables['python_version'], '3.10')
- task: DeleteFiles@1 - task: DeleteFiles@1
inputs: inputs:
@ -41,7 +40,7 @@ steps:
- bash: | - bash: |
${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
${{ parameters.prefix }} python -m pip install dist/$SDIST ${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
displayName: "Install from sdist" displayName: "Install from sdist"
- script: | - script: |
@ -64,12 +63,12 @@ steps:
displayName: "Run GPU tests" displayName: "Run GPU tests"
condition: eq(${{ parameters.gpu }}, true) condition: eq(${{ parameters.gpu }}, true)
- script: | # - script: |
python -m spacy download ca_core_news_sm # python -m spacy download ca_core_news_sm
python -m spacy download ca_core_news_md # python -m spacy download ca_core_news_md
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" # python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
displayName: 'Test download CLI' # displayName: 'Test download CLI'
condition: eq(variables['python_version'], '3.8') # condition: eq(variables['python_version'], '3.8')
- script: | - script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
@ -93,17 +92,17 @@ steps:
displayName: 'Test train CLI' displayName: 'Test train CLI'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.8')
- script: | # - script: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" # python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir # PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
displayName: 'Test assemble CLI' # displayName: 'Test assemble CLI'
condition: eq(variables['python_version'], '3.8') # condition: eq(variables['python_version'], '3.8')
#
- script: | # - script: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" # python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 # python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
displayName: 'Test assemble CLI vectors warning' # displayName: 'Test assemble CLI vectors warning'
condition: eq(variables['python_version'], '3.8') # condition: eq(variables['python_version'], '3.8')
- script: | - script: |
python .github/validate_universe_json.py website/meta/universe.json python .github/validate_universe_json.py website/meta/universe.json
@ -111,7 +110,7 @@ steps:
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.8')
- script: | - script: |
${{ parameters.prefix }} python -m pip install thinc-apple-ops ${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops
${{ parameters.prefix }} python -m pytest --pyargs spacy ${{ parameters.prefix }} python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops" displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9')) condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))

67
.github/spacy_universe_alert.py vendored Normal file
View File

@ -0,0 +1,67 @@
import os
import sys
import json
from datetime import datetime
from slack_sdk.web.client import WebClient
CHANNEL = "#alerts-universe"
SLACK_TOKEN = os.environ.get("SLACK_BOT_TOKEN", "ENV VAR not available!")
DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
client = WebClient(SLACK_TOKEN)
github_context = json.loads(sys.argv[1])
event = github_context['event']
pr_title = event['pull_request']["title"]
pr_link = event['pull_request']["patch_url"].replace(".patch", "")
pr_author_url = event['sender']["html_url"]
pr_author_name = pr_author_url.rsplit('/')[-1]
pr_created_at_dt = datetime.strptime(
event['pull_request']["created_at"],
DATETIME_FORMAT
)
pr_created_at = pr_created_at_dt.strftime("%c")
pr_updated_at_dt = datetime.strptime(
event['pull_request']["updated_at"],
DATETIME_FORMAT
)
pr_updated_at = pr_updated_at_dt.strftime("%c")
blocks = [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "📣 New spaCy Universe Project Alert ✨"
}
},
{
"type": "section",
"fields": [
{
"type": "mrkdwn",
"text": f"*Pull Request:*\n<{pr_link}|{pr_title}>"
},
{
"type": "mrkdwn",
"text": f"*Author:*\n<{pr_author_url}|{pr_author_name}>"
},
{
"type": "mrkdwn",
"text": f"*Created at:*\n {pr_created_at}"
},
{
"type": "mrkdwn",
"text": f"*Last Updated:*\n {pr_updated_at}"
}
]
}
]
client.chat_postMessage(
channel=CHANNEL,
text="spaCy universe project PR alert",
blocks=blocks
)

View File

@ -0,0 +1,30 @@
name: spaCy universe project alert
on:
pull_request_target:
paths:
- "website/meta/universe.json"
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Dump GitHub context
env:
GITHUB_CONTEXT: ${{ toJson(github) }}
PR_NUMBER: ${{github.event.number}}
run: |
echo "$GITHUB_CONTEXT"
- uses: actions/checkout@v1
- uses: actions/setup-python@v1
- name: Install Bernadette app dependency and send an alert
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
GITHUB_CONTEXT: ${{ toJson(github) }}
CHANNEL: "#alerts-universe"
run: |
pip install slack-sdk==3.17.2 aiohttp==3.8.1
echo "$CHANNEL"
python .github/spacy_universe_alert.py "$GITHUB_CONTEXT"

View File

@ -1,4 +1,4 @@
recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml *.hh
include LICENSE include LICENSE
include README.md include README.md
include pyproject.toml include pyproject.toml

View File

@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the MIT license. open-source software, released under the MIT license.
💫 **Version 3.2 out now!** 💫 **Version 3.4.0 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases) [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)

View File

@ -1,6 +1,8 @@
# build version constraints for use with wheelwright + multibuild # build version constraints for use with wheelwright + multibuild
numpy==1.15.0; python_version<='3.7' numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
numpy==1.17.3; python_version=='3.8' numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
numpy==1.19.3; python_version=='3.9' numpy==1.19.3; python_version=='3.9'
numpy==1.21.3; python_version=='3.10' numpy==1.21.3; python_version=='3.10'
numpy; python_version>='3.11' numpy; python_version>='3.11'

View File

@ -455,6 +455,10 @@ Regression tests are tests that refer to bugs reported in specific issues. They
The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file. The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file.
### Testing Cython Code
If you're developing Cython code (`.pyx` files), those extensions will need to be built before the test runner can test that code - otherwise it's going to run the tests with stale code from the last time the extension was built. You can build the extensions locally with `python setup.py build_ext -i`.
### Constructing objects and state ### Constructing objects and state
Test functions usually follow the same simple structure: they set up some state, perform the operation you want to test and `assert` conditions that you expect to be true, usually before and after the operation. Test functions usually follow the same simple structure: they set up some state, perform the operation you want to test and `assert` conditions that you expect to be true, usually before and after the operation.

View File

@ -5,8 +5,7 @@ requires = [
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.14,<8.1.0", "thinc>=8.1.0,<8.2.0",
"blis>=0.4.0,<0.8.0",
"pathy", "pathy",
"numpy>=1.15.0", "numpy>=1.15.0",
] ]

View File

@ -3,8 +3,7 @@ spacy-legacy>=3.0.9,<3.1.0
spacy-loggers>=1.0.0,<2.0.0 spacy-loggers>=1.0.0,<2.0.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.14,<8.1.0 thinc>=8.1.0,<8.2.0
blis>=0.4.0,<0.8.0
ml_datasets>=0.2.0,<0.3.0 ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.1.0 wasabi>=0.9.1,<1.1.0
@ -16,7 +15,7 @@ pathy>=0.3.5
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
jinja2 jinja2
langcodes>=3.2.0,<4.0.0 langcodes>=3.2.0,<4.0.0
# Official Python utilities # Official Python utilities
@ -31,7 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0 mock>=2.0.0,<3.0.0
flake8>=3.8.0,<3.10.0 flake8>=3.8.0,<3.10.0
hypothesis>=3.27.0,<7.0.0 hypothesis>=3.27.0,<7.0.0
mypy==0.910 mypy>=0.910,<0.970; platform_machine!='aarch64'
types-dataclasses>=0.1.3; python_version < "3.7" types-dataclasses>=0.1.3; python_version < "3.7"
types-mock>=0.1.1 types-mock>=0.1.1
types-requests types-requests

View File

@ -38,7 +38,7 @@ setup_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=8.0.14,<8.1.0 thinc>=8.1.0,<8.2.0
install_requires = install_requires =
# Our libraries # Our libraries
spacy-legacy>=3.0.9,<3.1.0 spacy-legacy>=3.0.9,<3.1.0
@ -46,8 +46,7 @@ install_requires =
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.14,<8.1.0 thinc>=8.1.0,<8.2.0
blis>=0.4.0,<0.8.0
wasabi>=0.9.1,<1.1.0 wasabi>=0.9.1,<1.1.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
@ -57,7 +56,7 @@ install_requires =
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
jinja2 jinja2
# Official Python utilities # Official Python utilities
setuptools setuptools
@ -104,8 +103,12 @@ cuda114 =
cupy-cuda114>=5.0.0b4,<11.0.0 cupy-cuda114>=5.0.0b4,<11.0.0
cuda115 = cuda115 =
cupy-cuda115>=5.0.0b4,<11.0.0 cupy-cuda115>=5.0.0b4,<11.0.0
cuda116 =
cupy-cuda116>=5.0.0b4,<11.0.0
cuda117 =
cupy-cuda117>=5.0.0b4,<11.0.0
apple = apple =
thinc-apple-ops>=0.0.4,<1.0.0 thinc-apple-ops>=0.1.0.dev0,<1.0.0
# Language tokenizers with external dependencies # Language tokenizers with external dependencies
ja = ja =
sudachipy>=0.5.2,!=0.6.1 sudachipy>=0.5.2,!=0.6.1

View File

@ -126,6 +126,8 @@ class build_ext_options:
class build_ext_subclass(build_ext, build_ext_options): class build_ext_subclass(build_ext, build_ext_options):
def build_extensions(self): def build_extensions(self):
if self.parallel is None and os.environ.get("SPACY_NUM_BUILD_JOBS") is not None:
self.parallel = int(os.environ.get("SPACY_NUM_BUILD_JOBS"))
build_ext_options.build_options(self) build_ext_options.build_options(self)
build_ext.build_extensions(self) build_ext.build_extensions(self)
@ -206,7 +208,11 @@ def setup_package():
for name in MOD_NAMES: for name in MOD_NAMES:
mod_path = name.replace(".", "/") + ".pyx" mod_path = name.replace(".", "/") + ".pyx"
ext = Extension( ext = Extension(
name, [mod_path], language="c++", include_dirs=include_dirs, extra_compile_args=["-std=c++11"] name,
[mod_path],
language="c++",
include_dirs=include_dirs,
extra_compile_args=["-std=c++11"],
) )
ext_modules.append(ext) ext_modules.append(ext)
print("Cythonizing sources") print("Cythonizing sources")

View File

@ -32,6 +32,7 @@ def load(
*, *,
vocab: Union[Vocab, bool] = True, vocab: Union[Vocab, bool] = True,
disable: Iterable[str] = util.SimpleFrozenList(), disable: Iterable[str] = util.SimpleFrozenList(),
enable: Iterable[str] = util.SimpleFrozenList(),
exclude: Iterable[str] = util.SimpleFrozenList(), exclude: Iterable[str] = util.SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
) -> Language: ) -> Language:
@ -42,6 +43,8 @@ def load(
disable (Iterable[str]): Names of pipeline components to disable. Disabled disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe. enable them by calling nlp.enable_pipe.
enable (Iterable[str]): Names of pipeline components to enable. All other
pipes will be disabled (but can be enabled later using nlp.enable_pipe).
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded. components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict config (Dict[str, Any] / Config): Config overrides as nested dict or dict
@ -49,7 +52,12 @@ def load(
RETURNS (Language): The loaded nlp object. RETURNS (Language): The loaded nlp object.
""" """
return util.load_model( return util.load_model(
name, vocab=vocab, disable=disable, exclude=exclude, config=config name,
vocab=vocab,
disable=disable,
enable=enable,
exclude=exclude,
config=config,
) )

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "3.3.1" __version__ = "3.4.0"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects" __projects__ = "https://github.com/explosion/projects"

View File

@ -12,7 +12,7 @@ from click.parser import split_arg_string
from typer.main import get_command from typer.main import get_command
from contextlib import contextmanager from contextlib import contextmanager
from thinc.api import Config, ConfigValidationError, require_gpu from thinc.api import Config, ConfigValidationError, require_gpu
from thinc.util import has_cupy, gpu_is_available from thinc.util import gpu_is_available
from configparser import InterpolationError from configparser import InterpolationError
import os import os
@ -462,6 +462,23 @@ def git_sparse_checkout(repo, subpath, dest, branch):
shutil.move(str(source_path), str(dest)) shutil.move(str(source_path), str(dest))
def git_repo_branch_exists(repo: str, branch: str) -> bool:
"""Uses 'git ls-remote' to check if a repository and branch exists
repo (str): URL to get repo.
branch (str): Branch on repo to check.
RETURNS (bool): True if repo:branch exists.
"""
get_git_version()
cmd = f"git ls-remote {repo} {branch}"
# We might be tempted to use `--exit-code` with `git ls-remote`, but
# `run_command` handles the `returncode` for us, so we'll rely on
# the fact that stdout returns '' if the requested branch doesn't exist
ret = run_command(cmd, capture=True)
exists = ret.stdout != ""
return exists
def get_git_version( def get_git_version(
error: str = "Could not run 'git'. Make sure it's installed and the executable is available.", error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
) -> Tuple[int, int]: ) -> Tuple[int, int]:
@ -554,5 +571,5 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
require_gpu(use_gpu) require_gpu(use_gpu)
else: else:
local_msg.info("Using CPU") local_msg.info("Using CPU")
if has_cupy and gpu_is_available(): if gpu_is_available():
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0") local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")

View File

@ -10,7 +10,7 @@ import math
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli from ._util import import_code, debug_cli
from ..training import Example from ..training import Example, remove_bilu_prefix
from ..training.initialize import get_sourced_components from ..training.initialize import get_sourced_components
from ..schemas import ConfigSchemaTraining from ..schemas import ConfigSchemaTraining
from ..pipeline._parser_internals import nonproj from ..pipeline._parser_internals import nonproj
@ -361,7 +361,7 @@ def debug_data(
if label != "-" if label != "-"
] ]
labels_with_counts = _format_labels(labels_with_counts, counts=True) labels_with_counts = _format_labels(labels_with_counts, counts=True)
msg.text(f"Labels in train data: {_format_labels(labels)}", show=verbose) msg.text(f"Labels in train data: {labels_with_counts}", show=verbose)
missing_labels = model_labels - labels missing_labels = model_labels - labels
if missing_labels: if missing_labels:
msg.warn( msg.warn(
@ -758,9 +758,9 @@ def _compile_gold(
# "Illegal" whitespace entity # "Illegal" whitespace entity
data["ws_ents"] += 1 data["ws_ents"] += 1
if label.startswith(("B-", "U-")): if label.startswith(("B-", "U-")):
combined_label = label.split("-")[1] combined_label = remove_bilu_prefix(label)
data["ner"][combined_label] += 1 data["ner"][combined_label] += 1
if sent_starts[i] == True and label.startswith(("I-", "L-")): if sent_starts[i] and label.startswith(("I-", "L-")):
data["boundary_cross_ents"] += 1 data["boundary_cross_ents"] += 1
elif label == "-": elif label == "-":
data["ner"]["-"] += 1 data["ner"]["-"] += 1
@ -908,7 +908,7 @@ def _get_examples_without_label(
for eg in data: for eg in data:
if component == "ner": if component == "ner":
labels = [ labels = [
label.split("-")[1] remove_bilu_prefix(label)
for label in eg.get_aligned_ner() for label in eg.get_aligned_ner()
if label not in ("O", "-", None) if label not in ("O", "-", None)
] ]

View File

@ -10,6 +10,7 @@ from jinja2 import Template
from .. import util from .. import util
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
from ..schemas import RecommendationSchema from ..schemas import RecommendationSchema
from ..util import SimpleFrozenList
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
from ._util import string_to_list, import_code from ._util import string_to_list, import_code
@ -24,16 +25,30 @@ class Optimizations(str, Enum):
accuracy = "accuracy" accuracy = "accuracy"
class InitValues:
"""
Default values for initialization. Dedicated class to allow synchronized default values for init_config_cli() and
init_config(), i.e. initialization calls via CLI respectively Python.
"""
lang = "en"
pipeline = SimpleFrozenList(["tagger", "parser", "ner"])
optimize = Optimizations.efficiency
gpu = False
pretraining = False
force_overwrite = False
@init_cli.command("config") @init_cli.command("config")
def init_config_cli( def init_config_cli(
# fmt: off # fmt: off
output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True), output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
lang: str = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"), lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"),
pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"), pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."), optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."), gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"), pretraining: bool = Opt(InitValues.pretraining, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"), force_overwrite: bool = Opt(InitValues.force_overwrite, "--force", "-F", help="Force overwriting the output file"),
# fmt: on # fmt: on
): ):
""" """
@ -133,11 +148,11 @@ def fill_config(
def init_config( def init_config(
*, *,
lang: str, lang: str = InitValues.lang,
pipeline: List[str], pipeline: List[str] = InitValues.pipeline,
optimize: str, optimize: str = InitValues.optimize,
gpu: bool, gpu: bool = InitValues.gpu,
pretraining: bool = False, pretraining: bool = InitValues.pretraining,
silent: bool = True, silent: bool = True,
) -> Config: ) -> Config:
msg = Printer(no_print=silent) msg = Printer(no_print=silent)

View File

@ -7,11 +7,11 @@ import re
from ... import about from ... import about
from ...util import ensure_path from ...util import ensure_path
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
from .._util import git_checkout, get_git_version from .._util import git_checkout, get_git_version, git_repo_branch_exists
DEFAULT_REPO = about.__projects__ DEFAULT_REPO = about.__projects__
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__ DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
DEFAULT_BRANCH = "master" DEFAULT_BRANCHES = ["main", "master"]
@project_cli.command("clone") @project_cli.command("clone")
@ -20,7 +20,7 @@ def project_clone_cli(
name: str = Arg(..., help="The name of the template to clone"), name: str = Arg(..., help="The name of the template to clone"),
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False), dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"), repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
branch: Optional[str] = Opt(None, "--branch", "-b", help="The branch to clone from"), branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.") sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
# fmt: on # fmt: on
): ):
@ -33,9 +33,25 @@ def project_clone_cli(
""" """
if dest is None: if dest is None:
dest = Path.cwd() / Path(name).parts[-1] dest = Path.cwd() / Path(name).parts[-1]
if repo == DEFAULT_REPO and branch is None:
branch = DEFAULT_PROJECTS_BRANCH
if branch is None: if branch is None:
# If it's a user repo, we want to default to other branch for default_branch in DEFAULT_BRANCHES:
branch = DEFAULT_PROJECTS_BRANCH if repo == DEFAULT_REPO else DEFAULT_BRANCH if git_repo_branch_exists(repo, default_branch):
branch = default_branch
break
if branch is None:
default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
msg.fail(
"No branch provided and attempted default "
f"branches {default_branches_msg} do not exist.",
exits=1,
)
else:
if not git_repo_branch_exists(repo, branch):
msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
assert isinstance(branch, str)
project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout) project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
@ -61,9 +77,9 @@ def project_clone(
try: try:
git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout) git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
except subprocess.CalledProcessError: except subprocess.CalledProcessError:
err = f"Could not clone '{name}' from repo '{repo_name}'" err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
msg.fail(err, exits=1) msg.fail(err, exits=1)
msg.good(f"Cloned '{name}' from {repo_name}", project_dir) msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
if not (project_dir / PROJECT_FILE).exists(): if not (project_dir / PROJECT_FILE).exists():
msg.warn(f"No {PROJECT_FILE} found in directory") msg.warn(f"No {PROJECT_FILE} found in directory")
else: else:

View File

@ -64,8 +64,11 @@ class SpanRenderer:
# Set up how the text and labels will be rendered # Set up how the text and labels will be rendered
self.direction = DEFAULT_DIR self.direction = DEFAULT_DIR
self.lang = DEFAULT_LANG self.lang = DEFAULT_LANG
# These values are in px
self.top_offset = options.get("top_offset", 40) self.top_offset = options.get("top_offset", 40)
self.top_offset_step = options.get("top_offset_step", 17) # This is how far under the top offset the span labels appear
self.span_label_offset = options.get("span_label_offset", 20)
self.offset_step = options.get("top_offset_step", 17)
# Set up which templates will be used # Set up which templates will be used
template = options.get("template") template = options.get("template")
@ -127,26 +130,56 @@ class SpanRenderer:
title (str / None): Document title set in Doc.user_data['title']. title (str / None): Document title set in Doc.user_data['title'].
""" """
per_token_info = [] per_token_info = []
# we must sort so that we can correctly describe when spans need to "stack"
# which is determined by their start token, then span length (longer spans on top),
# then break any remaining ties with the span label
spans = sorted(
spans,
key=lambda s: (
s["start_token"],
-(s["end_token"] - s["start_token"]),
s["label"],
),
)
for s in spans:
# this is the vertical 'slot' that the span will be rendered in
# vertical_position = span_label_offset + (offset_step * (slot - 1))
s["render_slot"] = 0
for idx, token in enumerate(tokens): for idx, token in enumerate(tokens):
# Identify if a token belongs to a Span (and which) and if it's a # Identify if a token belongs to a Span (and which) and if it's a
# start token of said Span. We'll use this for the final HTML render # start token of said Span. We'll use this for the final HTML render
token_markup: Dict[str, Any] = {} token_markup: Dict[str, Any] = {}
token_markup["text"] = token token_markup["text"] = token
concurrent_spans = 0
entities = [] entities = []
for span in spans: for span in spans:
ent = {} ent = {}
if span["start_token"] <= idx < span["end_token"]: if span["start_token"] <= idx < span["end_token"]:
concurrent_spans += 1
span_start = idx == span["start_token"]
ent["label"] = span["label"] ent["label"] = span["label"]
ent["is_start"] = True if idx == span["start_token"] else False ent["is_start"] = span_start
if span_start:
# When the span starts, we need to know how many other
# spans are on the 'span stack' and will be rendered.
# This value becomes the vertical render slot for this entire span
span["render_slot"] = concurrent_spans
ent["render_slot"] = span["render_slot"]
kb_id = span.get("kb_id", "") kb_id = span.get("kb_id", "")
kb_url = span.get("kb_url", "#") kb_url = span.get("kb_url", "#")
ent["kb_link"] = ( ent["kb_link"] = (
TPL_KB_LINK.format(kb_id=kb_id, kb_url=kb_url) if kb_id else "" TPL_KB_LINK.format(kb_id=kb_id, kb_url=kb_url) if kb_id else ""
) )
entities.append(ent) entities.append(ent)
else:
# We don't specifically need to do this since we loop
# over tokens and spans sorted by their start_token,
# so we'll never use a span again after the last token it appears in,
# but if we were to use these spans again we'd want to make sure
# this value was reset correctly.
span["render_slot"] = 0
token_markup["entities"] = entities token_markup["entities"] = entities
per_token_info.append(token_markup) per_token_info.append(token_markup)
markup = self._render_markup(per_token_info) markup = self._render_markup(per_token_info)
markup = TPL_SPANS.format(content=markup, dir=self.direction) markup = TPL_SPANS.format(content=markup, dir=self.direction)
if title: if title:
@ -157,12 +190,24 @@ class SpanRenderer:
"""Render the markup from per-token information""" """Render the markup from per-token information"""
markup = "" markup = ""
for token in per_token_info: for token in per_token_info:
entities = sorted(token["entities"], key=lambda d: d["label"]) entities = sorted(token["entities"], key=lambda d: d["render_slot"])
if entities: # Whitespace tokens disrupt the vertical space (no line height) so that the
# span indicators get misaligned. We don't render them as individual
# tokens anyway, so we'll just not display a span indicator either.
is_whitespace = token["text"].strip() == ""
if entities and not is_whitespace:
slices = self._get_span_slices(token["entities"]) slices = self._get_span_slices(token["entities"])
starts = self._get_span_starts(token["entities"]) starts = self._get_span_starts(token["entities"])
total_height = (
self.top_offset
+ self.span_label_offset
+ (self.offset_step * (len(entities) - 1))
)
markup += self.span_template.format( markup += self.span_template.format(
text=token["text"], span_slices=slices, span_starts=starts text=token["text"],
span_slices=slices,
span_starts=starts,
total_height=total_height,
) )
else: else:
markup += escape_html(token["text"] + " ") markup += escape_html(token["text"] + " ")
@ -171,10 +216,18 @@ class SpanRenderer:
def _get_span_slices(self, entities: List[Dict]) -> str: def _get_span_slices(self, entities: List[Dict]) -> str:
"""Get the rendered markup of all Span slices""" """Get the rendered markup of all Span slices"""
span_slices = [] span_slices = []
for entity, step in zip(entities, itertools.count(step=self.top_offset_step)): for entity in entities:
# rather than iterate over multiples of offset_step, we use entity['render_slot']
# to determine the vertical position, since that tells where
# the span starts vertically so we can extend it horizontally,
# past other spans that might have already ended
color = self.colors.get(entity["label"].upper(), self.default_color) color = self.colors.get(entity["label"].upper(), self.default_color)
top_offset = self.top_offset + (
self.offset_step * (entity["render_slot"] - 1)
)
span_slice = self.span_slice_template.format( span_slice = self.span_slice_template.format(
bg=color, top_offset=self.top_offset + step bg=color,
top_offset=top_offset,
) )
span_slices.append(span_slice) span_slices.append(span_slice)
return "".join(span_slices) return "".join(span_slices)
@ -182,12 +235,15 @@ class SpanRenderer:
def _get_span_starts(self, entities: List[Dict]) -> str: def _get_span_starts(self, entities: List[Dict]) -> str:
"""Get the rendered markup of all Span start tokens""" """Get the rendered markup of all Span start tokens"""
span_starts = [] span_starts = []
for entity, step in zip(entities, itertools.count(step=self.top_offset_step)): for entity in entities:
color = self.colors.get(entity["label"].upper(), self.default_color) color = self.colors.get(entity["label"].upper(), self.default_color)
top_offset = self.top_offset + (
self.offset_step * (entity["render_slot"] - 1)
)
span_start = ( span_start = (
self.span_start_template.format( self.span_start_template.format(
bg=color, bg=color,
top_offset=self.top_offset + step, top_offset=top_offset,
label=entity["label"], label=entity["label"],
kb_link=entity["kb_link"], kb_link=entity["kb_link"],
) )

View File

@ -67,7 +67,7 @@ TPL_SPANS = """
""" """
TPL_SPAN = """ TPL_SPAN = """
<span style="font-weight: bold; display: inline-block; position: relative;"> <span style="font-weight: bold; display: inline-block; position: relative; height: {total_height}px;">
{text} {text}
{span_slices} {span_slices}
{span_starts} {span_starts}

View File

@ -1,4 +1,5 @@
import warnings import warnings
from .compat import Literal
class ErrorsWithCodes(type): class ErrorsWithCodes(type):
@ -26,7 +27,10 @@ def setup_default_warnings():
filter_warning("once", error_msg="[W114]") filter_warning("once", error_msg="[W114]")
def filter_warning(action: str, error_msg: str): def filter_warning(
action: Literal["default", "error", "ignore", "always", "module", "once"],
error_msg: str,
):
"""Customize how spaCy should handle a certain warning. """Customize how spaCy should handle a certain warning.
error_msg (str): e.g. "W006", or a full error message error_msg (str): e.g. "W006", or a full error message
@ -205,6 +209,9 @@ class Warnings(metaclass=ErrorsWithCodes):
"Only the last span group will be loaded under " "Only the last span group will be loaded under "
"Doc.spans['{group_name}']. Skipping span group with values: " "Doc.spans['{group_name}']. Skipping span group with values: "
"{group_values}") "{group_values}")
W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'")
W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class "
"is a Cython extension type.")
class Errors(metaclass=ErrorsWithCodes): class Errors(metaclass=ErrorsWithCodes):
@ -928,6 +935,10 @@ class Errors(metaclass=ErrorsWithCodes):
E1040 = ("Doc.from_json requires all tokens to have the same attributes. " E1040 = ("Doc.from_json requires all tokens to have the same attributes. "
"Some tokens do not contain annotation for: {partial_attrs}") "Some tokens do not contain annotation for: {partial_attrs}")
E1041 = ("Expected a string, Doc, or bytes as input, but got: {type}") E1041 = ("Expected a string, Doc, or bytes as input, but got: {type}")
E1042 = ("Function was called with `{arg1}`={arg1_values} and "
"`{arg2}`={arg2_values} but these arguments are conflicting.")
E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
"{value}.")
# Deprecated model shortcuts, only used in errors and warnings # Deprecated model shortcuts, only used in errors and warnings

View File

@ -93,14 +93,14 @@ cdef class KnowledgeBase:
self.vocab = vocab self.vocab = vocab
self._create_empty_vectors(dummy_hash=self.vocab.strings[""]) self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
def initialize_entities(self, int64_t nr_entities): def _initialize_entities(self, int64_t nr_entities):
self._entry_index = PreshMap(nr_entities + 1) self._entry_index = PreshMap(nr_entities + 1)
self._entries = entry_vec(nr_entities + 1) self._entries = entry_vec(nr_entities + 1)
def initialize_vectors(self, int64_t nr_entities): def _initialize_vectors(self, int64_t nr_entities):
self._vectors_table = float_matrix(nr_entities + 1) self._vectors_table = float_matrix(nr_entities + 1)
def initialize_aliases(self, int64_t nr_aliases): def _initialize_aliases(self, int64_t nr_aliases):
self._alias_index = PreshMap(nr_aliases + 1) self._alias_index = PreshMap(nr_aliases + 1)
self._aliases_table = alias_vec(nr_aliases + 1) self._aliases_table = alias_vec(nr_aliases + 1)
@ -155,8 +155,8 @@ cdef class KnowledgeBase:
raise ValueError(Errors.E140) raise ValueError(Errors.E140)
nr_entities = len(set(entity_list)) nr_entities = len(set(entity_list))
self.initialize_entities(nr_entities) self._initialize_entities(nr_entities)
self.initialize_vectors(nr_entities) self._initialize_vectors(nr_entities)
i = 0 i = 0
cdef KBEntryC entry cdef KBEntryC entry
@ -388,9 +388,9 @@ cdef class KnowledgeBase:
nr_entities = header[0] nr_entities = header[0]
nr_aliases = header[1] nr_aliases = header[1]
entity_vector_length = header[2] entity_vector_length = header[2]
self.initialize_entities(nr_entities) self._initialize_entities(nr_entities)
self.initialize_vectors(nr_entities) self._initialize_vectors(nr_entities)
self.initialize_aliases(nr_aliases) self._initialize_aliases(nr_aliases)
self.entity_vector_length = entity_vector_length self.entity_vector_length = entity_vector_length
def deserialize_vectors(b): def deserialize_vectors(b):
@ -512,8 +512,8 @@ cdef class KnowledgeBase:
cdef int64_t entity_vector_length cdef int64_t entity_vector_length
reader.read_header(&nr_entities, &entity_vector_length) reader.read_header(&nr_entities, &entity_vector_length)
self.initialize_entities(nr_entities) self._initialize_entities(nr_entities)
self.initialize_vectors(nr_entities) self._initialize_vectors(nr_entities)
self.entity_vector_length = entity_vector_length self.entity_vector_length = entity_vector_length
# STEP 1: load entity vectors # STEP 1: load entity vectors
@ -552,7 +552,7 @@ cdef class KnowledgeBase:
# STEP 3: load aliases # STEP 3: load aliases
cdef int64_t nr_aliases cdef int64_t nr_aliases
reader.read_alias_length(&nr_aliases) reader.read_alias_length(&nr_aliases)
self.initialize_aliases(nr_aliases) self._initialize_aliases(nr_aliases)
cdef int64_t nr_candidates cdef int64_t nr_candidates
cdef vector[int64_t] entry_indices cdef vector[int64_t] entry_indices

View File

@ -2,7 +2,8 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
from ...language import Language, BaseDefaults from ...language import Language, BaseDefaults
from ...attrs import LANG from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
@ -16,6 +17,8 @@ class BulgarianDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
class Bulgarian(Language): class Bulgarian(Language):

View File

@ -258,6 +258,10 @@ ALPHA = group_chars(
ALPHA_LOWER = group_chars(_lower + _uncased) ALPHA_LOWER = group_chars(_lower + _uncased)
ALPHA_UPPER = group_chars(_upper + _uncased) ALPHA_UPPER = group_chars(_upper + _uncased)
_combining_diacritics = r"\u0300-\u036f"
COMBINING_DIACRITICS = _combining_diacritics
_units = ( _units = (
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft " "km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
"kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb " "kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb "

View File

@ -35,7 +35,7 @@ for pron in ["i"]:
_exc[orth + "m"] = [ _exc[orth + "m"] = [
{ORTH: orth, NORM: pron}, {ORTH: orth, NORM: pron},
{ORTH: "m", "tenspect": 1, "number": 1}, {ORTH: "m"},
] ]
_exc[orth + "'ma"] = [ _exc[orth + "'ma"] = [
@ -139,20 +139,21 @@ for pron in ["he", "she", "it"]:
# W-words, relative pronouns, prepositions etc. # W-words, relative pronouns, prepositions etc.
for word in [ for word, morph in [
"who", ("who", None),
"what", ("what", None),
"when", ("when", None),
"where", ("where", None),
"why", ("why", None),
"how", ("how", None),
"there", ("there", None),
"that", ("that", "Number=Sing|Person=3"),
"this", ("this", "Number=Sing|Person=3"),
"these", ("these", "Number=Plur|Person=3"),
"those", ("those", "Number=Plur|Person=3"),
]: ]:
for orth in [word, word.title()]: for orth in [word, word.title()]:
if morph != "Number=Plur|Person=3":
_exc[orth + "'s"] = [ _exc[orth + "'s"] = [
{ORTH: orth, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "'s", NORM: "'s"}, {ORTH: "'s", NORM: "'s"},
@ -182,6 +183,7 @@ for word in [
{ORTH: "ve", NORM: "have"}, {ORTH: "ve", NORM: "have"},
] ]
if morph != "Number=Sing|Person=3":
_exc[orth + "'re"] = [ _exc[orth + "'re"] = [
{ORTH: orth, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "'re", NORM: "are"}, {ORTH: "'re", NORM: "are"},

View File

@ -1,5 +1,5 @@
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS, COMBINING_DIACRITICS
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
@ -44,3 +44,23 @@ TOKENIZER_INFIXES = (
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
] ]
) )
# Some languages e.g. written with the Cyrillic alphabet permit the use of diacritics
# to mark stressed syllables in words where stress is distinctive. Such languages
# should use the COMBINING_DIACRITICS... suffix and infix regex lists in
# place of the standard ones.
COMBINING_DIACRITICS_TOKENIZER_SUFFIXES = list(TOKENIZER_SUFFIXES) + [
r"(?<=[{a}][{d}])\.".format(a=ALPHA, d=COMBINING_DIACRITICS),
]
COMBINING_DIACRITICS_TOKENIZER_INFIXES = list(TOKENIZER_INFIXES) + [
r"(?<=[{al}][{d}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES, d=COMBINING_DIACRITICS
),
r"(?<=[{a}][{d}]),(?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
r"(?<=[{a}][{d}])(?:{h})(?=[{a}])".format(
a=ALPHA, d=COMBINING_DIACRITICS, h=HYPHENS
),
r"(?<=[{a}][{d}])[:<>=/](?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
]

View File

@ -5,6 +5,8 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer from .lemmatizer import RussianLemmatizer
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
from ...language import Language, BaseDefaults from ...language import Language, BaseDefaults
@ -12,6 +14,8 @@ class RussianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
class Russian(Language): class Russian(Language):

View File

@ -6,6 +6,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import UkrainianLemmatizer from .lemmatizer import UkrainianLemmatizer
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
from ...language import Language, BaseDefaults from ...language import Language, BaseDefaults
@ -13,6 +15,8 @@ class UkrainianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
class Ukrainian(Language): class Ukrainian(Language):

View File

@ -1,4 +1,4 @@
from typing import Iterator, Optional, Any, Dict, Callable, Iterable from typing import Iterator, Optional, Any, Dict, Callable, Iterable, Collection
from typing import Union, Tuple, List, Set, Pattern, Sequence from typing import Union, Tuple, List, Set, Pattern, Sequence
from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
@ -1694,6 +1694,7 @@ class Language:
*, *,
vocab: Union[Vocab, bool] = True, vocab: Union[Vocab, bool] = True,
disable: Iterable[str] = SimpleFrozenList(), disable: Iterable[str] = SimpleFrozenList(),
enable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(), exclude: Iterable[str] = SimpleFrozenList(),
meta: Dict[str, Any] = SimpleFrozenDict(), meta: Dict[str, Any] = SimpleFrozenDict(),
auto_fill: bool = True, auto_fill: bool = True,
@ -1708,6 +1709,8 @@ class Language:
disable (Iterable[str]): Names of pipeline components to disable. disable (Iterable[str]): Names of pipeline components to disable.
Disabled pipes will be loaded but they won't be run unless you Disabled pipes will be loaded but they won't be run unless you
explicitly enable them by calling nlp.enable_pipe. explicitly enable them by calling nlp.enable_pipe.
enable (Iterable[str]): Names of pipeline components to enable. All other
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
exclude (Iterable[str]): Names of pipeline components to exclude. exclude (Iterable[str]): Names of pipeline components to exclude.
Excluded components won't be loaded. Excluded components won't be loaded.
meta (Dict[str, Any]): Meta overrides for nlp.meta. meta (Dict[str, Any]): Meta overrides for nlp.meta.
@ -1861,8 +1864,15 @@ class Language:
# Restore the original vocab after sourcing if necessary # Restore the original vocab after sourcing if necessary
if vocab_b is not None: if vocab_b is not None:
nlp.vocab.from_bytes(vocab_b) nlp.vocab.from_bytes(vocab_b)
disabled_pipes = [*config["nlp"]["disabled"], *disable]
# Resolve disabled/enabled settings.
disabled_pipes = cls._resolve_component_status(
[*config["nlp"]["disabled"], *disable],
[*config["nlp"].get("enabled", []), *enable],
config["nlp"]["pipeline"],
)
nlp._disabled = set(p for p in disabled_pipes if p not in exclude) nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
nlp.batch_size = config["nlp"]["batch_size"] nlp.batch_size = config["nlp"]["batch_size"]
nlp.config = filled if auto_fill else config nlp.config = filled if auto_fill else config
if after_pipeline_creation is not None: if after_pipeline_creation is not None:
@ -2014,6 +2024,42 @@ class Language:
serializers["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude) serializers["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
util.to_disk(path, serializers, exclude) util.to_disk(path, serializers, exclude)
@staticmethod
def _resolve_component_status(
disable: Iterable[str], enable: Iterable[str], pipe_names: Collection[str]
) -> Tuple[str, ...]:
"""Derives whether (1) `disable` and `enable` values are consistent and (2)
resolves those to a single set of disabled components. Raises an error in
case of inconsistency.
disable (Iterable[str]): Names of components or serialization fields to disable.
enable (Iterable[str]): Names of pipeline components to enable.
pipe_names (Iterable[str]): Names of all pipeline components.
RETURNS (Tuple[str, ...]): Names of components to exclude from pipeline w.r.t.
specified includes and excludes.
"""
if disable is not None and isinstance(disable, str):
disable = [disable]
to_disable = disable
if enable:
to_disable = [
pipe_name for pipe_name in pipe_names if pipe_name not in enable
]
if disable and disable != to_disable:
raise ValueError(
Errors.E1042.format(
arg1="enable",
arg2="disable",
arg1_values=enable,
arg2_values=disable,
)
)
return tuple(to_disable)
def from_disk( def from_disk(
self, self,
path: Union[str, Path], path: Union[str, Path],

View File

@ -85,7 +85,7 @@ class Table(OrderedDict):
value: The value to set. value: The value to set.
""" """
key = get_string_id(key) key = get_string_id(key)
OrderedDict.__setitem__(self, key, value) OrderedDict.__setitem__(self, key, value) # type:ignore[assignment]
self.bloom.add(key) self.bloom.add(key)
def set(self, key: Union[str, int], value: Any) -> None: def set(self, key: Union[str, int], value: Any) -> None:
@ -104,7 +104,7 @@ class Table(OrderedDict):
RETURNS: The value. RETURNS: The value.
""" """
key = get_string_id(key) key = get_string_id(key)
return OrderedDict.__getitem__(self, key) return OrderedDict.__getitem__(self, key) # type:ignore[index]
def get(self, key: Union[str, int], default: Optional[Any] = None) -> Any: def get(self, key: Union[str, int], default: Optional[Any] = None) -> Any:
"""Get the value for a given key. String keys will be hashed. """Get the value for a given key. String keys will be hashed.
@ -114,7 +114,7 @@ class Table(OrderedDict):
RETURNS: The value. RETURNS: The value.
""" """
key = get_string_id(key) key = get_string_id(key)
return OrderedDict.get(self, key, default) return OrderedDict.get(self, key, default) # type:ignore[arg-type]
def __contains__(self, key: Union[str, int]) -> bool: # type: ignore[override] def __contains__(self, key: Union[str, int]) -> bool: # type: ignore[override]
"""Check whether a key is in the table. String keys will be hashed. """Check whether a key is in the table. String keys will be hashed.

View File

@ -90,6 +90,10 @@ cdef class Matcher:
'?': Make the pattern optional, by allowing it to match 0 or 1 times. '?': Make the pattern optional, by allowing it to match 0 or 1 times.
'+': Require the pattern to match 1 or more times. '+': Require the pattern to match 1 or more times.
'*': Allow the pattern to zero or more times. '*': Allow the pattern to zero or more times.
'{n}': Require the pattern to match exactly _n_ times.
'{n,m}': Require the pattern to match at least _n_ but not more than _m_ times.
'{n,}': Require the pattern to match at least _n_ times.
'{,m}': Require the pattern to match at most _m_ times.
The + and * operators return all possible matches (not just the greedy The + and * operators return all possible matches (not just the greedy
ones). However, the "greedy" argument can filter the final matches ones). However, the "greedy" argument can filter the final matches
@ -1004,8 +1008,29 @@ def _get_operators(spec):
return (ONE,) return (ONE,)
elif spec["OP"] in lookup: elif spec["OP"] in lookup:
return lookup[spec["OP"]] return lookup[spec["OP"]]
#Min_max {n,m}
elif spec["OP"].startswith("{") and spec["OP"].endswith("}"):
# {n} --> {n,n} exactly n ONE,(n)
# {n,m}--> {n,m} min of n, max of m ONE,(n),ZERO_ONE,(m)
# {,m} --> {0,m} min of zero, max of m ZERO_ONE,(m)
# {n,} --> {n,∞} min of n, max of inf ONE,(n),ZERO_PLUS
min_max = spec["OP"][1:-1]
min_max = min_max if "," in min_max else f"{min_max},{min_max}"
n, m = min_max.split(",")
#1. Either n or m is a blank string and the other is numeric -->isdigit
#2. Both are numeric and n <= m
if (not n.isdecimal() and not m.isdecimal()) or (n.isdecimal() and m.isdecimal() and int(n) > int(m)):
keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
# if n is empty string, zero would be used
head = tuple(ONE for __ in range(int(n or 0)))
tail = tuple(ZERO_ONE for __ in range(int(m) - int(n or 0))) if m else (ZERO_PLUS,)
return head + tail
else: else:
keys = ", ".join(lookup.keys()) keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys)) raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))

View File

@ -22,9 +22,11 @@ def forward(model, X, is_train):
nP = model.get_dim("nP") nP = model.get_dim("nP")
nI = model.get_dim("nI") nI = model.get_dim("nI")
W = model.get_param("W") W = model.get_param("W")
Yf = model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True) # Preallocate array for layer output, including padding.
Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
Yf = Yf.reshape((Yf.shape[0], nF, nO, nP)) Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
Yf = model.ops.xp.vstack((model.get_param("pad"), Yf)) Yf[0] = model.get_param("pad")
def backward(dY_ids): def backward(dY_ids):
# This backprop is particularly tricky, because we get back a different # This backprop is particularly tricky, because we get back a different

View File

@ -1,9 +1,14 @@
from functools import partial from typing import Type, Callable, Dict, TYPE_CHECKING, List, Optional, Set
from typing import Type, Callable, TYPE_CHECKING import functools
import inspect
import types
import warnings
from thinc.layers import with_nvtx_range from thinc.layers import with_nvtx_range
from thinc.model import Model, wrap_model_recursive from thinc.model import Model, wrap_model_recursive
from thinc.util import use_nvtx_range
from ..errors import Warnings
from ..util import registry from ..util import registry
if TYPE_CHECKING: if TYPE_CHECKING:
@ -11,29 +16,106 @@ if TYPE_CHECKING:
from ..language import Language # noqa: F401 from ..language import Language # noqa: F401
@registry.callbacks("spacy.models_with_nvtx_range.v1") DEFAULT_NVTX_ANNOTATABLE_PIPE_METHODS = [
def create_models_with_nvtx_range( "pipe",
forward_color: int = -1, backprop_color: int = -1 "predict",
) -> Callable[["Language"], "Language"]: "set_annotations",
def models_with_nvtx_range(nlp): "update",
"rehearse",
"get_loss",
"initialize",
"begin_update",
"finish_update",
"update",
]
def models_with_nvtx_range(nlp, forward_color: int, backprop_color: int):
pipes = [ pipes = [
pipe pipe
for _, pipe in nlp.components for _, pipe in nlp.components
if hasattr(pipe, "is_trainable") and pipe.is_trainable if hasattr(pipe, "is_trainable") and pipe.is_trainable
] ]
# We need process all models jointly to avoid wrapping callbacks twice. seen_models: Set[int] = set()
models = Model( for pipe in pipes:
"wrap_with_nvtx_range", for node in pipe.model.walk():
forward=lambda model, X, is_train: ..., if id(node) in seen_models:
layers=[pipe.model for pipe in pipes], continue
) seen_models.add(id(node))
for node in models.walk():
with_nvtx_range( with_nvtx_range(
node, forward_color=forward_color, backprop_color=backprop_color node, forward_color=forward_color, backprop_color=backprop_color
) )
return nlp return nlp
return models_with_nvtx_range
@registry.callbacks("spacy.models_with_nvtx_range.v1")
def create_models_with_nvtx_range(
forward_color: int = -1, backprop_color: int = -1
) -> Callable[["Language"], "Language"]:
return functools.partial(
models_with_nvtx_range,
forward_color=forward_color,
backprop_color=backprop_color,
)
def nvtx_range_wrapper_for_pipe_method(self, func, *args, **kwargs):
if isinstance(func, functools.partial):
return func(*args, **kwargs)
else:
with use_nvtx_range(f"{self.name} {func.__name__}"):
return func(*args, **kwargs)
def pipes_with_nvtx_range(
nlp, additional_pipe_functions: Optional[Dict[str, List[str]]]
):
for _, pipe in nlp.components:
if additional_pipe_functions:
extra_funcs = additional_pipe_functions.get(pipe.name, [])
else:
extra_funcs = []
for name in DEFAULT_NVTX_ANNOTATABLE_PIPE_METHODS + extra_funcs:
func = getattr(pipe, name, None)
if func is None:
if name in extra_funcs:
warnings.warn(Warnings.W121.format(method=name, pipe=pipe.name))
continue
wrapped_func = functools.partial(
types.MethodType(nvtx_range_wrapper_for_pipe_method, pipe), func
)
# Try to preserve the original function signature.
try:
wrapped_func.__signature__ = inspect.signature(func) # type: ignore
except:
pass
try:
setattr(
pipe,
name,
wrapped_func,
)
except AttributeError:
warnings.warn(Warnings.W122.format(method=name, pipe=pipe.name))
return nlp
@registry.callbacks("spacy.models_and_pipes_with_nvtx_range.v1")
def create_models_and_pipes_with_nvtx_range(
forward_color: int = -1,
backprop_color: int = -1,
additional_pipe_functions: Optional[Dict[str, List[str]]] = None,
) -> Callable[["Language"], "Language"]:
def inner(nlp):
nlp = models_with_nvtx_range(nlp, forward_color, backprop_color)
nlp = pipes_with_nvtx_range(nlp, additional_pipe_functions)
return nlp
return inner

View File

@ -23,7 +23,7 @@ def build_nel_encoder(
((tok2vec >> list2ragged()) & build_span_maker()) ((tok2vec >> list2ragged()) & build_span_maker())
>> extract_spans() >> extract_spans()
>> reduce_mean() >> reduce_mean()
>> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) # type: ignore[arg-type] >> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) # type: ignore
>> output_layer >> output_layer
) )
model.set_ref("output_layer", output_layer) model.set_ref("output_layer", output_layer)

View File

@ -72,7 +72,7 @@ def build_tb_parser_model(
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
tok2vec = chain( tok2vec = chain(
tok2vec, tok2vec,
cast(Model[List["Floats2d"], Floats2d], list2array()), list2array(),
Linear(hidden_width, t2v_width), Linear(hidden_width, t2v_width),
) )
tok2vec.set_dim("nO", hidden_width) tok2vec.set_dim("nO", hidden_width)

View File

@ -1,5 +1,5 @@
from typing import Optional, List, cast
from functools import partial from functools import partial
from typing import Optional, List
from thinc.types import Floats2d from thinc.types import Floats2d
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
@ -59,7 +59,8 @@ def build_simple_cnn_text_classifier(
resizable_layer=resizable_layer, resizable_layer=resizable_layer,
) )
model.set_ref("tok2vec", tok2vec) model.set_ref("tok2vec", tok2vec)
model.set_dim("nO", nO) # type: ignore # TODO: remove type ignore once Thinc has been updated if nO is not None:
model.set_dim("nO", cast(int, nO))
model.attrs["multi_label"] = not exclusive_classes model.attrs["multi_label"] = not exclusive_classes
return model return model
@ -85,7 +86,7 @@ def build_bow_text_classifier(
if not no_output_layer: if not no_output_layer:
fill_defaults["b"] = NEG_VALUE fill_defaults["b"] = NEG_VALUE
output_layer = softmax_activation() if exclusive_classes else Logistic() output_layer = softmax_activation() if exclusive_classes else Logistic()
resizable_layer = resizable( # type: ignore[var-annotated] resizable_layer: Model[Floats2d, Floats2d] = resizable(
sparse_linear, sparse_linear,
resize_layer=partial(resize_linear_weighted, fill_defaults=fill_defaults), resize_layer=partial(resize_linear_weighted, fill_defaults=fill_defaults),
) )
@ -93,7 +94,8 @@ def build_bow_text_classifier(
model = with_cpu(model, model.ops) model = with_cpu(model, model.ops)
if output_layer: if output_layer:
model = model >> with_cpu(output_layer, output_layer.ops) model = model >> with_cpu(output_layer, output_layer.ops)
model.set_dim("nO", nO) # type: ignore[arg-type] if nO is not None:
model.set_dim("nO", cast(int, nO))
model.set_ref("output_layer", sparse_linear) model.set_ref("output_layer", sparse_linear)
model.attrs["multi_label"] = not exclusive_classes model.attrs["multi_label"] = not exclusive_classes
model.attrs["resize_output"] = partial( model.attrs["resize_output"] = partial(
@ -129,8 +131,8 @@ def build_text_classifier_v2(
output_layer = Linear(nO=nO, nI=nO_double) >> Logistic() output_layer = Linear(nO=nO, nI=nO_double) >> Logistic()
model = (linear_model | cnn_model) >> output_layer model = (linear_model | cnn_model) >> output_layer
model.set_ref("tok2vec", tok2vec) model.set_ref("tok2vec", tok2vec)
if model.has_dim("nO") is not False: if model.has_dim("nO") is not False and nO is not None:
model.set_dim("nO", nO) # type: ignore[arg-type] model.set_dim("nO", cast(int, nO))
model.set_ref("output_layer", linear_model.get_ref("output_layer")) model.set_ref("output_layer", linear_model.get_ref("output_layer"))
model.set_ref("attention_layer", attention_layer) model.set_ref("attention_layer", attention_layer)
model.set_ref("maxout_layer", maxout_layer) model.set_ref("maxout_layer", maxout_layer)
@ -164,7 +166,7 @@ def build_text_classifier_lowdata(
>> list2ragged() >> list2ragged()
>> ParametricAttention(width) >> ParametricAttention(width)
>> reduce_sum() >> reduce_sum()
>> residual(Relu(width, width)) ** 2 # type: ignore[arg-type] >> residual(Relu(width, width)) ** 2
>> Linear(nO, width) >> Linear(nO, width)
) )
if dropout: if dropout:

View File

@ -1,5 +1,5 @@
from typing import Optional, List, Union, cast from typing import Optional, List, Union, cast
from thinc.types import Floats2d, Ints2d, Ragged from thinc.types import Floats2d, Ints2d, Ragged, Ints1d
from thinc.api import chain, clone, concatenate, with_array, with_padded from thinc.api import chain, clone, concatenate, with_array, with_padded
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
@ -159,7 +159,7 @@ def MultiHashEmbed(
embeddings = [make_hash_embed(i) for i in range(len(attrs))] embeddings = [make_hash_embed(i) for i in range(len(attrs))]
concat_size = width * (len(embeddings) + include_static_vectors) concat_size = width * (len(embeddings) + include_static_vectors)
max_out: Model[Ragged, Ragged] = with_array( max_out: Model[Ragged, Ragged] = with_array(
Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True) # type: ignore Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)
) )
if include_static_vectors: if include_static_vectors:
feature_extractor: Model[List[Doc], Ragged] = chain( feature_extractor: Model[List[Doc], Ragged] = chain(
@ -173,7 +173,7 @@ def MultiHashEmbed(
StaticVectors(width, dropout=0.0), StaticVectors(width, dropout=0.0),
), ),
max_out, max_out,
cast(Model[Ragged, List[Floats2d]], ragged2list()), ragged2list(),
) )
else: else:
model = chain( model = chain(
@ -181,7 +181,7 @@ def MultiHashEmbed(
cast(Model[List[Ints2d], Ragged], list2ragged()), cast(Model[List[Ints2d], Ragged], list2ragged()),
with_array(concatenate(*embeddings)), with_array(concatenate(*embeddings)),
max_out, max_out,
cast(Model[Ragged, List[Floats2d]], ragged2list()), ragged2list(),
) )
return model return model
@ -232,12 +232,12 @@ def CharacterEmbed(
feature_extractor: Model[List[Doc], Ragged] = chain( feature_extractor: Model[List[Doc], Ragged] = chain(
FeatureExtractor([feature]), FeatureExtractor([feature]),
cast(Model[List[Ints2d], Ragged], list2ragged()), cast(Model[List[Ints2d], Ragged], list2ragged()),
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), # type: ignore with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), # type: ignore[misc]
) )
max_out: Model[Ragged, Ragged] max_out: Model[Ragged, Ragged]
if include_static_vectors: if include_static_vectors:
max_out = with_array( max_out = with_array(
Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0) # type: ignore Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)
) )
model = chain( model = chain(
concatenate( concatenate(
@ -246,11 +246,11 @@ def CharacterEmbed(
StaticVectors(width, dropout=0.0), StaticVectors(width, dropout=0.0),
), ),
max_out, max_out,
cast(Model[Ragged, List[Floats2d]], ragged2list()), ragged2list(),
) )
else: else:
max_out = with_array( max_out = with_array(
Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0) # type: ignore Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)
) )
model = chain( model = chain(
concatenate( concatenate(
@ -258,7 +258,7 @@ def CharacterEmbed(
feature_extractor, feature_extractor,
), ),
max_out, max_out,
cast(Model[Ragged, List[Floats2d]], ragged2list()), ragged2list(),
) )
return model return model
@ -289,10 +289,10 @@ def MaxoutWindowEncoder(
normalize=True, normalize=True,
), ),
) )
model = clone(residual(cnn), depth) # type: ignore[arg-type] model = clone(residual(cnn), depth)
model.set_dim("nO", width) model.set_dim("nO", width)
receptive_field = window_size * depth receptive_field = window_size * depth
return with_array(model, pad=receptive_field) # type: ignore[arg-type] return with_array(model, pad=receptive_field)
@registry.architectures("spacy.MishWindowEncoder.v2") @registry.architectures("spacy.MishWindowEncoder.v2")
@ -313,9 +313,9 @@ def MishWindowEncoder(
expand_window(window_size=window_size), expand_window(window_size=window_size),
Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True), Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
) )
model = clone(residual(cnn), depth) # type: ignore[arg-type] model = clone(residual(cnn), depth)
model.set_dim("nO", width) model.set_dim("nO", width)
return with_array(model) # type: ignore[arg-type] return with_array(model)
@registry.architectures("spacy.TorchBiLSTMEncoder.v1") @registry.architectures("spacy.TorchBiLSTMEncoder.v1")

View File

@ -1,4 +1,5 @@
from libc.string cimport memset, memcpy from libc.string cimport memset, memcpy
from thinc.backends.cblas cimport CBlas
from ..typedefs cimport weight_t, hash_t from ..typedefs cimport weight_t, hash_t
from ..pipeline._parser_internals._state cimport StateC from ..pipeline._parser_internals._state cimport StateC
@ -38,7 +39,7 @@ cdef ActivationsC alloc_activations(SizesC n) nogil
cdef void free_activations(const ActivationsC* A) nogil cdef void free_activations(const ActivationsC* A) nogil
cdef void predict_states(ActivationsC* A, StateC** states, cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
const WeightsC* W, SizesC n) nogil const WeightsC* W, SizesC n) nogil
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil

View File

@ -4,11 +4,11 @@ from libc.math cimport exp
from libc.string cimport memset, memcpy from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free, realloc from libc.stdlib cimport calloc, free, realloc
from thinc.backends.linalg cimport Vec, VecVec from thinc.backends.linalg cimport Vec, VecVec
cimport blis.cy from thinc.backends.cblas cimport saxpy, sgemm
import numpy import numpy
import numpy.random import numpy.random
from thinc.api import Model, CupyOps, NumpyOps from thinc.api import Model, CupyOps, NumpyOps, get_ops
from .. import util from .. import util
from ..errors import Errors from ..errors import Errors
@ -91,7 +91,7 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
A._curr_size = n.states A._curr_size = n.states
cdef void predict_states(ActivationsC* A, StateC** states, cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
const WeightsC* W, SizesC n) nogil: const WeightsC* W, SizesC n) nogil:
cdef double one = 1.0 cdef double one = 1.0
resize_activations(A, n) resize_activations(A, n)
@ -99,7 +99,7 @@ cdef void predict_states(ActivationsC* A, StateC** states,
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats) states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float)) memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float)) memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
sum_state_features(A.unmaxed, sum_state_features(cblas, A.unmaxed,
W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces) W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
for i in range(n.states): for i in range(n.states):
VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces], VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces],
@ -113,12 +113,10 @@ cdef void predict_states(ActivationsC* A, StateC** states,
memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float)) memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
else: else:
# Compute hidden-to-output # Compute hidden-to-output
blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE, sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
n.states, n.classes, n.hiddens, one, 1.0, <const float *>A.hiddens, n.hiddens,
<float*>A.hiddens, n.hiddens, 1, <const float *>W.hidden_weights, n.hiddens,
<float*>W.hidden_weights, n.hiddens, 1, 0.0, A.scores, n.classes)
one,
<float*>A.scores, n.classes, 1)
# Add bias # Add bias
for i in range(n.states): for i in range(n.states):
VecVec.add_i(&A.scores[i*n.classes], VecVec.add_i(&A.scores[i*n.classes],
@ -135,7 +133,7 @@ cdef void predict_states(ActivationsC* A, StateC** states,
A.scores[i*n.classes+j] = min_ A.scores[i*n.classes+j] = min_
cdef void sum_state_features(float* output, cdef void sum_state_features(CBlas cblas, float* output,
const float* cached, const int* token_ids, int B, int F, int O) nogil: const float* cached, const int* token_ids, int B, int F, int O) nogil:
cdef int idx, b, f, i cdef int idx, b, f, i
cdef const float* feature cdef const float* feature
@ -150,9 +148,7 @@ cdef void sum_state_features(float* output,
else: else:
idx = token_ids[f] * id_stride + f*O idx = token_ids[f] * id_stride + f*O
feature = &cached[idx] feature = &cached[idx]
blis.cy.axpyv(blis.cy.NO_CONJUGATE, O, one, saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
<float*>feature, 1,
&output[b*O], 1)
token_ids += F token_ids += F
@ -443,9 +439,15 @@ cdef class precompute_hiddens:
# - Output from backward on GPU # - Output from backward on GPU
bp_hiddens = self._bp_hiddens bp_hiddens = self._bp_hiddens
cdef CBlas cblas
if isinstance(self.ops, CupyOps):
cblas = NUMPY_OPS.cblas()
else:
cblas = self.ops.cblas()
feat_weights = self.get_feat_weights() feat_weights = self.get_feat_weights()
cdef int[:, ::1] ids = token_ids cdef int[:, ::1] ids = token_ids
sum_state_features(<float*>state_vector.data, sum_state_features(cblas, <float*>state_vector.data,
feat_weights, &ids[0,0], feat_weights, &ids[0,0],
token_ids.shape[0], self.nF, self.nO*self.nP) token_ids.shape[0], self.nF, self.nO*self.nP)
state_vector += self.bias state_vector += self.bias

View File

@ -40,17 +40,15 @@ def forward(
if not token_count: if not token_count:
return _handle_empty(model.ops, model.get_dim("nO")) return _handle_empty(model.ops, model.get_dim("nO"))
key_attr: int = model.attrs["key_attr"] key_attr: int = model.attrs["key_attr"]
keys: Ints1d = model.ops.flatten( keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
cast(Sequence, [doc.to_array(key_attr) for doc in docs])
)
vocab: Vocab = docs[0].vocab vocab: Vocab = docs[0].vocab
W = cast(Floats2d, model.ops.as_contig(model.get_param("W"))) W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
if vocab.vectors.mode == Mode.default: if vocab.vectors.mode == Mode.default:
V = cast(Floats2d, model.ops.asarray(vocab.vectors.data)) V = model.ops.asarray(vocab.vectors.data)
rows = vocab.vectors.find(keys=keys) rows = vocab.vectors.find(keys=keys)
V = model.ops.as_contig(V[rows]) V = model.ops.as_contig(V[rows])
elif vocab.vectors.mode == Mode.floret: elif vocab.vectors.mode == Mode.floret:
V = cast(Floats2d, vocab.vectors.get_batch(keys)) V = vocab.vectors.get_batch(keys)
V = model.ops.as_contig(V) V = model.ops.as_contig(V)
else: else:
raise RuntimeError(Errors.E896) raise RuntimeError(Errors.E896)
@ -62,9 +60,7 @@ def forward(
# Convert negative indices to 0-vectors # Convert negative indices to 0-vectors
# TODO: more options for UNK tokens # TODO: more options for UNK tokens
vectors_data[rows < 0] = 0 vectors_data[rows < 0] = 0
output = Ragged( output = Ragged(vectors_data, model.ops.asarray1i([len(doc) for doc in docs]))
vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i") # type: ignore
)
mask = None mask = None
if is_train: if is_train:
mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate")) mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate"))
@ -77,7 +73,9 @@ def forward(
model.inc_grad( model.inc_grad(
"W", "W",
model.ops.gemm( model.ops.gemm(
cast(Floats2d, d_output.data), model.ops.as_contig(V), trans1=True cast(Floats2d, d_output.data),
cast(Floats2d, model.ops.as_contig(V)),
trans1=True,
), ),
) )
return [] return []

View File

@ -10,6 +10,7 @@ from ...strings cimport hash_string
from ...structs cimport TokenC from ...structs cimport TokenC
from ...tokens.doc cimport Doc, set_children_from_heads from ...tokens.doc cimport Doc, set_children_from_heads
from ...tokens.token cimport MISSING_DEP from ...tokens.token cimport MISSING_DEP
from ...training import split_bilu_label
from ...training.example cimport Example from ...training.example cimport Example
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC, ArcC from ._state cimport StateC, ArcC
@ -687,7 +688,7 @@ cdef class ArcEager(TransitionSystem):
return self.c[name_or_id] return self.c[name_or_id]
name = name_or_id name = name_or_id
if '-' in name: if '-' in name:
move_str, label_str = name.split('-', 1) move_str, label_str = split_bilu_label(name)
label = self.strings[label_str] label = self.strings[label_str]
else: else:
move_str = name move_str = name

View File

@ -13,6 +13,7 @@ from ...typedefs cimport weight_t, attr_t
from ...lexeme cimport Lexeme from ...lexeme cimport Lexeme
from ...attrs cimport IS_SPACE from ...attrs cimport IS_SPACE
from ...structs cimport TokenC, SpanC from ...structs cimport TokenC, SpanC
from ...training import split_bilu_label
from ...training.example cimport Example from ...training.example cimport Example
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
@ -182,7 +183,7 @@ cdef class BiluoPushDown(TransitionSystem):
if name == '-' or name == '' or name is None: if name == '-' or name == '' or name is None:
return Transition(clas=0, move=MISSING, label=0, score=0) return Transition(clas=0, move=MISSING, label=0, score=0)
elif '-' in name: elif '-' in name:
move_str, label_str = name.split('-', 1) move_str, label_str = split_bilu_label(name)
# Deprecated, hacky way to denote 'not this entity' # Deprecated, hacky way to denote 'not this entity'
if label_str.startswith('!'): if label_str.startswith('!'):
raise ValueError(Errors.E869.format(label=name)) raise ValueError(Errors.E869.format(label=name))

View File

@ -0,0 +1,11 @@
#ifndef NONPROJ_HH
#define NONPROJ_HH
#include <stdexcept>
#include <string>
void raise_domain_error(std::string const &msg) {
throw std::domain_error(msg);
}
#endif // NONPROJ_HH

View File

@ -0,0 +1,4 @@
from libcpp.string cimport string
cdef extern from "nonproj.hh":
cdef void raise_domain_error(const string& msg) nogil except +

View File

@ -4,10 +4,13 @@ for doing pseudo-projective parsing implementation uses the HEAD decoration
scheme. scheme.
""" """
from copy import copy from copy import copy
from cython.operator cimport preincrement as incr, dereference as deref
from libc.limits cimport INT_MAX from libc.limits cimport INT_MAX
from libc.stdlib cimport abs from libc.stdlib cimport abs
from libcpp cimport bool from libcpp cimport bool
from libcpp.string cimport string, to_string
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libcpp.unordered_set cimport unordered_set
from ...tokens.doc cimport Doc, set_children_from_heads from ...tokens.doc cimport Doc, set_children_from_heads
@ -49,7 +52,7 @@ def is_nonproj_arc(tokenid, heads):
return _is_nonproj_arc(tokenid, c_heads) return _is_nonproj_arc(tokenid, c_heads)
cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil: cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil except *:
# definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
# if there is a token k, h < k < d such that h is not # if there is a token k, h < k < d such that h is not
# an ancestor of k. Same for h -> d, h > d # an ancestor of k. Same for h -> d, h > d
@ -65,25 +68,49 @@ cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil:
else: else:
start, end = (tokenid+1, head) start, end = (tokenid+1, head)
for k in range(start, end): for k in range(start, end):
if _has_head_as_ancestor(k, head, heads): if not _has_head_as_ancestor(k, head, heads):
continue
else: # head not in ancestors: d -> h is non-projective
return True return True
return False return False
cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil: cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil except *:
ancestor = tokenid ancestor = tokenid
cnt = 0 cdef unordered_set[int] seen_tokens
while cnt < heads.size(): seen_tokens.insert(ancestor)
while True:
# Reached the head or a disconnected node
if heads[ancestor] == head or heads[ancestor] < 0: if heads[ancestor] == head or heads[ancestor] < 0:
return True return True
# Reached the root
if heads[ancestor] == ancestor:
return False
ancestor = heads[ancestor] ancestor = heads[ancestor]
cnt += 1 result = seen_tokens.insert(ancestor)
# Found cycle
if not result.second:
raise_domain_error(heads_to_string(heads))
return False return False
cdef string heads_to_string(const vector[int]& heads) nogil:
cdef vector[int].const_iterator citer
cdef string cycle_str
cycle_str.append("Found cycle in dependency graph: [")
# FIXME: Rewrite using ostringstream when available in Cython.
citer = heads.const_begin()
while citer != heads.const_end():
if citer != heads.const_begin():
cycle_str.append(", ")
cycle_str.append(to_string(deref(citer)))
incr(citer)
cycle_str.append("]")
return cycle_str
def is_nonproj_tree(heads): def is_nonproj_tree(heads):
cdef vector[int] c_heads = _heads_to_c(heads) cdef vector[int] c_heads = _heads_to_c(heads)
# a tree is non-projective if at least one arc is non-projective # a tree is non-projective if at least one arc is non-projective
@ -176,11 +203,12 @@ def get_smallest_nonproj_arc_slow(heads):
return _get_smallest_nonproj_arc(c_heads) return _get_smallest_nonproj_arc(c_heads)
cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil: cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil except -2:
# return the smallest non-proj arc or None # return the smallest non-proj arc or None
# where size is defined as the distance between dep and head # where size is defined as the distance between dep and head
# and ties are broken left to right # and ties are broken left to right
cdef int smallest_size = INT_MAX cdef int smallest_size = INT_MAX
# -1 means its already projective.
cdef int smallest_np_arc = -1 cdef int smallest_np_arc = -1
cdef int size cdef int size
cdef int tokenid cdef int tokenid

View File

@ -12,6 +12,7 @@ from ..language import Language
from ._parser_internals import nonproj from ._parser_internals import nonproj
from ._parser_internals.nonproj import DELIMITER from ._parser_internals.nonproj import DELIMITER
from ..scorer import Scorer from ..scorer import Scorer
from ..training import remove_bilu_prefix
from ..util import registry from ..util import registry
@ -314,7 +315,7 @@ cdef class DependencyParser(Parser):
# Get the labels from the model by looking at the available moves # Get the labels from the model by looking at the available moves
for move in self.move_names: for move in self.move_names:
if "-" in move: if "-" in move:
label = move.split("-")[1] label = remove_bilu_prefix(move)
if DELIMITER in label: if DELIMITER in label:
label = label.split(DELIMITER)[1] label = label.split(DELIMITER)[1]
labels.add(label) labels.add(label)

View File

@ -138,7 +138,7 @@ class EditTreeLemmatizer(TrainablePipe):
truths.append(eg_truths) truths.append(eg_truths)
d_scores, loss = loss_func(scores, truths) # type: ignore d_scores, loss = loss_func(scores, truths)
if self.model.ops.xp.isnan(loss): if self.model.ops.xp.isnan(loss):
raise ValueError(Errors.E910.format(name=self.name)) raise ValueError(Errors.E910.format(name=self.name))

View File

@ -56,6 +56,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
"overwrite": True, "overwrite": True,
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
"use_gold_ents": True, "use_gold_ents": True,
"threshold": None,
}, },
default_score_weights={ default_score_weights={
"nel_micro_f": 1.0, "nel_micro_f": 1.0,
@ -77,6 +78,7 @@ def make_entity_linker(
overwrite: bool, overwrite: bool,
scorer: Optional[Callable], scorer: Optional[Callable],
use_gold_ents: bool, use_gold_ents: bool,
threshold: Optional[float] = None,
): ):
"""Construct an EntityLinker component. """Construct an EntityLinker component.
@ -91,6 +93,10 @@ def make_entity_linker(
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention. produces a list of candidates, given a certain knowledge base and a textual mention.
scorer (Optional[Callable]): The scoring method. scorer (Optional[Callable]): The scoring method.
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
component must provide entity annotations.
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
prediction is discarded. If None, predictions are not filtered by any threshold.
""" """
if not model.attrs.get("include_span_maker", False): if not model.attrs.get("include_span_maker", False):
@ -121,6 +127,7 @@ def make_entity_linker(
overwrite=overwrite, overwrite=overwrite,
scorer=scorer, scorer=scorer,
use_gold_ents=use_gold_ents, use_gold_ents=use_gold_ents,
threshold=threshold,
) )
@ -156,6 +163,7 @@ class EntityLinker(TrainablePipe):
overwrite: bool = BACKWARD_OVERWRITE, overwrite: bool = BACKWARD_OVERWRITE,
scorer: Optional[Callable] = entity_linker_score, scorer: Optional[Callable] = entity_linker_score,
use_gold_ents: bool, use_gold_ents: bool,
threshold: Optional[float] = None,
) -> None: ) -> None:
"""Initialize an entity linker. """Initialize an entity linker.
@ -174,9 +182,20 @@ class EntityLinker(TrainablePipe):
Scorer.score_links. Scorer.score_links.
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
component must provide entity annotations. component must provide entity annotations.
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
DOCS: https://spacy.io/api/entitylinker#init DOCS: https://spacy.io/api/entitylinker#init
""" """
if threshold is not None and not (0 <= threshold <= 1):
raise ValueError(
Errors.E1043.format(
range_start=0,
range_end=1,
value=threshold,
)
)
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
self.name = name self.name = name
@ -192,6 +211,7 @@ class EntityLinker(TrainablePipe):
self.kb = empty_kb(entity_vector_length)(self.vocab) self.kb = empty_kb(entity_vector_length)(self.vocab)
self.scorer = scorer self.scorer = scorer
self.use_gold_ents = use_gold_ents self.use_gold_ents = use_gold_ents
self.threshold = threshold
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
"""Define the KB of this pipe by providing a function that will """Define the KB of this pipe by providing a function that will
@ -355,7 +375,7 @@ class EntityLinker(TrainablePipe):
keep_ents.append(eidx) keep_ents.append(eidx)
eidx += 1 eidx += 1
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") entity_encodings = self.model.ops.asarray2f(entity_encodings, dtype="float32")
selected_encodings = sentence_encodings[keep_ents] selected_encodings = sentence_encodings[keep_ents]
# if there are no matches, short circuit # if there are no matches, short circuit
@ -368,13 +388,12 @@ class EntityLinker(TrainablePipe):
method="get_loss", msg="gold entities do not match up" method="get_loss", msg="gold entities do not match up"
) )
raise RuntimeError(err) raise RuntimeError(err)
# TODO: fix typing issue here gradients = self.distance.get_grad(selected_encodings, entity_encodings)
gradients = self.distance.get_grad(selected_encodings, entity_encodings) # type: ignore
# to match the input size, we need to give a zero gradient for items not in the kb # to match the input size, we need to give a zero gradient for items not in the kb
out = self.model.ops.alloc2f(*sentence_encodings.shape) out = self.model.ops.alloc2f(*sentence_encodings.shape)
out[keep_ents] = gradients out[keep_ents] = gradients
loss = self.distance.get_loss(selected_encodings, entity_encodings) # type: ignore loss = self.distance.get_loss(selected_encodings, entity_encodings)
loss = loss / len(entity_encodings) loss = loss / len(entity_encodings)
return float(loss), out return float(loss), out
@ -391,18 +410,21 @@ class EntityLinker(TrainablePipe):
self.validate_kb() self.validate_kb()
entity_count = 0 entity_count = 0
final_kb_ids: List[str] = [] final_kb_ids: List[str] = []
xp = self.model.ops.xp
if not docs: if not docs:
return final_kb_ids return final_kb_ids
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
if len(doc) == 0:
continue
sentences = [s for s in doc.sents] sentences = [s for s in doc.sents]
if len(doc) > 0:
# Looping through each entity (TODO: rewrite) # Looping through each entity (TODO: rewrite)
for ent in doc.ents: for ent in doc.ents:
sent = ent.sent sent_index = sentences.index(ent.sent)
sent_index = sentences.index(sent)
assert sent_index >= 0 assert sent_index >= 0
if self.incl_context:
# get n_neighbour sentences, clipped to the length of the document # get n_neighbour sentences, clipped to the length of the document
start_sentence = max(0, sent_index - self.n_sents) start_sentence = max(0, sent_index - self.n_sents)
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents) end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
@ -410,8 +432,6 @@ class EntityLinker(TrainablePipe):
end_token = sentences[end_sentence].end end_token = sentences[end_sentence].end
sent_doc = doc[start_token:end_token].as_doc() sent_doc = doc[start_token:end_token].as_doc()
# currently, the context is the same for each entity in a sentence (should be refined) # currently, the context is the same for each entity in a sentence (should be refined)
xp = self.model.ops.xp
if self.incl_context:
sentence_encoding = self.model.predict([sent_doc])[0] sentence_encoding = self.model.predict([sent_doc])[0]
sentence_encoding_t = sentence_encoding.T sentence_encoding_t = sentence_encoding.T
sentence_norm = xp.linalg.norm(sentence_encoding_t) sentence_norm = xp.linalg.norm(sentence_encoding_t)
@ -424,9 +444,8 @@ class EntityLinker(TrainablePipe):
if not candidates: if not candidates:
# no prediction possible for this entity - setting to NIL # no prediction possible for this entity - setting to NIL
final_kb_ids.append(self.NIL) final_kb_ids.append(self.NIL)
elif len(candidates) == 1: elif len(candidates) == 1 and self.threshold is None:
# shortcut for efficiency reasons: take the 1 candidate # shortcut for efficiency reasons: take the 1 candidate
# TODO: thresholding
final_kb_ids.append(candidates[0].entity_) final_kb_ids.append(candidates[0].entity_)
else: else:
random.shuffle(candidates) random.shuffle(candidates)
@ -455,10 +474,11 @@ class EntityLinker(TrainablePipe):
if sims.shape != prior_probs.shape: if sims.shape != prior_probs.shape:
raise ValueError(Errors.E161) raise ValueError(Errors.E161)
scores = prior_probs + sims - (prior_probs * sims) scores = prior_probs + sims - (prior_probs * sims)
# TODO: thresholding final_kb_ids.append(
best_index = scores.argmax().item() candidates[scores.argmax().item()].entity_
best_candidate = candidates[best_index] if self.threshold is None or scores.max() >= self.threshold
final_kb_ids.append(best_candidate.entity_) else EntityLinker.NIL
)
if not (len(final_kb_ids) == entity_count): if not (len(final_kb_ids) == entity_count):
err = Errors.E147.format( err = Errors.E147.format(
method="predict", msg="result variables not of equal length" method="predict", msg="result variables not of equal length"

View File

@ -159,10 +159,8 @@ class EntityRuler(Pipe):
self._require_patterns() self._require_patterns()
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="\\[W036") warnings.filterwarnings("ignore", message="\\[W036")
matches = cast( matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
List[Tuple[int, int, int]],
list(self.matcher(doc)) + list(self.phrase_matcher(doc)),
)
final_matches = set( final_matches = set(
[(m_id, start, end) for m_id, start, end in matches if start != end] [(m_id, start, end) for m_id, start, end in matches if start != end]
) )

View File

@ -7,7 +7,7 @@ from pathlib import Path
from itertools import islice from itertools import islice
import srsly import srsly
import random import random
from thinc.api import CosineDistance, Model, Optimizer, Config from thinc.api import CosineDistance, Model, Optimizer
from thinc.api import set_dropout_rate from thinc.api import set_dropout_rate
import warnings import warnings
@ -20,7 +20,7 @@ from ...language import Language
from ...vocab import Vocab from ...vocab import Vocab
from ...training import Example, validate_examples, validate_get_examples from ...training import Example, validate_examples, validate_get_examples
from ...errors import Errors, Warnings from ...errors import Errors, Warnings
from ...util import SimpleFrozenList, registry from ...util import SimpleFrozenList
from ... import util from ... import util
from ...scorer import Scorer from ...scorer import Scorer
@ -70,7 +70,6 @@ class EntityLinker_v1(TrainablePipe):
produces a list of candidates, given a certain knowledge base and a textual mention. produces a list of candidates, given a certain knowledge base and a textual mention.
scorer (Optional[Callable]): The scoring method. Defaults to scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_links. Scorer.score_links.
DOCS: https://spacy.io/api/entitylinker#init DOCS: https://spacy.io/api/entitylinker#init
""" """
self.vocab = vocab self.vocab = vocab
@ -213,15 +212,14 @@ class EntityLinker_v1(TrainablePipe):
if kb_id: if kb_id:
entity_encoding = self.kb.get_vector(kb_id) entity_encoding = self.kb.get_vector(kb_id)
entity_encodings.append(entity_encoding) entity_encodings.append(entity_encoding)
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") entity_encodings = self.model.ops.asarray2f(entity_encodings)
if sentence_encodings.shape != entity_encodings.shape: if sentence_encodings.shape != entity_encodings.shape:
err = Errors.E147.format( err = Errors.E147.format(
method="get_loss", msg="gold entities do not match up" method="get_loss", msg="gold entities do not match up"
) )
raise RuntimeError(err) raise RuntimeError(err)
# TODO: fix typing issue here gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
gradients = self.distance.get_grad(sentence_encodings, entity_encodings) # type: ignore loss = self.distance.get_loss(sentence_encodings, entity_encodings)
loss = self.distance.get_loss(sentence_encodings, entity_encodings) # type: ignore
loss = loss / len(entity_encodings) loss = loss / len(entity_encodings)
return float(loss), gradients return float(loss), gradients
@ -273,7 +271,6 @@ class EntityLinker_v1(TrainablePipe):
final_kb_ids.append(self.NIL) final_kb_ids.append(self.NIL)
elif len(candidates) == 1: elif len(candidates) == 1:
# shortcut for efficiency reasons: take the 1 candidate # shortcut for efficiency reasons: take the 1 candidate
# TODO: thresholding
final_kb_ids.append(candidates[0].entity_) final_kb_ids.append(candidates[0].entity_)
else: else:
random.shuffle(candidates) random.shuffle(candidates)
@ -302,7 +299,6 @@ class EntityLinker_v1(TrainablePipe):
if sims.shape != prior_probs.shape: if sims.shape != prior_probs.shape:
raise ValueError(Errors.E161) raise ValueError(Errors.E161)
scores = prior_probs + sims - (prior_probs * sims) scores = prior_probs + sims - (prior_probs * sims)
# TODO: thresholding
best_index = scores.argmax().item() best_index = scores.argmax().item()
best_candidate = candidates[best_index] best_candidate = candidates[best_index]
final_kb_ids.append(best_candidate.entity_) final_kb_ids.append(best_candidate.entity_)

View File

@ -6,10 +6,10 @@ from thinc.api import Model, Config
from ._parser_internals.transition_system import TransitionSystem from ._parser_internals.transition_system import TransitionSystem
from .transition_parser cimport Parser from .transition_parser cimport Parser
from ._parser_internals.ner cimport BiluoPushDown from ._parser_internals.ner cimport BiluoPushDown
from ..language import Language from ..language import Language
from ..scorer import get_ner_prf, PRFScore from ..scorer import get_ner_prf, PRFScore
from ..util import registry from ..util import registry
from ..training import remove_bilu_prefix
default_model_config = """ default_model_config = """
@ -242,7 +242,7 @@ cdef class EntityRecognizer(Parser):
def labels(self): def labels(self):
# Get the labels from the model by looking at the available moves, e.g. # Get the labels from the model by looking at the available moves, e.g.
# B-PERSON, I-PERSON, L-PERSON, U-PERSON # B-PERSON, I-PERSON, L-PERSON, U-PERSON
labels = set(move.split("-")[1] for move in self.move_names labels = set(remove_bilu_prefix(move) for move in self.move_names
if move[0] in ("B", "I", "L", "U")) if move[0] in ("B", "I", "L", "U"))
return tuple(sorted(labels)) return tuple(sorted(labels))

View File

@ -75,7 +75,7 @@ def build_ngram_suggester(sizes: List[int]) -> Suggester:
if spans: if spans:
assert spans[-1].ndim == 2, spans[-1].shape assert spans[-1].ndim == 2, spans[-1].shape
lengths.append(length) lengths.append(length)
lengths_array = cast(Ints1d, ops.asarray(lengths, dtype="i")) lengths_array = ops.asarray1i(lengths)
if len(spans) > 0: if len(spans) > 0:
output = Ragged(ops.xp.vstack(spans), lengths_array) output = Ragged(ops.xp.vstack(spans), lengths_array)
else: else:

View File

@ -192,7 +192,7 @@ class TextCategorizer(TrainablePipe):
if not any(len(doc) for doc in docs): if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs. # Handle cases where there are no tokens in any docs.
tensors = [doc.tensor for doc in docs] tensors = [doc.tensor for doc in docs]
xp = get_array_module(tensors) xp = self.model.ops.xp
scores = xp.zeros((len(list(docs)), len(self.labels))) scores = xp.zeros((len(list(docs)), len(self.labels)))
return scores return scores
scores = self.model.predict(docs) scores = self.model.predict(docs)

View File

@ -1,4 +1,5 @@
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.backends.cblas cimport CBlas
from ..vocab cimport Vocab from ..vocab cimport Vocab
from .trainable_pipe cimport TrainablePipe from .trainable_pipe cimport TrainablePipe
@ -12,7 +13,7 @@ cdef class Parser(TrainablePipe):
cdef readonly TransitionSystem moves cdef readonly TransitionSystem moves
cdef public object _multitasks cdef public object _multitasks
cdef void _parseC(self, StateC** states, cdef void _parseC(self, CBlas cblas, StateC** states,
WeightsC weights, SizesC sizes) nogil WeightsC weights, SizesC sizes) nogil
cdef void c_transition_batch(self, StateC** states, const float* scores, cdef void c_transition_batch(self, StateC** states, const float* scores,

View File

@ -9,7 +9,7 @@ from libc.stdlib cimport calloc, free
import random import random
import srsly import srsly
from thinc.api import set_dropout_rate, CupyOps from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
from thinc.extra.search cimport Beam from thinc.extra.search cimport Beam
import numpy.random import numpy.random
import numpy import numpy
@ -30,6 +30,9 @@ from ..errors import Errors, Warnings
from .. import util from .. import util
NUMPY_OPS = NumpyOps()
cdef class Parser(TrainablePipe): cdef class Parser(TrainablePipe):
""" """
Base class of the DependencyParser and EntityRecognizer. Base class of the DependencyParser and EntityRecognizer.
@ -259,6 +262,12 @@ cdef class Parser(TrainablePipe):
def greedy_parse(self, docs, drop=0.): def greedy_parse(self, docs, drop=0.):
cdef vector[StateC*] states cdef vector[StateC*] states
cdef StateClass state cdef StateClass state
ops = self.model.ops
cdef CBlas cblas
if isinstance(ops, CupyOps):
cblas = NUMPY_OPS.cblas()
else:
cblas = ops.cblas()
self._ensure_labels_are_added(docs) self._ensure_labels_are_added(docs)
set_dropout_rate(self.model, drop) set_dropout_rate(self.model, drop)
batch = self.moves.init_batch(docs) batch = self.moves.init_batch(docs)
@ -269,8 +278,7 @@ cdef class Parser(TrainablePipe):
states.push_back(state.c) states.push_back(state.c)
sizes = get_c_sizes(model, states.size()) sizes = get_c_sizes(model, states.size())
with nogil: with nogil:
self._parseC(&states[0], self._parseC(cblas, &states[0], weights, sizes)
weights, sizes)
model.clear_memory() model.clear_memory()
del model del model
return batch return batch
@ -297,14 +305,13 @@ cdef class Parser(TrainablePipe):
del model del model
return list(batch) return list(batch)
cdef void _parseC(self, StateC** states, cdef void _parseC(self, CBlas cblas, StateC** states,
WeightsC weights, SizesC sizes) nogil: WeightsC weights, SizesC sizes) nogil:
cdef int i, j cdef int i, j
cdef vector[StateC*] unfinished cdef vector[StateC*] unfinished
cdef ActivationsC activations = alloc_activations(sizes) cdef ActivationsC activations = alloc_activations(sizes)
while sizes.states >= 1: while sizes.states >= 1:
predict_states(&activations, predict_states(cblas, &activations, states, &weights, sizes)
states, &weights, sizes)
# Validate actions, argmax, take action. # Validate actions, argmax, take action.
self.c_transition_batch(states, self.c_transition_batch(states,
activations.scores, sizes.classes, sizes.states) activations.scores, sizes.classes, sizes.states)

View File

@ -3,12 +3,13 @@ from typing import Iterable, TypeVar, TYPE_CHECKING
from .compat import Literal from .compat import Literal
from enum import Enum from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator, create_model from pydantic import BaseModel, Field, ValidationError, validator, create_model
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
from pydantic.main import ModelMetaclass from pydantic.main import ModelMetaclass
from thinc.api import Optimizer, ConfigValidationError, Model from thinc.api import Optimizer, ConfigValidationError, Model
from thinc.config import Promise from thinc.config import Promise
from collections import defaultdict from collections import defaultdict
import inspect import inspect
import re
from .attrs import NAMES from .attrs import NAMES
from .lookups import Lookups from .lookups import Lookups
@ -104,7 +105,7 @@ def get_arg_model(
sig_args[param.name] = (annotation, default) sig_args[param.name] = (annotation, default)
is_strict = strict and not has_variable is_strict = strict and not has_variable
sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra # type: ignore[assignment] sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra # type: ignore[assignment]
return create_model(name, **sig_args) # type: ignore[arg-type, return-value] return create_model(name, **sig_args) # type: ignore[call-overload, arg-type, return-value]
def validate_init_settings( def validate_init_settings(
@ -198,13 +199,18 @@ class TokenPatternNumber(BaseModel):
return v return v
class TokenPatternOperator(str, Enum): class TokenPatternOperatorSimple(str, Enum):
plus: StrictStr = StrictStr("+") plus: StrictStr = StrictStr("+")
start: StrictStr = StrictStr("*") star: StrictStr = StrictStr("*")
question: StrictStr = StrictStr("?") question: StrictStr = StrictStr("?")
exclamation: StrictStr = StrictStr("!") exclamation: StrictStr = StrictStr("!")
class TokenPatternOperatorMinMax(ConstrainedStr):
regex = re.compile("^({\d+}|{\d+,\d*}|{\d*,\d+})$")
TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax]
StringValue = Union[TokenPatternString, StrictStr] StringValue = Union[TokenPatternString, StrictStr]
NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat] NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
UnderscoreValue = Union[ UnderscoreValue = Union[

View File

@ -26,4 +26,4 @@ cdef class StringStore:
cdef public PreshMap _map cdef public PreshMap _map
cdef const Utf8Str* intern_unicode(self, str py_string) cdef const Utf8Str* intern_unicode(self, str py_string)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)

View File

@ -14,6 +14,13 @@ from .symbols import NAMES as SYMBOLS_BY_INT
from .errors import Errors from .errors import Errors
from . import util from . import util
# Not particularly elegant, but this is faster than `isinstance(key, numbers.Integral)`
cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
try:
out_hash[0] = key
return True
except:
return False
def get_string_id(key): def get_string_id(key):
"""Get a string ID, handling the reserved symbols correctly. If the key is """Get a string ID, handling the reserved symbols correctly. If the key is
@ -22,15 +29,27 @@ def get_string_id(key):
This function optimises for convenience over performance, so shouldn't be This function optimises for convenience over performance, so shouldn't be
used in tight loops. used in tight loops.
""" """
if not isinstance(key, str): cdef hash_t str_hash
return key if isinstance(key, str):
elif key in SYMBOLS_BY_STR: if len(key) == 0:
return SYMBOLS_BY_STR[key]
elif not key:
return 0 return 0
symbol = SYMBOLS_BY_STR.get(key, None)
if symbol is not None:
return symbol
else: else:
chars = key.encode("utf8") chars = key.encode("utf8")
return hash_utf8(chars, len(chars)) return hash_utf8(chars, len(chars))
elif _try_coerce_to_hash(key, &str_hash):
# Coerce the integral key to the expected primitive hash type.
# This ensures that custom/overloaded "primitive" data types
# such as those implemented by numpy are not inadvertently used
# downsteam (as these are internally implemented as custom PyObjects
# whose comparison operators can incur a significant overhead).
return str_hash
else:
# TODO: Raise an error instead
return key
cpdef hash_t hash_string(str string) except 0: cpdef hash_t hash_string(str string) except 0:
@ -110,24 +129,32 @@ cdef class StringStore:
string_or_id (bytes, str or uint64): The value to encode. string_or_id (bytes, str or uint64): The value to encode.
Returns (str / uint64): The value to be retrieved. Returns (str / uint64): The value to be retrieved.
""" """
if isinstance(string_or_id, str) and len(string_or_id) == 0: cdef hash_t str_hash
return 0 cdef Utf8Str* utf8str = NULL
elif string_or_id == 0:
return ""
elif string_or_id in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string_or_id]
cdef hash_t key
if isinstance(string_or_id, str): if isinstance(string_or_id, str):
key = hash_string(string_or_id) if len(string_or_id) == 0:
return key return 0
elif isinstance(string_or_id, bytes):
key = hash_utf8(string_or_id, len(string_or_id)) # Return early if the string is found in the symbols LUT.
return key symbol = SYMBOLS_BY_STR.get(string_or_id, None)
elif string_or_id < len(SYMBOLS_BY_INT): if symbol is not None:
return SYMBOLS_BY_INT[string_or_id] return symbol
else: else:
key = string_or_id return hash_string(string_or_id)
utf8str = <Utf8Str*>self._map.get(key) elif isinstance(string_or_id, bytes):
return hash_utf8(string_or_id, len(string_or_id))
elif _try_coerce_to_hash(string_or_id, &str_hash):
if str_hash == 0:
return ""
elif str_hash < len(SYMBOLS_BY_INT):
return SYMBOLS_BY_INT[str_hash]
else:
utf8str = <Utf8Str*>self._map.get(str_hash)
else:
# TODO: Raise an error instead
utf8str = <Utf8Str*>self._map.get(string_or_id)
if utf8str is NULL: if utf8str is NULL:
raise KeyError(Errors.E018.format(hash_value=string_or_id)) raise KeyError(Errors.E018.format(hash_value=string_or_id))
else: else:
@ -153,19 +180,22 @@ cdef class StringStore:
string (str): The string to add. string (str): The string to add.
RETURNS (uint64): The string's hash value. RETURNS (uint64): The string's hash value.
""" """
cdef hash_t str_hash
if isinstance(string, str): if isinstance(string, str):
if string in SYMBOLS_BY_STR: if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string] return SYMBOLS_BY_STR[string]
key = hash_string(string)
self.intern_unicode(string) string = string.encode("utf8")
str_hash = hash_utf8(string, len(string))
self._intern_utf8(string, len(string), &str_hash)
elif isinstance(string, bytes): elif isinstance(string, bytes):
if string in SYMBOLS_BY_STR: if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string] return SYMBOLS_BY_STR[string]
key = hash_utf8(string, len(string)) str_hash = hash_utf8(string, len(string))
self._intern_utf8(string, len(string)) self._intern_utf8(string, len(string), &str_hash)
else: else:
raise TypeError(Errors.E017.format(value_type=type(string))) raise TypeError(Errors.E017.format(value_type=type(string)))
return key return str_hash
def __len__(self): def __len__(self):
"""The number of strings in the store. """The number of strings in the store.
@ -174,30 +204,29 @@ cdef class StringStore:
""" """
return self.keys.size() return self.keys.size()
def __contains__(self, string not None): def __contains__(self, string_or_id not None):
"""Check whether a string is in the store. """Check whether a string or ID is in the store.
string (str): The string to check. string_or_id (str or int): The string to check.
RETURNS (bool): Whether the store contains the string. RETURNS (bool): Whether the store contains the string.
""" """
cdef hash_t key cdef hash_t str_hash
if isinstance(string, int) or isinstance(string, long): if isinstance(string_or_id, str):
if string == 0: if len(string_or_id) == 0:
return True return True
key = string elif string_or_id in SYMBOLS_BY_STR:
elif len(string) == 0:
return True return True
elif string in SYMBOLS_BY_STR: str_hash = hash_string(string_or_id)
return True elif _try_coerce_to_hash(string_or_id, &str_hash):
elif isinstance(string, str): pass
key = hash_string(string)
else: else:
string = string.encode("utf8") # TODO: Raise an error instead
key = hash_utf8(string, len(string)) return self._map.get(string_or_id) is not NULL
if key < len(SYMBOLS_BY_INT):
if str_hash < len(SYMBOLS_BY_INT):
return True return True
else: else:
return self._map.get(key) is not NULL return self._map.get(str_hash) is not NULL
def __iter__(self): def __iter__(self):
"""Iterate over the strings in the store, in order. """Iterate over the strings in the store, in order.
@ -272,13 +301,13 @@ cdef class StringStore:
cdef const Utf8Str* intern_unicode(self, str py_string): cdef const Utf8Str* intern_unicode(self, str py_string):
# 0 means missing, but we don't bother offsetting the index. # 0 means missing, but we don't bother offsetting the index.
cdef bytes byte_string = py_string.encode("utf8") cdef bytes byte_string = py_string.encode("utf8")
return self._intern_utf8(byte_string, len(byte_string)) return self._intern_utf8(byte_string, len(byte_string), NULL)
@cython.final @cython.final
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length): cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
# TODO: This function's API/behaviour is an unholy mess... # TODO: This function's API/behaviour is an unholy mess...
# 0 means missing, but we don't bother offsetting the index. # 0 means missing, but we don't bother offsetting the index.
cdef hash_t key = hash_utf8(utf8_string, length) cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
cdef Utf8Str* value = <Utf8Str*>self._map.get(key) cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
if value is not NULL: if value is not NULL:
return value return value

View File

@ -1,5 +1,11 @@
import pytest import pytest
from spacy.util import get_lang_class from spacy.util import get_lang_class
from hypothesis import settings
# Functionally disable deadline settings for tests
# to prevent spurious test failures in CI builds.
settings.register_profile("no_deadlines", deadline=2 * 60 * 1000) # in ms
settings.load_profile("no_deadlines")
def pytest_addoption(parser): def pytest_addoption(parser):

View File

@ -0,0 +1,8 @@
import pytest
def test_bg_tokenizer_handles_final_diacritics(bg_tokenizer):
text = "Ня̀маше яйца̀. Ня̀маше яйца̀."
tokens = bg_tokenizer(text)
assert tokens[1].text == "яйца̀"
assert tokens[2].text == "."

View File

@ -167,3 +167,12 @@ def test_issue3521(en_tokenizer, word):
tok = en_tokenizer(word)[1] tok = en_tokenizer(word)[1]
# 'not' and 'would' should be stopwords, also in their abbreviated forms # 'not' and 'would' should be stopwords, also in their abbreviated forms
assert tok.is_stop assert tok.is_stop
@pytest.mark.issue(10699)
@pytest.mark.parametrize("text", ["theses", "thisre"])
def test_issue10699(en_tokenizer, text):
"""Test that 'theses' and 'thisre' are excluded from the contractions
generated by the English tokenizer exceptions."""
tokens = en_tokenizer(text)
assert len(tokens) == 1

View File

@ -1,3 +1,4 @@
from string import punctuation
import pytest import pytest
@ -122,3 +123,36 @@ def test_ru_tokenizer_splits_bracket_period(ru_tokenizer):
text = "(Раз, два, три, проверка)." text = "(Раз, два, три, проверка)."
tokens = ru_tokenizer(text) tokens = ru_tokenizer(text)
assert tokens[len(tokens) - 1].text == "." assert tokens[len(tokens) - 1].text == "."
@pytest.mark.parametrize(
"text",
[
"рекоменду́я подда́ть жару́. Самого́ Баргамота",
"РЕКОМЕНДУ́Я ПОДДА́ТЬ ЖАРУ́. САМОГО́ БАРГАМОТА",
"рекоменду̍я подда̍ть жару̍.Самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍.'Самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍,самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍:самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍. самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍, самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍: самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍-самого̍ Баргамота",
],
)
def test_ru_tokenizer_handles_final_diacritics(ru_tokenizer, text):
tokens = ru_tokenizer(text)
assert tokens[2].text in ("жару́", "ЖАРУ́", "жару̍")
assert tokens[3].text in punctuation
@pytest.mark.parametrize(
"text",
[
"РЕКОМЕНДУ́Я ПОДДА́ТЬ ЖАРУ́.САМОГО́ БАРГАМОТА",
"рекоменду̍я подда̍ть жару́.самого́ Баргамота",
],
)
def test_ru_tokenizer_handles_final_diacritic_and_period(ru_tokenizer, text):
tokens = ru_tokenizer(text)
assert tokens[2].text.lower() == "жару́.самого́"

View File

@ -140,3 +140,10 @@ def test_uk_tokenizer_splits_bracket_period(uk_tokenizer):
text = "(Раз, два, три, проверка)." text = "(Раз, два, три, проверка)."
tokens = uk_tokenizer(text) tokens = uk_tokenizer(text)
assert tokens[len(tokens) - 1].text == "." assert tokens[len(tokens) - 1].text == "."
def test_uk_tokenizer_handles_final_diacritics(uk_tokenizer):
text = "Хлібі́в не було́. Хлібі́в не було́."
tokens = uk_tokenizer(text)
assert tokens[2].text == "було́"
assert tokens[3].text == "."

View File

@ -476,6 +476,17 @@ def test_matcher_extension_set_membership(en_vocab):
assert len(matches) == 0 assert len(matches) == 0
@pytest.mark.xfail(reason="IN predicate must handle sequence values in extensions")
def test_matcher_extension_in_set_predicate(en_vocab):
matcher = Matcher(en_vocab)
Token.set_extension("ext", default=[])
pattern = [{"_": {"ext": {"IN": ["A", "C"]}}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0]._.ext = ["A", "B"]
assert len(matcher(doc)) == 1
def test_matcher_basic_check(en_vocab): def test_matcher_basic_check(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
# Potential mistake: pass in pattern instead of list of patterns # Potential mistake: pass in pattern instead of list of patterns
@ -669,3 +680,38 @@ def test_matcher_ent_iob_key(en_vocab):
assert matches[0] == "Maria" assert matches[0] == "Maria"
assert matches[1] == "Maria Esperanza" assert matches[1] == "Maria Esperanza"
assert matches[2] == "Esperanza" assert matches[2] == "Esperanza"
def test_matcher_min_max_operator(en_vocab):
# Exactly n matches {n}
doc = Doc(
en_vocab,
words=["foo", "bar", "foo", "foo", "bar", "foo", "foo", "foo", "bar", "bar"],
)
matcher = Matcher(en_vocab)
pattern = [{"ORTH": "foo", "OP": "{3}"}]
matcher.add("TEST", [pattern])
matches1 = [doc[start:end].text for _, start, end in matcher(doc)]
assert len(matches1) == 1
# At least n matches {n,}
matcher = Matcher(en_vocab)
pattern = [{"ORTH": "foo", "OP": "{2,}"}]
matcher.add("TEST", [pattern])
matches2 = [doc[start:end].text for _, start, end in matcher(doc)]
assert len(matches2) == 4
# At most m matches {,m}
matcher = Matcher(en_vocab)
pattern = [{"ORTH": "foo", "OP": "{,2}"}]
matcher.add("TEST", [pattern])
matches3 = [doc[start:end].text for _, start, end in matcher(doc)]
assert len(matches3) == 9
# At least n matches and most m matches {n,m}
matcher = Matcher(en_vocab)
pattern = [{"ORTH": "foo", "OP": "{2,3}"}]
matcher.add("TEST", [pattern])
matches4 = [doc[start:end].text for _, start, end in matcher(doc)]
assert len(matches4) == 4

View File

@ -699,6 +699,10 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
("aaaa", "a a a a a?", [0, 1, 2, 3]), ("aaaa", "a a a a a?", [0, 1, 2, 3]),
("aaab", "a+ a b", [0, 0, 1, 2]), ("aaab", "a+ a b", [0, 0, 1, 2]),
("aaab", "a+ a+ b", [0, 0, 1, 2]), ("aaab", "a+ a+ b", [0, 0, 1, 2]),
("aaab", "a{2,} b", [0, 0, 0, 1]),
("aaab", "a{,3} b", [0, 0, 0, 1]),
("aaab", "a{2} b", [0, 0, 1]),
("aaab", "a{2,3} b", [0, 0, 0, 1]),
] ]
for string, pattern_str, result in cases: for string, pattern_str, result in cases:
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
@ -711,6 +715,8 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
pattern.append({"ORTH": part[0], "OP": "*"}) pattern.append({"ORTH": part[0], "OP": "*"})
elif part.endswith("?"): elif part.endswith("?"):
pattern.append({"ORTH": part[0], "OP": "?"}) pattern.append({"ORTH": part[0], "OP": "?"})
elif part.endswith("}"):
pattern.append({"ORTH": part[0], "OP": part[1:]})
else: else:
pattern.append({"ORTH": part}) pattern.append({"ORTH": part})
matcher.add("PATTERN", [pattern], greedy="LONGEST") matcher.add("PATTERN", [pattern], greedy="LONGEST")
@ -722,7 +728,7 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
assert expected == result, (string, pattern_str, s, e, n_matches) assert expected == result, (string, pattern_str, s, e, n_matches)
def test_matcher_with_alignments_nongreedy(en_vocab): def test_matcher_with_alignments_non_greedy(en_vocab):
cases = [ cases = [
(0, "aaab", "a* b", [[0, 1], [0, 0, 1], [0, 0, 0, 1], [1]]), (0, "aaab", "a* b", [[0, 1], [0, 0, 1], [0, 0, 0, 1], [1]]),
(1, "baab", "b a* b", [[0, 1, 1, 2]]), (1, "baab", "b a* b", [[0, 1, 1, 2]]),
@ -752,6 +758,10 @@ def test_matcher_with_alignments_nongreedy(en_vocab):
(15, "aaaa", "a a a a a?", [[0, 1, 2, 3]]), (15, "aaaa", "a a a a a?", [[0, 1, 2, 3]]),
(16, "aaab", "a+ a b", [[0, 1, 2], [0, 0, 1, 2]]), (16, "aaab", "a+ a b", [[0, 1, 2], [0, 0, 1, 2]]),
(17, "aaab", "a+ a+ b", [[0, 1, 2], [0, 0, 1, 2]]), (17, "aaab", "a+ a+ b", [[0, 1, 2], [0, 0, 1, 2]]),
(18, "aaab", "a{2,} b", [[0, 0, 1], [0, 0, 0, 1]]),
(19, "aaab", "a{3} b", [[0, 0, 0, 1]]),
(20, "aaab", "a{2} b", [[0, 0, 1]]),
(21, "aaab", "a{2,3} b", [[0, 0, 1], [0, 0, 0, 1]]),
] ]
for case_id, string, pattern_str, results in cases: for case_id, string, pattern_str, results in cases:
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
@ -764,6 +774,8 @@ def test_matcher_with_alignments_nongreedy(en_vocab):
pattern.append({"ORTH": part[0], "OP": "*"}) pattern.append({"ORTH": part[0], "OP": "*"})
elif part.endswith("?"): elif part.endswith("?"):
pattern.append({"ORTH": part[0], "OP": "?"}) pattern.append({"ORTH": part[0], "OP": "?"})
elif part.endswith("}"):
pattern.append({"ORTH": part[0], "OP": part[1:]})
else: else:
pattern.append({"ORTH": part}) pattern.append({"ORTH": part})

View File

@ -14,6 +14,14 @@ TEST_PATTERNS = [
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1), ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
([{"ENT_IOB": "foo"}], 1, 1), ([{"ENT_IOB": "foo"}], 1, 1),
([1, 2, 3], 3, 1), ([1, 2, 3], 3, 1),
([{"TEXT": "foo", "OP": "{,}"}], 1, 1),
([{"TEXT": "foo", "OP": "{,4}4"}], 1, 1),
([{"TEXT": "foo", "OP": "{a,3}"}], 1, 1),
([{"TEXT": "foo", "OP": "{a}"}], 1, 1),
([{"TEXT": "foo", "OP": "{,a}"}], 1, 1),
([{"TEXT": "foo", "OP": "{1,2,3}"}], 1, 1),
([{"TEXT": "foo", "OP": "{1, 3}"}], 1, 1),
([{"TEXT": "foo", "OP": "{-2}"}], 1, 1),
# Bad patterns flagged outside of Matcher # Bad patterns flagged outside of Matcher
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0) ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)
# Bad patterns not flagged with minimal checks # Bad patterns not flagged with minimal checks
@ -38,6 +46,7 @@ TEST_PATTERNS = [
([{"SENT_START": True}], 0, 0), ([{"SENT_START": True}], 0, 0),
([{"ENT_ID": "STRING"}], 0, 0), ([{"ENT_ID": "STRING"}], 0, 0),
([{"ENT_KB_ID": "STRING"}], 0, 0), ([{"ENT_KB_ID": "STRING"}], 0, 0),
([{"TEXT": "ha", "OP": "{3}"}], 0, 0),
] ]

View File

@ -10,7 +10,7 @@ from spacy.lang.it import Italian
from spacy.language import Language from spacy.language import Language
from spacy.lookups import Lookups from spacy.lookups import Lookups
from spacy.pipeline._parser_internals.ner import BiluoPushDown from spacy.pipeline._parser_internals.ner import BiluoPushDown
from spacy.training import Example, iob_to_biluo from spacy.training import Example, iob_to_biluo, split_bilu_label
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
from spacy.vocab import Vocab from spacy.vocab import Vocab
import logging import logging
@ -110,6 +110,9 @@ def test_issue2385():
# maintain support for iob2 format # maintain support for iob2 format
tags3 = ("B-PERSON", "I-PERSON", "B-PERSON") tags3 = ("B-PERSON", "I-PERSON", "B-PERSON")
assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"] assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"]
# ensure it works with hyphens in the name
tags4 = ("B-MULTI-PERSON", "I-MULTI-PERSON", "B-MULTI-PERSON")
assert iob_to_biluo(tags4) == ["B-MULTI-PERSON", "L-MULTI-PERSON", "U-MULTI-PERSON"]
@pytest.mark.issue(2800) @pytest.mark.issue(2800)
@ -154,6 +157,24 @@ def test_issue3209():
assert ner2.move_names == move_names assert ner2.move_names == move_names
def test_labels_from_BILUO():
"""Test that labels are inferred correctly when there's a - in label."""
nlp = English()
ner = nlp.add_pipe("ner")
ner.add_label("LARGE-ANIMAL")
nlp.initialize()
move_names = [
"O",
"B-LARGE-ANIMAL",
"I-LARGE-ANIMAL",
"L-LARGE-ANIMAL",
"U-LARGE-ANIMAL",
]
labels = {"LARGE-ANIMAL"}
assert ner.move_names == move_names
assert set(ner.labels) == labels
@pytest.mark.issue(4267) @pytest.mark.issue(4267)
def test_issue4267(): def test_issue4267():
"""Test that running an entity_ruler after ner gives consistent results""" """Test that running an entity_ruler after ner gives consistent results"""
@ -298,7 +319,7 @@ def test_oracle_moves_missing_B(en_vocab):
elif tag == "O": elif tag == "O":
moves.add_action(move_types.index("O"), "") moves.add_action(move_types.index("O"), "")
else: else:
action, label = tag.split("-") action, label = split_bilu_label(tag)
moves.add_action(move_types.index("B"), label) moves.add_action(move_types.index("B"), label)
moves.add_action(move_types.index("I"), label) moves.add_action(move_types.index("I"), label)
moves.add_action(move_types.index("L"), label) moves.add_action(move_types.index("L"), label)
@ -324,7 +345,7 @@ def test_oracle_moves_whitespace(en_vocab):
elif tag == "O": elif tag == "O":
moves.add_action(move_types.index("O"), "") moves.add_action(move_types.index("O"), "")
else: else:
action, label = tag.split("-") action, label = split_bilu_label(tag)
moves.add_action(move_types.index(action), label) moves.add_action(move_types.index(action), label)
moves.get_oracle_sequence(example) moves.get_oracle_sequence(example)

View File

@ -49,7 +49,9 @@ def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree
assert contains_cycle(multirooted_tree) is None assert contains_cycle(multirooted_tree) is None
def test_parser_is_nonproj_arc(nonproj_tree, partial_tree, multirooted_tree): def test_parser_is_nonproj_arc(
cyclic_tree, nonproj_tree, partial_tree, multirooted_tree
):
assert is_nonproj_arc(0, nonproj_tree) is False assert is_nonproj_arc(0, nonproj_tree) is False
assert is_nonproj_arc(1, nonproj_tree) is False assert is_nonproj_arc(1, nonproj_tree) is False
assert is_nonproj_arc(2, nonproj_tree) is False assert is_nonproj_arc(2, nonproj_tree) is False
@ -62,15 +64,23 @@ def test_parser_is_nonproj_arc(nonproj_tree, partial_tree, multirooted_tree):
assert is_nonproj_arc(7, partial_tree) is False assert is_nonproj_arc(7, partial_tree) is False
assert is_nonproj_arc(17, multirooted_tree) is False assert is_nonproj_arc(17, multirooted_tree) is False
assert is_nonproj_arc(16, multirooted_tree) is True assert is_nonproj_arc(16, multirooted_tree) is True
with pytest.raises(
ValueError, match=r"Found cycle in dependency graph: \[1, 2, 2, 4, 5, 3, 2\]"
):
is_nonproj_arc(6, cyclic_tree)
def test_parser_is_nonproj_tree( def test_parser_is_nonproj_tree(
proj_tree, nonproj_tree, partial_tree, multirooted_tree proj_tree, cyclic_tree, nonproj_tree, partial_tree, multirooted_tree
): ):
assert is_nonproj_tree(proj_tree) is False assert is_nonproj_tree(proj_tree) is False
assert is_nonproj_tree(nonproj_tree) is True assert is_nonproj_tree(nonproj_tree) is True
assert is_nonproj_tree(partial_tree) is False assert is_nonproj_tree(partial_tree) is False
assert is_nonproj_tree(multirooted_tree) is True assert is_nonproj_tree(multirooted_tree) is True
with pytest.raises(
ValueError, match=r"Found cycle in dependency graph: \[1, 2, 2, 4, 5, 3, 2\]"
):
is_nonproj_tree(cyclic_tree)
def test_parser_pseudoprojectivity(en_vocab): def test_parser_pseudoprojectivity(en_vocab):
@ -84,8 +94,10 @@ def test_parser_pseudoprojectivity(en_vocab):
tree = [1, 2, 2] tree = [1, 2, 2]
nonproj_tree = [1, 2, 2, 4, 5, 2, 7, 4, 2] nonproj_tree = [1, 2, 2, 4, 5, 2, 7, 4, 2]
nonproj_tree2 = [9, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1] nonproj_tree2 = [9, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1]
cyclic_tree = [1, 2, 2, 4, 5, 3, 2]
labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"] labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"]
labels2 = ["advmod", "root", "det", "nsubj", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod", "advmod", "det", "amod", "punct"] labels2 = ["advmod", "root", "det", "nsubj", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod", "advmod", "det", "amod", "punct"]
cyclic_labels = ["det", "nsubj", "root", "det", "dobj", "aux", "punct"]
# fmt: on # fmt: on
assert nonproj.decompose("X||Y") == ("X", "Y") assert nonproj.decompose("X||Y") == ("X", "Y")
assert nonproj.decompose("X") == ("X", "") assert nonproj.decompose("X") == ("X", "")
@ -97,6 +109,8 @@ def test_parser_pseudoprojectivity(en_vocab):
assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree2) == 10 assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree2) == 10
# fmt: off # fmt: off
proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels) proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
with pytest.raises(ValueError, match=r'Found cycle in dependency graph: \[1, 2, 2, 4, 5, 3, 2\]'):
nonproj.projectivize(cyclic_tree, cyclic_labels)
assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2] assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]
assert deco_labels == ["det", "nsubj", "root", "det", "dobj", "aux", assert deco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
"nsubj", "acl||dobj", "punct"] "nsubj", "acl||dobj", "punct"]

View File

@ -1,4 +1,4 @@
from typing import Callable, Iterable from typing import Callable, Iterable, Dict, Any
import pytest import pytest
from numpy.testing import assert_equal from numpy.testing import assert_equal
@ -207,7 +207,7 @@ def test_no_entities():
nlp.add_pipe("sentencizer", first=True) nlp.add_pipe("sentencizer", first=True)
# this will run the pipeline on the examples and shouldn't crash # this will run the pipeline on the examples and shouldn't crash
results = nlp.evaluate(train_examples) nlp.evaluate(train_examples)
def test_partial_links(): def test_partial_links():
@ -1063,7 +1063,7 @@ def test_no_gold_ents(patterns):
"entity_linker", config={"use_gold_ents": False}, last=True "entity_linker", config={"use_gold_ents": False}, last=True
) )
entity_linker.set_kb(create_kb) entity_linker.set_kb(create_kb)
assert entity_linker.use_gold_ents == False assert entity_linker.use_gold_ents is False
optimizer = nlp.initialize(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)
for i in range(2): for i in range(2):
@ -1074,7 +1074,7 @@ def test_no_gold_ents(patterns):
nlp.add_pipe("sentencizer", first=True) nlp.add_pipe("sentencizer", first=True)
# this will run the pipeline on the examples and shouldn't crash # this will run the pipeline on the examples and shouldn't crash
results = nlp.evaluate(train_examples) nlp.evaluate(train_examples)
@pytest.mark.issue(9575) @pytest.mark.issue(9575)
@ -1114,4 +1114,61 @@ def test_tokenization_mismatch():
nlp.update(train_examples, sgd=optimizer, losses=losses) nlp.update(train_examples, sgd=optimizer, losses=losses)
nlp.add_pipe("sentencizer", first=True) nlp.add_pipe("sentencizer", first=True)
results = nlp.evaluate(train_examples) nlp.evaluate(train_examples)
# fmt: off
@pytest.mark.parametrize(
"meet_threshold,config",
[
(False, {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
(True, {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
],
)
# fmt: on
def test_threshold(meet_threshold: bool, config: Dict[str, Any]):
"""Tests abstention threshold.
meet_threshold (bool): Whether to configure NEL setup so that confidence threshold is met.
config (Dict[str, Any]): NEL architecture config.
"""
nlp = English()
nlp.add_pipe("sentencizer")
text = "Mahler's Symphony No. 8 was beautiful."
entities = [(0, 6, "PERSON")]
links = {(0, 6): {"Q7304": 1.0}}
sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
entity_id = "Q7304"
doc = nlp(text)
train_examples = [
Example.from_dict(
doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
)
]
def create_kb(vocab):
# create artificial KB
mykb = KnowledgeBase(vocab, entity_vector_length=3)
mykb.add_entity(entity=entity_id, freq=12, entity_vector=[6, -4, 3])
mykb.add_alias(
alias="Mahler",
entities=[entity_id],
probabilities=[1 if meet_threshold else 0.01],
)
return mykb
# Create the Entity Linker component and add it to the pipeline
entity_linker = nlp.add_pipe(
"entity_linker",
last=True,
config={"threshold": 0.99, "model": config},
)
entity_linker.set_kb(create_kb) # type: ignore
nlp.initialize(get_examples=lambda: train_examples)
# Add a custom rule-based component to mimick NER
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
ruler.add_patterns([{"label": "PERSON", "pattern": [{"LOWER": "mahler"}]}]) # type: ignore
doc = nlp(text)
assert len(doc.ents) == 1
assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL

View File

@ -491,7 +491,6 @@ def test_entity_ruler_remove_nonexisting_pattern(nlp, entity_ruler_factory):
ruler.remove_by_id("nepattern") ruler.remove_by_id("nepattern")
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) @pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory): def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory):
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")

View File

@ -4,13 +4,14 @@ import numpy
import pytest import pytest
from thinc.api import get_current_ops from thinc.api import get_current_ops
import spacy
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.en.syntax_iterators import noun_chunks from spacy.lang.en.syntax_iterators import noun_chunks
from spacy.language import Language from spacy.language import Language
from spacy.pipeline import TrainablePipe from spacy.pipeline import TrainablePipe
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.training import Example from spacy.training import Example
from spacy.util import SimpleFrozenList, get_arg_names from spacy.util import SimpleFrozenList, get_arg_names, make_tempdir
from spacy.vocab import Vocab from spacy.vocab import Vocab
@ -602,3 +603,52 @@ def test_update_with_annotates():
assert results[component] == "".join(eg.predicted.text for eg in examples) assert results[component] == "".join(eg.predicted.text for eg in examples)
for component in components - set(components_to_annotate): for component in components - set(components_to_annotate):
assert results[component] == "" assert results[component] == ""
def test_load_disable_enable() -> None:
"""
Tests spacy.load() with dis-/enabling components.
"""
base_nlp = English()
for pipe in ("sentencizer", "tagger", "parser"):
base_nlp.add_pipe(pipe)
with make_tempdir() as tmp_dir:
base_nlp.to_disk(tmp_dir)
to_disable = ["parser", "tagger"]
to_enable = ["tagger", "parser"]
# Setting only `disable`.
nlp = spacy.load(tmp_dir, disable=to_disable)
assert all([comp_name in nlp.disabled for comp_name in to_disable])
# Setting only `enable`.
nlp = spacy.load(tmp_dir, enable=to_enable)
assert all(
[
(comp_name in nlp.disabled) is (comp_name not in to_enable)
for comp_name in nlp.component_names
]
)
# Testing consistent enable/disable combination.
nlp = spacy.load(
tmp_dir,
enable=to_enable,
disable=[
comp_name
for comp_name in nlp.component_names
if comp_name not in to_enable
],
)
assert all(
[
(comp_name in nlp.disabled) is (comp_name not in to_enable)
for comp_name in nlp.component_names
]
)
# Inconsistent enable/disable combination.
with pytest.raises(ValueError):
spacy.load(tmp_dir, enable=to_enable, disable=["parser"])

View File

@ -589,6 +589,7 @@ def test_string_to_list_intify(value):
assert string_to_list(value, intify=True) == [1, 2, 3] assert string_to_list(value, intify=True) == [1, 2, 3]
@pytest.mark.skip(reason="Temporarily skip for dev version")
def test_download_compatibility(): def test_download_compatibility():
spec = SpecifierSet("==" + about.__version__) spec = SpecifierSet("==" + about.__version__)
spec.prereleases = False spec.prereleases = False
@ -599,6 +600,7 @@ def test_download_compatibility():
assert get_minor_version(about.__version__) == get_minor_version(version) assert get_minor_version(about.__version__) == get_minor_version(version)
@pytest.mark.skip(reason="Temporarily skip for dev version")
def test_validate_compatibility_table(): def test_validate_compatibility_table():
spec = SpecifierSet("==" + about.__version__) spec = SpecifierSet("==" + about.__version__)
spec.prereleases = False spec.prereleases = False

View File

@ -671,13 +671,38 @@ def test_gold_ner_missing_tags(en_tokenizer):
def test_projectivize(en_tokenizer): def test_projectivize(en_tokenizer):
doc = en_tokenizer("He pretty quickly walks away") doc = en_tokenizer("He pretty quickly walks away")
heads = [3, 2, 3, 0, 2] heads = [3, 2, 3, 3, 2]
deps = ["dep"] * len(heads) deps = ["dep"] * len(heads)
example = Example.from_dict(doc, {"heads": heads, "deps": deps}) example = Example.from_dict(doc, {"heads": heads, "deps": deps})
proj_heads, proj_labels = example.get_aligned_parse(projectivize=True) proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False) nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False)
assert proj_heads == [3, 2, 3, 0, 3] assert proj_heads == [3, 2, 3, 3, 3]
assert nonproj_heads == [3, 2, 3, 0, 2] assert nonproj_heads == [3, 2, 3, 3, 2]
# Test single token documents
doc = en_tokenizer("Conrail")
heads = [0]
deps = ["dep"]
example = Example.from_dict(doc, {"heads": heads, "deps": deps})
proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
assert proj_heads == heads
assert proj_labels == deps
# Test documents with no alignments
doc_a = Doc(
doc.vocab, words=["Double-Jointed"], spaces=[False], deps=["ROOT"], heads=[0]
)
doc_b = Doc(
doc.vocab,
words=["Double", "-", "Jointed"],
spaces=[True, True, True],
deps=["amod", "punct", "ROOT"],
heads=[2, 2, 2],
)
example = Example(doc_a, doc_b)
proj_heads, proj_deps = example.get_aligned_parse(projectivize=True)
assert proj_heads == [None]
assert proj_deps == [None]
def test_iob_to_biluo(): def test_iob_to_biluo():

View File

@ -5,6 +5,7 @@ import srsly
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.util import make_tempdir # noqa: F401 from spacy.util import make_tempdir # noqa: F401
from spacy.training import split_bilu_label
from thinc.api import get_current_ops from thinc.api import get_current_ops
@ -40,7 +41,7 @@ def apply_transition_sequence(parser, doc, sequence):
desired state.""" desired state."""
for action_name in sequence: for action_name in sequence:
if "-" in action_name: if "-" in action_name:
move, label = action_name.split("-") move, label = split_bilu_label(action_name)
parser.add_label(label) parser.add_label(label)
with parser.step_through(doc) as stepwise: with parser.step_through(doc) as stepwise:
for transition in sequence: for transition in sequence:

View File

@ -1,6 +1,7 @@
import pytest import pytest
import numpy import numpy
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.vocab import Vocab
from ..util import get_cosine, add_vecs_to_vocab from ..util import get_cosine, add_vecs_to_vocab
@ -71,7 +72,6 @@ def test_vectors_similarity_DD(vocab, vectors):
def test_vectors_similarity_TD(vocab, vectors): def test_vectors_similarity_TD(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors [(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2]) doc = Doc(vocab, words=[word1, word2])
with pytest.warns(UserWarning):
assert isinstance(doc.similarity(doc[0]), float) assert isinstance(doc.similarity(doc[0]), float)
assert isinstance(doc[0].similarity(doc), float) assert isinstance(doc[0].similarity(doc), float)
assert doc.similarity(doc[0]) == doc[0].similarity(doc) assert doc.similarity(doc[0]) == doc[0].similarity(doc)
@ -80,9 +80,8 @@ def test_vectors_similarity_TD(vocab, vectors):
def test_vectors_similarity_TS(vocab, vectors): def test_vectors_similarity_TS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors [(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2]) doc = Doc(vocab, words=[word1, word2])
with pytest.warns(UserWarning):
assert isinstance(doc[:2].similarity(doc[0]), float) assert isinstance(doc[:2].similarity(doc[0]), float)
assert isinstance(doc[0].similarity(doc[-2]), float) assert isinstance(doc[0].similarity(doc[:2]), float)
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2]) assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
@ -91,3 +90,21 @@ def test_vectors_similarity_DS(vocab, vectors):
doc = Doc(vocab, words=[word1, word2]) doc = Doc(vocab, words=[word1, word2])
assert isinstance(doc.similarity(doc[:2]), float) assert isinstance(doc.similarity(doc[:2]), float)
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc) assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
def test_vectors_similarity_no_vectors():
vocab = Vocab()
doc1 = Doc(vocab, words=["a", "b"])
doc2 = Doc(vocab, words=["c", "d", "e"])
with pytest.warns(UserWarning):
doc1.similarity(doc2)
with pytest.warns(UserWarning):
doc1.similarity(doc2[1])
with pytest.warns(UserWarning):
doc1.similarity(doc2[:2])
with pytest.warns(UserWarning):
doc2.similarity(doc1)
with pytest.warns(UserWarning):
doc2[1].similarity(doc1)
with pytest.warns(UserWarning):
doc2[:2].similarity(doc1)

View File

@ -318,7 +318,6 @@ def test_vectors_lexeme_doc_similarity(vocab, text):
@pytest.mark.parametrize("text", [["apple", "orange", "juice"]]) @pytest.mark.parametrize("text", [["apple", "orange", "juice"]])
def test_vectors_span_span_similarity(vocab, text): def test_vectors_span_span_similarity(vocab, text):
doc = Doc(vocab, words=text) doc = Doc(vocab, words=text)
with pytest.warns(UserWarning):
assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2]) assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
assert -1.0 < doc[0:2].similarity(doc[1:3]) < 1.0 assert -1.0 < doc[0:2].similarity(doc[1:3]) < 1.0
@ -326,7 +325,6 @@ def test_vectors_span_span_similarity(vocab, text):
@pytest.mark.parametrize("text", [["apple", "orange", "juice"]]) @pytest.mark.parametrize("text", [["apple", "orange", "juice"]])
def test_vectors_span_doc_similarity(vocab, text): def test_vectors_span_doc_similarity(vocab, text):
doc = Doc(vocab, words=text) doc = Doc(vocab, words=text)
with pytest.warns(UserWarning):
assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2]) assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
assert -1.0 < doc[0:2].similarity(doc) < 1.0 assert -1.0 < doc[0:2].similarity(doc) < 1.0

View File

@ -607,6 +607,7 @@ cdef class Doc:
if self.vocab.vectors.n_keys == 0: if self.vocab.vectors.n_keys == 0:
warnings.warn(Warnings.W007.format(obj="Doc")) warnings.warn(Warnings.W007.format(obj="Doc"))
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
if not self.has_vector or not other.has_vector:
warnings.warn(Warnings.W008.format(obj="Doc")) warnings.warn(Warnings.W008.format(obj="Doc"))
return 0.0 return 0.0
vector = self.vector vector = self.vector
@ -627,7 +628,7 @@ cdef class Doc:
if "has_vector" in self.user_hooks: if "has_vector" in self.user_hooks:
return self.user_hooks["has_vector"](self) return self.user_hooks["has_vector"](self)
elif self.vocab.vectors.size: elif self.vocab.vectors.size:
return True return any(token.has_vector for token in self)
elif self.tensor.size: elif self.tensor.size:
return True return True
else: else:

View File

@ -354,6 +354,7 @@ cdef class Span:
if self.vocab.vectors.n_keys == 0: if self.vocab.vectors.n_keys == 0:
warnings.warn(Warnings.W007.format(obj="Span")) warnings.warn(Warnings.W007.format(obj="Span"))
if self.vector_norm == 0.0 or other.vector_norm == 0.0: if self.vector_norm == 0.0 or other.vector_norm == 0.0:
if not self.has_vector or not other.has_vector:
warnings.warn(Warnings.W008.format(obj="Span")) warnings.warn(Warnings.W008.format(obj="Span"))
return 0.0 return 0.0
vector = self.vector vector = self.vector

View File

@ -206,6 +206,7 @@ cdef class Token:
if self.vocab.vectors.n_keys == 0: if self.vocab.vectors.n_keys == 0:
warnings.warn(Warnings.W007.format(obj="Token")) warnings.warn(Warnings.W007.format(obj="Token"))
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
if not self.has_vector or not other.has_vector:
warnings.warn(Warnings.W008.format(obj="Token")) warnings.warn(Warnings.W008.format(obj="Token"))
return 0.0 return 0.0
vector = self.vector vector = self.vector

View File

@ -5,6 +5,7 @@ from .augment import dont_augment, orth_variants_augmenter # noqa: F401
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401 from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401 from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401
from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401 from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401
from .iob_utils import split_bilu_label, remove_bilu_prefix # noqa: F401
from .gold_io import docs_to_json, read_json_file # noqa: F401 from .gold_io import docs_to_json, read_json_file # noqa: F401
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401 from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
from .loggers import console_logger # noqa: F401 from .loggers import console_logger # noqa: F401

View File

@ -1,33 +1,39 @@
from typing import List from typing import List
from ..errors import Errors from ..errors import Errors
import numpy import numpy
from libc.stdint cimport int32_t
cdef class AlignmentArray: cdef class AlignmentArray:
"""AlignmentArray is similar to Thinc's Ragged with two simplfications: """AlignmentArray is similar to Thinc's Ragged with two simplfications:
indexing returns numpy arrays and this type can only be used for CPU arrays. indexing returns numpy arrays and this type can only be used for CPU arrays.
However, these changes make AlginmentArray more efficient for indexing in a However, these changes make AlignmentArray more efficient for indexing in a
tight loop.""" tight loop."""
__slots__ = [] __slots__ = []
def __init__(self, alignment: List[List[int]]): def __init__(self, alignment: List[List[int]]):
self._lengths = None
self._starts_ends = numpy.zeros(len(alignment) + 1, dtype="i")
cdef int data_len = 0 cdef int data_len = 0
cdef int outer_len cdef int outer_len
cdef int idx cdef int idx
self._starts_ends = numpy.zeros(len(alignment) + 1, dtype='int32')
cdef int32_t* starts_ends_ptr = <int32_t*>self._starts_ends.data
for idx, outer in enumerate(alignment): for idx, outer in enumerate(alignment):
outer_len = len(outer) outer_len = len(outer)
self._starts_ends[idx + 1] = self._starts_ends[idx] + outer_len starts_ends_ptr[idx + 1] = starts_ends_ptr[idx] + outer_len
data_len += outer_len data_len += outer_len
self._data = numpy.empty(data_len, dtype="i") self._lengths = None
self._data = numpy.empty(data_len, dtype="int32")
idx = 0 idx = 0
cdef int32_t* data_ptr = <int32_t*>self._data.data
for outer in alignment: for outer in alignment:
for inner in outer: for inner in outer:
self._data[idx] = inner data_ptr[idx] = inner
idx += 1 idx += 1
def __getitem__(self, idx): def __getitem__(self, idx):

View File

@ -3,10 +3,10 @@ from typing import Optional
import random import random
import itertools import itertools
from functools import partial from functools import partial
from pydantic import BaseModel, StrictStr
from ..util import registry from ..util import registry
from .example import Example from .example import Example
from .iob_utils import split_bilu_label
if TYPE_CHECKING: if TYPE_CHECKING:
from ..language import Language # noqa: F401 from ..language import Language # noqa: F401
@ -278,10 +278,8 @@ def make_whitespace_variant(
ent_prev = doc_dict["entities"][position - 1] ent_prev = doc_dict["entities"][position - 1]
ent_next = doc_dict["entities"][position] ent_next = doc_dict["entities"][position]
if "-" in ent_prev and "-" in ent_next: if "-" in ent_prev and "-" in ent_next:
ent_iob_prev = ent_prev.split("-")[0] ent_iob_prev, ent_type_prev = split_bilu_label(ent_prev)
ent_type_prev = ent_prev.split("-", 1)[1] ent_iob_next, ent_type_next = split_bilu_label(ent_next)
ent_iob_next = ent_next.split("-")[0]
ent_type_next = ent_next.split("-", 1)[1]
if ( if (
ent_iob_prev in ("B", "I") ent_iob_prev in ("B", "I")
and ent_iob_next in ("I", "L") and ent_iob_next in ("I", "L")

View File

@ -9,11 +9,11 @@ from ..tokens.span import Span
from ..attrs import IDS from ..attrs import IDS
from .alignment import Alignment from .alignment import Alignment
from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
from .iob_utils import biluo_tags_to_spans from .iob_utils import biluo_tags_to_spans, remove_bilu_prefix
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..pipeline._parser_internals import nonproj from ..pipeline._parser_internals import nonproj
from ..tokens.token cimport MISSING_DEP from ..tokens.token cimport MISSING_DEP
from ..util import logger, to_ternary_int from ..util import logger, to_ternary_int, all_equal
cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot): cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
@ -151,50 +151,127 @@ cdef class Example:
self._y_sig = y_sig self._y_sig = y_sig
return self._cached_alignment return self._cached_alignment
def _get_aligned_vectorized(self, align, gold_values):
# Fast path for Doc attributes/fields that are predominantly a single value,
# i.e., TAG, POS, MORPH.
x2y_single_toks = []
x2y_single_toks_i = []
x2y_multiple_toks = []
x2y_multiple_toks_i = []
# Gather indices of gold tokens aligned to the candidate tokens into two buckets.
# Bucket 1: All tokens that have a one-to-one alignment.
# Bucket 2: All tokens that have a one-to-many alignment.
for idx, token in enumerate(self.predicted):
aligned_gold_i = align[token.i]
aligned_gold_len = len(aligned_gold_i)
if aligned_gold_len == 1:
x2y_single_toks.append(aligned_gold_i.item())
x2y_single_toks_i.append(idx)
elif aligned_gold_len > 1:
x2y_multiple_toks.append(aligned_gold_i)
x2y_multiple_toks_i.append(idx)
# Map elements of the first bucket directly to the output array.
output = numpy.full(len(self.predicted), None)
output[x2y_single_toks_i] = gold_values[x2y_single_toks].squeeze()
# Collapse many-to-one alignments into one-to-one alignments if they
# share the same value. Map to None in all other cases.
for i in range(len(x2y_multiple_toks)):
aligned_gold_values = gold_values[x2y_multiple_toks[i]]
# If all aligned tokens have the same value, use it.
if all_equal(aligned_gold_values):
x2y_multiple_toks[i] = aligned_gold_values[0].item()
else:
x2y_multiple_toks[i] = None
output[x2y_multiple_toks_i] = x2y_multiple_toks
return output.tolist()
def _get_aligned_non_vectorized(self, align, gold_values):
# Slower path for fields that return multiple values (resulting
# in ragged arrays that cannot be vectorized trivially).
output = [None] * len(self.predicted)
for token in self.predicted:
aligned_gold_i = align[token.i]
values = gold_values[aligned_gold_i].ravel()
if len(values) == 1:
output[token.i] = values.item()
elif all_equal(values):
# If all aligned tokens have the same value, use it.
output[token.i] = values[0].item()
return output
def get_aligned(self, field, as_string=False): def get_aligned(self, field, as_string=False):
"""Return an aligned array for a token attribute.""" """Return an aligned array for a token attribute."""
align = self.alignment.x2y align = self.alignment.x2y
gold_values = self.reference.to_array([field])
if len(gold_values.shape) == 1:
output = self._get_aligned_vectorized(align, gold_values)
else:
output = self._get_aligned_non_vectorized(align, gold_values)
vocab = self.reference.vocab vocab = self.reference.vocab
gold_values = self.reference.to_array([field])
output = [None] * len(self.predicted)
for token in self.predicted:
values = gold_values[align[token.i]]
values = values.ravel()
if len(values) == 0:
output[token.i] = None
elif len(values) == 1:
output[token.i] = values[0]
elif len(set(list(values))) == 1:
# If all aligned tokens have the same value, use it.
output[token.i] = values[0]
else:
output[token.i] = None
if as_string and field not in ["ENT_IOB", "SENT_START"]: if as_string and field not in ["ENT_IOB", "SENT_START"]:
output = [vocab.strings[o] if o is not None else o for o in output] output = [vocab.strings[o] if o is not None else o for o in output]
return output return output
def get_aligned_parse(self, projectivize=True): def get_aligned_parse(self, projectivize=True):
cand_to_gold = self.alignment.x2y cand_to_gold = self.alignment.x2y
gold_to_cand = self.alignment.y2x gold_to_cand = self.alignment.y2x
aligned_heads = [None] * self.x.length
aligned_deps = [None] * self.x.length
has_deps = [token.has_dep() for token in self.y]
has_heads = [token.has_head() for token in self.y]
heads = [token.head.i for token in self.y] heads = [token.head.i for token in self.y]
deps = [token.dep_ for token in self.y] deps = [token.dep_ for token in self.y]
if projectivize: if projectivize:
proj_heads, proj_deps = nonproj.projectivize(heads, deps) proj_heads, proj_deps = nonproj.projectivize(heads, deps)
has_deps = [token.has_dep() for token in self.y]
has_heads = [token.has_head() for token in self.y]
# ensure that missing data remains missing # ensure that missing data remains missing
heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)] heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)]
deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)] deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)]
for cand_i in range(self.x.length):
if cand_to_gold.lengths[cand_i] == 1: # Select all candidate tokens that are aligned to a single gold token.
gold_i = cand_to_gold[cand_i][0] c2g_single_toks = numpy.where(cand_to_gold.lengths == 1)[0]
if gold_to_cand.lengths[heads[gold_i]] == 1:
aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]][0]) # Fetch all aligned gold token incides.
aligned_deps[cand_i] = deps[gold_i] if c2g_single_toks.shape == cand_to_gold.lengths.shape:
return aligned_heads, aligned_deps # This the most likely case.
gold_i = cand_to_gold[:]
else:
gold_i = numpy.vectorize(lambda x: cand_to_gold[int(x)][0], otypes='i')(c2g_single_toks)
# Fetch indices of all gold heads for the aligned gold tokens.
heads = numpy.asarray(heads, dtype='i')
gold_head_i = heads[gold_i]
# Select all gold tokens that are heads of the previously selected
# gold tokens (and are aligned to a single candidate token).
g2c_len_heads = gold_to_cand.lengths[gold_head_i]
g2c_len_heads = numpy.where(g2c_len_heads == 1)[0]
g2c_i = numpy.vectorize(lambda x: gold_to_cand[int(x)][0], otypes='i')(gold_head_i[g2c_len_heads]).squeeze()
# Update head/dep alignments with the above.
aligned_heads = numpy.full((self.x.length), None)
aligned_heads[c2g_single_toks[g2c_len_heads]] = g2c_i
deps = numpy.asarray(deps)
aligned_deps = numpy.full((self.x.length), None)
aligned_deps[c2g_single_toks] = deps[gold_i]
return aligned_heads.tolist(), aligned_deps.tolist()
def get_aligned_sent_starts(self): def get_aligned_sent_starts(self):
"""Get list of SENT_START attributes aligned to the predicted tokenization. """Get list of SENT_START attributes aligned to the predicted tokenization.
@ -519,7 +596,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
else: else:
ent_iobs.append(iob_tag.split("-")[0]) ent_iobs.append(iob_tag.split("-")[0])
if iob_tag.startswith("I") or iob_tag.startswith("B"): if iob_tag.startswith("I") or iob_tag.startswith("B"):
ent_types.append(iob_tag.split("-", 1)[1]) ent_types.append(remove_bilu_prefix(iob_tag))
else: else:
ent_types.append("") ent_types.append("")
return ent_iobs, ent_types return ent_iobs, ent_types

View File

@ -1,4 +1,4 @@
from typing import List, Dict, Tuple, Iterable, Union, Iterator from typing import List, Dict, Tuple, Iterable, Union, Iterator, cast
import warnings import warnings
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
@ -218,6 +218,14 @@ def tags_to_entities(tags: Iterable[str]) -> List[Tuple[str, int, int]]:
return entities return entities
def split_bilu_label(label: str) -> Tuple[str, str]:
return cast(Tuple[str, str], label.split("-", 1))
def remove_bilu_prefix(label: str) -> str:
return label.split("-", 1)[1]
# Fallbacks to make backwards-compat easier # Fallbacks to make backwards-compat easier
offsets_from_biluo_tags = biluo_tags_to_offsets offsets_from_biluo_tags = biluo_tags_to_offsets
spans_from_biluo_tags = biluo_tags_to_spans spans_from_biluo_tags = biluo_tags_to_spans

View File

@ -1,6 +1,6 @@
from typing import List, Mapping, NoReturn, Union, Dict, Any, Set from typing import List, Mapping, NoReturn, Union, Dict, Any, Set, cast
from typing import Optional, Iterable, Callable, Tuple, Type from typing import Optional, Iterable, Callable, Tuple, Type
from typing import Iterator, Type, Pattern, Generator, TYPE_CHECKING from typing import Iterator, Pattern, Generator, TYPE_CHECKING
from types import ModuleType from types import ModuleType
import os import os
import importlib import importlib
@ -12,7 +12,6 @@ from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
from thinc.api import ConfigValidationError, Model from thinc.api import ConfigValidationError, Model
import functools import functools
import itertools import itertools
import numpy.random
import numpy import numpy
import srsly import srsly
import catalogue import catalogue
@ -294,7 +293,7 @@ def find_matching_language(lang: str) -> Optional[str]:
# Find out which language modules we have # Find out which language modules we have
possible_languages = [] possible_languages = []
for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore[attr-defined]
code = modinfo.name code = modinfo.name
if code == "xx": if code == "xx":
# Temporarily make 'xx' into a valid language code # Temporarily make 'xx' into a valid language code
@ -391,7 +390,8 @@ def get_module_path(module: ModuleType) -> Path:
""" """
if not hasattr(module, "__module__"): if not hasattr(module, "__module__"):
raise ValueError(Errors.E169.format(module=repr(module))) raise ValueError(Errors.E169.format(module=repr(module)))
return Path(sys.modules[module.__module__].__file__).parent file_path = Path(cast(os.PathLike, sys.modules[module.__module__].__file__))
return file_path.parent
def load_model( def load_model(
@ -399,6 +399,7 @@ def load_model(
*, *,
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(), disable: Iterable[str] = SimpleFrozenList(),
enable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(), exclude: Iterable[str] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
@ -408,11 +409,19 @@ def load_model(
vocab (Vocab / True): Optional vocab to pass in on initialization. If True, vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created. a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable. disable (Iterable[str]): Names of pipeline components to disable.
enable (Iterable[str]): Names of pipeline components to enable. All others will be disabled.
exclude (Iterable[str]): Names of pipeline components to exclude.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation. keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object. RETURNS (Language): The loaded nlp object.
""" """
kwargs = {"vocab": vocab, "disable": disable, "exclude": exclude, "config": config} kwargs = {
"vocab": vocab,
"disable": disable,
"enable": enable,
"exclude": exclude,
"config": config,
}
if isinstance(name, str): # name or string path if isinstance(name, str): # name or string path
if name.startswith("blank:"): # shortcut for blank model if name.startswith("blank:"): # shortcut for blank model
return get_lang_class(name.replace("blank:", ""))() return get_lang_class(name.replace("blank:", ""))()
@ -432,6 +441,7 @@ def load_model_from_package(
*, *,
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(), disable: Iterable[str] = SimpleFrozenList(),
enable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(), exclude: Iterable[str] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
@ -443,6 +453,8 @@ def load_model_from_package(
disable (Iterable[str]): Names of pipeline components to disable. Disabled disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe. enable them by calling nlp.enable_pipe.
enable (Iterable[str]): Names of pipeline components to enable. All other
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded. components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict config (Dict[str, Any] / Config): Config overrides as nested dict or dict
@ -450,7 +462,7 @@ def load_model_from_package(
RETURNS (Language): The loaded nlp object. RETURNS (Language): The loaded nlp object.
""" """
cls = importlib.import_module(name) cls = importlib.import_module(name)
return cls.load(vocab=vocab, disable=disable, exclude=exclude, config=config) # type: ignore[attr-defined] return cls.load(vocab=vocab, disable=disable, enable=enable, exclude=exclude, config=config) # type: ignore[attr-defined]
def load_model_from_path( def load_model_from_path(
@ -459,6 +471,7 @@ def load_model_from_path(
meta: Optional[Dict[str, Any]] = None, meta: Optional[Dict[str, Any]] = None,
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(), disable: Iterable[str] = SimpleFrozenList(),
enable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(), exclude: Iterable[str] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
@ -472,6 +485,8 @@ def load_model_from_path(
disable (Iterable[str]): Names of pipeline components to disable. Disabled disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe. enable them by calling nlp.enable_pipe.
enable (Iterable[str]): Names of pipeline components to enable. All other
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded. components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict config (Dict[str, Any] / Config): Config overrides as nested dict or dict
@ -486,7 +501,12 @@ def load_model_from_path(
overrides = dict_to_dot(config) overrides = dict_to_dot(config)
config = load_config(config_path, overrides=overrides) config = load_config(config_path, overrides=overrides)
nlp = load_model_from_config( nlp = load_model_from_config(
config, vocab=vocab, disable=disable, exclude=exclude, meta=meta config,
vocab=vocab,
disable=disable,
enable=enable,
exclude=exclude,
meta=meta,
) )
return nlp.from_disk(model_path, exclude=exclude, overrides=overrides) return nlp.from_disk(model_path, exclude=exclude, overrides=overrides)
@ -497,6 +517,7 @@ def load_model_from_config(
meta: Dict[str, Any] = SimpleFrozenDict(), meta: Dict[str, Any] = SimpleFrozenDict(),
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(), disable: Iterable[str] = SimpleFrozenList(),
enable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(), exclude: Iterable[str] = SimpleFrozenList(),
auto_fill: bool = False, auto_fill: bool = False,
validate: bool = True, validate: bool = True,
@ -511,6 +532,8 @@ def load_model_from_config(
disable (Iterable[str]): Names of pipeline components to disable. Disabled disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe. enable them by calling nlp.enable_pipe.
enable (Iterable[str]): Names of pipeline components to enable. All other
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded. components won't be loaded.
auto_fill (bool): Whether to auto-fill config with missing defaults. auto_fill (bool): Whether to auto-fill config with missing defaults.
@ -529,6 +552,7 @@ def load_model_from_config(
config, config,
vocab=vocab, vocab=vocab,
disable=disable, disable=disable,
enable=enable,
exclude=exclude, exclude=exclude,
auto_fill=auto_fill, auto_fill=auto_fill,
validate=validate, validate=validate,
@ -593,6 +617,7 @@ def load_model_from_init_py(
*, *,
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(), disable: Iterable[str] = SimpleFrozenList(),
enable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(), exclude: Iterable[str] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
@ -604,6 +629,8 @@ def load_model_from_init_py(
disable (Iterable[str]): Names of pipeline components to disable. Disabled disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe. enable them by calling nlp.enable_pipe.
enable (Iterable[str]): Names of pipeline components to enable. All other
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded. components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict config (Dict[str, Any] / Config): Config overrides as nested dict or dict
@ -621,6 +648,7 @@ def load_model_from_init_py(
vocab=vocab, vocab=vocab,
meta=meta, meta=meta,
disable=disable, disable=disable,
enable=enable,
exclude=exclude, exclude=exclude,
config=config, config=config,
) )
@ -878,7 +906,7 @@ def get_package_path(name: str) -> Path:
# Here we're importing the module just to find it. This is worryingly # Here we're importing the module just to find it. This is worryingly
# indirect, but it's otherwise very difficult to find the package. # indirect, but it's otherwise very difficult to find the package.
pkg = importlib.import_module(name) pkg = importlib.import_module(name)
return Path(pkg.__file__).parent return Path(cast(Union[str, os.PathLike], pkg.__file__)).parent
def replace_model_node(model: Model, target: Model, replacement: Model) -> None: def replace_model_node(model: Model, target: Model, replacement: Model) -> None:
@ -1684,7 +1712,14 @@ def packages_distributions() -> Dict[str, List[str]]:
it's not available in the builtin importlib.metadata. it's not available in the builtin importlib.metadata.
""" """
pkg_to_dist = defaultdict(list) pkg_to_dist = defaultdict(list)
for dist in importlib_metadata.distributions(): # type: ignore[attr-defined] for dist in importlib_metadata.distributions():
for pkg in (dist.read_text("top_level.txt") or "").split(): for pkg in (dist.read_text("top_level.txt") or "").split():
pkg_to_dist[pkg].append(dist.metadata["Name"]) pkg_to_dist[pkg].append(dist.metadata["Name"])
return dict(pkg_to_dist) return dict(pkg_to_dist)
def all_equal(iterable):
"""Return True if all the elements are equal to each other
(or if the input is an empty sequence), False otherwise."""
g = itertools.groupby(iterable)
return next(g, True) and not next(g, False)

View File

@ -336,10 +336,10 @@ cdef class Vectors:
xp = get_array_module(self.data) xp = get_array_module(self.data)
if key is not None: if key is not None:
key = get_string_id(key) key = get_string_id(key)
return self.key2row.get(key, -1) return self.key2row.get(int(key), -1)
elif keys is not None: elif keys is not None:
keys = [get_string_id(key) for key in keys] keys = [get_string_id(key) for key in keys]
rows = [self.key2row.get(key, -1.) for key in keys] rows = [self.key2row.get(int(key), -1) for key in keys]
return xp.asarray(rows, dtype="i") return xp.asarray(rows, dtype="i")
else: else:
row2key = {row: key for key, row in self.key2row.items()} row2key = {row: key for key, row in self.key2row.items()}

View File

@ -47,12 +47,13 @@ architectures and their arguments and hyperparameters.
> "model": DEFAULT_NEL_MODEL, > "model": DEFAULT_NEL_MODEL,
> "entity_vector_length": 64, > "entity_vector_length": 64,
> "get_candidates": {'@misc': 'spacy.CandidateGenerator.v1'}, > "get_candidates": {'@misc': 'spacy.CandidateGenerator.v1'},
> "threshold": None,
> } > }
> nlp.add_pipe("entity_linker", config=config) > nlp.add_pipe("entity_linker", config=config)
> ``` > ```
| Setting | Description | | Setting | Description |
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | | `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | | `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | | `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
@ -63,6 +64,7 @@ architectures and their arguments and hyperparameters.
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | | `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | | `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | | `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
```python ```python
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py %%GITHUB_SPACY/spacy/pipeline/entity_linker.py
@ -96,7 +98,7 @@ custom knowledge base, you should either call
[`initialize`](/api/entitylinker#initialize) call. [`initialize`](/api/entitylinker#initialize) call.
| Name | Description | | Name | Description |
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- | | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
@ -109,6 +111,7 @@ custom knowledge base, you should either call
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | | `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | | `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | | `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
## EntityLinker.\_\_call\_\_ {#call tag="method"} ## EntityLinker.\_\_call\_\_ {#call tag="method"}

View File

@ -59,15 +59,20 @@ matched:
> [ > [
> {"POS": "ADJ", "OP": "*"}, > {"POS": "ADJ", "OP": "*"},
> {"POS": "NOUN", "OP": "+"} > {"POS": "NOUN", "OP": "+"}
> {"POS": "PROPN", "OP": "{2}"}
> ] > ]
> ``` > ```
| OP | Description | | OP | Description |
| --- | ---------------------------------------------------------------- | |---------|------------------------------------------------------------------------|
| `!` | Negate the pattern, by requiring it to match exactly 0 times. | | `!` | Negate the pattern, by requiring it to match exactly 0 times. |
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. | | `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
| `+` | Require the pattern to match 1 or more times. | | `+` | Require the pattern to match 1 or more times. |
| `*` | Allow the pattern to match 0 or more times. | | `*` | Allow the pattern to match 0 or more times. |
| `{n}` | Require the pattern to match exactly _n_ times. |
| `{n,m}` | Require the pattern to match at least _n_ but not more than _m_ times. |
| `{n,}` | Require the pattern to match at least _n_ times. |
| `{,m}` | Require the pattern to match at most _m_ times. |
Token patterns can also map to a **dictionary of properties** instead of a Token patterns can also map to a **dictionary of properties** instead of a
single value to indicate whether the expected value is a member of a list or how single value to indicate whether the expected value is a member of a list or how

View File

@ -51,6 +51,7 @@ specified separately using the new `exclude` keyword argument.
| _keyword-only_ | | | _keyword-only_ | |
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
| `enable` | Names of pipeline components to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~List[str]~~ |
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | | `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | | `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ | | **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ |

View File

@ -130,8 +130,8 @@ grateful to use the work of Chainer's [CuPy](https://cupy.chainer.org) module,
which provides a numpy-compatible interface for GPU arrays. which provides a numpy-compatible interface for GPU arrays.
spaCy can be installed for a CUDA-compatible GPU by specifying `spacy[cuda]`, spaCy can be installed for a CUDA-compatible GPU by specifying `spacy[cuda]`,
`spacy[cuda102]`, `spacy[cuda112]`, `spacy[cuda113]`, etc. If you know your `spacy[cuda102]`, `spacy[cuda112]`, `spacy[cuda113]`, etc. If you know your CUDA
CUDA version, using the more explicit specifier allows CuPy to be installed via version, using the more explicit specifier allows CuPy to be installed via
wheel, saving some compilation time. The specifiers should install wheel, saving some compilation time. The specifiers should install
[`cupy`](https://cupy.chainer.org). [`cupy`](https://cupy.chainer.org).
@ -195,29 +195,73 @@ How to install compilers and related build tools:
[Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/) [Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/)
that matches the version that was used to compile your Python interpreter. that matches the version that was used to compile your Python interpreter.
#### Using build constraints when compiling from source
If you install spaCy from source or with `pip` for platforms where there are not
binary wheels on PyPI, you may need to use build constraints if any package in
your environment requires an older version of `numpy`.
If `numpy` gets downgraded from the most recent release at any point after
you've compiled `spacy`, you might see an error that looks like this:
```none
numpy.ndarray size changed, may indicate binary incompatibility.
```
To fix this, create a new virtual environment and install `spacy` and all of its
dependencies using build constraints.
[Build constraints](https://pip.pypa.io/en/stable/user_guide/#constraints-files)
specify an older version of `numpy` that is only used while compiling `spacy`,
and then your runtime environment can use any newer version of `numpy` and still
be compatible. In addition, use `--no-cache-dir` to ignore any previously cached
wheels so that all relevant packages are recompiled from scratch:
```shell
PIP_CONSTRAINT=https://raw.githubusercontent.com/explosion/spacy/master/build-constraints.txt \
pip install spacy --no-cache-dir
```
Our build constraints currently specify the oldest supported `numpy` available
on PyPI for `x86_64` and `aarch64`. Depending on your platform and environment,
you may want to customize the specific versions of `numpy`. For other platforms,
you can have a look at SciPy's
[`oldest-supported-numpy`](https://github.com/scipy/oldest-supported-numpy/blob/main/setup.cfg)
package to see what the oldest recommended versions of `numpy` are.
(_Warning_: don't use `pip install -c constraints.txt` instead of
`PIP_CONSTRAINT`, since this isn't applied to the isolated build environments.)
#### Additional options for developers {#source-developers} #### Additional options for developers {#source-developers}
Some additional options may be useful for spaCy developers who are editing the Some additional options may be useful for spaCy developers who are editing the
source code and recompiling frequently. source code and recompiling frequently.
- Install in editable mode. Changes to `.py` files will be reflected as soon as - Install in editable mode. Changes to `.py` files will be reflected as soon
the files are saved, but edits to Cython files (`.pxd`, `.pyx`) will require as the files are saved, but edits to Cython files (`.pxd`, `.pyx`) will
the `pip install` or `python setup.py build_ext` command below to be run require the `pip install` command below to be run again. Before installing in
again. Before installing in editable mode, be sure you have removed any editable mode, be sure you have removed any previous installs with
previous installs with `pip uninstall spacy`, which you may need to run `pip uninstall spacy`, which you may need to run multiple times to remove all
multiple times to remove all traces of earlier installs. traces of earlier installs.
```bash ```bash
$ pip install -r requirements.txt $ pip install -r requirements.txt
$ pip install --no-build-isolation --editable . $ pip install --no-build-isolation --editable .
``` ```
- Build in parallel using `N` CPUs to speed up compilation and then install in - Build in parallel. Starting in v3.4.0, you can specify the number of
editable mode: build jobs with the environment variable `SPACY_NUM_BUILD_JOBS`:
```bash ```bash
$ pip install -r requirements.txt $ pip install -r requirements.txt
$ python setup.py build_ext --inplace -j N $ SPACY_NUM_BUILD_JOBS=4 pip install --no-build-isolation --editable .
```
- For editable mode and parallel builds with `python setup.py` instead of `pip`
(no longer recommended):
```bash
$ pip install -r requirements.txt
$ python setup.py build_ext --inplace -j 4
$ python setup.py develop $ python setup.py develop
``` ```

View File

@ -362,6 +362,18 @@ nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser"])
nlp.enable_pipe("tagger") nlp.enable_pipe("tagger")
``` ```
In addition to `disable`, `spacy.load()` also accepts `enable`. If `enable` is
set, all components except for those in `enable` are disabled.
```python
# Load the complete pipeline, but disable all components except for tok2vec and tagger
nlp = spacy.load("en_core_web_sm", enable=["tok2vec", "tagger"])
# Has the same effect, as NER is already not part of enabled set of components
nlp = spacy.load("en_core_web_sm", enable=["tok2vec", "tagger"], disable=["ner"])
# Will raise an error, as the sets of enabled and disabled components are conflicting
nlp = spacy.load("en_core_web_sm", enable=["ner"], disable=["ner"])
```
<Infobox variant="warning" title="Changed in v3.0"> <Infobox variant="warning" title="Changed in v3.0">
As of v3.0, the `disable` keyword argument specifies components to load but As of v3.0, the `disable` keyword argument specifies components to load but

View File

@ -375,11 +375,15 @@ scoped quantifiers instead, you can build those behaviors with `on_match`
callbacks. callbacks.
| OP | Description | | OP | Description |
| --- | ---------------------------------------------------------------- | |---------|------------------------------------------------------------------------|
| `!` | Negate the pattern, by requiring it to match exactly 0 times. | | `!` | Negate the pattern, by requiring it to match exactly 0 times. |
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. | | `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
| `+` | Require the pattern to match 1 or more times. | | `+` | Require the pattern to match 1 or more times. |
| `*` | Allow the pattern to match zero or more times. | | `*` | Allow the pattern to match zero or more times. |
| `{n}` | Require the pattern to match exactly _n_ times. |
| `{n,m}` | Require the pattern to match at least _n_ but not more than _m_ times. |
| `{n,}` | Require the pattern to match at least _n_ times. |
| `{,m}` | Require the pattern to match at most _m_ times. |
> #### Example > #### Example
> >

143
website/docs/usage/v3-4.md Normal file
View File

@ -0,0 +1,143 @@
---
title: What's New in v3.4
teaser: New features and how to upgrade
menu:
- ['New Features', 'features']
- ['Upgrading Notes', 'upgrading']
---
## New features {#features hidden="true"}
spaCy v3.4 brings typing and speed improvements along with new vectors for
English CNN pipelines and new trained pipelines for Croatian. This release also
includes prebuilt linux aarch64 wheels for all spaCy dependencies distributed by
Explosion.
### Typing improvements {#typing}
spaCy v3.4 supports pydantic v1.9 and mypy 0.950+ through extensive updates to
types in Thinc v8.1.
### Speed improvements {#speed}
- For the parser, use C `saxpy`/`sgemm` provided by the `Ops` implementation in
order to use Accelerate through `thinc-apple-ops`.
- Improved speed of vector lookups.
- Improved speed for `Example.get_aligned_parse` and `Example.get_aligned`.
## Additional features and improvements
- Min/max `{n,m}` operator for `Matcher` patterns.
- Language updates:
- Improve tokenization for Cyrillic combining diacritics.
- Improve English tokenizer exceptions for contractions with
this/that/these/those.
- Updated `spacy project clone` to try both `main` and `master` branches by
default.
- Added confidence threshold for named entity linker.
- Improved handling of Typer optional default values for `init_config_cli`.
- Added cycle detection in parser projectivization methods.
- Added counts for NER labels in `debug data`.
- Support for adding NVTX ranges to `TrainablePipe` components.
- Support env variable `SPACY_NUM_BUILD_JOBS` to specify the number of build
jobs to run in parallel with `pip`.
## Trained pipelines {#pipelines}
### New trained pipelines {#new-pipelines}
v3.4 introduces new CPU/CNN pipelines for Croatian, which use the trainable
lemmatizer and [floret vectors](https://github.com/explosion/floret). Due to the
use of [Bloom embeddings](https://explosion.ai/blog/bloom-embeddings) and
subwords, the pipelines have compact vectors with no out-of-vocabulary words.
| Package | UPOS | Parser LAS | NER F |
| ----------------------------------------------- | ---: | ---------: | ----: |
| [`hr_core_news_sm`](/models/hr#hr_core_news_sm) | 96.6 | 77.5 | 76.1 |
| [`hr_core_news_md`](/models/hr#hr_core_news_md) | 97.3 | 80.1 | 81.8 |
| [`hr_core_news_lg`](/models/hr#hr_core_news_lg) | 97.5 | 80.4 | 83.0 |
### Pipeline updates {#pipeline-updates}
All CNN pipelines have been extended with whitespace augmentation.
The English CNN pipelines have new word vectors:
| Package | Model Version | TAG | Parser LAS | NER F |
| ----------------------------------------------- | ------------- | ---: | ---------: | ----: |
| [`en_core_news_md`](/models/en#en_core_news_md) | v3.3.0 | 97.3 | 90.1 | 84.6 |
| [`en_core_news_md`](/models/en#en_core_news_lg) | v3.4.0 | 97.2 | 90.3 | 85.5 |
| [`en_core_news_lg`](/models/en#en_core_news_md) | v3.3.0 | 97.4 | 90.1 | 85.3 |
| [`en_core_news_lg`](/models/en#en_core_news_lg) | v3.4.0 | 97.3 | 90.2 | 85.6 |
## Notes about upgrading from v3.3 {#upgrading}
### Doc.has_vector
`Doc.has_vector` now matches `Token.has_vector` and `Span.has_vector`: it
returns `True` if at least one token in the doc has a vector rather than
checking only whether the vocab contains vectors.
### Using trained pipelines with floret vectors
If you're using a trained pipeline for Croatian, Finnish, Korean or Swedish with
new texts and working with `Doc` objects, you shouldn't notice any difference
between floret vectors and default vectors.
If you use vectors for similarity comparisons, there are a few differences,
mainly because a floret pipeline doesn't include any kind of frequency-based
word list similar to the list of in-vocabulary vector keys with default vectors.
- If your workflow iterates over the vector keys, you should use an external
word list instead:
```diff
- lexemes = [nlp.vocab[orth] for orth in nlp.vocab.vectors]
+ lexemes = [nlp.vocab[word] for word in external_word_list]
```
- `Vectors.most_similar` is not supported because there's no fixed list of
vectors to compare your vectors to.
### Pipeline package version compatibility {#version-compat}
> #### Using legacy implementations
>
> In spaCy v3, you'll still be able to load and reference legacy implementations
> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
> components or architectures change and newer versions are available in the
> core library.
When you're loading a pipeline package trained with an earlier version of spaCy
v3, you will see a warning telling you that the pipeline may be incompatible.
This doesn't necessarily have to be true, but we recommend running your
pipelines against your test suite or evaluation data to make sure there are no
unexpected results.
If you're using one of the [trained pipelines](/models) we provide, you should
run [`spacy download`](/api/cli#download) to update to the latest version. To
see an overview of all installed packages and their compatibility, you can run
[`spacy validate`](/api/cli#validate).
If you've trained your own custom pipeline and you've confirmed that it's still
working as expected, you can update the spaCy version requirements in the
[`meta.json`](/api/data-formats#meta):
```diff
- "spacy_version": ">=3.3.0,<3.4.0",
+ "spacy_version": ">=3.3.0,<3.5.0",
```
### Updating v3.3 configs
To update a config from spaCy v3.3 with the new v3.4 settings, run
[`init fill-config`](/api/cli#init-fill-config):
```cli
$ python -m spacy init fill-config config-v3.3.cfg config-v3.4.cfg
```
In many cases ([`spacy train`](/api/cli#train),
[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
automatically, but you'll need to fill in the new settings to run
[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).

View File

@ -162,7 +162,12 @@
{ {
"code": "hr", "code": "hr",
"name": "Croatian", "name": "Croatian",
"has_examples": true "has_examples": true,
"models": [
"hr_core_news_sm",
"hr_core_news_md",
"hr_core_news_lg"
]
}, },
{ {
"code": "hsb", "code": "hsb",

View File

@ -12,7 +12,9 @@
{ "text": "New in v3.0", "url": "/usage/v3" }, { "text": "New in v3.0", "url": "/usage/v3" },
{ "text": "New in v3.1", "url": "/usage/v3-1" }, { "text": "New in v3.1", "url": "/usage/v3-1" },
{ "text": "New in v3.2", "url": "/usage/v3-2" }, { "text": "New in v3.2", "url": "/usage/v3-2" },
{ "text": "New in v3.3", "url": "/usage/v3-3" } { "text": "New in v3.2", "url": "/usage/v3-2" },
{ "text": "New in v3.3", "url": "/usage/v3-3" },
{ "text": "New in v3.4", "url": "/usage/v3-4" }
] ]
}, },
{ {

View File

@ -2983,6 +2983,7 @@
"from pysbd.utils import PySBDFactory", "from pysbd.utils import PySBDFactory",
"", "",
"nlp = spacy.blank('en')", "nlp = spacy.blank('en')",
"# Caution: works with spaCy<=2.x.x",
"nlp.add_pipe(PySBDFactory(nlp))", "nlp.add_pipe(PySBDFactory(nlp))",
"", "",
"doc = nlp('My name is Jonas E. Smith. Please turn to p. 55.')", "doc = nlp('My name is Jonas E. Smith. Please turn to p. 55.')",

View File

@ -120,8 +120,8 @@ const AlertSpace = ({ nightly, legacy }) => {
} }
const navAlert = ( const navAlert = (
<Link to="/usage/v3-3" hidden> <Link to="/usage/v3-4" hidden>
<strong>💥 Out now:</strong> spaCy v3.3 <strong>💥 Out now:</strong> spaCy v3.4
</Link> </Link>
) )

View File

@ -24,6 +24,8 @@ const CUDA = {
'11.3': 'cuda113', '11.3': 'cuda113',
'11.4': 'cuda114', '11.4': 'cuda114',
'11.5': 'cuda115', '11.5': 'cuda115',
'11.6': 'cuda116',
'11.7': 'cuda117',
} }
const LANG_EXTRAS = ['ja'] // only for languages with models const LANG_EXTRAS = ['ja'] // only for languages with models