Merge remote-tracking branch 'upstream/v4' into store-activations

This commit is contained in:
Daniël de Kok 2022-08-01 09:30:04 +02:00
commit 288d27e17e
77 changed files with 1471 additions and 675 deletions

View File

@ -27,7 +27,6 @@ steps:
- script: python -m mypy spacy
displayName: 'Run mypy'
condition: ne(variables['python_version'], '3.10')
- task: DeleteFiles@1
inputs:
@ -41,7 +40,7 @@ steps:
- bash: |
${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
${{ parameters.prefix }} python -m pip install dist/$SDIST
${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
displayName: "Install from sdist"
- script: |
@ -64,12 +63,12 @@ steps:
displayName: "Run GPU tests"
condition: eq(${{ parameters.gpu }}, true)
# - script: |
# python -m spacy download ca_core_news_sm
# python -m spacy download ca_core_news_md
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
# displayName: 'Test download CLI'
# condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy download ca_core_news_sm
python -m spacy download ca_core_news_md
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
displayName: 'Test download CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
@ -93,17 +92,17 @@ steps:
displayName: 'Test train CLI'
condition: eq(variables['python_version'], '3.8')
# - script: |
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
# displayName: 'Test assemble CLI'
# condition: eq(variables['python_version'], '3.8')
#
# - script: |
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
# displayName: 'Test assemble CLI vectors warning'
# condition: eq(variables['python_version'], '3.8')
- script: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
displayName: 'Test assemble CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
displayName: 'Test assemble CLI vectors warning'
condition: eq(variables['python_version'], '3.8')
- script: |
python .github/validate_universe_json.py website/meta/universe.json

67
.github/spacy_universe_alert.py vendored Normal file
View File

@ -0,0 +1,67 @@
import os
import sys
import json
from datetime import datetime
from slack_sdk.web.client import WebClient
CHANNEL = "#alerts-universe"
SLACK_TOKEN = os.environ.get("SLACK_BOT_TOKEN", "ENV VAR not available!")
DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
client = WebClient(SLACK_TOKEN)
github_context = json.loads(sys.argv[1])
event = github_context['event']
pr_title = event['pull_request']["title"]
pr_link = event['pull_request']["patch_url"].replace(".patch", "")
pr_author_url = event['sender']["html_url"]
pr_author_name = pr_author_url.rsplit('/')[-1]
pr_created_at_dt = datetime.strptime(
event['pull_request']["created_at"],
DATETIME_FORMAT
)
pr_created_at = pr_created_at_dt.strftime("%c")
pr_updated_at_dt = datetime.strptime(
event['pull_request']["updated_at"],
DATETIME_FORMAT
)
pr_updated_at = pr_updated_at_dt.strftime("%c")
blocks = [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "📣 New spaCy Universe Project Alert ✨"
}
},
{
"type": "section",
"fields": [
{
"type": "mrkdwn",
"text": f"*Pull Request:*\n<{pr_link}|{pr_title}>"
},
{
"type": "mrkdwn",
"text": f"*Author:*\n<{pr_author_url}|{pr_author_name}>"
},
{
"type": "mrkdwn",
"text": f"*Created at:*\n {pr_created_at}"
},
{
"type": "mrkdwn",
"text": f"*Last Updated:*\n {pr_updated_at}"
}
]
}
]
client.chat_postMessage(
channel=CHANNEL,
text="spaCy universe project PR alert",
blocks=blocks
)

View File

@ -0,0 +1,30 @@
name: spaCy universe project alert
on:
pull_request_target:
paths:
- "website/meta/universe.json"
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Dump GitHub context
env:
GITHUB_CONTEXT: ${{ toJson(github) }}
PR_NUMBER: ${{github.event.number}}
run: |
echo "$GITHUB_CONTEXT"
- uses: actions/checkout@v1
- uses: actions/setup-python@v1
- name: Install Bernadette app dependency and send an alert
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
GITHUB_CONTEXT: ${{ toJson(github) }}
CHANNEL: "#alerts-universe"
run: |
pip install slack-sdk==3.17.2 aiohttp==3.8.1
echo "$CHANNEL"
python .github/spacy_universe_alert.py "$GITHUB_CONTEXT"

View File

@ -271,7 +271,8 @@ except: # noqa: E722
### Python conventions
All Python code must be written **compatible with Python 3.6+**.
All Python code must be written **compatible with Python 3.6+**. More detailed
code conventions can be found in the [developer docs](https://github.com/explosion/spaCy/blob/master/extra/DEVELOPER_DOCS/Code%20Conventions.md).
#### I/O and handling paths

View File

@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the MIT license.
💫 **Version 3.3.1 out now!**
💫 **Version 3.4.0 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)

View File

@ -1,6 +1,8 @@
# build version constraints for use with wheelwright + multibuild
numpy==1.15.0; python_version<='3.7'
numpy==1.17.3; python_version=='3.8'
numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
numpy==1.19.3; python_version=='3.9'
numpy==1.21.3; python_version=='3.10'
numpy; python_version>='3.11'

View File

@ -16,21 +16,41 @@ To summon the robot, write a github comment on the issue/PR you wish to test. Th
Some things to note:
* The `@explosion-bot please` must be the beginning of the command - you cannot add anything in front of this or else the robot won't know how to parse it. Adding anything at the end aside from the test name will also confuse the robot, so keep it simple!
* The command name (such as `test_gpu`) must be one of the tests that the bot knows how to run. The available commands are documented in the bot's [workflow config](https://github.com/explosion/spaCy/blob/master/.github/workflows/explosionbot.yml#L26) and must match exactly one of the commands listed there.
* The robot can't do multiple things at once, so if you want it to run multiple tests, you'll have to summon it with one comment per test.
* For the `test_gpu` command, you can specify an optional thinc branch (from the spaCy repo) or a spaCy branch (from the thinc repo) with either the `--thinc-branch` or `--spacy-branch` flags. By default, the bot will pull in the PR branch from the repo where the command was issued, and the main branch of the other repository. However, if you need to run against another branch, you can say (for example):
- The `@explosion-bot please` must be the beginning of the command - you cannot add anything in front of this or else the robot won't know how to parse it. Adding anything at the end aside from the test name will also confuse the robot, so keep it simple!
- The command name (such as `test_gpu`) must be one of the tests that the bot knows how to run. The available commands are documented in the bot's [workflow config](https://github.com/explosion/spaCy/blob/master/.github/workflows/explosionbot.yml#L26) and must match exactly one of the commands listed there.
- The robot can't do multiple things at once, so if you want it to run multiple tests, you'll have to summon it with one comment per test.
```
@explosion-bot please test_gpu --thinc-branch develop
```
You can also specify a branch from an unmerged PR:
```
@explosion-bot please test_gpu --thinc-branch refs/pull/633/head
```
### Examples
- Execute spaCy slow GPU tests with a custom thinc branch from a spaCy PR:
```
@explosion-bot please test_slow_gpu --thinc-branch <branch_name>
```
`branch_name` can either be a named branch, e.g: `develop`, or an unmerged PR, e.g: `refs/pull/<pr_number>/head`.
- Execute spaCy Transformers GPU tests from a spaCy PR:
```
@explosion-bot please test_gpu --run-on spacy-transformers --run-on-branch master --spacy-branch current_pr
```
This will launch the GPU pipeline for the `spacy-transformers` repo on its `master` branch, using the current spaCy PR's branch to build spaCy. The name of the repository passed to `--run-on` is case-sensitive, e.g: use `spaCy` instead of `spacy`.
- General info about supported commands.
```
@explosion-bot please info
```
- Help text for a specific command
```
@explosion-bot please <command> --help
```
## Troubleshooting
If the robot isn't responding to commands as expected, you can check its logs in the [Github Action](https://github.com/explosion/spaCy/actions/workflows/explosionbot.yml).
If the robot isn't responding to commands as expected, you can check its logs in the [Github Action](https://github.com/explosion/spaCy/actions/workflows/explosionbot.yml).
For each command sent to the bot, there should be a run of the `explosion-bot` workflow. In the `Install and run explosion-bot` step, towards the ends of the logs you should see info about the configuration that the bot was run with, as well as any errors that the bot encountered.

View File

@ -5,7 +5,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.1.0.dev3,<8.2.0",
"thinc>=8.1.0,<8.2.0",
"pathy",
"numpy>=1.15.0",
]

View File

@ -3,7 +3,7 @@ spacy-legacy>=3.0.9,<3.1.0
spacy-loggers>=1.0.0,<2.0.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.1.0.dev3,<8.2.0
thinc>=8.1.0,<8.2.0
ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.1.0
@ -30,7 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.8.0,<3.10.0
hypothesis>=3.27.0,<7.0.0
mypy>=0.910,<=0.960
mypy>=0.910,<0.970; platform_machine!='aarch64'
types-dataclasses>=0.1.3; python_version < "3.7"
types-mock>=0.1.1
types-requests

View File

@ -38,7 +38,7 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.1.0.dev3,<8.2.0
thinc>=8.1.0,<8.2.0
install_requires =
# Our libraries
spacy-legacy>=3.0.9,<3.1.0
@ -46,7 +46,7 @@ install_requires =
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.1.0.dev3,<8.2.0
thinc>=8.1.0,<8.2.0
wasabi>=0.9.1,<1.1.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
@ -103,6 +103,10 @@ cuda114 =
cupy-cuda114>=5.0.0b4,<11.0.0
cuda115 =
cupy-cuda115>=5.0.0b4,<11.0.0
cuda116 =
cupy-cuda116>=5.0.0b4,<11.0.0
cuda117 =
cupy-cuda117>=5.0.0b4,<11.0.0
apple =
thinc-apple-ops>=0.1.0.dev0,<1.0.0
# Language tokenizers with external dependencies
@ -110,7 +114,7 @@ ja =
sudachipy>=0.5.2,!=0.6.1
sudachidict_core>=20211220
ko =
natto-py==0.9.0
natto-py>=0.9.0
th =
pythainlp>=2.0

View File

@ -126,6 +126,8 @@ class build_ext_options:
class build_ext_subclass(build_ext, build_ext_options):
def build_extensions(self):
if self.parallel is None and os.environ.get("SPACY_NUM_BUILD_JOBS") is not None:
self.parallel = int(os.environ.get("SPACY_NUM_BUILD_JOBS"))
build_ext_options.build_options(self)
build_ext.build_extensions(self)
@ -206,7 +208,11 @@ def setup_package():
for name in MOD_NAMES:
mod_path = name.replace(".", "/") + ".pyx"
ext = Extension(
name, [mod_path], language="c++", include_dirs=include_dirs, extra_compile_args=["-std=c++11"]
name,
[mod_path],
language="c++",
include_dirs=include_dirs,
extra_compile_args=["-std=c++11"],
)
ext_modules.append(ext)
print("Cythonizing sources")

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
__version__ = "3.4.0"
__version__ = "3.4.1"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"

View File

@ -462,6 +462,23 @@ def git_sparse_checkout(repo, subpath, dest, branch):
shutil.move(str(source_path), str(dest))
def git_repo_branch_exists(repo: str, branch: str) -> bool:
"""Uses 'git ls-remote' to check if a repository and branch exists
repo (str): URL to get repo.
branch (str): Branch on repo to check.
RETURNS (bool): True if repo:branch exists.
"""
get_git_version()
cmd = f"git ls-remote {repo} {branch}"
# We might be tempted to use `--exit-code` with `git ls-remote`, but
# `run_command` handles the `returncode` for us, so we'll rely on
# the fact that stdout returns '' if the requested branch doesn't exist
ret = run_command(cmd, capture=True)
exists = ret.stdout != ""
return exists
def get_git_version(
error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
) -> Tuple[int, int]:

View File

@ -61,7 +61,7 @@ def pretrain_cli(
# TODO: What's the solution here? How do we handle optional blocks?
msg.fail("The [pretraining] block in your config is empty", exits=1)
if not output_dir.exists():
output_dir.mkdir()
output_dir.mkdir(parents=True)
msg.good(f"Created output directory: {output_dir}")
# Save non-interpolated config
raw_config.to_disk(output_dir / "config.cfg")

View File

@ -7,11 +7,11 @@ import re
from ... import about
from ...util import ensure_path
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
from .._util import git_checkout, get_git_version
from .._util import git_checkout, get_git_version, git_repo_branch_exists
DEFAULT_REPO = about.__projects__
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
DEFAULT_BRANCH = "master"
DEFAULT_BRANCHES = ["main", "master"]
@project_cli.command("clone")
@ -20,7 +20,7 @@ def project_clone_cli(
name: str = Arg(..., help="The name of the template to clone"),
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
branch: Optional[str] = Opt(None, "--branch", "-b", help="The branch to clone from"),
branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
# fmt: on
):
@ -33,9 +33,25 @@ def project_clone_cli(
"""
if dest is None:
dest = Path.cwd() / Path(name).parts[-1]
if repo == DEFAULT_REPO and branch is None:
branch = DEFAULT_PROJECTS_BRANCH
if branch is None:
# If it's a user repo, we want to default to other branch
branch = DEFAULT_PROJECTS_BRANCH if repo == DEFAULT_REPO else DEFAULT_BRANCH
for default_branch in DEFAULT_BRANCHES:
if git_repo_branch_exists(repo, default_branch):
branch = default_branch
break
if branch is None:
default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
msg.fail(
"No branch provided and attempted default "
f"branches {default_branches_msg} do not exist.",
exits=1,
)
else:
if not git_repo_branch_exists(repo, branch):
msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
assert isinstance(branch, str)
project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
@ -61,9 +77,9 @@ def project_clone(
try:
git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
except subprocess.CalledProcessError:
err = f"Could not clone '{name}' from repo '{repo_name}'"
err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
msg.fail(err, exits=1)
msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
if not (project_dir / PROJECT_FILE).exists():
msg.warn(f"No {PROJECT_FILE} found in directory")
else:

View File

@ -64,8 +64,11 @@ class SpanRenderer:
# Set up how the text and labels will be rendered
self.direction = DEFAULT_DIR
self.lang = DEFAULT_LANG
# These values are in px
self.top_offset = options.get("top_offset", 40)
self.top_offset_step = options.get("top_offset_step", 17)
# This is how far under the top offset the span labels appear
self.span_label_offset = options.get("span_label_offset", 20)
self.offset_step = options.get("top_offset_step", 17)
# Set up which templates will be used
template = options.get("template")
@ -127,26 +130,56 @@ class SpanRenderer:
title (str / None): Document title set in Doc.user_data['title'].
"""
per_token_info = []
# we must sort so that we can correctly describe when spans need to "stack"
# which is determined by their start token, then span length (longer spans on top),
# then break any remaining ties with the span label
spans = sorted(
spans,
key=lambda s: (
s["start_token"],
-(s["end_token"] - s["start_token"]),
s["label"],
),
)
for s in spans:
# this is the vertical 'slot' that the span will be rendered in
# vertical_position = span_label_offset + (offset_step * (slot - 1))
s["render_slot"] = 0
for idx, token in enumerate(tokens):
# Identify if a token belongs to a Span (and which) and if it's a
# start token of said Span. We'll use this for the final HTML render
token_markup: Dict[str, Any] = {}
token_markup["text"] = token
concurrent_spans = 0
entities = []
for span in spans:
ent = {}
if span["start_token"] <= idx < span["end_token"]:
concurrent_spans += 1
span_start = idx == span["start_token"]
ent["label"] = span["label"]
ent["is_start"] = True if idx == span["start_token"] else False
ent["is_start"] = span_start
if span_start:
# When the span starts, we need to know how many other
# spans are on the 'span stack' and will be rendered.
# This value becomes the vertical render slot for this entire span
span["render_slot"] = concurrent_spans
ent["render_slot"] = span["render_slot"]
kb_id = span.get("kb_id", "")
kb_url = span.get("kb_url", "#")
ent["kb_link"] = (
TPL_KB_LINK.format(kb_id=kb_id, kb_url=kb_url) if kb_id else ""
)
entities.append(ent)
else:
# We don't specifically need to do this since we loop
# over tokens and spans sorted by their start_token,
# so we'll never use a span again after the last token it appears in,
# but if we were to use these spans again we'd want to make sure
# this value was reset correctly.
span["render_slot"] = 0
token_markup["entities"] = entities
per_token_info.append(token_markup)
markup = self._render_markup(per_token_info)
markup = TPL_SPANS.format(content=markup, dir=self.direction)
if title:
@ -157,12 +190,24 @@ class SpanRenderer:
"""Render the markup from per-token information"""
markup = ""
for token in per_token_info:
entities = sorted(token["entities"], key=lambda d: d["label"])
if entities:
entities = sorted(token["entities"], key=lambda d: d["render_slot"])
# Whitespace tokens disrupt the vertical space (no line height) so that the
# span indicators get misaligned. We don't render them as individual
# tokens anyway, so we'll just not display a span indicator either.
is_whitespace = token["text"].strip() == ""
if entities and not is_whitespace:
slices = self._get_span_slices(token["entities"])
starts = self._get_span_starts(token["entities"])
total_height = (
self.top_offset
+ self.span_label_offset
+ (self.offset_step * (len(entities) - 1))
)
markup += self.span_template.format(
text=token["text"], span_slices=slices, span_starts=starts
text=token["text"],
span_slices=slices,
span_starts=starts,
total_height=total_height,
)
else:
markup += escape_html(token["text"] + " ")
@ -171,10 +216,18 @@ class SpanRenderer:
def _get_span_slices(self, entities: List[Dict]) -> str:
"""Get the rendered markup of all Span slices"""
span_slices = []
for entity, step in zip(entities, itertools.count(step=self.top_offset_step)):
for entity in entities:
# rather than iterate over multiples of offset_step, we use entity['render_slot']
# to determine the vertical position, since that tells where
# the span starts vertically so we can extend it horizontally,
# past other spans that might have already ended
color = self.colors.get(entity["label"].upper(), self.default_color)
top_offset = self.top_offset + (
self.offset_step * (entity["render_slot"] - 1)
)
span_slice = self.span_slice_template.format(
bg=color, top_offset=self.top_offset + step
bg=color,
top_offset=top_offset,
)
span_slices.append(span_slice)
return "".join(span_slices)
@ -182,12 +235,15 @@ class SpanRenderer:
def _get_span_starts(self, entities: List[Dict]) -> str:
"""Get the rendered markup of all Span start tokens"""
span_starts = []
for entity, step in zip(entities, itertools.count(step=self.top_offset_step)):
for entity in entities:
color = self.colors.get(entity["label"].upper(), self.default_color)
top_offset = self.top_offset + (
self.offset_step * (entity["render_slot"] - 1)
)
span_start = (
self.span_start_template.format(
bg=color,
top_offset=self.top_offset + step,
top_offset=top_offset,
label=entity["label"],
kb_link=entity["kb_link"],
)

View File

@ -67,7 +67,7 @@ TPL_SPANS = """
"""
TPL_SPAN = """
<span style="font-weight: bold; display: inline-block; position: relative;">
<span style="font-weight: bold; display: inline-block; position: relative; height: {total_height}px;">
{text}
{span_slices}
{span_starts}

View File

@ -209,6 +209,9 @@ class Warnings(metaclass=ErrorsWithCodes):
"Only the last span group will be loaded under "
"Doc.spans['{group_name}']. Skipping span group with values: "
"{group_values}")
W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'")
W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class "
"is a Cython extension type.")
W400 = ("Activation '{activation}' is unknown for pipe '{pipe_name}'")
@ -935,6 +938,8 @@ class Errors(metaclass=ErrorsWithCodes):
E1041 = ("Expected a string, Doc, or bytes as input, but got: {type}")
E1042 = ("Function was called with `{arg1}`={arg1_values} and "
"`{arg2}`={arg2_values} but these arguments are conflicting.")
E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
"{value}.")
E1400 = ("store_activations attribute must be set to List[str] or bool")

View File

@ -2,7 +2,8 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
from ...language import Language, BaseDefaults
from ...attrs import LANG
from ...util import update_exc
@ -16,6 +17,8 @@ class BulgarianDefaults(BaseDefaults):
stop_words = STOP_WORDS
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
class Bulgarian(Language):

View File

@ -258,6 +258,10 @@ ALPHA = group_chars(
ALPHA_LOWER = group_chars(_lower + _uncased)
ALPHA_UPPER = group_chars(_upper + _uncased)
_combining_diacritics = r"\u0300-\u036f"
COMBINING_DIACRITICS = _combining_diacritics
_units = (
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
"kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb "

View File

@ -1,5 +1,5 @@
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS, COMBINING_DIACRITICS
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
@ -44,3 +44,23 @@ TOKENIZER_INFIXES = (
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)
# Some languages e.g. written with the Cyrillic alphabet permit the use of diacritics
# to mark stressed syllables in words where stress is distinctive. Such languages
# should use the COMBINING_DIACRITICS... suffix and infix regex lists in
# place of the standard ones.
COMBINING_DIACRITICS_TOKENIZER_SUFFIXES = list(TOKENIZER_SUFFIXES) + [
r"(?<=[{a}][{d}])\.".format(a=ALPHA, d=COMBINING_DIACRITICS),
]
COMBINING_DIACRITICS_TOKENIZER_INFIXES = list(TOKENIZER_INFIXES) + [
r"(?<=[{al}][{d}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES, d=COMBINING_DIACRITICS
),
r"(?<=[{a}][{d}]),(?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
r"(?<=[{a}][{d}])(?:{h})(?=[{a}])".format(
a=ALPHA, d=COMBINING_DIACRITICS, h=HYPHENS
),
r"(?<=[{a}][{d}])[:<>=/](?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
]

View File

@ -5,6 +5,8 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
from ...language import Language, BaseDefaults
@ -12,6 +14,8 @@ class RussianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
class Russian(Language):

View File

@ -6,6 +6,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import UkrainianLemmatizer
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
from ...language import Language, BaseDefaults
@ -13,6 +15,8 @@ class UkrainianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
class Ukrainian(Language):

View File

@ -82,6 +82,10 @@ cdef class DependencyMatcher:
"$-": self._imm_left_sib,
"$++": self._right_sib,
"$--": self._left_sib,
">++": self._right_child,
">--": self._left_child,
"<++": self._right_parent,
"<--": self._left_parent,
}
def __reduce__(self):
@ -423,6 +427,22 @@ cdef class DependencyMatcher:
def _left_sib(self, doc, node):
return [doc[child.i] for child in doc[node].head.children if child.i < node]
def _right_child(self, doc, node):
return [doc[child.i] for child in doc[node].children if child.i > node]
def _left_child(self, doc, node):
return [doc[child.i] for child in doc[node].children if child.i < node]
def _right_parent(self, doc, node):
if doc[node].head.i > node:
return [doc[node].head]
return []
def _left_parent(self, doc, node):
if doc[node].head.i < node:
return [doc[node].head]
return []
def _normalize_key(self, key):
if isinstance(key, str):
return self.vocab.strings.add(key)

View File

@ -86,10 +86,14 @@ cdef class Matcher:
is a dictionary mapping attribute IDs to values, and optionally a
quantifier operator under the key "op". The available quantifiers are:
'!': Negate the pattern, by requiring it to match exactly 0 times.
'?': Make the pattern optional, by allowing it to match 0 or 1 times.
'+': Require the pattern to match 1 or more times.
'*': Allow the pattern to zero or more times.
'!': Negate the pattern, by requiring it to match exactly 0 times.
'?': Make the pattern optional, by allowing it to match 0 or 1 times.
'+': Require the pattern to match 1 or more times.
'*': Allow the pattern to zero or more times.
'{n}': Require the pattern to match exactly _n_ times.
'{n,m}': Require the pattern to match at least _n_ but not more than _m_ times.
'{n,}': Require the pattern to match at least _n_ times.
'{,m}': Require the pattern to match at most _m_ times.
The + and * operators return all possible matches (not just the greedy
ones). However, the "greedy" argument can filter the final matches
@ -1005,8 +1009,29 @@ def _get_operators(spec):
return (ONE,)
elif spec["OP"] in lookup:
return lookup[spec["OP"]]
#Min_max {n,m}
elif spec["OP"].startswith("{") and spec["OP"].endswith("}"):
# {n} --> {n,n} exactly n ONE,(n)
# {n,m}--> {n,m} min of n, max of m ONE,(n),ZERO_ONE,(m)
# {,m} --> {0,m} min of zero, max of m ZERO_ONE,(m)
# {n,} --> {n,∞} min of n, max of inf ONE,(n),ZERO_PLUS
min_max = spec["OP"][1:-1]
min_max = min_max if "," in min_max else f"{min_max},{min_max}"
n, m = min_max.split(",")
#1. Either n or m is a blank string and the other is numeric -->isdigit
#2. Both are numeric and n <= m
if (not n.isdecimal() and not m.isdecimal()) or (n.isdecimal() and m.isdecimal() and int(n) > int(m)):
keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
# if n is empty string, zero would be used
head = tuple(ONE for __ in range(int(n or 0)))
tail = tuple(ZERO_ONE for __ in range(int(m) - int(n or 0))) if m else (ZERO_PLUS,)
return head + tail
else:
keys = ", ".join(lookup.keys())
keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))

View File

@ -26,7 +26,11 @@ def forward(model, X, is_train):
Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
Yf[0] = model.get_param("pad")
# Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
# change its shape to (nF, nO, nP) without breaking existing models. So
# we'll squeeze the first dimension here.
Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
def backward(dY_ids):
# This backprop is particularly tricky, because we get back a different

View File

@ -1,9 +1,14 @@
from functools import partial
from typing import Type, Callable, TYPE_CHECKING
from typing import Type, Callable, Dict, TYPE_CHECKING, List, Optional, Set
import functools
import inspect
import types
import warnings
from thinc.layers import with_nvtx_range
from thinc.model import Model, wrap_model_recursive
from thinc.util import use_nvtx_range
from ..errors import Warnings
from ..util import registry
if TYPE_CHECKING:
@ -11,29 +16,106 @@ if TYPE_CHECKING:
from ..language import Language # noqa: F401
@registry.callbacks("spacy.models_with_nvtx_range.v1")
def create_models_with_nvtx_range(
forward_color: int = -1, backprop_color: int = -1
) -> Callable[["Language"], "Language"]:
def models_with_nvtx_range(nlp):
pipes = [
pipe
for _, pipe in nlp.components
if hasattr(pipe, "is_trainable") and pipe.is_trainable
]
DEFAULT_NVTX_ANNOTATABLE_PIPE_METHODS = [
"pipe",
"predict",
"set_annotations",
"update",
"rehearse",
"get_loss",
"initialize",
"begin_update",
"finish_update",
"update",
]
# We need process all models jointly to avoid wrapping callbacks twice.
models = Model(
"wrap_with_nvtx_range",
forward=lambda model, X, is_train: ...,
layers=[pipe.model for pipe in pipes],
)
for node in models.walk():
def models_with_nvtx_range(nlp, forward_color: int, backprop_color: int):
pipes = [
pipe
for _, pipe in nlp.components
if hasattr(pipe, "is_trainable") and pipe.is_trainable
]
seen_models: Set[int] = set()
for pipe in pipes:
for node in pipe.model.walk():
if id(node) in seen_models:
continue
seen_models.add(id(node))
with_nvtx_range(
node, forward_color=forward_color, backprop_color=backprop_color
)
return nlp
@registry.callbacks("spacy.models_with_nvtx_range.v1")
def create_models_with_nvtx_range(
forward_color: int = -1, backprop_color: int = -1
) -> Callable[["Language"], "Language"]:
return functools.partial(
models_with_nvtx_range,
forward_color=forward_color,
backprop_color=backprop_color,
)
def nvtx_range_wrapper_for_pipe_method(self, func, *args, **kwargs):
if isinstance(func, functools.partial):
return func(*args, **kwargs)
else:
with use_nvtx_range(f"{self.name} {func.__name__}"):
return func(*args, **kwargs)
def pipes_with_nvtx_range(
nlp, additional_pipe_functions: Optional[Dict[str, List[str]]]
):
for _, pipe in nlp.components:
if additional_pipe_functions:
extra_funcs = additional_pipe_functions.get(pipe.name, [])
else:
extra_funcs = []
for name in DEFAULT_NVTX_ANNOTATABLE_PIPE_METHODS + extra_funcs:
func = getattr(pipe, name, None)
if func is None:
if name in extra_funcs:
warnings.warn(Warnings.W121.format(method=name, pipe=pipe.name))
continue
wrapped_func = functools.partial(
types.MethodType(nvtx_range_wrapper_for_pipe_method, pipe), func
)
# Try to preserve the original function signature.
try:
wrapped_func.__signature__ = inspect.signature(func) # type: ignore
except:
pass
try:
setattr(
pipe,
name,
wrapped_func,
)
except AttributeError:
warnings.warn(Warnings.W122.format(method=name, pipe=pipe.name))
return nlp
@registry.callbacks("spacy.models_and_pipes_with_nvtx_range.v1")
def create_models_and_pipes_with_nvtx_range(
forward_color: int = -1,
backprop_color: int = -1,
additional_pipe_functions: Optional[Dict[str, List[str]]] = None,
) -> Callable[["Language"], "Language"]:
def inner(nlp):
nlp = models_with_nvtx_range(nlp, forward_color, backprop_color)
nlp = pipes_with_nvtx_range(nlp, additional_pipe_functions)
return nlp
return models_with_nvtx_range
return inner

View File

@ -347,6 +347,7 @@ cdef class precompute_hiddens:
cdef bint _is_synchronized
cdef public object ops
cdef public object numpy_ops
cdef public object _cpu_ops
cdef np.ndarray _features
cdef np.ndarray _cached
cdef np.ndarray bias
@ -377,6 +378,7 @@ cdef class precompute_hiddens:
self.nO = cached.shape[2]
self.ops = lower_model.ops
self.numpy_ops = NumpyOps()
self._cpu_ops = get_ops("cpu") if isinstance(self.ops, CupyOps) else self.ops
assert activation in (None, "relu", "maxout")
self.activation = activation
self._is_synchronized = False
@ -439,11 +441,7 @@ cdef class precompute_hiddens:
# - Output from backward on GPU
bp_hiddens = self._bp_hiddens
cdef CBlas cblas
if isinstance(self.ops, CupyOps):
cblas = get_ops("cpu").cblas()
else:
cblas = self.ops.cblas()
cdef CBlas cblas = self._cpu_ops.cblas()
feat_weights = self.get_feat_weights()
cdef int[:, ::1] ids = token_ids

View File

@ -1,23 +1,41 @@
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
cimport numpy as np
from libc.stdint cimport uint64_t
from libc.stdint cimport uint32_t, uint64_t
from libcpp.unordered_map cimport unordered_map
from libcpp.vector cimport vector
from libcpp.memory cimport shared_ptr
from .structs cimport MorphAnalysisC
from .strings cimport StringStore
from .typedefs cimport attr_t, hash_t
cdef cppclass Feature:
hash_t field
hash_t value
__init__():
this.field = 0
this.value = 0
cdef cppclass MorphAnalysisC:
hash_t key
vector[Feature] features
__init__():
this.key = 0
cdef class Morphology:
cdef readonly Pool mem
cdef readonly StringStore strings
cdef PreshMap tags # Keyed by hash, value is pointer to tag
cdef unordered_map[hash_t, shared_ptr[MorphAnalysisC]] tags
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
cdef int insert(self, MorphAnalysisC tag) except -1
cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash)
cdef void _intern_morph_tag(self, hash_t tag_key, feats)
cdef hash_t _add(self, features)
cdef str _normalize_features(self, features)
cdef str get_morph_str(self, hash_t morph_key)
cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key)
cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
cdef list list_features(const MorphAnalysisC* morph)
cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field)
cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil
cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil
cdef list list_features(const shared_ptr[MorphAnalysisC] morph)
cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field)
cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil

View File

@ -1,10 +1,10 @@
# cython: infer_types
import numpy
import warnings
from typing import Union, Tuple, List, Dict, Optional
from cython.operator cimport dereference as deref
from libcpp.memory cimport shared_ptr
from .attrs cimport POS
from .parts_of_speech import IDS as POS_IDS
from .errors import Warnings
from . import symbols
@ -24,134 +24,187 @@ cdef class Morphology:
EMPTY_MORPH = symbols.NAMES[symbols._]
def __init__(self, StringStore strings):
self.mem = Pool()
self.strings = strings
self.tags = PreshMap()
def __reduce__(self):
tags = set([self.get(self.strings[s]) for s in self.strings])
tags -= set([""])
return (unpickle_morphology, (self.strings, sorted(tags)), None, None)
def add(self, features):
cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash):
match = self.tags.find(tag_hash)
if match != self.tags.const_end():
return deref(match).second
else:
return shared_ptr[MorphAnalysisC]()
def _normalize_attr(self, attr_key : Union[int, str], attr_value : Union[int, str]) -> Optional[Tuple[str, Union[str, List[str]]]]:
if isinstance(attr_key, (int, str)) and isinstance(attr_value, (int, str)):
attr_key = self.strings.as_string(attr_key)
attr_value = self.strings.as_string(attr_value)
# Preserve multiple values as a list
if self.VALUE_SEP in attr_value:
values = attr_value.split(self.VALUE_SEP)
values.sort()
attr_value = values
else:
warnings.warn(Warnings.W100.format(feature={attr_key: attr_value}))
return None
return attr_key, attr_value
def _str_to_normalized_feat_dict(self, feats: str) -> Dict[str, str]:
if not feats or feats == self.EMPTY_MORPH:
return {}
out = []
for feat in feats.split(self.FEATURE_SEP):
field, values = feat.split(self.FIELD_SEP, 1)
normalized_attr = self._normalize_attr(field, values)
if normalized_attr is None:
continue
out.append((normalized_attr[0], normalized_attr[1]))
out.sort(key=lambda x: x[0])
return dict(out)
def _dict_to_normalized_feat_dict(self, feats: Dict[Union[int, str], Union[int, str]]) -> Dict[str, str]:
out = []
for field, values in feats.items():
normalized_attr = self._normalize_attr(field, values)
if normalized_attr is None:
continue
out.append((normalized_attr[0], normalized_attr[1]))
out.sort(key=lambda x: x[0])
return dict(out)
def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
norm_feats_string = self.FEATURE_SEP.join([
self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
for field, values in feats.items()
])
return norm_feats_string or self.EMPTY_MORPH
cdef hash_t _add(self, features):
"""Insert a morphological analysis in the morphology table, if not
already present. The morphological analysis may be provided in the UD
FEATS format as a string or in the tag map dict format.
Returns the hash of the new analysis.
"""
cdef MorphAnalysisC* tag_ptr
cdef hash_t tag_hash = 0
cdef shared_ptr[MorphAnalysisC] tag
if isinstance(features, str):
if features == "":
features = self.EMPTY_MORPH
tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
if tag_ptr != NULL:
return tag_ptr.key
features = self.feats_to_dict(features)
if not isinstance(features, dict):
tag_hash = self.strings[features]
tag = self._lookup_tag(tag_hash)
if tag:
return deref(tag).key
features = self._str_to_normalized_feat_dict(features)
elif isinstance(features, dict):
features = self._dict_to_normalized_feat_dict(features)
else:
warnings.warn(Warnings.W100.format(feature=features))
features = {}
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
# intified ("Field", "Field=Value") pairs
field_feature_pairs = []
for field in sorted(string_features):
values = string_features[field]
for value in values.split(self.VALUE_SEP):
field_feature_pairs.append((
self.strings.add(field),
self.strings.add(field + self.FIELD_SEP + value),
))
cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
# the hash key for the tag is either the hash of the normalized UFEATS
# string or the hash of an empty placeholder
norm_feats_string = self.normalize_features(features)
tag.key = self.strings.add(norm_feats_string)
self.insert(tag)
return tag.key
norm_feats_string = self._normalized_feat_dict_to_str(features)
tag_hash = self.strings.add(norm_feats_string)
tag = self._lookup_tag(tag_hash)
if tag:
return deref(tag).key
def normalize_features(self, features):
self._intern_morph_tag(tag_hash, features)
return tag_hash
cdef void _intern_morph_tag(self, hash_t tag_key, feats):
# intified ("Field", "Field=Value") pairs where fields with multiple values have
# been split into individual tuples, e.g.:
# [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
# ("Field2", "Field2=Value3")]
field_feature_pairs = []
# Feat dict is normalized at this point.
for field, values in feats.items():
field_key = self.strings.add(field)
if isinstance(values, list):
for value in values:
value_key = self.strings.add(field + self.FIELD_SEP + value)
field_feature_pairs.append((field_key, value_key))
else:
# We could box scalar values into a list and use a common
# code path to generate features but that incurs a small
# but measurable allocation/iteration overhead (as this
# branch is taken often enough).
value_key = self.strings.add(field + self.FIELD_SEP + values)
field_feature_pairs.append((field_key, value_key))
num_features = len(field_feature_pairs)
cdef shared_ptr[MorphAnalysisC] tag = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
deref(tag).key = tag_key
deref(tag).features.resize(num_features)
for i in range(num_features):
deref(tag).features[i].field = field_feature_pairs[i][0]
deref(tag).features[i].value = field_feature_pairs[i][1]
self.tags[tag_key] = tag
cdef str get_morph_str(self, hash_t morph_key):
cdef shared_ptr[MorphAnalysisC] tag = self._lookup_tag(morph_key)
if not tag:
return ""
else:
return self.strings[deref(tag).key]
cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key):
return self._lookup_tag(morph_key)
cdef str _normalize_features(self, features):
"""Create a normalized FEATS string from a features string or dict.
features (Union[dict, str]): Features as dict or UFEATS string.
RETURNS (str): Features as normalized UFEATS string.
"""
if isinstance(features, str):
features = self.feats_to_dict(features)
if not isinstance(features, dict):
features = self._str_to_normalized_feat_dict(features)
elif isinstance(features, dict):
features = self._dict_to_normalized_feat_dict(features)
else:
warnings.warn(Warnings.W100.format(feature=features))
features = {}
features = self.normalize_attrs(features)
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
# normalized UFEATS string with sorted fields and values
norm_feats_string = self.FEATURE_SEP.join(sorted([
self.FIELD_SEP.join([field, values])
for field, values in string_features.items()
]))
return norm_feats_string or self.EMPTY_MORPH
def normalize_attrs(self, attrs):
"""Convert attrs dict so that POS is always by ID, other features are
by string. Values separated by VALUE_SEP are sorted.
"""
out = {}
attrs = dict(attrs)
for key, value in attrs.items():
# convert POS value to ID
if key == POS or (isinstance(key, str) and key.upper() == "POS"):
if isinstance(value, str) and value.upper() in POS_IDS:
value = POS_IDS[value.upper()]
elif isinstance(value, int) and value not in POS_IDS.values():
warnings.warn(Warnings.W100.format(feature={key: value}))
continue
out[POS] = value
# accept any string or ID fields and values and convert to strings
elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
key = self.strings.as_string(key)
value = self.strings.as_string(value)
# sort values
if self.VALUE_SEP in value:
value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP)))
out[key] = value
else:
warnings.warn(Warnings.W100.format(feature={key: value}))
return out
return self._normalized_feat_dict_to_str(features)
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
"""Creates a MorphAnalysisC from a list of intified
("Field", "Field=Value") tuples where fields with multiple values have
been split into individual tuples, e.g.:
[("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
("Field2", "Field2=Value3")]
"""
cdef MorphAnalysisC tag
tag.length = len(field_feature_pairs)
if tag.length > 0:
tag.fields = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
tag.features = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
for i, (field, feature) in enumerate(field_feature_pairs):
tag.fields[i] = field
tag.features[i] = feature
return tag
def add(self, features):
return self._add(features)
cdef int insert(self, MorphAnalysisC tag) except -1:
cdef hash_t key = tag.key
if self.tags.get(key) == NULL:
tag_ptr = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
tag_ptr[0] = tag
self.tags.set(key, <void*>tag_ptr)
def get(self, morph_key):
return self.get_morph_str(morph_key)
def get(self, hash_t morph):
tag = <MorphAnalysisC*>self.tags.get(morph)
if tag == NULL:
return ""
else:
return self.strings[tag.key]
def normalize_features(self, features):
return self._normalize_features(features)
@staticmethod
def feats_to_dict(feats):
def feats_to_dict(feats, *, sort_values=True):
if not feats or feats == Morphology.EMPTY_MORPH:
return {}
return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
[feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
out = {}
for feat in feats.split(Morphology.FEATURE_SEP):
field, values = feat.split(Morphology.FIELD_SEP, 1)
if sort_values:
values = values.split(Morphology.VALUE_SEP)
values.sort()
values = Morphology.VALUE_SEP.join(values)
out[field] = values
return out
@staticmethod
def dict_to_feats(feats_dict):
@ -160,34 +213,34 @@ cdef class Morphology:
return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil:
cdef int i
for i in range(morph.length):
if morph.features[i] == feature:
for i in range(deref(morph).features.size()):
if deref(morph).features[i].value == feature:
return True
return False
cdef list list_features(const MorphAnalysisC* morph):
cdef list list_features(const shared_ptr[MorphAnalysisC] morph):
cdef int i
features = []
for i in range(morph.length):
features.append(morph.features[i])
for i in range(deref(morph).features.size()):
features.append(deref(morph).features[i].value)
return features
cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field):
cdef np.ndarray results = numpy.zeros((morph.length,), dtype="uint64")
cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field):
cdef np.ndarray results = numpy.zeros((deref(morph).features.size(),), dtype="uint64")
n = get_n_by_field(<uint64_t*>results.data, morph, field)
return results[:n]
cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil:
cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil:
cdef int n_results = 0
cdef int i
for i in range(morph.length):
if morph.fields[i] == field:
results[n_results] = morph.features[i]
for i in range(deref(morph).features.size()):
if deref(morph).features[i].field == field:
results[n_results] = deref(morph).features[i].value
n_results += 1
return n_results

View File

@ -61,6 +61,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
"overwrite": True,
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
"use_gold_ents": True,
"threshold": None,
"store_activations": False,
},
default_score_weights={
@ -83,6 +84,7 @@ def make_entity_linker(
overwrite: bool,
scorer: Optional[Callable],
use_gold_ents: bool,
threshold: Optional[float] = None,
store_activations: Union[bool, List[str]],
):
"""Construct an EntityLinker component.
@ -98,6 +100,10 @@ def make_entity_linker(
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
scorer (Optional[Callable]): The scoring method.
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
component must provide entity annotations.
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
prediction is discarded. If None, predictions are not filtered by any threshold.
store_activations (Union[bool, List[str]]): Model activations to store in
Doc when annotating. supported activations are: "ents" and "scores".
"""
@ -130,6 +136,7 @@ def make_entity_linker(
overwrite=overwrite,
scorer=scorer,
use_gold_ents=use_gold_ents,
threshold=threshold,
store_activations=store_activations,
)
@ -166,6 +173,7 @@ class EntityLinker(TrainablePipe):
overwrite: bool = BACKWARD_OVERWRITE,
scorer: Optional[Callable] = entity_linker_score,
use_gold_ents: bool,
threshold: Optional[float] = None,
store_activations=False,
) -> None:
"""Initialize an entity linker.
@ -185,9 +193,20 @@ class EntityLinker(TrainablePipe):
Scorer.score_links.
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
component must provide entity annotations.
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
DOCS: https://spacy.io/api/entitylinker#init
"""
if threshold is not None and not (0 <= threshold <= 1):
raise ValueError(
Errors.E1043.format(
range_start=0,
range_end=1,
value=threshold,
)
)
self.vocab = vocab
self.model = model
self.name = name
@ -203,6 +222,7 @@ class EntityLinker(TrainablePipe):
self.kb = empty_kb(entity_vector_length)(self.vocab)
self.scorer = scorer
self.use_gold_ents = use_gold_ents
self.threshold = threshold
self.store_activations = store_activations
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
@ -450,10 +470,12 @@ class EntityLinker(TrainablePipe):
self._add_activations(
doc_scores, doc_scores_lens, doc_ents, [0.0], [0]
)
elif len(candidates) == 1:
elif len(candidates) == 1 and self.threshold is None:
# shortcut for efficiency reasons: take the 1 candidate
# TODO: thresholding
final_kb_ids.append(candidates[0].entity_)
self._add_activations(
doc_scores, doc_scores_lens, doc_ents, [1.0], [candidates[0].entity_]
)
else:
random.shuffle(candidates)
# set all prior probabilities to 0 if incl_prior=False
@ -481,7 +503,11 @@ class EntityLinker(TrainablePipe):
if sims.shape != prior_probs.shape:
raise ValueError(Errors.E161)
scores = prior_probs + sims - (prior_probs * sims)
# TODO: thresholding
final_kb_ids.append(
candidates[scores.argmax().item()].entity_
if self.threshold is None or scores.max() >= self.threshold
else EntityLinker.NIL
)
self._add_activations(
doc_scores,
doc_scores_lens,
@ -489,9 +515,6 @@ class EntityLinker(TrainablePipe):
scores,
[c.entity for c in candidates],
)
best_index = scores.argmax().item()
best_candidate = candidates[best_index]
final_kb_ids.append(best_candidate.entity_)
self._add_doc_activations(
docs_scores, docs_ents, doc_scores, doc_scores_lens, doc_ents
)

View File

@ -7,7 +7,7 @@ from pathlib import Path
from itertools import islice
import srsly
import random
from thinc.api import CosineDistance, Model, Optimizer, Config
from thinc.api import CosineDistance, Model, Optimizer
from thinc.api import set_dropout_rate
import warnings
@ -20,7 +20,7 @@ from ...language import Language
from ...vocab import Vocab
from ...training import Example, validate_examples, validate_get_examples
from ...errors import Errors, Warnings
from ...util import SimpleFrozenList, registry
from ...util import SimpleFrozenList
from ... import util
from ...scorer import Scorer
@ -70,7 +70,6 @@ class EntityLinker_v1(TrainablePipe):
produces a list of candidates, given a certain knowledge base and a textual mention.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_links.
DOCS: https://spacy.io/api/entitylinker#init
"""
self.vocab = vocab
@ -272,7 +271,6 @@ class EntityLinker_v1(TrainablePipe):
final_kb_ids.append(self.NIL)
elif len(candidates) == 1:
# shortcut for efficiency reasons: take the 1 candidate
# TODO: thresholding
final_kb_ids.append(candidates[0].entity_)
else:
random.shuffle(candidates)
@ -301,7 +299,6 @@ class EntityLinker_v1(TrainablePipe):
if sims.shape != prior_probs.shape:
raise ValueError(Errors.E161)
scores = prior_probs + sims - (prior_probs * sims)
# TODO: thresholding
best_index = scores.argmax().item()
best_candidate = candidates[best_index]
final_kb_ids.append(best_candidate.entity_)

View File

@ -139,8 +139,8 @@ class Morphologizer(Tagger):
@property
def labels(self):
"""RETURNS (Tuple[str]): The labels currently added to the component."""
return tuple(self.cfg["labels_morph"].keys())
"""RETURNS (Iterable[str]): The labels currently added to the component."""
return self.cfg["labels_morph"].keys()
@property
def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
@ -163,7 +163,7 @@ class Morphologizer(Tagger):
# normalize label
norm_label = self.vocab.morphology.normalize_features(label)
# extract separate POS and morph tags
label_dict = Morphology.feats_to_dict(label)
label_dict = Morphology.feats_to_dict(label, sort_values=False)
pos = label_dict.get(self.POS_FEAT, "")
if self.POS_FEAT in label_dict:
label_dict.pop(self.POS_FEAT)
@ -201,7 +201,7 @@ class Morphologizer(Tagger):
continue
morph = str(token.morph)
# create and add the combined morph+POS label
morph_dict = Morphology.feats_to_dict(morph)
morph_dict = Morphology.feats_to_dict(morph, sort_values=False)
if pos:
morph_dict[self.POS_FEAT] = pos
norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
@ -218,7 +218,7 @@ class Morphologizer(Tagger):
for i, token in enumerate(example.reference):
pos = token.pos_
morph = str(token.morph)
morph_dict = Morphology.feats_to_dict(morph)
morph_dict = Morphology.feats_to_dict(morph, sort_values=False)
if pos:
morph_dict[self.POS_FEAT] = pos
norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
@ -244,7 +244,10 @@ class Morphologizer(Tagger):
cdef Vocab vocab = self.vocab
cdef bint overwrite = self.cfg["overwrite"]
cdef bint extend = self.cfg["extend"]
labels = self.labels
# We require random access for the upcoming ops, so we need
# to allocate a compatible container out of the iterable.
labels = tuple(self.labels)
for i, doc in enumerate(docs):
doc.activations[self.name] = {}
for activation in self.store_activations:
@ -253,20 +256,20 @@ class Morphologizer(Tagger):
if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids):
morph = labels[tag_id]
morph = labels[int(tag_id)]
# set morph
if doc.c[j].morph == 0 or overwrite or extend:
if overwrite and extend:
# morphologizer morph overwrites any existing features
# while extending
extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph])
extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0)))
extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False)
extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False))
doc.c[j].morph = self.vocab.morphology.add(extended_morph)
elif extend:
# existing features are preserved and any new features
# are added
extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0))
extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph]))
extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False)
extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False))
doc.c[j].morph = self.vocab.morphology.add(extended_morph)
else:
# clobber
@ -286,7 +289,7 @@ class Morphologizer(Tagger):
DOCS: https://spacy.io/api/morphologizer#get_loss
"""
validate_examples(examples, "Morphologizer.get_loss")
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
loss_func = SequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
truths = []
for eg in examples:
eg_truths = []
@ -307,7 +310,7 @@ class Morphologizer(Tagger):
label = None
# Otherwise, generate the combined label
else:
label_dict = Morphology.feats_to_dict(morph)
label_dict = Morphology.feats_to_dict(morph, sort_values=False)
if pos:
label_dict[self.POS_FEAT] = pos
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]

View File

@ -205,7 +205,7 @@ class TextCategorizer(TrainablePipe):
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
tensors = [doc.tensor for doc in docs]
xp = get_array_module(tensors)
xp = self.model.ops.xp
scores = xp.zeros((len(list(docs)), len(self.labels)))
return scores
scores = self.model.predict(docs)

View File

@ -12,6 +12,7 @@ cdef class Parser(TrainablePipe):
cdef public object _rehearsal_model
cdef readonly TransitionSystem moves
cdef public object _multitasks
cdef object _cpu_ops
cdef void _parseC(self, CBlas cblas, StateC** states,
WeightsC weights, SizesC sizes) nogil

View File

@ -9,7 +9,7 @@ from libc.stdlib cimport calloc, free
import random
import srsly
from thinc.api import get_ops, set_dropout_rate, CupyOps
from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
from thinc.extra.search cimport Beam
import numpy.random
import numpy
@ -30,6 +30,9 @@ from ..errors import Errors, Warnings
from .. import util
NUMPY_OPS = NumpyOps()
cdef class Parser(TrainablePipe):
"""
Base class of the DependencyParser and EntityRecognizer.
@ -120,6 +123,7 @@ cdef class Parser(TrainablePipe):
self._rehearsal_model = None
self.scorer = scorer
self._cpu_ops = get_ops("cpu") if isinstance(self.model.ops, CupyOps) else self.model.ops
def __getnewargs_ex__(self):
"""This allows pickling the Parser and its keyword-only init arguments"""
@ -259,12 +263,7 @@ cdef class Parser(TrainablePipe):
def greedy_parse(self, docs, drop=0.):
cdef vector[StateC*] states
cdef StateClass state
ops = self.model.ops
cdef CBlas cblas
if isinstance(ops, CupyOps):
cblas = get_ops("cpu").cblas()
else:
cblas = ops.cblas()
cdef CBlas cblas = self._cpu_ops.cblas()
self._ensure_labels_are_added(docs)
set_dropout_rate(self.model, drop)
batch = self.moves.init_batch(docs)

View File

@ -3,12 +3,13 @@ from typing import Iterable, TypeVar, TYPE_CHECKING
from .compat import Literal
from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator, create_model
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
from pydantic.main import ModelMetaclass
from thinc.api import Optimizer, ConfigValidationError, Model
from thinc.config import Promise
from collections import defaultdict
import inspect
import re
from .attrs import NAMES
from .lookups import Lookups
@ -198,13 +199,18 @@ class TokenPatternNumber(BaseModel):
return v
class TokenPatternOperator(str, Enum):
class TokenPatternOperatorSimple(str, Enum):
plus: StrictStr = StrictStr("+")
start: StrictStr = StrictStr("*")
star: StrictStr = StrictStr("*")
question: StrictStr = StrictStr("?")
exclamation: StrictStr = StrictStr("!")
class TokenPatternOperatorMinMax(ConstrainedStr):
regex = re.compile("^({\d+}|{\d+,\d*}|{\d*,\d+})$")
TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax]
StringValue = Union[TokenPatternString, StrictStr]
NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
UnderscoreValue = Union[

View File

@ -26,4 +26,4 @@ cdef class StringStore:
cdef public PreshMap _map
cdef const Utf8Str* intern_unicode(self, str py_string)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)

View File

@ -14,6 +14,13 @@ from .symbols import NAMES as SYMBOLS_BY_INT
from .errors import Errors
from . import util
# Not particularly elegant, but this is faster than `isinstance(key, numbers.Integral)`
cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
try:
out_hash[0] = key
return True
except:
return False
def get_string_id(key):
"""Get a string ID, handling the reserved symbols correctly. If the key is
@ -22,15 +29,27 @@ def get_string_id(key):
This function optimises for convenience over performance, so shouldn't be
used in tight loops.
"""
if not isinstance(key, str):
return key
elif key in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[key]
elif not key:
return 0
cdef hash_t str_hash
if isinstance(key, str):
if len(key) == 0:
return 0
symbol = SYMBOLS_BY_STR.get(key, None)
if symbol is not None:
return symbol
else:
chars = key.encode("utf8")
return hash_utf8(chars, len(chars))
elif _try_coerce_to_hash(key, &str_hash):
# Coerce the integral key to the expected primitive hash type.
# This ensures that custom/overloaded "primitive" data types
# such as those implemented by numpy are not inadvertently used
# downsteam (as these are internally implemented as custom PyObjects
# whose comparison operators can incur a significant overhead).
return str_hash
else:
chars = key.encode("utf8")
return hash_utf8(chars, len(chars))
# TODO: Raise an error instead
return key
cpdef hash_t hash_string(str string) except 0:
@ -110,28 +129,36 @@ cdef class StringStore:
string_or_id (bytes, str or uint64): The value to encode.
Returns (str / uint64): The value to be retrieved.
"""
if isinstance(string_or_id, str) and len(string_or_id) == 0:
return 0
elif string_or_id == 0:
return ""
elif string_or_id in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string_or_id]
cdef hash_t key
cdef hash_t str_hash
cdef Utf8Str* utf8str = NULL
if isinstance(string_or_id, str):
key = hash_string(string_or_id)
return key
elif isinstance(string_or_id, bytes):
key = hash_utf8(string_or_id, len(string_or_id))
return key
elif string_or_id < len(SYMBOLS_BY_INT):
return SYMBOLS_BY_INT[string_or_id]
else:
key = string_or_id
utf8str = <Utf8Str*>self._map.get(key)
if utf8str is NULL:
raise KeyError(Errors.E018.format(hash_value=string_or_id))
if len(string_or_id) == 0:
return 0
# Return early if the string is found in the symbols LUT.
symbol = SYMBOLS_BY_STR.get(string_or_id, None)
if symbol is not None:
return symbol
else:
return decode_Utf8Str(utf8str)
return hash_string(string_or_id)
elif isinstance(string_or_id, bytes):
return hash_utf8(string_or_id, len(string_or_id))
elif _try_coerce_to_hash(string_or_id, &str_hash):
if str_hash == 0:
return ""
elif str_hash < len(SYMBOLS_BY_INT):
return SYMBOLS_BY_INT[str_hash]
else:
utf8str = <Utf8Str*>self._map.get(str_hash)
else:
# TODO: Raise an error instead
utf8str = <Utf8Str*>self._map.get(string_or_id)
if utf8str is NULL:
raise KeyError(Errors.E018.format(hash_value=string_or_id))
else:
return decode_Utf8Str(utf8str)
def as_int(self, key):
"""If key is an int, return it; otherwise, get the int value."""
@ -153,19 +180,22 @@ cdef class StringStore:
string (str): The string to add.
RETURNS (uint64): The string's hash value.
"""
cdef hash_t str_hash
if isinstance(string, str):
if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string]
key = hash_string(string)
self.intern_unicode(string)
string = string.encode("utf8")
str_hash = hash_utf8(string, len(string))
self._intern_utf8(string, len(string), &str_hash)
elif isinstance(string, bytes):
if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string]
key = hash_utf8(string, len(string))
self._intern_utf8(string, len(string))
str_hash = hash_utf8(string, len(string))
self._intern_utf8(string, len(string), &str_hash)
else:
raise TypeError(Errors.E017.format(value_type=type(string)))
return key
return str_hash
def __len__(self):
"""The number of strings in the store.
@ -174,30 +204,29 @@ cdef class StringStore:
"""
return self.keys.size()
def __contains__(self, string not None):
"""Check whether a string is in the store.
def __contains__(self, string_or_id not None):
"""Check whether a string or ID is in the store.
string (str): The string to check.
string_or_id (str or int): The string to check.
RETURNS (bool): Whether the store contains the string.
"""
cdef hash_t key
if isinstance(string, int) or isinstance(string, long):
if string == 0:
cdef hash_t str_hash
if isinstance(string_or_id, str):
if len(string_or_id) == 0:
return True
key = string
elif len(string) == 0:
return True
elif string in SYMBOLS_BY_STR:
return True
elif isinstance(string, str):
key = hash_string(string)
elif string_or_id in SYMBOLS_BY_STR:
return True
str_hash = hash_string(string_or_id)
elif _try_coerce_to_hash(string_or_id, &str_hash):
pass
else:
string = string.encode("utf8")
key = hash_utf8(string, len(string))
if key < len(SYMBOLS_BY_INT):
# TODO: Raise an error instead
return self._map.get(string_or_id) is not NULL
if str_hash < len(SYMBOLS_BY_INT):
return True
else:
return self._map.get(key) is not NULL
return self._map.get(str_hash) is not NULL
def __iter__(self):
"""Iterate over the strings in the store, in order.
@ -272,13 +301,13 @@ cdef class StringStore:
cdef const Utf8Str* intern_unicode(self, str py_string):
# 0 means missing, but we don't bother offsetting the index.
cdef bytes byte_string = py_string.encode("utf8")
return self._intern_utf8(byte_string, len(byte_string))
return self._intern_utf8(byte_string, len(byte_string), NULL)
@cython.final
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length):
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
# TODO: This function's API/behaviour is an unholy mess...
# 0 means missing, but we don't bother offsetting the index.
cdef hash_t key = hash_utf8(utf8_string, length)
cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
if value is not NULL:
return value

View File

@ -58,14 +58,6 @@ cdef struct TokenC:
hash_t ent_id
cdef struct MorphAnalysisC:
hash_t key
int length
attr_t* fields
attr_t* features
# Internal struct, for storage and disambiguation of entities.
cdef struct KBEntryC:

View File

@ -1,5 +1,11 @@
import pytest
from spacy.util import get_lang_class
from hypothesis import settings
# Functionally disable deadline settings for tests
# to prevent spurious test failures in CI builds.
settings.register_profile("no_deadlines", deadline=2 * 60 * 1000) # in ms
settings.load_profile("no_deadlines")
def pytest_addoption(parser):

View File

@ -0,0 +1,8 @@
import pytest
def test_bg_tokenizer_handles_final_diacritics(bg_tokenizer):
text = "Ня̀маше яйца̀. Ня̀маше яйца̀."
tokens = bg_tokenizer(text)
assert tokens[1].text == "яйца̀"
assert tokens[2].text == "."

View File

@ -1,3 +1,4 @@
from string import punctuation
import pytest
@ -122,3 +123,36 @@ def test_ru_tokenizer_splits_bracket_period(ru_tokenizer):
text = "(Раз, два, три, проверка)."
tokens = ru_tokenizer(text)
assert tokens[len(tokens) - 1].text == "."
@pytest.mark.parametrize(
"text",
[
"рекоменду́я подда́ть жару́. Самого́ Баргамота",
"РЕКОМЕНДУ́Я ПОДДА́ТЬ ЖАРУ́. САМОГО́ БАРГАМОТА",
"рекоменду̍я подда̍ть жару̍.Самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍.'Самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍,самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍:самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍. самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍, самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍: самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍-самого̍ Баргамота",
],
)
def test_ru_tokenizer_handles_final_diacritics(ru_tokenizer, text):
tokens = ru_tokenizer(text)
assert tokens[2].text in ("жару́", "ЖАРУ́", "жару̍")
assert tokens[3].text in punctuation
@pytest.mark.parametrize(
"text",
[
"РЕКОМЕНДУ́Я ПОДДА́ТЬ ЖАРУ́.САМОГО́ БАРГАМОТА",
"рекоменду̍я подда̍ть жару́.самого́ Баргамота",
],
)
def test_ru_tokenizer_handles_final_diacritic_and_period(ru_tokenizer, text):
tokens = ru_tokenizer(text)
assert tokens[2].text.lower() == "жару́.самого́"

View File

@ -140,3 +140,10 @@ def test_uk_tokenizer_splits_bracket_period(uk_tokenizer):
text = "(Раз, два, три, проверка)."
tokens = uk_tokenizer(text)
assert tokens[len(tokens) - 1].text == "."
def test_uk_tokenizer_handles_final_diacritics(uk_tokenizer):
text = "Хлібі́в не було́. Хлібі́в не було́."
tokens = uk_tokenizer(text)
assert tokens[2].text == "було́"
assert tokens[3].text == "."

View File

@ -316,6 +316,20 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
("the", "brown", "$--", 0),
("brown", "the", "$--", 1),
("brown", "brown", "$--", 0),
("quick", "fox", "<++", 1),
("quick", "over", "<++", 0),
("over", "jumped", "<++", 0),
("the", "fox", "<++", 2),
("brown", "fox", "<--", 0),
("fox", "jumped", "<--", 0),
("fox", "over", "<--", 1),
("jumped", "over", ">++", 1),
("fox", "lazy", ">++", 0),
("over", "the", ">++", 0),
("brown", "fox", ">--", 0),
("fox", "brown", ">--", 1),
("jumped", "fox", ">--", 1),
("fox", "the", ">--", 2),
],
)
def test_dependency_matcher_ops(en_vocab, doc, left, right, op, num_matches):

View File

@ -687,3 +687,38 @@ def test_matcher_ent_iob_key(en_vocab):
assert matches[0] == "Maria"
assert matches[1] == "Maria Esperanza"
assert matches[2] == "Esperanza"
def test_matcher_min_max_operator(en_vocab):
# Exactly n matches {n}
doc = Doc(
en_vocab,
words=["foo", "bar", "foo", "foo", "bar", "foo", "foo", "foo", "bar", "bar"],
)
matcher = Matcher(en_vocab)
pattern = [{"ORTH": "foo", "OP": "{3}"}]
matcher.add("TEST", [pattern])
matches1 = [doc[start:end].text for _, start, end in matcher(doc)]
assert len(matches1) == 1
# At least n matches {n,}
matcher = Matcher(en_vocab)
pattern = [{"ORTH": "foo", "OP": "{2,}"}]
matcher.add("TEST", [pattern])
matches2 = [doc[start:end].text for _, start, end in matcher(doc)]
assert len(matches2) == 4
# At most m matches {,m}
matcher = Matcher(en_vocab)
pattern = [{"ORTH": "foo", "OP": "{,2}"}]
matcher.add("TEST", [pattern])
matches3 = [doc[start:end].text for _, start, end in matcher(doc)]
assert len(matches3) == 9
# At least n matches and most m matches {n,m}
matcher = Matcher(en_vocab)
pattern = [{"ORTH": "foo", "OP": "{2,3}"}]
matcher.add("TEST", [pattern])
matches4 = [doc[start:end].text for _, start, end in matcher(doc)]
assert len(matches4) == 4

View File

@ -699,6 +699,10 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
("aaaa", "a a a a a?", [0, 1, 2, 3]),
("aaab", "a+ a b", [0, 0, 1, 2]),
("aaab", "a+ a+ b", [0, 0, 1, 2]),
("aaab", "a{2,} b", [0, 0, 0, 1]),
("aaab", "a{,3} b", [0, 0, 0, 1]),
("aaab", "a{2} b", [0, 0, 1]),
("aaab", "a{2,3} b", [0, 0, 0, 1]),
]
for string, pattern_str, result in cases:
matcher = Matcher(en_vocab)
@ -711,6 +715,8 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
pattern.append({"ORTH": part[0], "OP": "*"})
elif part.endswith("?"):
pattern.append({"ORTH": part[0], "OP": "?"})
elif part.endswith("}"):
pattern.append({"ORTH": part[0], "OP": part[1:]})
else:
pattern.append({"ORTH": part})
matcher.add("PATTERN", [pattern], greedy="LONGEST")
@ -722,7 +728,7 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
assert expected == result, (string, pattern_str, s, e, n_matches)
def test_matcher_with_alignments_nongreedy(en_vocab):
def test_matcher_with_alignments_non_greedy(en_vocab):
cases = [
(0, "aaab", "a* b", [[0, 1], [0, 0, 1], [0, 0, 0, 1], [1]]),
(1, "baab", "b a* b", [[0, 1, 1, 2]]),
@ -752,6 +758,10 @@ def test_matcher_with_alignments_nongreedy(en_vocab):
(15, "aaaa", "a a a a a?", [[0, 1, 2, 3]]),
(16, "aaab", "a+ a b", [[0, 1, 2], [0, 0, 1, 2]]),
(17, "aaab", "a+ a+ b", [[0, 1, 2], [0, 0, 1, 2]]),
(18, "aaab", "a{2,} b", [[0, 0, 1], [0, 0, 0, 1]]),
(19, "aaab", "a{3} b", [[0, 0, 0, 1]]),
(20, "aaab", "a{2} b", [[0, 0, 1]]),
(21, "aaab", "a{2,3} b", [[0, 0, 1], [0, 0, 0, 1]]),
]
for case_id, string, pattern_str, results in cases:
matcher = Matcher(en_vocab)
@ -764,6 +774,8 @@ def test_matcher_with_alignments_nongreedy(en_vocab):
pattern.append({"ORTH": part[0], "OP": "*"})
elif part.endswith("?"):
pattern.append({"ORTH": part[0], "OP": "?"})
elif part.endswith("}"):
pattern.append({"ORTH": part[0], "OP": part[1:]})
else:
pattern.append({"ORTH": part})

View File

@ -14,6 +14,14 @@ TEST_PATTERNS = [
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
([{"ENT_IOB": "foo"}], 1, 1),
([1, 2, 3], 3, 1),
([{"TEXT": "foo", "OP": "{,}"}], 1, 1),
([{"TEXT": "foo", "OP": "{,4}4"}], 1, 1),
([{"TEXT": "foo", "OP": "{a,3}"}], 1, 1),
([{"TEXT": "foo", "OP": "{a}"}], 1, 1),
([{"TEXT": "foo", "OP": "{,a}"}], 1, 1),
([{"TEXT": "foo", "OP": "{1,2,3}"}], 1, 1),
([{"TEXT": "foo", "OP": "{1, 3}"}], 1, 1),
([{"TEXT": "foo", "OP": "{-2}"}], 1, 1),
# Bad patterns flagged outside of Matcher
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)
# Bad patterns not flagged with minimal checks
@ -38,6 +46,7 @@ TEST_PATTERNS = [
([{"SENT_START": True}], 0, 0),
([{"ENT_ID": "STRING"}], 0, 0),
([{"ENT_KB_ID": "STRING"}], 0, 0),
([{"TEXT": "ha", "OP": "{3}"}], 0, 0),
]

View File

@ -1,4 +1,4 @@
from typing import Callable, Iterable, cast
from typing import Callable, Iterable, Dict, Any, cast
import pytest
from numpy.testing import assert_equal
@ -208,7 +208,7 @@ def test_no_entities():
nlp.add_pipe("sentencizer", first=True)
# this will run the pipeline on the examples and shouldn't crash
results = nlp.evaluate(train_examples)
nlp.evaluate(train_examples)
def test_partial_links():
@ -1064,7 +1064,7 @@ def test_no_gold_ents(patterns):
"entity_linker", config={"use_gold_ents": False}, last=True
)
entity_linker.set_kb(create_kb)
assert entity_linker.use_gold_ents == False
assert entity_linker.use_gold_ents is False
optimizer = nlp.initialize(get_examples=lambda: train_examples)
for i in range(2):
@ -1075,7 +1075,7 @@ def test_no_gold_ents(patterns):
nlp.add_pipe("sentencizer", first=True)
# this will run the pipeline on the examples and shouldn't crash
results = nlp.evaluate(train_examples)
nlp.evaluate(train_examples)
@pytest.mark.issue(9575)
@ -1115,7 +1115,64 @@ def test_tokenization_mismatch():
nlp.update(train_examples, sgd=optimizer, losses=losses)
nlp.add_pipe("sentencizer", first=True)
results = nlp.evaluate(train_examples)
nlp.evaluate(train_examples)
# fmt: off
@pytest.mark.parametrize(
"meet_threshold,config",
[
(False, {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
(True, {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
],
)
# fmt: on
def test_threshold(meet_threshold: bool, config: Dict[str, Any]):
"""Tests abstention threshold.
meet_threshold (bool): Whether to configure NEL setup so that confidence threshold is met.
config (Dict[str, Any]): NEL architecture config.
"""
nlp = English()
nlp.add_pipe("sentencizer")
text = "Mahler's Symphony No. 8 was beautiful."
entities = [(0, 6, "PERSON")]
links = {(0, 6): {"Q7304": 1.0}}
sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
entity_id = "Q7304"
doc = nlp(text)
train_examples = [
Example.from_dict(
doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
)
]
def create_kb(vocab):
# create artificial KB
mykb = KnowledgeBase(vocab, entity_vector_length=3)
mykb.add_entity(entity=entity_id, freq=12, entity_vector=[6, -4, 3])
mykb.add_alias(
alias="Mahler",
entities=[entity_id],
probabilities=[1 if meet_threshold else 0.01],
)
return mykb
# Create the Entity Linker component and add it to the pipeline
entity_linker = nlp.add_pipe(
"entity_linker",
last=True,
config={"threshold": 0.99, "model": config},
)
entity_linker.set_kb(create_kb) # type: ignore
nlp.initialize(get_examples=lambda: train_examples)
# Add a custom rule-based component to mimick NER
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
ruler.add_patterns([{"label": "PERSON", "pattern": [{"LOWER": "mahler"}]}]) # type: ignore
doc = nlp(text)
assert len(doc.ents) == 1
assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
def test_store_activations():

View File

@ -589,7 +589,6 @@ def test_string_to_list_intify(value):
assert string_to_list(value, intify=True) == [1, 2, 3]
@pytest.mark.skip(reason="Temporarily skip for dev version")
def test_download_compatibility():
spec = SpecifierSet("==" + about.__version__)
spec.prereleases = False
@ -600,7 +599,6 @@ def test_download_compatibility():
assert get_minor_version(about.__version__) == get_minor_version(version)
@pytest.mark.skip(reason="Temporarily skip for dev version")
def test_validate_compatibility_table():
spec = SpecifierSet("==" + about.__version__)
spec.prereleases = False

View File

@ -60,12 +60,11 @@ def test_readers():
assert isinstance(extra_corpus, Callable)
# TODO: enable IMDB test once Stanford servers are back up and running
@pytest.mark.slow
@pytest.mark.parametrize(
"reader,additional_config",
[
# ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 10}),
("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 10}),
("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 10}),
("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}),
],

View File

@ -679,6 +679,31 @@ def test_projectivize(en_tokenizer):
assert proj_heads == [3, 2, 3, 3, 3]
assert nonproj_heads == [3, 2, 3, 3, 2]
# Test single token documents
doc = en_tokenizer("Conrail")
heads = [0]
deps = ["dep"]
example = Example.from_dict(doc, {"heads": heads, "deps": deps})
proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
assert proj_heads == heads
assert proj_labels == deps
# Test documents with no alignments
doc_a = Doc(
doc.vocab, words=["Double-Jointed"], spaces=[False], deps=["ROOT"], heads=[0]
)
doc_b = Doc(
doc.vocab,
words=["Double", "-", "Jointed"],
spaces=[True, True, True],
deps=["amod", "punct", "ROOT"],
heads=[2, 2, 2],
)
example = Example(doc_a, doc_b)
proj_heads, proj_deps = example.get_aligned_parse(projectivize=True)
assert proj_heads == [None]
assert proj_deps == [None]
def test_iob_to_biluo():
good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]

View File

@ -1,6 +1,7 @@
import pytest
import numpy
from spacy.tokens import Doc
from spacy.vocab import Vocab
from ..util import get_cosine, add_vecs_to_vocab
@ -71,19 +72,17 @@ def test_vectors_similarity_DD(vocab, vectors):
def test_vectors_similarity_TD(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
with pytest.warns(UserWarning):
assert isinstance(doc.similarity(doc[0]), float)
assert isinstance(doc[0].similarity(doc), float)
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
assert isinstance(doc.similarity(doc[0]), float)
assert isinstance(doc[0].similarity(doc), float)
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
def test_vectors_similarity_TS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
with pytest.warns(UserWarning):
assert isinstance(doc[:2].similarity(doc[0]), float)
assert isinstance(doc[0].similarity(doc[-2]), float)
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
assert isinstance(doc[:2].similarity(doc[0]), float)
assert isinstance(doc[0].similarity(doc[:2]), float)
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
def test_vectors_similarity_DS(vocab, vectors):
@ -91,3 +90,21 @@ def test_vectors_similarity_DS(vocab, vectors):
doc = Doc(vocab, words=[word1, word2])
assert isinstance(doc.similarity(doc[:2]), float)
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
def test_vectors_similarity_no_vectors():
vocab = Vocab()
doc1 = Doc(vocab, words=["a", "b"])
doc2 = Doc(vocab, words=["c", "d", "e"])
with pytest.warns(UserWarning):
doc1.similarity(doc2)
with pytest.warns(UserWarning):
doc1.similarity(doc2[1])
with pytest.warns(UserWarning):
doc1.similarity(doc2[:2])
with pytest.warns(UserWarning):
doc2.similarity(doc1)
with pytest.warns(UserWarning):
doc2[1].similarity(doc1)
with pytest.warns(UserWarning):
doc2[:2].similarity(doc1)

View File

@ -318,17 +318,15 @@ def test_vectors_lexeme_doc_similarity(vocab, text):
@pytest.mark.parametrize("text", [["apple", "orange", "juice"]])
def test_vectors_span_span_similarity(vocab, text):
doc = Doc(vocab, words=text)
with pytest.warns(UserWarning):
assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
assert -1.0 < doc[0:2].similarity(doc[1:3]) < 1.0
assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
assert -1.0 < doc[0:2].similarity(doc[1:3]) < 1.0
@pytest.mark.parametrize("text", [["apple", "orange", "juice"]])
def test_vectors_span_doc_similarity(vocab, text):
doc = Doc(vocab, words=text)
with pytest.warns(UserWarning):
assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
assert -1.0 < doc[0:2].similarity(doc) < 1.0
assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
assert -1.0 < doc[0:2].similarity(doc) < 1.0
@pytest.mark.parametrize(

View File

@ -608,7 +608,8 @@ cdef class Doc:
if self.vocab.vectors.n_keys == 0:
warnings.warn(Warnings.W007.format(obj="Doc"))
if self.vector_norm == 0 or other.vector_norm == 0:
warnings.warn(Warnings.W008.format(obj="Doc"))
if not self.has_vector or not other.has_vector:
warnings.warn(Warnings.W008.format(obj="Doc"))
return 0.0
vector = self.vector
xp = get_array_module(vector)
@ -628,7 +629,7 @@ cdef class Doc:
if "has_vector" in self.user_hooks:
return self.user_hooks["has_vector"](self)
elif self.vocab.vectors.size:
return True
return any(token.has_vector for token in self)
elif self.tensor.size:
return True
else:

View File

@ -1,9 +1,12 @@
from ..vocab cimport Vocab
from ..typedefs cimport hash_t
from ..structs cimport MorphAnalysisC
from ..morphology cimport MorphAnalysisC
from libcpp.memory cimport shared_ptr
cdef class MorphAnalysis:
cdef readonly Vocab vocab
cdef readonly hash_t key
cdef MorphAnalysisC c
cdef shared_ptr[MorphAnalysisC] c
cdef void _init_c(self, hash_t key)

View File

@ -5,7 +5,12 @@ from ..errors import Errors
from ..morphology import Morphology
from ..vocab cimport Vocab
from ..typedefs cimport hash_t, attr_t
from ..morphology cimport list_features, check_feature, get_by_field
from ..morphology cimport list_features, check_feature, get_by_field, MorphAnalysisC
from libcpp.memory cimport shared_ptr
from cython.operator cimport dereference as deref
cdef shared_ptr[MorphAnalysisC] EMPTY_MORPH_TAG = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
cdef class MorphAnalysis:
@ -13,39 +18,38 @@ cdef class MorphAnalysis:
def __init__(self, Vocab vocab, features=dict()):
self.vocab = vocab
self.key = self.vocab.morphology.add(features)
analysis = <const MorphAnalysisC*>self.vocab.morphology.tags.get(self.key)
if analysis is not NULL:
self.c = analysis[0]
self._init_c(self.key)
cdef void _init_c(self, hash_t key):
cdef shared_ptr[MorphAnalysisC] analysis = self.vocab.morphology.get_morph_c(key)
if analysis:
self.c = analysis
else:
memset(&self.c, 0, sizeof(self.c))
self.c = EMPTY_MORPH_TAG
@classmethod
def from_id(cls, Vocab vocab, hash_t key):
"""Create a morphological analysis from a given ID."""
cdef MorphAnalysis morph = MorphAnalysis.__new__(MorphAnalysis, vocab)
cdef MorphAnalysis morph = MorphAnalysis(vocab)
morph.vocab = vocab
morph.key = key
analysis = <const MorphAnalysisC*>vocab.morphology.tags.get(key)
if analysis is not NULL:
morph.c = analysis[0]
else:
memset(&morph.c, 0, sizeof(morph.c))
morph._init_c(key)
return morph
def __contains__(self, feature):
"""Test whether the morphological analysis contains some feature."""
cdef attr_t feat_id = self.vocab.strings.as_int(feature)
return check_feature(&self.c, feat_id)
return check_feature(self.c, feat_id)
def __iter__(self):
"""Iterate over the features in the analysis."""
cdef attr_t feature
for feature in list_features(&self.c):
for feature in list_features(self.c):
yield self.vocab.strings[feature]
def __len__(self):
"""The number of features in the analysis."""
return self.c.length
return deref(self.c).features.size()
def __hash__(self):
return self.key
@ -61,7 +65,7 @@ cdef class MorphAnalysis:
def get(self, field):
"""Retrieve feature values by field."""
cdef attr_t field_id = self.vocab.strings.as_int(field)
cdef np.ndarray results = get_by_field(&self.c, field_id)
cdef np.ndarray results = get_by_field(self.c, field_id)
features = [self.vocab.strings[result] for result in results]
return [f.split(Morphology.FIELD_SEP)[1] for f in features]
@ -69,7 +73,7 @@ cdef class MorphAnalysis:
"""Produce a json serializable representation as a UD FEATS-style
string.
"""
morph_string = self.vocab.strings[self.c.key]
morph_string = self.vocab.strings[deref(self.c).key]
if morph_string == self.vocab.morphology.EMPTY_MORPH:
return ""
return morph_string

View File

@ -120,6 +120,10 @@ class Span:
ent_id: int
ent_id_: str
@property
def id(self) -> int: ...
@property
def id_(self) -> str: ...
@property
def orth_(self) -> str: ...
@property
def lemma_(self) -> str: ...

View File

@ -365,7 +365,8 @@ cdef class Span:
if self.vocab.vectors.n_keys == 0:
warnings.warn(Warnings.W007.format(obj="Span"))
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
warnings.warn(Warnings.W008.format(obj="Span"))
if not self.has_vector or not other.has_vector:
warnings.warn(Warnings.W008.format(obj="Span"))
return 0.0
vector = self.vector
xp = get_array_module(vector)

View File

@ -22,6 +22,7 @@ from .. import parts_of_speech
from ..errors import Errors, Warnings
from ..attrs import IOB_STRINGS
from .underscore import Underscore, get_ext_args
from cython.operator cimport dereference as deref
cdef class Token:
@ -206,7 +207,8 @@ cdef class Token:
if self.vocab.vectors.n_keys == 0:
warnings.warn(Warnings.W007.format(obj="Token"))
if self.vector_norm == 0 or other.vector_norm == 0:
warnings.warn(Warnings.W008.format(obj="Token"))
if not self.has_vector or not other.has_vector:
warnings.warn(Warnings.W008.format(obj="Token"))
return 0.0
vector = self.vector
xp = get_array_module(vector)
@ -230,7 +232,7 @@ cdef class Token:
# Check that the morph has the same vocab
if self.vocab != morph.vocab:
raise ValueError(Errors.E1013)
self.c.morph = morph.c.key
self.c.morph = deref(morph.c).key
def set_morph(self, features):
cdef hash_t key

View File

@ -249,9 +249,9 @@ cdef class Example:
# Fetch all aligned gold token incides.
if c2g_single_toks.shape == cand_to_gold.lengths.shape:
# This the most likely case.
gold_i = cand_to_gold[:].squeeze()
gold_i = cand_to_gold[:]
else:
gold_i = numpy.vectorize(lambda x: cand_to_gold[int(x)][0])(c2g_single_toks).squeeze()
gold_i = numpy.vectorize(lambda x: cand_to_gold[int(x)][0], otypes='i')(c2g_single_toks)
# Fetch indices of all gold heads for the aligned gold tokens.
heads = numpy.asarray(heads, dtype='i')
@ -261,7 +261,7 @@ cdef class Example:
# gold tokens (and are aligned to a single candidate token).
g2c_len_heads = gold_to_cand.lengths[gold_head_i]
g2c_len_heads = numpy.where(g2c_len_heads == 1)[0]
g2c_i = numpy.vectorize(lambda x: gold_to_cand[int(x)][0])(gold_head_i[g2c_len_heads]).squeeze()
g2c_i = numpy.vectorize(lambda x: gold_to_cand[int(x)][0], otypes='i')(gold_head_i[g2c_len_heads]).squeeze()
# Update head/dep alignments with the above.
aligned_heads = numpy.full((self.x.length), None)

View File

@ -336,10 +336,10 @@ cdef class Vectors:
xp = get_array_module(self.data)
if key is not None:
key = get_string_id(key)
return self.key2row.get(key, -1)
return self.key2row.get(int(key), -1)
elif keys is not None:
keys = [get_string_id(key) for key in keys]
rows = [self.key2row.get(key, -1) for key in keys]
rows = [self.key2row.get(int(key), -1) for key in keys]
return xp.asarray(rows, dtype="i")
else:
row2key = {row: key for key, row in self.key2row.items()}

View File

@ -62,7 +62,7 @@ of relations, see the usage guide on
</Infobox>
### Operators
### Operators {#operators}
The following operators are supported by the `DependencyMatcher`, most of which
come directly from
@ -82,6 +82,11 @@ come directly from
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
## DependencyMatcher.\_\_init\_\_ {#init tag="method"}

View File

@ -47,23 +47,25 @@ architectures and their arguments and hyperparameters.
> "model": DEFAULT_NEL_MODEL,
> "entity_vector_length": 64,
> "get_candidates": {'@misc': 'spacy.CandidateGenerator.v1'},
> "threshold": None,
> }
> nlp.add_pipe("entity_linker", config=config)
> ```
| Setting | Description |
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| `store_activations` | Store activations in `Doc` when annotating. Supported activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
| Setting | Description |
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| `store_activations` | Store activations in `Doc` when annotating. Supported activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
@ -96,20 +98,21 @@ custom knowledge base, you should either call
[`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the
[`initialize`](/api/entitylinker#initialize) call.
| Name | Description |
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | |
| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ |
| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ |
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ |
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| Name | Description |
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | |
| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ |
| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ |
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ |
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
## EntityLinker.\_\_call\_\_ {#call tag="method"}

View File

@ -118,7 +118,7 @@ shortcut for this and instantiate the component using its string name and
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | |
| mode | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
| overwrite | Whether to overwrite existing lemmas. ~~bool~ |
| overwrite | Whether to overwrite existing lemmas. ~~bool~~ |
## Lemmatizer.\_\_call\_\_ {#call tag="method"}

View File

@ -59,15 +59,20 @@ matched:
> [
> {"POS": "ADJ", "OP": "*"},
> {"POS": "NOUN", "OP": "+"}
> {"POS": "PROPN", "OP": "{2}"}
> ]
> ```
| OP | Description |
| --- | ---------------------------------------------------------------- |
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
| `+` | Require the pattern to match 1 or more times. |
| `*` | Allow the pattern to match 0 or more times. |
| OP | Description |
|---------|------------------------------------------------------------------------|
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
| `+` | Require the pattern to match 1 or more times. |
| `*` | Allow the pattern to match 0 or more times. |
| `{n}` | Require the pattern to match exactly _n_ times. |
| `{n,m}` | Require the pattern to match at least _n_ but not more than _m_ times. |
| `{n,}` | Require the pattern to match at least _n_ times. |
| `{,m}` | Require the pattern to match at most _m_ times. |
Token patterns can also map to a **dictionary of properties** instead of a
single value to indicate whether the expected value is a member of a list or how
@ -194,7 +199,7 @@ will be overwritten.
> [{"LOWER": "hello"}, {"LOWER": "world"}],
> [{"ORTH": "Google"}, {"ORTH": "Maps"}]
> ]
> matcher.add("TEST_PATTERNS", patterns)
> matcher.add("TEST_PATTERNS", patterns, on_match=on_match)
> doc = nlp("HELLO WORLD on Google Maps.")
> matches = matcher(doc)
> ```

View File

@ -402,7 +402,7 @@ coarse-grained POS as the feature `POS`.
| Name | Description |
| ----------- | ------------------------------------------------------ |
| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
| **RETURNS** | The labels added to the component. ~~Iterable[str, ...]~~ |
## Morphologizer.label_data {#label_data tag="property" new="3"}

View File

@ -130,8 +130,8 @@ grateful to use the work of Chainer's [CuPy](https://cupy.chainer.org) module,
which provides a numpy-compatible interface for GPU arrays.
spaCy can be installed for a CUDA-compatible GPU by specifying `spacy[cuda]`,
`spacy[cuda102]`, `spacy[cuda112]`, `spacy[cuda113]`, etc. If you know your
CUDA version, using the more explicit specifier allows CuPy to be installed via
`spacy[cuda102]`, `spacy[cuda112]`, `spacy[cuda113]`, etc. If you know your CUDA
version, using the more explicit specifier allows CuPy to be installed via
wheel, saving some compilation time. The specifiers should install
[`cupy`](https://cupy.chainer.org).
@ -195,29 +195,73 @@ How to install compilers and related build tools:
[Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/)
that matches the version that was used to compile your Python interpreter.
#### Using build constraints when compiling from source
If you install spaCy from source or with `pip` for platforms where there are not
binary wheels on PyPI, you may need to use build constraints if any package in
your environment requires an older version of `numpy`.
If `numpy` gets downgraded from the most recent release at any point after
you've compiled `spacy`, you might see an error that looks like this:
```none
numpy.ndarray size changed, may indicate binary incompatibility.
```
To fix this, create a new virtual environment and install `spacy` and all of its
dependencies using build constraints.
[Build constraints](https://pip.pypa.io/en/stable/user_guide/#constraints-files)
specify an older version of `numpy` that is only used while compiling `spacy`,
and then your runtime environment can use any newer version of `numpy` and still
be compatible. In addition, use `--no-cache-dir` to ignore any previously cached
wheels so that all relevant packages are recompiled from scratch:
```shell
PIP_CONSTRAINT=https://raw.githubusercontent.com/explosion/spacy/master/build-constraints.txt \
pip install spacy --no-cache-dir
```
Our build constraints currently specify the oldest supported `numpy` available
on PyPI for `x86_64` and `aarch64`. Depending on your platform and environment,
you may want to customize the specific versions of `numpy`. For other platforms,
you can have a look at SciPy's
[`oldest-supported-numpy`](https://github.com/scipy/oldest-supported-numpy/blob/main/setup.cfg)
package to see what the oldest recommended versions of `numpy` are.
(_Warning_: don't use `pip install -c constraints.txt` instead of
`PIP_CONSTRAINT`, since this isn't applied to the isolated build environments.)
#### Additional options for developers {#source-developers}
Some additional options may be useful for spaCy developers who are editing the
source code and recompiling frequently.
- Install in editable mode. Changes to `.py` files will be reflected as soon as
the files are saved, but edits to Cython files (`.pxd`, `.pyx`) will require
the `pip install` or `python setup.py build_ext` command below to be run
again. Before installing in editable mode, be sure you have removed any
previous installs with `pip uninstall spacy`, which you may need to run
multiple times to remove all traces of earlier installs.
- Install in editable mode. Changes to `.py` files will be reflected as soon
as the files are saved, but edits to Cython files (`.pxd`, `.pyx`) will
require the `pip install` command below to be run again. Before installing in
editable mode, be sure you have removed any previous installs with
`pip uninstall spacy`, which you may need to run multiple times to remove all
traces of earlier installs.
```bash
$ pip install -r requirements.txt
$ pip install --no-build-isolation --editable .
```
- Build in parallel using `N` CPUs to speed up compilation and then install in
editable mode:
- Build in parallel. Starting in v3.4.0, you can specify the number of
build jobs with the environment variable `SPACY_NUM_BUILD_JOBS`:
```bash
$ pip install -r requirements.txt
$ python setup.py build_ext --inplace -j N
$ SPACY_NUM_BUILD_JOBS=4 pip install --no-build-isolation --editable .
```
- For editable mode and parallel builds with `python setup.py` instead of `pip`
(no longer recommended):
```bash
$ pip install -r requirements.txt
$ python setup.py build_ext --inplace -j 4
$ python setup.py develop
```

View File

@ -374,12 +374,16 @@ punctuation marks, or specify optional tokens. Note that there are no nested or
scoped quantifiers instead, you can build those behaviors with `on_match`
callbacks.
| OP | Description |
| --- | ---------------------------------------------------------------- |
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
| `+` | Require the pattern to match 1 or more times. |
| `*` | Allow the pattern to match zero or more times. |
| OP | Description |
|---------|------------------------------------------------------------------------|
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
| `+` | Require the pattern to match 1 or more times. |
| `*` | Allow the pattern to match zero or more times. |
| `{n}` | Require the pattern to match exactly _n_ times. |
| `{n,m}` | Require the pattern to match at least _n_ but not more than _m_ times. |
| `{n,}` | Require the pattern to match at least _n_ times. |
| `{,m}` | Require the pattern to match at most _m_ times. |
> #### Example
>

143
website/docs/usage/v3-4.md Normal file
View File

@ -0,0 +1,143 @@
---
title: What's New in v3.4
teaser: New features and how to upgrade
menu:
- ['New Features', 'features']
- ['Upgrading Notes', 'upgrading']
---
## New features {#features hidden="true"}
spaCy v3.4 brings typing and speed improvements along with new vectors for
English CNN pipelines and new trained pipelines for Croatian. This release also
includes prebuilt linux aarch64 wheels for all spaCy dependencies distributed by
Explosion.
### Typing improvements {#typing}
spaCy v3.4 supports pydantic v1.9 and mypy 0.950+ through extensive updates to
types in Thinc v8.1.
### Speed improvements {#speed}
- For the parser, use C `saxpy`/`sgemm` provided by the `Ops` implementation in
order to use Accelerate through `thinc-apple-ops`.
- Improved speed of vector lookups.
- Improved speed for `Example.get_aligned_parse` and `Example.get_aligned`.
## Additional features and improvements
- Min/max `{n,m}` operator for `Matcher` patterns.
- Language updates:
- Improve tokenization for Cyrillic combining diacritics.
- Improve English tokenizer exceptions for contractions with
this/that/these/those.
- Updated `spacy project clone` to try both `main` and `master` branches by
default.
- Added confidence threshold for named entity linker.
- Improved handling of Typer optional default values for `init_config_cli`.
- Added cycle detection in parser projectivization methods.
- Added counts for NER labels in `debug data`.
- Support for adding NVTX ranges to `TrainablePipe` components.
- Support env variable `SPACY_NUM_BUILD_JOBS` to specify the number of build
jobs to run in parallel with `pip`.
## Trained pipelines {#pipelines}
### New trained pipelines {#new-pipelines}
v3.4 introduces new CPU/CNN pipelines for Croatian, which use the trainable
lemmatizer and [floret vectors](https://github.com/explosion/floret). Due to the
use of [Bloom embeddings](https://explosion.ai/blog/bloom-embeddings) and
subwords, the pipelines have compact vectors with no out-of-vocabulary words.
| Package | UPOS | Parser LAS | NER F |
| ----------------------------------------------- | ---: | ---------: | ----: |
| [`hr_core_news_sm`](/models/hr#hr_core_news_sm) | 96.6 | 77.5 | 76.1 |
| [`hr_core_news_md`](/models/hr#hr_core_news_md) | 97.3 | 80.1 | 81.8 |
| [`hr_core_news_lg`](/models/hr#hr_core_news_lg) | 97.5 | 80.4 | 83.0 |
### Pipeline updates {#pipeline-updates}
All CNN pipelines have been extended with whitespace augmentation.
The English CNN pipelines have new word vectors:
| Package | Model Version | TAG | Parser LAS | NER F |
| ----------------------------------------------- | ------------- | ---: | ---------: | ----: |
| [`en_core_news_md`](/models/en#en_core_news_md) | v3.3.0 | 97.3 | 90.1 | 84.6 |
| [`en_core_news_md`](/models/en#en_core_news_lg) | v3.4.0 | 97.2 | 90.3 | 85.5 |
| [`en_core_news_lg`](/models/en#en_core_news_md) | v3.3.0 | 97.4 | 90.1 | 85.3 |
| [`en_core_news_lg`](/models/en#en_core_news_lg) | v3.4.0 | 97.3 | 90.2 | 85.6 |
## Notes about upgrading from v3.3 {#upgrading}
### Doc.has_vector
`Doc.has_vector` now matches `Token.has_vector` and `Span.has_vector`: it
returns `True` if at least one token in the doc has a vector rather than
checking only whether the vocab contains vectors.
### Using trained pipelines with floret vectors
If you're using a trained pipeline for Croatian, Finnish, Korean or Swedish with
new texts and working with `Doc` objects, you shouldn't notice any difference
between floret vectors and default vectors.
If you use vectors for similarity comparisons, there are a few differences,
mainly because a floret pipeline doesn't include any kind of frequency-based
word list similar to the list of in-vocabulary vector keys with default vectors.
- If your workflow iterates over the vector keys, you should use an external
word list instead:
```diff
- lexemes = [nlp.vocab[orth] for orth in nlp.vocab.vectors]
+ lexemes = [nlp.vocab[word] for word in external_word_list]
```
- `Vectors.most_similar` is not supported because there's no fixed list of
vectors to compare your vectors to.
### Pipeline package version compatibility {#version-compat}
> #### Using legacy implementations
>
> In spaCy v3, you'll still be able to load and reference legacy implementations
> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
> components or architectures change and newer versions are available in the
> core library.
When you're loading a pipeline package trained with an earlier version of spaCy
v3, you will see a warning telling you that the pipeline may be incompatible.
This doesn't necessarily have to be true, but we recommend running your
pipelines against your test suite or evaluation data to make sure there are no
unexpected results.
If you're using one of the [trained pipelines](/models) we provide, you should
run [`spacy download`](/api/cli#download) to update to the latest version. To
see an overview of all installed packages and their compatibility, you can run
[`spacy validate`](/api/cli#validate).
If you've trained your own custom pipeline and you've confirmed that it's still
working as expected, you can update the spaCy version requirements in the
[`meta.json`](/api/data-formats#meta):
```diff
- "spacy_version": ">=3.3.0,<3.4.0",
+ "spacy_version": ">=3.3.0,<3.5.0",
```
### Updating v3.3 configs
To update a config from spaCy v3.3 with the new v3.4 settings, run
[`init fill-config`](/api/cli#init-fill-config):
```cli
$ python -m spacy init fill-config config-v3.3.cfg config-v3.4.cfg
```
In many cases ([`spacy train`](/api/cli#train),
[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
automatically, but you'll need to fill in the new settings to run
[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).

View File

@ -162,7 +162,12 @@
{
"code": "hr",
"name": "Croatian",
"has_examples": true
"has_examples": true,
"models": [
"hr_core_news_sm",
"hr_core_news_md",
"hr_core_news_lg"
]
},
{
"code": "hsb",

View File

@ -12,7 +12,9 @@
{ "text": "New in v3.0", "url": "/usage/v3" },
{ "text": "New in v3.1", "url": "/usage/v3-1" },
{ "text": "New in v3.2", "url": "/usage/v3-2" },
{ "text": "New in v3.3", "url": "/usage/v3-3" }
{ "text": "New in v3.2", "url": "/usage/v3-2" },
{ "text": "New in v3.3", "url": "/usage/v3-3" },
{ "text": "New in v3.4", "url": "/usage/v3-4" }
]
},
{

View File

@ -28,7 +28,7 @@
},
"binderUrl": "explosion/spacy-io-binder",
"binderBranch": "spacy.io",
"binderVersion": "3.0",
"binderVersion": "3.4",
"sections": [
{ "id": "usage", "title": "Usage Documentation", "theme": "blue" },
{ "id": "models", "title": "Models Documentation", "theme": "blue" },

View File

@ -22,6 +22,7 @@
"## Set parameter `extra_info` to `True` and check also span._.description, span._.src_description, span._.normal_term, span._.other_ids"
],
"category": ["models", "pipeline"],
"image": "https://raw.githubusercontent.com/Lucaterre/spacyfishing/main/docs/spacyfishing-logo-resized.png",
"tags": ["NER", "NEL"],
"author": "Lucas Terriel",
"author_links": {
@ -544,6 +545,37 @@
"website": "https://koaning.io"
}
},
{
"id": "bertopic",
"title": "BERTopic",
"slogan": "Leveraging BERT and c-TF-IDF to create easily interpretable topics.",
"description": "BERTopic is a topic modeling technique that leverages embedding models and c-TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions. BERTopic supports guided, (semi-) supervised, hierarchical, and dynamic topic modeling.",
"github": "maartengr/bertopic",
"pip": "bertopic",
"thumb": "https://i.imgur.com/Rx2LfBm.png",
"image": "https://raw.githubusercontent.com/MaartenGr/BERTopic/master/images/topic_visualization.gif",
"code_example": [
"import spacy",
"from bertopic import BERTopic",
"from sklearn.datasets import fetch_20newsgroups",
"",
"docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data']",
"nlp = spacy.load('en_core_web_md', exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])",
"",
"topic_model = BERTopic(embedding_model=nlp)",
"topics, probs = topic_model.fit_transform(docs)",
"",
"fig = topic_model.visualize_topics()",
"fig.show()"
],
"category": ["visualizers", "training"],
"author": "Maarten Grootendorst",
"author_links": {
"twitter": "maartengr",
"github": "maartengr",
"website": "https://maartengrootendorst.com"
}
},
{
"id": "tokenwiser",
"title": "tokenwiser",
@ -749,43 +781,6 @@
"category": ["standalone", "research"],
"tags": ["pytorch"]
},
{
"id": "NeuroNER",
"title": "NeuroNER",
"slogan": "Named-entity recognition using neural networks",
"github": "Franck-Dernoncourt/NeuroNER",
"category": ["models"],
"pip": "pyneuroner[cpu]",
"code_example": [
"from neuroner import neuromodel",
"nn = neuromodel.NeuroNER(train_model=False, use_pretrained_model=True)"
],
"tags": ["standalone"]
},
{
"id": "NLPre",
"title": "NLPre",
"slogan": "Natural Language Preprocessing Library for health data and more",
"github": "NIHOPA/NLPre",
"pip": "nlpre",
"code_example": [
"from nlpre import titlecaps, dedash, identify_parenthetical_phrases",
"from nlpre import replace_acronyms, replace_from_dictionary",
"ABBR = identify_parenthetical_phrases()(text)",
"parsers = [dedash(), titlecaps(), replace_acronyms(ABBR),",
" replace_from_dictionary(prefix='MeSH_')]",
"for f in parsers:",
" text = f(text)",
"print(text)"
],
"category": ["scientific", "biomedical"],
"author": "Travis Hoppe",
"author_links": {
"github": "thoppe",
"twitter": "metasemantic",
"website": "http://thoppe.github.io/"
}
},
{
"id": "Chatterbot",
"title": "Chatterbot",
@ -888,78 +883,6 @@
"github": "shigapov"
}
},
{
"id": "spacy_hunspell",
"slogan": "Add spellchecking and spelling suggestions to your spaCy pipeline using Hunspell",
"description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add [Hunspell](http://hunspell.github.io) support for spellchecking.",
"github": "tokestermw/spacy_hunspell",
"pip": "spacy_hunspell",
"code_example": [
"import spacy",
"from spacy_hunspell import spaCyHunSpell",
"",
"nlp = spacy.load('en_core_web_sm')",
"hunspell = spaCyHunSpell(nlp, 'mac')",
"nlp.add_pipe(hunspell)",
"doc = nlp('I can haz cheezeburger.')",
"haz = doc[2]",
"haz._.hunspell_spell # False",
"haz._.hunspell_suggest # ['ha', 'haze', 'hazy', 'has', 'hat', 'had', 'hag', 'ham', 'hap', 'hay', 'haw', 'ha z']"
],
"author": "Motoki Wu",
"author_links": {
"github": "tokestermw",
"twitter": "plusepsilon"
},
"category": ["pipeline"],
"tags": ["spellcheck"]
},
{
"id": "spacy_grammar",
"slogan": "Language Tool style grammar handling with spaCy",
"description": "This packages leverages the [Matcher API](https://spacy.io/docs/usage/rule-based-matching) in spaCy to quickly match on spaCy tokens not dissimilar to regex. It reads a `grammar.yml` file to load up custom patterns and returns the results inside `Doc`, `Span`, and `Token`. It is extensible through adding rules to `grammar.yml` (though currently only the simple string matching is implemented).",
"github": "tokestermw/spacy_grammar",
"code_example": [
"import spacy",
"from spacy_grammar.grammar import Grammar",
"",
"nlp = spacy.load('en')",
"grammar = Grammar(nlp)",
"nlp.add_pipe(grammar)",
"doc = nlp('I can haz cheeseburger.')",
"doc._.has_grammar_error # True"
],
"author": "Motoki Wu",
"author_links": {
"github": "tokestermw",
"twitter": "plusepsilon"
},
"category": ["pipeline"]
},
{
"id": "spacy_kenlm",
"slogan": "KenLM extension for spaCy 2.0",
"github": "tokestermw/spacy_kenlm",
"pip": "spacy_kenlm",
"code_example": [
"import spacy",
"from spacy_kenlm import spaCyKenLM",
"",
"nlp = spacy.load('en_core_web_sm')",
"spacy_kenlm = spaCyKenLM() # default model from test.arpa",
"nlp.add_pipe(spacy_kenlm)",
"doc = nlp('How are you?')",
"doc._.kenlm_score # doc score",
"doc[:2]._.kenlm_score # span score",
"doc[2]._.kenlm_score # token score"
],
"author": "Motoki Wu",
"author_links": {
"github": "tokestermw",
"twitter": "plusepsilon"
},
"category": ["pipeline"]
},
{
"id": "spacy_readability",
"slogan": "Add text readability meta data to Doc objects",
@ -1028,34 +951,6 @@
},
"category": ["pipeline"]
},
{
"id": "spacy-lookup",
"slogan": "A powerful entity matcher for very large dictionaries, using the FlashText module",
"description": "spaCy v2.0 extension and pipeline component for adding Named Entities metadata to `Doc` objects. Detects Named Entities using dictionaries. The extension sets the custom `Doc`, `Token` and `Span` attributes `._.is_entity`, `._.entity_type`, `._.has_entities` and `._.entities`. Named Entities are matched using the python module `flashtext`, and looked up in the data provided by different dictionaries.",
"github": "mpuig/spacy-lookup",
"pip": "spacy-lookup",
"code_example": [
"import spacy",
"from spacy_lookup import Entity",
"",
"nlp = spacy.load('en')",
"entity = Entity(keywords_list=['python', 'product manager', 'java platform'])",
"nlp.add_pipe(entity, last=True)",
"",
"doc = nlp(\"I am a product manager for a java and python.\")",
"assert doc._.has_entities == True",
"assert doc[0]._.is_entity == False",
"assert doc[3]._.entity_desc == 'product manager'",
"assert doc[3]._.is_entity == True",
"",
"print([(token.text, token._.canonical) for token in doc if token._.is_entity])"
],
"author": "Marc Puig",
"author_links": {
"github": "mpuig"
},
"category": ["pipeline"]
},
{
"id": "spacy-iwnlp",
"slogan": "German lemmatization with IWNLP",
@ -1257,6 +1152,46 @@
"category": ["pipeline", "models", "training"],
"tags": ["pipeline", "models", "transformers"]
},
{
"id": "asent",
"title": "Asent",
"slogan": "Fast, flexible and transparent sentiment analysis",
"description": "Asent is a rule-based sentiment analysis library for Python made using spaCy. It is inspired by VADER, but uses a more modular ruleset, that allows the user to change e.g. the method for finding negations. Furthermore it includes visualisers to visualize the model predictions, making the model easily interpretable.",
"github": "kennethenevoldsen/asent",
"pip": "aseny",
"code_example": [
"import spacy",
"import asent",
"",
"# load spacy pipeline",
"nlp = spacy.blank('en')",
"nlp.add_pipe('sentencizer')",
"",
"# add the rule-based sentiment model",
"nlp.add_pipe('asent_en_v1')",
"",
"# try an example",
"text = 'I am not very happy, but I am also not especially sad'",
"doc = nlp(text)",
"",
"# print polarity of document, scaled to be between -1, and 1",
"print(doc._.polarity)",
"# neg=0.0 neu=0.631 pos=0.369 compound=0.7526",
"",
"# Naturally, a simple score can be quite unsatisfying, thus Asent implements a series of visualizer to interpret the results:",
"asent.visualize(doc, style='prediction')",
" # or",
"asent.visualize(doc[:5], style='analysis')"
],
"thumb": "https://github.com/KennethEnevoldsen/asent/raw/main/docs/img/logo_black_font.png?raw=true",
"author": "Kenneth Enevoldsen",
"author_links": {
"github": "KennethEnevoldsen",
"website": "https://www.kennethenevoldsen.com"
},
"category": ["pipeline", "models"],
"tags": ["pipeline", "models", "sentiment"]
},
{
"id": "textdescriptives",
"title": "TextDescriptives",
@ -1322,21 +1257,6 @@
"github": "huggingface"
}
},
{
"id": "spacy-vis",
"slogan": "A visualisation tool for spaCy using Hierplane",
"description": "A visualiser for spaCy annotations. This visualisation uses the [Hierplane](https://allenai.github.io/hierplane/) Library to render the dependency parse from spaCy's models. It also includes visualisation of entities and POS tags within nodes.",
"github": "DeNeutoy/spacy-vis",
"url": "http://spacyvis.allennlp.org/spacy-parser",
"thumb": "https://i.imgur.com/DAG9QFd.jpg",
"image": "https://raw.githubusercontent.com/DeNeutoy/spacy-vis/master/img/example.gif",
"author": "Mark Neumann",
"author_links": {
"twitter": "MarkNeumannnn",
"github": "DeNeutoy"
},
"category": ["visualizers"]
},
{
"id": "matcher-explorer",
"title": "Rule-based Matcher Explorer",
@ -2340,29 +2260,6 @@
"youtube": "8u57WSXVpmw",
"category": ["videos"]
},
{
"id": "adam_qas",
"title": "ADAM: Question Answering System",
"slogan": "A question answering system that extracts answers from Wikipedia to questions posed in natural language.",
"github": "5hirish/adam_qas",
"pip": "qas",
"code_example": [
"git clone https://github.com/5hirish/adam_qas.git",
"cd adam_qas",
"pip install -r requirements.txt",
"python -m qas.adam 'When was linux kernel version 4.0 released ?'"
],
"code_language": "bash",
"thumb": "https://shirishkadam.files.wordpress.com/2018/04/mini_alleviate.png",
"author": "Shirish Kadam",
"author_links": {
"twitter": "5hirish",
"github": "5hirish",
"website": "https://shirishkadam.com/"
},
"category": ["standalone"],
"tags": ["question-answering", "elasticsearch"]
},
{
"id": "self-attentive-parser",
"title": "Berkeley Neural Parser",
@ -2460,20 +2357,6 @@
"category": ["nonpython"],
"tags": ["javascript"]
},
{
"id": "spacy-raspberry",
"title": "spacy-raspberry",
"slogan": "64bit Raspberry Pi image for spaCy and neuralcoref",
"github": "boehm-e/spacy-raspberry",
"thumb": "https://i.imgur.com/VCJMrE6.png",
"image": "https://raw.githubusercontent.com/boehm-e/spacy-raspberry/master/imgs/preview.png",
"author": "Erwan Boehm",
"author_links": {
"github": "boehm-e"
},
"category": ["apis"],
"tags": ["raspberrypi"]
},
{
"id": "spacy-wordnet",
"title": "spacy-wordnet",
@ -2544,35 +2427,6 @@
"category": ["standalone", "pipeline"],
"tags": ["linguistics", "computational linguistics", "conll", "conll-u"]
},
{
"id": "spacy-langdetect",
"title": "spacy-langdetect",
"slogan": "A fully customizable language detection pipeline for spaCy",
"description": "This module allows you to add language detection capabilites to your spaCy pipeline. Also supports custom language detectors!",
"pip": "spacy-langdetect",
"code_example": [
"import spacy",
"from spacy_langdetect import LanguageDetector",
"nlp = spacy.load('en')",
"nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)",
"text = 'This is an english text.'",
"doc = nlp(text)",
"# document level language detection. Think of it like average language of the document!",
"print(doc._.language)",
"# sentence level language detection",
"for sent in doc.sents:",
" print(sent, sent._.language)"
],
"code_language": "python",
"author": "Abhijit Balaji",
"author_links": {
"github": "Abhijit-2592",
"website": "https://abhijit-2592.github.io/"
},
"github": "Abhijit-2592/spacy-langdetect",
"category": ["pipeline"],
"tags": ["language-detection"]
},
{
"id": "ludwig",
"title": "Ludwig",
@ -2873,7 +2727,7 @@
"slogan": "Information extraction from English and German texts based on predicate logic",
"github": "explosion/holmes-extractor",
"url": "https://github.com/explosion/holmes-extractor",
"description": "Holmes is a Python 3 library that supports a number of use cases involving information extraction from English and German texts, including chatbot, structural extraction, topic matching and supervised document classification. There is a [website demonstrating intelligent search based on topic matching](https://demo.holmes.prod.demos.explosion.services).",
"description": "Holmes is a Python 3 library that supports a number of use cases involving information extraction from English and German texts, including chatbot, structural extraction, topic matching and supervised document classification. There is a [website demonstrating intelligent search based on topic matching](https://holmes-demo.explosion.services).",
"pip": "holmes-extractor",
"category": ["pipeline", "standalone"],
"tags": ["chatbots", "text-processing"],
@ -3071,35 +2925,6 @@
],
"author": "Stefan Daniel Dumitrescu, Andrei-Marius Avram"
},
{
"id": "num_fh",
"title": "Numeric Fused-Head",
"slogan": "Numeric Fused-Head Identificaiton and Resolution in English",
"description": "This package provide a wrapper for the Numeric Fused-Head in English. It provides another information layer on numbers that refer to another entity which is not obvious from the syntactic tree.",
"github": "yanaiela/num_fh",
"pip": "num_fh",
"category": ["pipeline", "research"],
"code_example": [
"import spacy",
"from num_fh import NFH",
"nlp = spacy.load('en_core_web_sm')",
"nfh = NFH(nlp)",
"nlp.add_pipe(nfh, first=False)",
"doc = nlp(\"I told you two, that only one of them is the one who will get 2 or 3 icecreams\")",
"",
"assert doc[16]._.is_nfh == True",
"assert doc[18]._.is_nfh == False",
"assert doc[3]._.is_deter_nfh == True",
"assert doc[16]._.is_deter_nfh == False",
"assert len(doc._.nfh) == 4"
],
"author": "Yanai Elazar",
"author_links": {
"github": "yanaiela",
"twitter": "yanaiela",
"website": "https://yanaiela.github.io"
}
},
{
"id": "Healthsea",
"title": "Healthsea",
@ -3190,6 +3015,7 @@
"from pysbd.utils import PySBDFactory",
"",
"nlp = spacy.blank('en')",
"# Caution: works with spaCy<=2.x.x",
"nlp.add_pipe(PySBDFactory(nlp))",
"",
"doc = nlp('My name is Jonas E. Smith. Please turn to p. 55.')",
@ -4109,6 +3935,21 @@
},
"category": ["biomedical", "scientific", "research", "pipeline"],
"tags": ["clinical"]
},
{
"id": "sent-pattern",
"title": "English Interpretation Sentence Pattern",
"slogan": "English interpretation for accurate translation from English to Japanese",
"description": "This package categorizes English sentences into one of five basic sentence patterns and identifies the subject, verb, object, and other components. The five basic sentence patterns are based on C. T. Onions's Advanced English Syntax and are frequently used when teaching English in Japan.",
"github": "lll-lll-lll-lll/sent-pattern",
"pip": "sent-pattern",
"author": "Shunpei Nakayama",
"author_links": {
"twitter": "ExZ79575296",
"github": "lll-lll-lll-lll"
},
"category": ["pipeline"],
"tags": ["interpretation", "ja"]
}
],

View File

@ -120,8 +120,8 @@ const AlertSpace = ({ nightly, legacy }) => {
}
const navAlert = (
<Link to="/usage/v3-3" hidden>
<strong>💥 Out now:</strong> spaCy v3.3
<Link to="/usage/v3-4" hidden>
<strong>💥 Out now:</strong> spaCy v3.4
</Link>
)

View File

@ -142,10 +142,10 @@ const UniverseContent = ({ content = [], categories, theme, pageContext, mdxComp
The Universe database is open-source and collected in a simple JSON file.
For more details on the formats and available fields, see the documentation.
Looking for inspiration your own spaCy plugin or extension? Check out the
<Link to={github() + '/labels/project%20idea'} hideIcon ws>
<InlineCode>project idea</InlineCode>
<Link to={"https://github.com/explosion/spaCy/discussions/categories/new-features-project-ideas/"} hideIcon ws>
project idea
</Link>
label on the issue tracker.
section in Discussions.
</p>
<InlineList>

View File

@ -24,6 +24,8 @@ const CUDA = {
'11.3': 'cuda113',
'11.4': 'cuda114',
'11.5': 'cuda115',
'11.6': 'cuda116',
'11.7': 'cuda117',
}
const LANG_EXTRAS = ['ja'] // only for languages with models