mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-08 06:04:57 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
1087cea1ef
41
.github/azure-steps.yml
vendored
41
.github/azure-steps.yml
vendored
|
@ -27,7 +27,6 @@ steps:
|
|||
|
||||
- script: python -m mypy spacy
|
||||
displayName: 'Run mypy'
|
||||
condition: ne(variables['python_version'], '3.10')
|
||||
|
||||
- task: DeleteFiles@1
|
||||
inputs:
|
||||
|
@ -41,7 +40,7 @@ steps:
|
|||
|
||||
- bash: |
|
||||
${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||
${{ parameters.prefix }} python -m pip install dist/$SDIST
|
||||
${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
|
||||
displayName: "Install from sdist"
|
||||
|
||||
- script: |
|
||||
|
@ -64,12 +63,12 @@ steps:
|
|||
displayName: "Run GPU tests"
|
||||
condition: eq(${{ parameters.gpu }}, true)
|
||||
|
||||
- script: |
|
||||
python -m spacy download ca_core_news_sm
|
||||
python -m spacy download ca_core_news_md
|
||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
displayName: 'Test download CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
# - script: |
|
||||
# python -m spacy download ca_core_news_sm
|
||||
# python -m spacy download ca_core_news_md
|
||||
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
# displayName: 'Test download CLI'
|
||||
# condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||
|
@ -93,17 +92,17 @@ steps:
|
|||
displayName: 'Test train CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
displayName: 'Test assemble CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
displayName: 'Test assemble CLI vectors warning'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
# - script: |
|
||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
# displayName: 'Test assemble CLI'
|
||||
# condition: eq(variables['python_version'], '3.8')
|
||||
#
|
||||
# - script: |
|
||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
# displayName: 'Test assemble CLI vectors warning'
|
||||
# condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python .github/validate_universe_json.py website/meta/universe.json
|
||||
|
@ -111,7 +110,7 @@ steps:
|
|||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pip install thinc-apple-ops
|
||||
${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops
|
||||
${{ parameters.prefix }} python -m pytest --pyargs spacy
|
||||
displayName: "Run CPU tests with thinc-apple-ops"
|
||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9'))
|
||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))
|
||||
|
|
67
.github/spacy_universe_alert.py
vendored
Normal file
67
.github/spacy_universe_alert.py
vendored
Normal file
|
@ -0,0 +1,67 @@
|
|||
import os
|
||||
import sys
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
from slack_sdk.web.client import WebClient
|
||||
|
||||
CHANNEL = "#alerts-universe"
|
||||
SLACK_TOKEN = os.environ.get("SLACK_BOT_TOKEN", "ENV VAR not available!")
|
||||
DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
|
||||
|
||||
client = WebClient(SLACK_TOKEN)
|
||||
github_context = json.loads(sys.argv[1])
|
||||
|
||||
event = github_context['event']
|
||||
pr_title = event['pull_request']["title"]
|
||||
pr_link = event['pull_request']["patch_url"].replace(".patch", "")
|
||||
pr_author_url = event['sender']["html_url"]
|
||||
pr_author_name = pr_author_url.rsplit('/')[-1]
|
||||
pr_created_at_dt = datetime.strptime(
|
||||
event['pull_request']["created_at"],
|
||||
DATETIME_FORMAT
|
||||
)
|
||||
pr_created_at = pr_created_at_dt.strftime("%c")
|
||||
pr_updated_at_dt = datetime.strptime(
|
||||
event['pull_request']["updated_at"],
|
||||
DATETIME_FORMAT
|
||||
)
|
||||
pr_updated_at = pr_updated_at_dt.strftime("%c")
|
||||
|
||||
blocks = [
|
||||
{
|
||||
"type": "section",
|
||||
"text": {
|
||||
"type": "mrkdwn",
|
||||
"text": "📣 New spaCy Universe Project Alert ✨"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "section",
|
||||
"fields": [
|
||||
{
|
||||
"type": "mrkdwn",
|
||||
"text": f"*Pull Request:*\n<{pr_link}|{pr_title}>"
|
||||
},
|
||||
{
|
||||
"type": "mrkdwn",
|
||||
"text": f"*Author:*\n<{pr_author_url}|{pr_author_name}>"
|
||||
},
|
||||
{
|
||||
"type": "mrkdwn",
|
||||
"text": f"*Created at:*\n {pr_created_at}"
|
||||
},
|
||||
{
|
||||
"type": "mrkdwn",
|
||||
"text": f"*Last Updated:*\n {pr_updated_at}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
client.chat_postMessage(
|
||||
channel=CHANNEL,
|
||||
text="spaCy universe project PR alert",
|
||||
blocks=blocks
|
||||
)
|
30
.github/workflows/spacy_universe_alert.yml
vendored
Normal file
30
.github/workflows/spacy_universe_alert.yml
vendored
Normal file
|
@ -0,0 +1,30 @@
|
|||
name: spaCy universe project alert
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
paths:
|
||||
- "website/meta/universe.json"
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Dump GitHub context
|
||||
env:
|
||||
GITHUB_CONTEXT: ${{ toJson(github) }}
|
||||
PR_NUMBER: ${{github.event.number}}
|
||||
run: |
|
||||
echo "$GITHUB_CONTEXT"
|
||||
|
||||
- uses: actions/checkout@v1
|
||||
- uses: actions/setup-python@v1
|
||||
- name: Install Bernadette app dependency and send an alert
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
GITHUB_CONTEXT: ${{ toJson(github) }}
|
||||
CHANNEL: "#alerts-universe"
|
||||
run: |
|
||||
pip install slack-sdk==3.17.2 aiohttp==3.8.1
|
||||
echo "$CHANNEL"
|
||||
python .github/spacy_universe_alert.py "$GITHUB_CONTEXT"
|
|
@ -1,4 +1,4 @@
|
|||
recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml
|
||||
recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml *.hh
|
||||
include LICENSE
|
||||
include README.md
|
||||
include pyproject.toml
|
||||
|
|
|
@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
|
|||
model packaging, deployment and workflow management. spaCy is commercial
|
||||
open-source software, released under the MIT license.
|
||||
|
||||
💫 **Version 3.2 out now!**
|
||||
💫 **Version 3.4.0 out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
# build version constraints for use with wheelwright + multibuild
|
||||
numpy==1.15.0; python_version<='3.7'
|
||||
numpy==1.17.3; python_version=='3.8'
|
||||
numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
|
||||
numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
|
||||
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
|
||||
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
|
||||
numpy==1.19.3; python_version=='3.9'
|
||||
numpy==1.21.3; python_version=='3.10'
|
||||
numpy; python_version>='3.11'
|
||||
|
|
|
@ -455,6 +455,10 @@ Regression tests are tests that refer to bugs reported in specific issues. They
|
|||
|
||||
The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file.
|
||||
|
||||
### Testing Cython Code
|
||||
|
||||
If you're developing Cython code (`.pyx` files), those extensions will need to be built before the test runner can test that code - otherwise it's going to run the tests with stale code from the last time the extension was built. You can build the extensions locally with `python setup.py build_ext -i`.
|
||||
|
||||
### Constructing objects and state
|
||||
|
||||
Test functions usually follow the same simple structure: they set up some state, perform the operation you want to test and `assert` conditions that you expect to be true, usually before and after the operation.
|
||||
|
|
|
@ -5,8 +5,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.14,<8.1.0",
|
||||
"blis>=0.4.0,<0.8.0",
|
||||
"thinc>=8.1.0,<8.2.0",
|
||||
"pathy",
|
||||
"numpy>=1.15.0",
|
||||
]
|
||||
|
|
|
@ -3,8 +3,7 @@ spacy-legacy>=3.0.9,<3.1.0
|
|||
spacy-loggers>=1.0.0,<2.0.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.14,<8.1.0
|
||||
blis>=0.4.0,<0.8.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
ml_datasets>=0.2.0,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.9.1,<1.1.0
|
||||
|
@ -16,7 +15,7 @@ pathy>=0.3.5
|
|||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
|
||||
jinja2
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
# Official Python utilities
|
||||
|
@ -31,7 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
|
|||
mock>=2.0.0,<3.0.0
|
||||
flake8>=3.8.0,<3.10.0
|
||||
hypothesis>=3.27.0,<7.0.0
|
||||
mypy==0.910
|
||||
mypy>=0.910,<0.970; platform_machine!='aarch64'
|
||||
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||
types-mock>=0.1.1
|
||||
types-requests
|
||||
|
|
13
setup.cfg
13
setup.cfg
|
@ -38,7 +38,7 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.0.14,<8.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
install_requires =
|
||||
# Our libraries
|
||||
spacy-legacy>=3.0.9,<3.1.0
|
||||
|
@ -46,8 +46,7 @@ install_requires =
|
|||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.14,<8.1.0
|
||||
blis>=0.4.0,<0.8.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
wasabi>=0.9.1,<1.1.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
|
@ -57,7 +56,7 @@ install_requires =
|
|||
tqdm>=4.38.0,<5.0.0
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
|
||||
jinja2
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
|
@ -104,8 +103,12 @@ cuda114 =
|
|||
cupy-cuda114>=5.0.0b4,<11.0.0
|
||||
cuda115 =
|
||||
cupy-cuda115>=5.0.0b4,<11.0.0
|
||||
cuda116 =
|
||||
cupy-cuda116>=5.0.0b4,<11.0.0
|
||||
cuda117 =
|
||||
cupy-cuda117>=5.0.0b4,<11.0.0
|
||||
apple =
|
||||
thinc-apple-ops>=0.0.4,<1.0.0
|
||||
thinc-apple-ops>=0.1.0.dev0,<1.0.0
|
||||
# Language tokenizers with external dependencies
|
||||
ja =
|
||||
sudachipy>=0.5.2,!=0.6.1
|
||||
|
|
8
setup.py
8
setup.py
|
@ -126,6 +126,8 @@ class build_ext_options:
|
|||
|
||||
class build_ext_subclass(build_ext, build_ext_options):
|
||||
def build_extensions(self):
|
||||
if self.parallel is None and os.environ.get("SPACY_NUM_BUILD_JOBS") is not None:
|
||||
self.parallel = int(os.environ.get("SPACY_NUM_BUILD_JOBS"))
|
||||
build_ext_options.build_options(self)
|
||||
build_ext.build_extensions(self)
|
||||
|
||||
|
@ -206,7 +208,11 @@ def setup_package():
|
|||
for name in MOD_NAMES:
|
||||
mod_path = name.replace(".", "/") + ".pyx"
|
||||
ext = Extension(
|
||||
name, [mod_path], language="c++", include_dirs=include_dirs, extra_compile_args=["-std=c++11"]
|
||||
name,
|
||||
[mod_path],
|
||||
language="c++",
|
||||
include_dirs=include_dirs,
|
||||
extra_compile_args=["-std=c++11"],
|
||||
)
|
||||
ext_modules.append(ext)
|
||||
print("Cythonizing sources")
|
||||
|
|
|
@ -32,6 +32,7 @@ def load(
|
|||
*,
|
||||
vocab: Union[Vocab, bool] = True,
|
||||
disable: Iterable[str] = util.SimpleFrozenList(),
|
||||
enable: Iterable[str] = util.SimpleFrozenList(),
|
||||
exclude: Iterable[str] = util.SimpleFrozenList(),
|
||||
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
|
||||
) -> Language:
|
||||
|
@ -42,6 +43,8 @@ def load(
|
|||
disable (Iterable[str]): Names of pipeline components to disable. Disabled
|
||||
pipes will be loaded but they won't be run unless you explicitly
|
||||
enable them by calling nlp.enable_pipe.
|
||||
enable (Iterable[str]): Names of pipeline components to enable. All other
|
||||
pipes will be disabled (but can be enabled later using nlp.enable_pipe).
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
|
||||
components won't be loaded.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
|
@ -49,7 +52,12 @@ def load(
|
|||
RETURNS (Language): The loaded nlp object.
|
||||
"""
|
||||
return util.load_model(
|
||||
name, vocab=vocab, disable=disable, exclude=exclude, config=config
|
||||
name,
|
||||
vocab=vocab,
|
||||
disable=disable,
|
||||
enable=enable,
|
||||
exclude=exclude,
|
||||
config=config,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.3.1"
|
||||
__version__ = "3.4.0"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
|
|
|
@ -12,7 +12,7 @@ from click.parser import split_arg_string
|
|||
from typer.main import get_command
|
||||
from contextlib import contextmanager
|
||||
from thinc.api import Config, ConfigValidationError, require_gpu
|
||||
from thinc.util import has_cupy, gpu_is_available
|
||||
from thinc.util import gpu_is_available
|
||||
from configparser import InterpolationError
|
||||
import os
|
||||
|
||||
|
@ -462,6 +462,23 @@ def git_sparse_checkout(repo, subpath, dest, branch):
|
|||
shutil.move(str(source_path), str(dest))
|
||||
|
||||
|
||||
def git_repo_branch_exists(repo: str, branch: str) -> bool:
|
||||
"""Uses 'git ls-remote' to check if a repository and branch exists
|
||||
|
||||
repo (str): URL to get repo.
|
||||
branch (str): Branch on repo to check.
|
||||
RETURNS (bool): True if repo:branch exists.
|
||||
"""
|
||||
get_git_version()
|
||||
cmd = f"git ls-remote {repo} {branch}"
|
||||
# We might be tempted to use `--exit-code` with `git ls-remote`, but
|
||||
# `run_command` handles the `returncode` for us, so we'll rely on
|
||||
# the fact that stdout returns '' if the requested branch doesn't exist
|
||||
ret = run_command(cmd, capture=True)
|
||||
exists = ret.stdout != ""
|
||||
return exists
|
||||
|
||||
|
||||
def get_git_version(
|
||||
error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
|
||||
) -> Tuple[int, int]:
|
||||
|
@ -554,5 +571,5 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
|
|||
require_gpu(use_gpu)
|
||||
else:
|
||||
local_msg.info("Using CPU")
|
||||
if has_cupy and gpu_is_available():
|
||||
if gpu_is_available():
|
||||
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
|
||||
|
|
|
@ -10,7 +10,7 @@ import math
|
|||
|
||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ._util import import_code, debug_cli
|
||||
from ..training import Example
|
||||
from ..training import Example, remove_bilu_prefix
|
||||
from ..training.initialize import get_sourced_components
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
from ..pipeline._parser_internals import nonproj
|
||||
|
@ -361,7 +361,7 @@ def debug_data(
|
|||
if label != "-"
|
||||
]
|
||||
labels_with_counts = _format_labels(labels_with_counts, counts=True)
|
||||
msg.text(f"Labels in train data: {_format_labels(labels)}", show=verbose)
|
||||
msg.text(f"Labels in train data: {labels_with_counts}", show=verbose)
|
||||
missing_labels = model_labels - labels
|
||||
if missing_labels:
|
||||
msg.warn(
|
||||
|
@ -758,9 +758,9 @@ def _compile_gold(
|
|||
# "Illegal" whitespace entity
|
||||
data["ws_ents"] += 1
|
||||
if label.startswith(("B-", "U-")):
|
||||
combined_label = label.split("-")[1]
|
||||
combined_label = remove_bilu_prefix(label)
|
||||
data["ner"][combined_label] += 1
|
||||
if sent_starts[i] == True and label.startswith(("I-", "L-")):
|
||||
if sent_starts[i] and label.startswith(("I-", "L-")):
|
||||
data["boundary_cross_ents"] += 1
|
||||
elif label == "-":
|
||||
data["ner"]["-"] += 1
|
||||
|
@ -908,7 +908,7 @@ def _get_examples_without_label(
|
|||
for eg in data:
|
||||
if component == "ner":
|
||||
labels = [
|
||||
label.split("-")[1]
|
||||
remove_bilu_prefix(label)
|
||||
for label in eg.get_aligned_ner()
|
||||
if label not in ("O", "-", None)
|
||||
]
|
||||
|
|
|
@ -10,6 +10,7 @@ from jinja2 import Template
|
|||
from .. import util
|
||||
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
|
||||
from ..schemas import RecommendationSchema
|
||||
from ..util import SimpleFrozenList
|
||||
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
||||
from ._util import string_to_list, import_code
|
||||
|
||||
|
@ -24,16 +25,30 @@ class Optimizations(str, Enum):
|
|||
accuracy = "accuracy"
|
||||
|
||||
|
||||
class InitValues:
|
||||
"""
|
||||
Default values for initialization. Dedicated class to allow synchronized default values for init_config_cli() and
|
||||
init_config(), i.e. initialization calls via CLI respectively Python.
|
||||
"""
|
||||
|
||||
lang = "en"
|
||||
pipeline = SimpleFrozenList(["tagger", "parser", "ner"])
|
||||
optimize = Optimizations.efficiency
|
||||
gpu = False
|
||||
pretraining = False
|
||||
force_overwrite = False
|
||||
|
||||
|
||||
@init_cli.command("config")
|
||||
def init_config_cli(
|
||||
# fmt: off
|
||||
output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
||||
lang: str = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
||||
pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
|
||||
lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"),
|
||||
pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||
optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
pretraining: bool = Opt(InitValues.pretraining, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||
force_overwrite: bool = Opt(InitValues.force_overwrite, "--force", "-F", help="Force overwriting the output file"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -133,11 +148,11 @@ def fill_config(
|
|||
|
||||
def init_config(
|
||||
*,
|
||||
lang: str,
|
||||
pipeline: List[str],
|
||||
optimize: str,
|
||||
gpu: bool,
|
||||
pretraining: bool = False,
|
||||
lang: str = InitValues.lang,
|
||||
pipeline: List[str] = InitValues.pipeline,
|
||||
optimize: str = InitValues.optimize,
|
||||
gpu: bool = InitValues.gpu,
|
||||
pretraining: bool = InitValues.pretraining,
|
||||
silent: bool = True,
|
||||
) -> Config:
|
||||
msg = Printer(no_print=silent)
|
||||
|
|
|
@ -7,11 +7,11 @@ import re
|
|||
from ... import about
|
||||
from ...util import ensure_path
|
||||
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
|
||||
from .._util import git_checkout, get_git_version
|
||||
from .._util import git_checkout, get_git_version, git_repo_branch_exists
|
||||
|
||||
DEFAULT_REPO = about.__projects__
|
||||
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
|
||||
DEFAULT_BRANCH = "master"
|
||||
DEFAULT_BRANCHES = ["main", "master"]
|
||||
|
||||
|
||||
@project_cli.command("clone")
|
||||
|
@ -20,7 +20,7 @@ def project_clone_cli(
|
|||
name: str = Arg(..., help="The name of the template to clone"),
|
||||
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
|
||||
repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
|
||||
branch: Optional[str] = Opt(None, "--branch", "-b", help="The branch to clone from"),
|
||||
branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
|
||||
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
|
||||
# fmt: on
|
||||
):
|
||||
|
@ -33,9 +33,25 @@ def project_clone_cli(
|
|||
"""
|
||||
if dest is None:
|
||||
dest = Path.cwd() / Path(name).parts[-1]
|
||||
if repo == DEFAULT_REPO and branch is None:
|
||||
branch = DEFAULT_PROJECTS_BRANCH
|
||||
|
||||
if branch is None:
|
||||
# If it's a user repo, we want to default to other branch
|
||||
branch = DEFAULT_PROJECTS_BRANCH if repo == DEFAULT_REPO else DEFAULT_BRANCH
|
||||
for default_branch in DEFAULT_BRANCHES:
|
||||
if git_repo_branch_exists(repo, default_branch):
|
||||
branch = default_branch
|
||||
break
|
||||
if branch is None:
|
||||
default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
|
||||
msg.fail(
|
||||
"No branch provided and attempted default "
|
||||
f"branches {default_branches_msg} do not exist.",
|
||||
exits=1,
|
||||
)
|
||||
else:
|
||||
if not git_repo_branch_exists(repo, branch):
|
||||
msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
|
||||
assert isinstance(branch, str)
|
||||
project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
|
||||
|
||||
|
||||
|
@ -61,9 +77,9 @@ def project_clone(
|
|||
try:
|
||||
git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
|
||||
except subprocess.CalledProcessError:
|
||||
err = f"Could not clone '{name}' from repo '{repo_name}'"
|
||||
err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
|
||||
msg.fail(err, exits=1)
|
||||
msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
|
||||
msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
|
||||
if not (project_dir / PROJECT_FILE).exists():
|
||||
msg.warn(f"No {PROJECT_FILE} found in directory")
|
||||
else:
|
||||
|
|
|
@ -64,8 +64,11 @@ class SpanRenderer:
|
|||
# Set up how the text and labels will be rendered
|
||||
self.direction = DEFAULT_DIR
|
||||
self.lang = DEFAULT_LANG
|
||||
# These values are in px
|
||||
self.top_offset = options.get("top_offset", 40)
|
||||
self.top_offset_step = options.get("top_offset_step", 17)
|
||||
# This is how far under the top offset the span labels appear
|
||||
self.span_label_offset = options.get("span_label_offset", 20)
|
||||
self.offset_step = options.get("top_offset_step", 17)
|
||||
|
||||
# Set up which templates will be used
|
||||
template = options.get("template")
|
||||
|
@ -127,26 +130,56 @@ class SpanRenderer:
|
|||
title (str / None): Document title set in Doc.user_data['title'].
|
||||
"""
|
||||
per_token_info = []
|
||||
# we must sort so that we can correctly describe when spans need to "stack"
|
||||
# which is determined by their start token, then span length (longer spans on top),
|
||||
# then break any remaining ties with the span label
|
||||
spans = sorted(
|
||||
spans,
|
||||
key=lambda s: (
|
||||
s["start_token"],
|
||||
-(s["end_token"] - s["start_token"]),
|
||||
s["label"],
|
||||
),
|
||||
)
|
||||
for s in spans:
|
||||
# this is the vertical 'slot' that the span will be rendered in
|
||||
# vertical_position = span_label_offset + (offset_step * (slot - 1))
|
||||
s["render_slot"] = 0
|
||||
for idx, token in enumerate(tokens):
|
||||
# Identify if a token belongs to a Span (and which) and if it's a
|
||||
# start token of said Span. We'll use this for the final HTML render
|
||||
token_markup: Dict[str, Any] = {}
|
||||
token_markup["text"] = token
|
||||
concurrent_spans = 0
|
||||
entities = []
|
||||
for span in spans:
|
||||
ent = {}
|
||||
if span["start_token"] <= idx < span["end_token"]:
|
||||
concurrent_spans += 1
|
||||
span_start = idx == span["start_token"]
|
||||
ent["label"] = span["label"]
|
||||
ent["is_start"] = True if idx == span["start_token"] else False
|
||||
ent["is_start"] = span_start
|
||||
if span_start:
|
||||
# When the span starts, we need to know how many other
|
||||
# spans are on the 'span stack' and will be rendered.
|
||||
# This value becomes the vertical render slot for this entire span
|
||||
span["render_slot"] = concurrent_spans
|
||||
ent["render_slot"] = span["render_slot"]
|
||||
kb_id = span.get("kb_id", "")
|
||||
kb_url = span.get("kb_url", "#")
|
||||
ent["kb_link"] = (
|
||||
TPL_KB_LINK.format(kb_id=kb_id, kb_url=kb_url) if kb_id else ""
|
||||
)
|
||||
entities.append(ent)
|
||||
else:
|
||||
# We don't specifically need to do this since we loop
|
||||
# over tokens and spans sorted by their start_token,
|
||||
# so we'll never use a span again after the last token it appears in,
|
||||
# but if we were to use these spans again we'd want to make sure
|
||||
# this value was reset correctly.
|
||||
span["render_slot"] = 0
|
||||
token_markup["entities"] = entities
|
||||
per_token_info.append(token_markup)
|
||||
|
||||
markup = self._render_markup(per_token_info)
|
||||
markup = TPL_SPANS.format(content=markup, dir=self.direction)
|
||||
if title:
|
||||
|
@ -157,12 +190,24 @@ class SpanRenderer:
|
|||
"""Render the markup from per-token information"""
|
||||
markup = ""
|
||||
for token in per_token_info:
|
||||
entities = sorted(token["entities"], key=lambda d: d["label"])
|
||||
if entities:
|
||||
entities = sorted(token["entities"], key=lambda d: d["render_slot"])
|
||||
# Whitespace tokens disrupt the vertical space (no line height) so that the
|
||||
# span indicators get misaligned. We don't render them as individual
|
||||
# tokens anyway, so we'll just not display a span indicator either.
|
||||
is_whitespace = token["text"].strip() == ""
|
||||
if entities and not is_whitespace:
|
||||
slices = self._get_span_slices(token["entities"])
|
||||
starts = self._get_span_starts(token["entities"])
|
||||
total_height = (
|
||||
self.top_offset
|
||||
+ self.span_label_offset
|
||||
+ (self.offset_step * (len(entities) - 1))
|
||||
)
|
||||
markup += self.span_template.format(
|
||||
text=token["text"], span_slices=slices, span_starts=starts
|
||||
text=token["text"],
|
||||
span_slices=slices,
|
||||
span_starts=starts,
|
||||
total_height=total_height,
|
||||
)
|
||||
else:
|
||||
markup += escape_html(token["text"] + " ")
|
||||
|
@ -171,10 +216,18 @@ class SpanRenderer:
|
|||
def _get_span_slices(self, entities: List[Dict]) -> str:
|
||||
"""Get the rendered markup of all Span slices"""
|
||||
span_slices = []
|
||||
for entity, step in zip(entities, itertools.count(step=self.top_offset_step)):
|
||||
for entity in entities:
|
||||
# rather than iterate over multiples of offset_step, we use entity['render_slot']
|
||||
# to determine the vertical position, since that tells where
|
||||
# the span starts vertically so we can extend it horizontally,
|
||||
# past other spans that might have already ended
|
||||
color = self.colors.get(entity["label"].upper(), self.default_color)
|
||||
top_offset = self.top_offset + (
|
||||
self.offset_step * (entity["render_slot"] - 1)
|
||||
)
|
||||
span_slice = self.span_slice_template.format(
|
||||
bg=color, top_offset=self.top_offset + step
|
||||
bg=color,
|
||||
top_offset=top_offset,
|
||||
)
|
||||
span_slices.append(span_slice)
|
||||
return "".join(span_slices)
|
||||
|
@ -182,12 +235,15 @@ class SpanRenderer:
|
|||
def _get_span_starts(self, entities: List[Dict]) -> str:
|
||||
"""Get the rendered markup of all Span start tokens"""
|
||||
span_starts = []
|
||||
for entity, step in zip(entities, itertools.count(step=self.top_offset_step)):
|
||||
for entity in entities:
|
||||
color = self.colors.get(entity["label"].upper(), self.default_color)
|
||||
top_offset = self.top_offset + (
|
||||
self.offset_step * (entity["render_slot"] - 1)
|
||||
)
|
||||
span_start = (
|
||||
self.span_start_template.format(
|
||||
bg=color,
|
||||
top_offset=self.top_offset + step,
|
||||
top_offset=top_offset,
|
||||
label=entity["label"],
|
||||
kb_link=entity["kb_link"],
|
||||
)
|
||||
|
|
|
@ -67,7 +67,7 @@ TPL_SPANS = """
|
|||
"""
|
||||
|
||||
TPL_SPAN = """
|
||||
<span style="font-weight: bold; display: inline-block; position: relative;">
|
||||
<span style="font-weight: bold; display: inline-block; position: relative; height: {total_height}px;">
|
||||
{text}
|
||||
{span_slices}
|
||||
{span_starts}
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import warnings
|
||||
from .compat import Literal
|
||||
|
||||
|
||||
class ErrorsWithCodes(type):
|
||||
|
@ -26,7 +27,10 @@ def setup_default_warnings():
|
|||
filter_warning("once", error_msg="[W114]")
|
||||
|
||||
|
||||
def filter_warning(action: str, error_msg: str):
|
||||
def filter_warning(
|
||||
action: Literal["default", "error", "ignore", "always", "module", "once"],
|
||||
error_msg: str,
|
||||
):
|
||||
"""Customize how spaCy should handle a certain warning.
|
||||
|
||||
error_msg (str): e.g. "W006", or a full error message
|
||||
|
@ -205,6 +209,9 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
"Only the last span group will be loaded under "
|
||||
"Doc.spans['{group_name}']. Skipping span group with values: "
|
||||
"{group_values}")
|
||||
W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'")
|
||||
W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class "
|
||||
"is a Cython extension type.")
|
||||
|
||||
|
||||
class Errors(metaclass=ErrorsWithCodes):
|
||||
|
@ -928,6 +935,10 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1040 = ("Doc.from_json requires all tokens to have the same attributes. "
|
||||
"Some tokens do not contain annotation for: {partial_attrs}")
|
||||
E1041 = ("Expected a string, Doc, or bytes as input, but got: {type}")
|
||||
E1042 = ("Function was called with `{arg1}`={arg1_values} and "
|
||||
"`{arg2}`={arg2_values} but these arguments are conflicting.")
|
||||
E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
|
||||
"{value}.")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
|
|
22
spacy/kb.pyx
22
spacy/kb.pyx
|
@ -93,14 +93,14 @@ cdef class KnowledgeBase:
|
|||
self.vocab = vocab
|
||||
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
|
||||
|
||||
def initialize_entities(self, int64_t nr_entities):
|
||||
def _initialize_entities(self, int64_t nr_entities):
|
||||
self._entry_index = PreshMap(nr_entities + 1)
|
||||
self._entries = entry_vec(nr_entities + 1)
|
||||
|
||||
def initialize_vectors(self, int64_t nr_entities):
|
||||
def _initialize_vectors(self, int64_t nr_entities):
|
||||
self._vectors_table = float_matrix(nr_entities + 1)
|
||||
|
||||
def initialize_aliases(self, int64_t nr_aliases):
|
||||
def _initialize_aliases(self, int64_t nr_aliases):
|
||||
self._alias_index = PreshMap(nr_aliases + 1)
|
||||
self._aliases_table = alias_vec(nr_aliases + 1)
|
||||
|
||||
|
@ -155,8 +155,8 @@ cdef class KnowledgeBase:
|
|||
raise ValueError(Errors.E140)
|
||||
|
||||
nr_entities = len(set(entity_list))
|
||||
self.initialize_entities(nr_entities)
|
||||
self.initialize_vectors(nr_entities)
|
||||
self._initialize_entities(nr_entities)
|
||||
self._initialize_vectors(nr_entities)
|
||||
|
||||
i = 0
|
||||
cdef KBEntryC entry
|
||||
|
@ -388,9 +388,9 @@ cdef class KnowledgeBase:
|
|||
nr_entities = header[0]
|
||||
nr_aliases = header[1]
|
||||
entity_vector_length = header[2]
|
||||
self.initialize_entities(nr_entities)
|
||||
self.initialize_vectors(nr_entities)
|
||||
self.initialize_aliases(nr_aliases)
|
||||
self._initialize_entities(nr_entities)
|
||||
self._initialize_vectors(nr_entities)
|
||||
self._initialize_aliases(nr_aliases)
|
||||
self.entity_vector_length = entity_vector_length
|
||||
|
||||
def deserialize_vectors(b):
|
||||
|
@ -512,8 +512,8 @@ cdef class KnowledgeBase:
|
|||
cdef int64_t entity_vector_length
|
||||
reader.read_header(&nr_entities, &entity_vector_length)
|
||||
|
||||
self.initialize_entities(nr_entities)
|
||||
self.initialize_vectors(nr_entities)
|
||||
self._initialize_entities(nr_entities)
|
||||
self._initialize_vectors(nr_entities)
|
||||
self.entity_vector_length = entity_vector_length
|
||||
|
||||
# STEP 1: load entity vectors
|
||||
|
@ -552,7 +552,7 @@ cdef class KnowledgeBase:
|
|||
# STEP 3: load aliases
|
||||
cdef int64_t nr_aliases
|
||||
reader.read_alias_length(&nr_aliases)
|
||||
self.initialize_aliases(nr_aliases)
|
||||
self._initialize_aliases(nr_aliases)
|
||||
|
||||
cdef int64_t nr_candidates
|
||||
cdef vector[int64_t] entry_indices
|
||||
|
|
|
@ -2,7 +2,8 @@ from .stop_words import STOP_WORDS
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||
from ...language import Language, BaseDefaults
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
@ -16,6 +17,8 @@ class BulgarianDefaults(BaseDefaults):
|
|||
|
||||
stop_words = STOP_WORDS
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Bulgarian(Language):
|
||||
|
|
|
@ -258,6 +258,10 @@ ALPHA = group_chars(
|
|||
ALPHA_LOWER = group_chars(_lower + _uncased)
|
||||
ALPHA_UPPER = group_chars(_upper + _uncased)
|
||||
|
||||
_combining_diacritics = r"\u0300-\u036f"
|
||||
|
||||
COMBINING_DIACRITICS = _combining_diacritics
|
||||
|
||||
_units = (
|
||||
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
|
||||
"kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb "
|
||||
|
|
|
@ -35,7 +35,7 @@ for pron in ["i"]:
|
|||
|
||||
_exc[orth + "m"] = [
|
||||
{ORTH: orth, NORM: pron},
|
||||
{ORTH: "m", "tenspect": 1, "number": 1},
|
||||
{ORTH: "m"},
|
||||
]
|
||||
|
||||
_exc[orth + "'ma"] = [
|
||||
|
@ -139,26 +139,27 @@ for pron in ["he", "she", "it"]:
|
|||
|
||||
# W-words, relative pronouns, prepositions etc.
|
||||
|
||||
for word in [
|
||||
"who",
|
||||
"what",
|
||||
"when",
|
||||
"where",
|
||||
"why",
|
||||
"how",
|
||||
"there",
|
||||
"that",
|
||||
"this",
|
||||
"these",
|
||||
"those",
|
||||
for word, morph in [
|
||||
("who", None),
|
||||
("what", None),
|
||||
("when", None),
|
||||
("where", None),
|
||||
("why", None),
|
||||
("how", None),
|
||||
("there", None),
|
||||
("that", "Number=Sing|Person=3"),
|
||||
("this", "Number=Sing|Person=3"),
|
||||
("these", "Number=Plur|Person=3"),
|
||||
("those", "Number=Plur|Person=3"),
|
||||
]:
|
||||
for orth in [word, word.title()]:
|
||||
_exc[orth + "'s"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "'s", NORM: "'s"},
|
||||
]
|
||||
if morph != "Number=Plur|Person=3":
|
||||
_exc[orth + "'s"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "'s", NORM: "'s"},
|
||||
]
|
||||
|
||||
_exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
|
||||
_exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
|
||||
|
||||
_exc[orth + "'ll"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
|
@ -182,25 +183,26 @@ for word in [
|
|||
{ORTH: "ve", NORM: "have"},
|
||||
]
|
||||
|
||||
_exc[orth + "'re"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "'re", NORM: "are"},
|
||||
]
|
||||
if morph != "Number=Sing|Person=3":
|
||||
_exc[orth + "'re"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "'re", NORM: "are"},
|
||||
]
|
||||
|
||||
_exc[orth + "re"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "re", NORM: "are"},
|
||||
]
|
||||
_exc[orth + "re"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "re", NORM: "are"},
|
||||
]
|
||||
|
||||
_exc[orth + "'ve"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "'ve"},
|
||||
]
|
||||
_exc[orth + "'ve"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "'ve"},
|
||||
]
|
||||
|
||||
_exc[orth + "ve"] = [
|
||||
{ORTH: orth},
|
||||
{ORTH: "ve", NORM: "have"},
|
||||
]
|
||||
_exc[orth + "ve"] = [
|
||||
{ORTH: orth},
|
||||
{ORTH: "ve", NORM: "have"},
|
||||
]
|
||||
|
||||
_exc[orth + "'d"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
|
||||
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS, COMBINING_DIACRITICS
|
||||
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||
|
||||
|
||||
|
@ -44,3 +44,23 @@ TOKENIZER_INFIXES = (
|
|||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# Some languages e.g. written with the Cyrillic alphabet permit the use of diacritics
|
||||
# to mark stressed syllables in words where stress is distinctive. Such languages
|
||||
# should use the COMBINING_DIACRITICS... suffix and infix regex lists in
|
||||
# place of the standard ones.
|
||||
COMBINING_DIACRITICS_TOKENIZER_SUFFIXES = list(TOKENIZER_SUFFIXES) + [
|
||||
r"(?<=[{a}][{d}])\.".format(a=ALPHA, d=COMBINING_DIACRITICS),
|
||||
]
|
||||
|
||||
COMBINING_DIACRITICS_TOKENIZER_INFIXES = list(TOKENIZER_INFIXES) + [
|
||||
r"(?<=[{al}][{d}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES, d=COMBINING_DIACRITICS
|
||||
),
|
||||
r"(?<=[{a}][{d}]),(?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
|
||||
r"(?<=[{a}][{d}])(?:{h})(?=[{a}])".format(
|
||||
a=ALPHA, d=COMBINING_DIACRITICS, h=HYPHENS
|
||||
),
|
||||
r"(?<=[{a}][{d}])[:<>=/](?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
|
||||
]
|
||||
|
|
|
@ -5,6 +5,8 @@ from .stop_words import STOP_WORDS
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import RussianLemmatizer
|
||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
|
@ -12,6 +14,8 @@ class RussianDefaults(BaseDefaults):
|
|||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Russian(Language):
|
||||
|
|
|
@ -6,6 +6,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import UkrainianLemmatizer
|
||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
|
@ -13,6 +15,8 @@ class UkrainianDefaults(BaseDefaults):
|
|||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Ukrainian(Language):
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Iterator, Optional, Any, Dict, Callable, Iterable
|
||||
from typing import Iterator, Optional, Any, Dict, Callable, Iterable, Collection
|
||||
from typing import Union, Tuple, List, Set, Pattern, Sequence
|
||||
from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
|
||||
|
||||
|
@ -1694,6 +1694,7 @@ class Language:
|
|||
*,
|
||||
vocab: Union[Vocab, bool] = True,
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
enable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
meta: Dict[str, Any] = SimpleFrozenDict(),
|
||||
auto_fill: bool = True,
|
||||
|
@ -1708,6 +1709,8 @@ class Language:
|
|||
disable (Iterable[str]): Names of pipeline components to disable.
|
||||
Disabled pipes will be loaded but they won't be run unless you
|
||||
explicitly enable them by calling nlp.enable_pipe.
|
||||
enable (Iterable[str]): Names of pipeline components to enable. All other
|
||||
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude.
|
||||
Excluded components won't be loaded.
|
||||
meta (Dict[str, Any]): Meta overrides for nlp.meta.
|
||||
|
@ -1861,8 +1864,15 @@ class Language:
|
|||
# Restore the original vocab after sourcing if necessary
|
||||
if vocab_b is not None:
|
||||
nlp.vocab.from_bytes(vocab_b)
|
||||
disabled_pipes = [*config["nlp"]["disabled"], *disable]
|
||||
|
||||
# Resolve disabled/enabled settings.
|
||||
disabled_pipes = cls._resolve_component_status(
|
||||
[*config["nlp"]["disabled"], *disable],
|
||||
[*config["nlp"].get("enabled", []), *enable],
|
||||
config["nlp"]["pipeline"],
|
||||
)
|
||||
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
|
||||
|
||||
nlp.batch_size = config["nlp"]["batch_size"]
|
||||
nlp.config = filled if auto_fill else config
|
||||
if after_pipeline_creation is not None:
|
||||
|
@ -2014,6 +2024,42 @@ class Language:
|
|||
serializers["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
|
||||
util.to_disk(path, serializers, exclude)
|
||||
|
||||
@staticmethod
|
||||
def _resolve_component_status(
|
||||
disable: Iterable[str], enable: Iterable[str], pipe_names: Collection[str]
|
||||
) -> Tuple[str, ...]:
|
||||
"""Derives whether (1) `disable` and `enable` values are consistent and (2)
|
||||
resolves those to a single set of disabled components. Raises an error in
|
||||
case of inconsistency.
|
||||
|
||||
disable (Iterable[str]): Names of components or serialization fields to disable.
|
||||
enable (Iterable[str]): Names of pipeline components to enable.
|
||||
pipe_names (Iterable[str]): Names of all pipeline components.
|
||||
|
||||
RETURNS (Tuple[str, ...]): Names of components to exclude from pipeline w.r.t.
|
||||
specified includes and excludes.
|
||||
"""
|
||||
|
||||
if disable is not None and isinstance(disable, str):
|
||||
disable = [disable]
|
||||
to_disable = disable
|
||||
|
||||
if enable:
|
||||
to_disable = [
|
||||
pipe_name for pipe_name in pipe_names if pipe_name not in enable
|
||||
]
|
||||
if disable and disable != to_disable:
|
||||
raise ValueError(
|
||||
Errors.E1042.format(
|
||||
arg1="enable",
|
||||
arg2="disable",
|
||||
arg1_values=enable,
|
||||
arg2_values=disable,
|
||||
)
|
||||
)
|
||||
|
||||
return tuple(to_disable)
|
||||
|
||||
def from_disk(
|
||||
self,
|
||||
path: Union[str, Path],
|
||||
|
|
|
@ -85,7 +85,7 @@ class Table(OrderedDict):
|
|||
value: The value to set.
|
||||
"""
|
||||
key = get_string_id(key)
|
||||
OrderedDict.__setitem__(self, key, value)
|
||||
OrderedDict.__setitem__(self, key, value) # type:ignore[assignment]
|
||||
self.bloom.add(key)
|
||||
|
||||
def set(self, key: Union[str, int], value: Any) -> None:
|
||||
|
@ -104,7 +104,7 @@ class Table(OrderedDict):
|
|||
RETURNS: The value.
|
||||
"""
|
||||
key = get_string_id(key)
|
||||
return OrderedDict.__getitem__(self, key)
|
||||
return OrderedDict.__getitem__(self, key) # type:ignore[index]
|
||||
|
||||
def get(self, key: Union[str, int], default: Optional[Any] = None) -> Any:
|
||||
"""Get the value for a given key. String keys will be hashed.
|
||||
|
@ -114,7 +114,7 @@ class Table(OrderedDict):
|
|||
RETURNS: The value.
|
||||
"""
|
||||
key = get_string_id(key)
|
||||
return OrderedDict.get(self, key, default)
|
||||
return OrderedDict.get(self, key, default) # type:ignore[arg-type]
|
||||
|
||||
def __contains__(self, key: Union[str, int]) -> bool: # type: ignore[override]
|
||||
"""Check whether a key is in the table. String keys will be hashed.
|
||||
|
|
|
@ -86,10 +86,14 @@ cdef class Matcher:
|
|||
is a dictionary mapping attribute IDs to values, and optionally a
|
||||
quantifier operator under the key "op". The available quantifiers are:
|
||||
|
||||
'!': Negate the pattern, by requiring it to match exactly 0 times.
|
||||
'?': Make the pattern optional, by allowing it to match 0 or 1 times.
|
||||
'+': Require the pattern to match 1 or more times.
|
||||
'*': Allow the pattern to zero or more times.
|
||||
'!': Negate the pattern, by requiring it to match exactly 0 times.
|
||||
'?': Make the pattern optional, by allowing it to match 0 or 1 times.
|
||||
'+': Require the pattern to match 1 or more times.
|
||||
'*': Allow the pattern to zero or more times.
|
||||
'{n}': Require the pattern to match exactly _n_ times.
|
||||
'{n,m}': Require the pattern to match at least _n_ but not more than _m_ times.
|
||||
'{n,}': Require the pattern to match at least _n_ times.
|
||||
'{,m}': Require the pattern to match at most _m_ times.
|
||||
|
||||
The + and * operators return all possible matches (not just the greedy
|
||||
ones). However, the "greedy" argument can filter the final matches
|
||||
|
@ -1004,8 +1008,29 @@ def _get_operators(spec):
|
|||
return (ONE,)
|
||||
elif spec["OP"] in lookup:
|
||||
return lookup[spec["OP"]]
|
||||
#Min_max {n,m}
|
||||
elif spec["OP"].startswith("{") and spec["OP"].endswith("}"):
|
||||
# {n} --> {n,n} exactly n ONE,(n)
|
||||
# {n,m}--> {n,m} min of n, max of m ONE,(n),ZERO_ONE,(m)
|
||||
# {,m} --> {0,m} min of zero, max of m ZERO_ONE,(m)
|
||||
# {n,} --> {n,∞} min of n, max of inf ONE,(n),ZERO_PLUS
|
||||
|
||||
min_max = spec["OP"][1:-1]
|
||||
min_max = min_max if "," in min_max else f"{min_max},{min_max}"
|
||||
n, m = min_max.split(",")
|
||||
|
||||
#1. Either n or m is a blank string and the other is numeric -->isdigit
|
||||
#2. Both are numeric and n <= m
|
||||
if (not n.isdecimal() and not m.isdecimal()) or (n.isdecimal() and m.isdecimal() and int(n) > int(m)):
|
||||
keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
|
||||
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
|
||||
|
||||
# if n is empty string, zero would be used
|
||||
head = tuple(ONE for __ in range(int(n or 0)))
|
||||
tail = tuple(ZERO_ONE for __ in range(int(m) - int(n or 0))) if m else (ZERO_PLUS,)
|
||||
return head + tail
|
||||
else:
|
||||
keys = ", ".join(lookup.keys())
|
||||
keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
|
||||
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
|
||||
|
||||
|
||||
|
|
|
@ -22,9 +22,11 @@ def forward(model, X, is_train):
|
|||
nP = model.get_dim("nP")
|
||||
nI = model.get_dim("nI")
|
||||
W = model.get_param("W")
|
||||
Yf = model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True)
|
||||
# Preallocate array for layer output, including padding.
|
||||
Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
|
||||
model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
|
||||
Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
|
||||
Yf = model.ops.xp.vstack((model.get_param("pad"), Yf))
|
||||
Yf[0] = model.get_param("pad")
|
||||
|
||||
def backward(dY_ids):
|
||||
# This backprop is particularly tricky, because we get back a different
|
||||
|
|
|
@ -1,9 +1,14 @@
|
|||
from functools import partial
|
||||
from typing import Type, Callable, TYPE_CHECKING
|
||||
from typing import Type, Callable, Dict, TYPE_CHECKING, List, Optional, Set
|
||||
import functools
|
||||
import inspect
|
||||
import types
|
||||
import warnings
|
||||
|
||||
from thinc.layers import with_nvtx_range
|
||||
from thinc.model import Model, wrap_model_recursive
|
||||
from thinc.util import use_nvtx_range
|
||||
|
||||
from ..errors import Warnings
|
||||
from ..util import registry
|
||||
|
||||
if TYPE_CHECKING:
|
||||
|
@ -11,29 +16,106 @@ if TYPE_CHECKING:
|
|||
from ..language import Language # noqa: F401
|
||||
|
||||
|
||||
@registry.callbacks("spacy.models_with_nvtx_range.v1")
|
||||
def create_models_with_nvtx_range(
|
||||
forward_color: int = -1, backprop_color: int = -1
|
||||
) -> Callable[["Language"], "Language"]:
|
||||
def models_with_nvtx_range(nlp):
|
||||
pipes = [
|
||||
pipe
|
||||
for _, pipe in nlp.components
|
||||
if hasattr(pipe, "is_trainable") and pipe.is_trainable
|
||||
]
|
||||
DEFAULT_NVTX_ANNOTATABLE_PIPE_METHODS = [
|
||||
"pipe",
|
||||
"predict",
|
||||
"set_annotations",
|
||||
"update",
|
||||
"rehearse",
|
||||
"get_loss",
|
||||
"initialize",
|
||||
"begin_update",
|
||||
"finish_update",
|
||||
"update",
|
||||
]
|
||||
|
||||
# We need process all models jointly to avoid wrapping callbacks twice.
|
||||
models = Model(
|
||||
"wrap_with_nvtx_range",
|
||||
forward=lambda model, X, is_train: ...,
|
||||
layers=[pipe.model for pipe in pipes],
|
||||
)
|
||||
|
||||
for node in models.walk():
|
||||
def models_with_nvtx_range(nlp, forward_color: int, backprop_color: int):
|
||||
pipes = [
|
||||
pipe
|
||||
for _, pipe in nlp.components
|
||||
if hasattr(pipe, "is_trainable") and pipe.is_trainable
|
||||
]
|
||||
|
||||
seen_models: Set[int] = set()
|
||||
for pipe in pipes:
|
||||
for node in pipe.model.walk():
|
||||
if id(node) in seen_models:
|
||||
continue
|
||||
seen_models.add(id(node))
|
||||
with_nvtx_range(
|
||||
node, forward_color=forward_color, backprop_color=backprop_color
|
||||
)
|
||||
|
||||
return nlp
|
||||
|
||||
|
||||
@registry.callbacks("spacy.models_with_nvtx_range.v1")
|
||||
def create_models_with_nvtx_range(
|
||||
forward_color: int = -1, backprop_color: int = -1
|
||||
) -> Callable[["Language"], "Language"]:
|
||||
return functools.partial(
|
||||
models_with_nvtx_range,
|
||||
forward_color=forward_color,
|
||||
backprop_color=backprop_color,
|
||||
)
|
||||
|
||||
|
||||
def nvtx_range_wrapper_for_pipe_method(self, func, *args, **kwargs):
|
||||
if isinstance(func, functools.partial):
|
||||
return func(*args, **kwargs)
|
||||
else:
|
||||
with use_nvtx_range(f"{self.name} {func.__name__}"):
|
||||
return func(*args, **kwargs)
|
||||
|
||||
|
||||
def pipes_with_nvtx_range(
|
||||
nlp, additional_pipe_functions: Optional[Dict[str, List[str]]]
|
||||
):
|
||||
for _, pipe in nlp.components:
|
||||
if additional_pipe_functions:
|
||||
extra_funcs = additional_pipe_functions.get(pipe.name, [])
|
||||
else:
|
||||
extra_funcs = []
|
||||
|
||||
for name in DEFAULT_NVTX_ANNOTATABLE_PIPE_METHODS + extra_funcs:
|
||||
func = getattr(pipe, name, None)
|
||||
if func is None:
|
||||
if name in extra_funcs:
|
||||
warnings.warn(Warnings.W121.format(method=name, pipe=pipe.name))
|
||||
continue
|
||||
|
||||
wrapped_func = functools.partial(
|
||||
types.MethodType(nvtx_range_wrapper_for_pipe_method, pipe), func
|
||||
)
|
||||
|
||||
# Try to preserve the original function signature.
|
||||
try:
|
||||
wrapped_func.__signature__ = inspect.signature(func) # type: ignore
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
setattr(
|
||||
pipe,
|
||||
name,
|
||||
wrapped_func,
|
||||
)
|
||||
except AttributeError:
|
||||
warnings.warn(Warnings.W122.format(method=name, pipe=pipe.name))
|
||||
|
||||
return nlp
|
||||
|
||||
|
||||
@registry.callbacks("spacy.models_and_pipes_with_nvtx_range.v1")
|
||||
def create_models_and_pipes_with_nvtx_range(
|
||||
forward_color: int = -1,
|
||||
backprop_color: int = -1,
|
||||
additional_pipe_functions: Optional[Dict[str, List[str]]] = None,
|
||||
) -> Callable[["Language"], "Language"]:
|
||||
def inner(nlp):
|
||||
nlp = models_with_nvtx_range(nlp, forward_color, backprop_color)
|
||||
nlp = pipes_with_nvtx_range(nlp, additional_pipe_functions)
|
||||
return nlp
|
||||
|
||||
return models_with_nvtx_range
|
||||
return inner
|
||||
|
|
|
@ -23,7 +23,7 @@ def build_nel_encoder(
|
|||
((tok2vec >> list2ragged()) & build_span_maker())
|
||||
>> extract_spans()
|
||||
>> reduce_mean()
|
||||
>> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) # type: ignore[arg-type]
|
||||
>> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) # type: ignore
|
||||
>> output_layer
|
||||
)
|
||||
model.set_ref("output_layer", output_layer)
|
||||
|
|
|
@ -72,7 +72,7 @@ def build_tb_parser_model(
|
|||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||
tok2vec = chain(
|
||||
tok2vec,
|
||||
cast(Model[List["Floats2d"], Floats2d], list2array()),
|
||||
list2array(),
|
||||
Linear(hidden_width, t2v_width),
|
||||
)
|
||||
tok2vec.set_dim("nO", hidden_width)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from typing import Optional, List, cast
|
||||
from functools import partial
|
||||
from typing import Optional, List
|
||||
|
||||
from thinc.types import Floats2d
|
||||
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||
|
@ -59,7 +59,8 @@ def build_simple_cnn_text_classifier(
|
|||
resizable_layer=resizable_layer,
|
||||
)
|
||||
model.set_ref("tok2vec", tok2vec)
|
||||
model.set_dim("nO", nO) # type: ignore # TODO: remove type ignore once Thinc has been updated
|
||||
if nO is not None:
|
||||
model.set_dim("nO", cast(int, nO))
|
||||
model.attrs["multi_label"] = not exclusive_classes
|
||||
return model
|
||||
|
||||
|
@ -85,7 +86,7 @@ def build_bow_text_classifier(
|
|||
if not no_output_layer:
|
||||
fill_defaults["b"] = NEG_VALUE
|
||||
output_layer = softmax_activation() if exclusive_classes else Logistic()
|
||||
resizable_layer = resizable( # type: ignore[var-annotated]
|
||||
resizable_layer: Model[Floats2d, Floats2d] = resizable(
|
||||
sparse_linear,
|
||||
resize_layer=partial(resize_linear_weighted, fill_defaults=fill_defaults),
|
||||
)
|
||||
|
@ -93,7 +94,8 @@ def build_bow_text_classifier(
|
|||
model = with_cpu(model, model.ops)
|
||||
if output_layer:
|
||||
model = model >> with_cpu(output_layer, output_layer.ops)
|
||||
model.set_dim("nO", nO) # type: ignore[arg-type]
|
||||
if nO is not None:
|
||||
model.set_dim("nO", cast(int, nO))
|
||||
model.set_ref("output_layer", sparse_linear)
|
||||
model.attrs["multi_label"] = not exclusive_classes
|
||||
model.attrs["resize_output"] = partial(
|
||||
|
@ -129,8 +131,8 @@ def build_text_classifier_v2(
|
|||
output_layer = Linear(nO=nO, nI=nO_double) >> Logistic()
|
||||
model = (linear_model | cnn_model) >> output_layer
|
||||
model.set_ref("tok2vec", tok2vec)
|
||||
if model.has_dim("nO") is not False:
|
||||
model.set_dim("nO", nO) # type: ignore[arg-type]
|
||||
if model.has_dim("nO") is not False and nO is not None:
|
||||
model.set_dim("nO", cast(int, nO))
|
||||
model.set_ref("output_layer", linear_model.get_ref("output_layer"))
|
||||
model.set_ref("attention_layer", attention_layer)
|
||||
model.set_ref("maxout_layer", maxout_layer)
|
||||
|
@ -164,7 +166,7 @@ def build_text_classifier_lowdata(
|
|||
>> list2ragged()
|
||||
>> ParametricAttention(width)
|
||||
>> reduce_sum()
|
||||
>> residual(Relu(width, width)) ** 2 # type: ignore[arg-type]
|
||||
>> residual(Relu(width, width)) ** 2
|
||||
>> Linear(nO, width)
|
||||
)
|
||||
if dropout:
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from typing import Optional, List, Union, cast
|
||||
from thinc.types import Floats2d, Ints2d, Ragged
|
||||
from thinc.types import Floats2d, Ints2d, Ragged, Ints1d
|
||||
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
||||
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
|
||||
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
||||
|
@ -159,7 +159,7 @@ def MultiHashEmbed(
|
|||
embeddings = [make_hash_embed(i) for i in range(len(attrs))]
|
||||
concat_size = width * (len(embeddings) + include_static_vectors)
|
||||
max_out: Model[Ragged, Ragged] = with_array(
|
||||
Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True) # type: ignore
|
||||
Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)
|
||||
)
|
||||
if include_static_vectors:
|
||||
feature_extractor: Model[List[Doc], Ragged] = chain(
|
||||
|
@ -173,7 +173,7 @@ def MultiHashEmbed(
|
|||
StaticVectors(width, dropout=0.0),
|
||||
),
|
||||
max_out,
|
||||
cast(Model[Ragged, List[Floats2d]], ragged2list()),
|
||||
ragged2list(),
|
||||
)
|
||||
else:
|
||||
model = chain(
|
||||
|
@ -181,7 +181,7 @@ def MultiHashEmbed(
|
|||
cast(Model[List[Ints2d], Ragged], list2ragged()),
|
||||
with_array(concatenate(*embeddings)),
|
||||
max_out,
|
||||
cast(Model[Ragged, List[Floats2d]], ragged2list()),
|
||||
ragged2list(),
|
||||
)
|
||||
return model
|
||||
|
||||
|
@ -232,12 +232,12 @@ def CharacterEmbed(
|
|||
feature_extractor: Model[List[Doc], Ragged] = chain(
|
||||
FeatureExtractor([feature]),
|
||||
cast(Model[List[Ints2d], Ragged], list2ragged()),
|
||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), # type: ignore
|
||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), # type: ignore[misc]
|
||||
)
|
||||
max_out: Model[Ragged, Ragged]
|
||||
if include_static_vectors:
|
||||
max_out = with_array(
|
||||
Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0) # type: ignore
|
||||
Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)
|
||||
)
|
||||
model = chain(
|
||||
concatenate(
|
||||
|
@ -246,11 +246,11 @@ def CharacterEmbed(
|
|||
StaticVectors(width, dropout=0.0),
|
||||
),
|
||||
max_out,
|
||||
cast(Model[Ragged, List[Floats2d]], ragged2list()),
|
||||
ragged2list(),
|
||||
)
|
||||
else:
|
||||
max_out = with_array(
|
||||
Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0) # type: ignore
|
||||
Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)
|
||||
)
|
||||
model = chain(
|
||||
concatenate(
|
||||
|
@ -258,7 +258,7 @@ def CharacterEmbed(
|
|||
feature_extractor,
|
||||
),
|
||||
max_out,
|
||||
cast(Model[Ragged, List[Floats2d]], ragged2list()),
|
||||
ragged2list(),
|
||||
)
|
||||
return model
|
||||
|
||||
|
@ -289,10 +289,10 @@ def MaxoutWindowEncoder(
|
|||
normalize=True,
|
||||
),
|
||||
)
|
||||
model = clone(residual(cnn), depth) # type: ignore[arg-type]
|
||||
model = clone(residual(cnn), depth)
|
||||
model.set_dim("nO", width)
|
||||
receptive_field = window_size * depth
|
||||
return with_array(model, pad=receptive_field) # type: ignore[arg-type]
|
||||
return with_array(model, pad=receptive_field)
|
||||
|
||||
|
||||
@registry.architectures("spacy.MishWindowEncoder.v2")
|
||||
|
@ -313,9 +313,9 @@ def MishWindowEncoder(
|
|||
expand_window(window_size=window_size),
|
||||
Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
|
||||
)
|
||||
model = clone(residual(cnn), depth) # type: ignore[arg-type]
|
||||
model = clone(residual(cnn), depth)
|
||||
model.set_dim("nO", width)
|
||||
return with_array(model) # type: ignore[arg-type]
|
||||
return with_array(model)
|
||||
|
||||
|
||||
@registry.architectures("spacy.TorchBiLSTMEncoder.v1")
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from libc.string cimport memset, memcpy
|
||||
from thinc.backends.cblas cimport CBlas
|
||||
from ..typedefs cimport weight_t, hash_t
|
||||
from ..pipeline._parser_internals._state cimport StateC
|
||||
|
||||
|
@ -38,7 +39,7 @@ cdef ActivationsC alloc_activations(SizesC n) nogil
|
|||
|
||||
cdef void free_activations(const ActivationsC* A) nogil
|
||||
|
||||
cdef void predict_states(ActivationsC* A, StateC** states,
|
||||
cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
|
||||
const WeightsC* W, SizesC n) nogil
|
||||
|
||||
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
|
||||
|
|
|
@ -4,11 +4,11 @@ from libc.math cimport exp
|
|||
from libc.string cimport memset, memcpy
|
||||
from libc.stdlib cimport calloc, free, realloc
|
||||
from thinc.backends.linalg cimport Vec, VecVec
|
||||
cimport blis.cy
|
||||
from thinc.backends.cblas cimport saxpy, sgemm
|
||||
|
||||
import numpy
|
||||
import numpy.random
|
||||
from thinc.api import Model, CupyOps, NumpyOps
|
||||
from thinc.api import Model, CupyOps, NumpyOps, get_ops
|
||||
|
||||
from .. import util
|
||||
from ..errors import Errors
|
||||
|
@ -91,7 +91,7 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
|
|||
A._curr_size = n.states
|
||||
|
||||
|
||||
cdef void predict_states(ActivationsC* A, StateC** states,
|
||||
cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
|
||||
const WeightsC* W, SizesC n) nogil:
|
||||
cdef double one = 1.0
|
||||
resize_activations(A, n)
|
||||
|
@ -99,7 +99,7 @@ cdef void predict_states(ActivationsC* A, StateC** states,
|
|||
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
|
||||
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
|
||||
memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
|
||||
sum_state_features(A.unmaxed,
|
||||
sum_state_features(cblas, A.unmaxed,
|
||||
W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
|
||||
for i in range(n.states):
|
||||
VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces],
|
||||
|
@ -113,12 +113,10 @@ cdef void predict_states(ActivationsC* A, StateC** states,
|
|||
memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
|
||||
else:
|
||||
# Compute hidden-to-output
|
||||
blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE,
|
||||
n.states, n.classes, n.hiddens, one,
|
||||
<float*>A.hiddens, n.hiddens, 1,
|
||||
<float*>W.hidden_weights, n.hiddens, 1,
|
||||
one,
|
||||
<float*>A.scores, n.classes, 1)
|
||||
sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
|
||||
1.0, <const float *>A.hiddens, n.hiddens,
|
||||
<const float *>W.hidden_weights, n.hiddens,
|
||||
0.0, A.scores, n.classes)
|
||||
# Add bias
|
||||
for i in range(n.states):
|
||||
VecVec.add_i(&A.scores[i*n.classes],
|
||||
|
@ -135,7 +133,7 @@ cdef void predict_states(ActivationsC* A, StateC** states,
|
|||
A.scores[i*n.classes+j] = min_
|
||||
|
||||
|
||||
cdef void sum_state_features(float* output,
|
||||
cdef void sum_state_features(CBlas cblas, float* output,
|
||||
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
||||
cdef int idx, b, f, i
|
||||
cdef const float* feature
|
||||
|
@ -150,9 +148,7 @@ cdef void sum_state_features(float* output,
|
|||
else:
|
||||
idx = token_ids[f] * id_stride + f*O
|
||||
feature = &cached[idx]
|
||||
blis.cy.axpyv(blis.cy.NO_CONJUGATE, O, one,
|
||||
<float*>feature, 1,
|
||||
&output[b*O], 1)
|
||||
saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
|
||||
token_ids += F
|
||||
|
||||
|
||||
|
@ -443,9 +439,15 @@ cdef class precompute_hiddens:
|
|||
# - Output from backward on GPU
|
||||
bp_hiddens = self._bp_hiddens
|
||||
|
||||
cdef CBlas cblas
|
||||
if isinstance(self.ops, CupyOps):
|
||||
cblas = NUMPY_OPS.cblas()
|
||||
else:
|
||||
cblas = self.ops.cblas()
|
||||
|
||||
feat_weights = self.get_feat_weights()
|
||||
cdef int[:, ::1] ids = token_ids
|
||||
sum_state_features(<float*>state_vector.data,
|
||||
sum_state_features(cblas, <float*>state_vector.data,
|
||||
feat_weights, &ids[0,0],
|
||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
||||
state_vector += self.bias
|
||||
|
|
|
@ -40,17 +40,15 @@ def forward(
|
|||
if not token_count:
|
||||
return _handle_empty(model.ops, model.get_dim("nO"))
|
||||
key_attr: int = model.attrs["key_attr"]
|
||||
keys: Ints1d = model.ops.flatten(
|
||||
cast(Sequence, [doc.to_array(key_attr) for doc in docs])
|
||||
)
|
||||
keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
|
||||
vocab: Vocab = docs[0].vocab
|
||||
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
|
||||
if vocab.vectors.mode == Mode.default:
|
||||
V = cast(Floats2d, model.ops.asarray(vocab.vectors.data))
|
||||
V = model.ops.asarray(vocab.vectors.data)
|
||||
rows = vocab.vectors.find(keys=keys)
|
||||
V = model.ops.as_contig(V[rows])
|
||||
elif vocab.vectors.mode == Mode.floret:
|
||||
V = cast(Floats2d, vocab.vectors.get_batch(keys))
|
||||
V = vocab.vectors.get_batch(keys)
|
||||
V = model.ops.as_contig(V)
|
||||
else:
|
||||
raise RuntimeError(Errors.E896)
|
||||
|
@ -62,9 +60,7 @@ def forward(
|
|||
# Convert negative indices to 0-vectors
|
||||
# TODO: more options for UNK tokens
|
||||
vectors_data[rows < 0] = 0
|
||||
output = Ragged(
|
||||
vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i") # type: ignore
|
||||
)
|
||||
output = Ragged(vectors_data, model.ops.asarray1i([len(doc) for doc in docs]))
|
||||
mask = None
|
||||
if is_train:
|
||||
mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate"))
|
||||
|
@ -77,7 +73,9 @@ def forward(
|
|||
model.inc_grad(
|
||||
"W",
|
||||
model.ops.gemm(
|
||||
cast(Floats2d, d_output.data), model.ops.as_contig(V), trans1=True
|
||||
cast(Floats2d, d_output.data),
|
||||
cast(Floats2d, model.ops.as_contig(V)),
|
||||
trans1=True,
|
||||
),
|
||||
)
|
||||
return []
|
||||
|
|
|
@ -10,6 +10,7 @@ from ...strings cimport hash_string
|
|||
from ...structs cimport TokenC
|
||||
from ...tokens.doc cimport Doc, set_children_from_heads
|
||||
from ...tokens.token cimport MISSING_DEP
|
||||
from ...training import split_bilu_label
|
||||
from ...training.example cimport Example
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC, ArcC
|
||||
|
@ -687,7 +688,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
return self.c[name_or_id]
|
||||
name = name_or_id
|
||||
if '-' in name:
|
||||
move_str, label_str = name.split('-', 1)
|
||||
move_str, label_str = split_bilu_label(name)
|
||||
label = self.strings[label_str]
|
||||
else:
|
||||
move_str = name
|
||||
|
|
|
@ -13,6 +13,7 @@ from ...typedefs cimport weight_t, attr_t
|
|||
from ...lexeme cimport Lexeme
|
||||
from ...attrs cimport IS_SPACE
|
||||
from ...structs cimport TokenC, SpanC
|
||||
from ...training import split_bilu_label
|
||||
from ...training.example cimport Example
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
|
@ -182,7 +183,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
if name == '-' or name == '' or name is None:
|
||||
return Transition(clas=0, move=MISSING, label=0, score=0)
|
||||
elif '-' in name:
|
||||
move_str, label_str = name.split('-', 1)
|
||||
move_str, label_str = split_bilu_label(name)
|
||||
# Deprecated, hacky way to denote 'not this entity'
|
||||
if label_str.startswith('!'):
|
||||
raise ValueError(Errors.E869.format(label=name))
|
||||
|
|
11
spacy/pipeline/_parser_internals/nonproj.hh
Normal file
11
spacy/pipeline/_parser_internals/nonproj.hh
Normal file
|
@ -0,0 +1,11 @@
|
|||
#ifndef NONPROJ_HH
|
||||
#define NONPROJ_HH
|
||||
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
void raise_domain_error(std::string const &msg) {
|
||||
throw std::domain_error(msg);
|
||||
}
|
||||
|
||||
#endif // NONPROJ_HH
|
|
@ -0,0 +1,4 @@
|
|||
from libcpp.string cimport string
|
||||
|
||||
cdef extern from "nonproj.hh":
|
||||
cdef void raise_domain_error(const string& msg) nogil except +
|
|
@ -4,10 +4,13 @@ for doing pseudo-projective parsing implementation uses the HEAD decoration
|
|||
scheme.
|
||||
"""
|
||||
from copy import copy
|
||||
from cython.operator cimport preincrement as incr, dereference as deref
|
||||
from libc.limits cimport INT_MAX
|
||||
from libc.stdlib cimport abs
|
||||
from libcpp cimport bool
|
||||
from libcpp.string cimport string, to_string
|
||||
from libcpp.vector cimport vector
|
||||
from libcpp.unordered_set cimport unordered_set
|
||||
|
||||
from ...tokens.doc cimport Doc, set_children_from_heads
|
||||
|
||||
|
@ -49,7 +52,7 @@ def is_nonproj_arc(tokenid, heads):
|
|||
return _is_nonproj_arc(tokenid, c_heads)
|
||||
|
||||
|
||||
cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil:
|
||||
cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil except *:
|
||||
# definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
|
||||
# if there is a token k, h < k < d such that h is not
|
||||
# an ancestor of k. Same for h -> d, h > d
|
||||
|
@ -65,25 +68,49 @@ cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil:
|
|||
else:
|
||||
start, end = (tokenid+1, head)
|
||||
for k in range(start, end):
|
||||
if _has_head_as_ancestor(k, head, heads):
|
||||
continue
|
||||
else: # head not in ancestors: d -> h is non-projective
|
||||
if not _has_head_as_ancestor(k, head, heads):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil:
|
||||
cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil except *:
|
||||
ancestor = tokenid
|
||||
cnt = 0
|
||||
while cnt < heads.size():
|
||||
cdef unordered_set[int] seen_tokens
|
||||
seen_tokens.insert(ancestor)
|
||||
while True:
|
||||
# Reached the head or a disconnected node
|
||||
if heads[ancestor] == head or heads[ancestor] < 0:
|
||||
return True
|
||||
# Reached the root
|
||||
if heads[ancestor] == ancestor:
|
||||
return False
|
||||
ancestor = heads[ancestor]
|
||||
cnt += 1
|
||||
result = seen_tokens.insert(ancestor)
|
||||
# Found cycle
|
||||
if not result.second:
|
||||
raise_domain_error(heads_to_string(heads))
|
||||
|
||||
return False
|
||||
|
||||
|
||||
cdef string heads_to_string(const vector[int]& heads) nogil:
|
||||
cdef vector[int].const_iterator citer
|
||||
cdef string cycle_str
|
||||
|
||||
cycle_str.append("Found cycle in dependency graph: [")
|
||||
|
||||
# FIXME: Rewrite using ostringstream when available in Cython.
|
||||
citer = heads.const_begin()
|
||||
while citer != heads.const_end():
|
||||
if citer != heads.const_begin():
|
||||
cycle_str.append(", ")
|
||||
cycle_str.append(to_string(deref(citer)))
|
||||
incr(citer)
|
||||
cycle_str.append("]")
|
||||
|
||||
return cycle_str
|
||||
|
||||
|
||||
def is_nonproj_tree(heads):
|
||||
cdef vector[int] c_heads = _heads_to_c(heads)
|
||||
# a tree is non-projective if at least one arc is non-projective
|
||||
|
@ -176,11 +203,12 @@ def get_smallest_nonproj_arc_slow(heads):
|
|||
return _get_smallest_nonproj_arc(c_heads)
|
||||
|
||||
|
||||
cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil:
|
||||
cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil except -2:
|
||||
# return the smallest non-proj arc or None
|
||||
# where size is defined as the distance between dep and head
|
||||
# and ties are broken left to right
|
||||
cdef int smallest_size = INT_MAX
|
||||
# -1 means its already projective.
|
||||
cdef int smallest_np_arc = -1
|
||||
cdef int size
|
||||
cdef int tokenid
|
||||
|
|
|
@ -12,6 +12,7 @@ from ..language import Language
|
|||
from ._parser_internals import nonproj
|
||||
from ._parser_internals.nonproj import DELIMITER
|
||||
from ..scorer import Scorer
|
||||
from ..training import remove_bilu_prefix
|
||||
from ..util import registry
|
||||
|
||||
|
||||
|
@ -314,7 +315,7 @@ cdef class DependencyParser(Parser):
|
|||
# Get the labels from the model by looking at the available moves
|
||||
for move in self.move_names:
|
||||
if "-" in move:
|
||||
label = move.split("-")[1]
|
||||
label = remove_bilu_prefix(move)
|
||||
if DELIMITER in label:
|
||||
label = label.split(DELIMITER)[1]
|
||||
labels.add(label)
|
||||
|
|
|
@ -138,7 +138,7 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
|
||||
truths.append(eg_truths)
|
||||
|
||||
d_scores, loss = loss_func(scores, truths) # type: ignore
|
||||
d_scores, loss = loss_func(scores, truths)
|
||||
if self.model.ops.xp.isnan(loss):
|
||||
raise ValueError(Errors.E910.format(name=self.name))
|
||||
|
||||
|
|
|
@ -56,6 +56,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"overwrite": True,
|
||||
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||
"use_gold_ents": True,
|
||||
"threshold": None,
|
||||
},
|
||||
default_score_weights={
|
||||
"nel_micro_f": 1.0,
|
||||
|
@ -77,6 +78,7 @@ def make_entity_linker(
|
|||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
use_gold_ents: bool,
|
||||
threshold: Optional[float] = None,
|
||||
):
|
||||
"""Construct an EntityLinker component.
|
||||
|
||||
|
@ -91,6 +93,10 @@ def make_entity_linker(
|
|||
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
|
||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||
component must provide entity annotations.
|
||||
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
|
||||
prediction is discarded. If None, predictions are not filtered by any threshold.
|
||||
"""
|
||||
|
||||
if not model.attrs.get("include_span_maker", False):
|
||||
|
@ -121,6 +127,7 @@ def make_entity_linker(
|
|||
overwrite=overwrite,
|
||||
scorer=scorer,
|
||||
use_gold_ents=use_gold_ents,
|
||||
threshold=threshold,
|
||||
)
|
||||
|
||||
|
||||
|
@ -156,6 +163,7 @@ class EntityLinker(TrainablePipe):
|
|||
overwrite: bool = BACKWARD_OVERWRITE,
|
||||
scorer: Optional[Callable] = entity_linker_score,
|
||||
use_gold_ents: bool,
|
||||
threshold: Optional[float] = None,
|
||||
) -> None:
|
||||
"""Initialize an entity linker.
|
||||
|
||||
|
@ -174,9 +182,20 @@ class EntityLinker(TrainablePipe):
|
|||
Scorer.score_links.
|
||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||
component must provide entity annotations.
|
||||
|
||||
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
|
||||
threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
|
||||
DOCS: https://spacy.io/api/entitylinker#init
|
||||
"""
|
||||
|
||||
if threshold is not None and not (0 <= threshold <= 1):
|
||||
raise ValueError(
|
||||
Errors.E1043.format(
|
||||
range_start=0,
|
||||
range_end=1,
|
||||
value=threshold,
|
||||
)
|
||||
)
|
||||
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.name = name
|
||||
|
@ -192,6 +211,7 @@ class EntityLinker(TrainablePipe):
|
|||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||
self.scorer = scorer
|
||||
self.use_gold_ents = use_gold_ents
|
||||
self.threshold = threshold
|
||||
|
||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||
"""Define the KB of this pipe by providing a function that will
|
||||
|
@ -355,7 +375,7 @@ class EntityLinker(TrainablePipe):
|
|||
keep_ents.append(eidx)
|
||||
|
||||
eidx += 1
|
||||
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
|
||||
entity_encodings = self.model.ops.asarray2f(entity_encodings, dtype="float32")
|
||||
selected_encodings = sentence_encodings[keep_ents]
|
||||
|
||||
# if there are no matches, short circuit
|
||||
|
@ -368,13 +388,12 @@ class EntityLinker(TrainablePipe):
|
|||
method="get_loss", msg="gold entities do not match up"
|
||||
)
|
||||
raise RuntimeError(err)
|
||||
# TODO: fix typing issue here
|
||||
gradients = self.distance.get_grad(selected_encodings, entity_encodings) # type: ignore
|
||||
gradients = self.distance.get_grad(selected_encodings, entity_encodings)
|
||||
# to match the input size, we need to give a zero gradient for items not in the kb
|
||||
out = self.model.ops.alloc2f(*sentence_encodings.shape)
|
||||
out[keep_ents] = gradients
|
||||
|
||||
loss = self.distance.get_loss(selected_encodings, entity_encodings) # type: ignore
|
||||
loss = self.distance.get_loss(selected_encodings, entity_encodings)
|
||||
loss = loss / len(entity_encodings)
|
||||
return float(loss), out
|
||||
|
||||
|
@ -391,18 +410,21 @@ class EntityLinker(TrainablePipe):
|
|||
self.validate_kb()
|
||||
entity_count = 0
|
||||
final_kb_ids: List[str] = []
|
||||
xp = self.model.ops.xp
|
||||
if not docs:
|
||||
return final_kb_ids
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
for i, doc in enumerate(docs):
|
||||
if len(doc) == 0:
|
||||
continue
|
||||
sentences = [s for s in doc.sents]
|
||||
if len(doc) > 0:
|
||||
# Looping through each entity (TODO: rewrite)
|
||||
for ent in doc.ents:
|
||||
sent = ent.sent
|
||||
sent_index = sentences.index(sent)
|
||||
assert sent_index >= 0
|
||||
# Looping through each entity (TODO: rewrite)
|
||||
for ent in doc.ents:
|
||||
sent_index = sentences.index(ent.sent)
|
||||
assert sent_index >= 0
|
||||
|
||||
if self.incl_context:
|
||||
# get n_neighbour sentences, clipped to the length of the document
|
||||
start_sentence = max(0, sent_index - self.n_sents)
|
||||
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
|
||||
|
@ -410,55 +432,53 @@ class EntityLinker(TrainablePipe):
|
|||
end_token = sentences[end_sentence].end
|
||||
sent_doc = doc[start_token:end_token].as_doc()
|
||||
# currently, the context is the same for each entity in a sentence (should be refined)
|
||||
xp = self.model.ops.xp
|
||||
if self.incl_context:
|
||||
sentence_encoding = self.model.predict([sent_doc])[0]
|
||||
sentence_encoding_t = sentence_encoding.T
|
||||
sentence_norm = xp.linalg.norm(sentence_encoding_t)
|
||||
entity_count += 1
|
||||
if ent.label_ in self.labels_discard:
|
||||
# ignoring this entity - setting to NIL
|
||||
sentence_encoding = self.model.predict([sent_doc])[0]
|
||||
sentence_encoding_t = sentence_encoding.T
|
||||
sentence_norm = xp.linalg.norm(sentence_encoding_t)
|
||||
entity_count += 1
|
||||
if ent.label_ in self.labels_discard:
|
||||
# ignoring this entity - setting to NIL
|
||||
final_kb_ids.append(self.NIL)
|
||||
else:
|
||||
candidates = list(self.get_candidates(self.kb, ent))
|
||||
if not candidates:
|
||||
# no prediction possible for this entity - setting to NIL
|
||||
final_kb_ids.append(self.NIL)
|
||||
elif len(candidates) == 1 and self.threshold is None:
|
||||
# shortcut for efficiency reasons: take the 1 candidate
|
||||
final_kb_ids.append(candidates[0].entity_)
|
||||
else:
|
||||
candidates = list(self.get_candidates(self.kb, ent))
|
||||
if not candidates:
|
||||
# no prediction possible for this entity - setting to NIL
|
||||
final_kb_ids.append(self.NIL)
|
||||
elif len(candidates) == 1:
|
||||
# shortcut for efficiency reasons: take the 1 candidate
|
||||
# TODO: thresholding
|
||||
final_kb_ids.append(candidates[0].entity_)
|
||||
else:
|
||||
random.shuffle(candidates)
|
||||
# set all prior probabilities to 0 if incl_prior=False
|
||||
prior_probs = xp.asarray([c.prior_prob for c in candidates])
|
||||
if not self.incl_prior:
|
||||
prior_probs = xp.asarray([0.0 for _ in candidates])
|
||||
scores = prior_probs
|
||||
# add in similarity from the context
|
||||
if self.incl_context:
|
||||
entity_encodings = xp.asarray(
|
||||
[c.entity_vector for c in candidates]
|
||||
)
|
||||
entity_norm = xp.linalg.norm(entity_encodings, axis=1)
|
||||
if len(entity_encodings) != len(prior_probs):
|
||||
raise RuntimeError(
|
||||
Errors.E147.format(
|
||||
method="predict",
|
||||
msg="vectors not of equal length",
|
||||
)
|
||||
random.shuffle(candidates)
|
||||
# set all prior probabilities to 0 if incl_prior=False
|
||||
prior_probs = xp.asarray([c.prior_prob for c in candidates])
|
||||
if not self.incl_prior:
|
||||
prior_probs = xp.asarray([0.0 for _ in candidates])
|
||||
scores = prior_probs
|
||||
# add in similarity from the context
|
||||
if self.incl_context:
|
||||
entity_encodings = xp.asarray(
|
||||
[c.entity_vector for c in candidates]
|
||||
)
|
||||
entity_norm = xp.linalg.norm(entity_encodings, axis=1)
|
||||
if len(entity_encodings) != len(prior_probs):
|
||||
raise RuntimeError(
|
||||
Errors.E147.format(
|
||||
method="predict",
|
||||
msg="vectors not of equal length",
|
||||
)
|
||||
# cosine similarity
|
||||
sims = xp.dot(entity_encodings, sentence_encoding_t) / (
|
||||
sentence_norm * entity_norm
|
||||
)
|
||||
if sims.shape != prior_probs.shape:
|
||||
raise ValueError(Errors.E161)
|
||||
scores = prior_probs + sims - (prior_probs * sims)
|
||||
# TODO: thresholding
|
||||
best_index = scores.argmax().item()
|
||||
best_candidate = candidates[best_index]
|
||||
final_kb_ids.append(best_candidate.entity_)
|
||||
# cosine similarity
|
||||
sims = xp.dot(entity_encodings, sentence_encoding_t) / (
|
||||
sentence_norm * entity_norm
|
||||
)
|
||||
if sims.shape != prior_probs.shape:
|
||||
raise ValueError(Errors.E161)
|
||||
scores = prior_probs + sims - (prior_probs * sims)
|
||||
final_kb_ids.append(
|
||||
candidates[scores.argmax().item()].entity_
|
||||
if self.threshold is None or scores.max() >= self.threshold
|
||||
else EntityLinker.NIL
|
||||
)
|
||||
if not (len(final_kb_ids) == entity_count):
|
||||
err = Errors.E147.format(
|
||||
method="predict", msg="result variables not of equal length"
|
||||
|
|
|
@ -159,10 +159,8 @@ class EntityRuler(Pipe):
|
|||
self._require_patterns()
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", message="\\[W036")
|
||||
matches = cast(
|
||||
List[Tuple[int, int, int]],
|
||||
list(self.matcher(doc)) + list(self.phrase_matcher(doc)),
|
||||
)
|
||||
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
||||
|
||||
final_matches = set(
|
||||
[(m_id, start, end) for m_id, start, end in matches if start != end]
|
||||
)
|
||||
|
|
|
@ -7,7 +7,7 @@ from pathlib import Path
|
|||
from itertools import islice
|
||||
import srsly
|
||||
import random
|
||||
from thinc.api import CosineDistance, Model, Optimizer, Config
|
||||
from thinc.api import CosineDistance, Model, Optimizer
|
||||
from thinc.api import set_dropout_rate
|
||||
import warnings
|
||||
|
||||
|
@ -20,7 +20,7 @@ from ...language import Language
|
|||
from ...vocab import Vocab
|
||||
from ...training import Example, validate_examples, validate_get_examples
|
||||
from ...errors import Errors, Warnings
|
||||
from ...util import SimpleFrozenList, registry
|
||||
from ...util import SimpleFrozenList
|
||||
from ... import util
|
||||
from ...scorer import Scorer
|
||||
|
||||
|
@ -70,7 +70,6 @@ class EntityLinker_v1(TrainablePipe):
|
|||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_links.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#init
|
||||
"""
|
||||
self.vocab = vocab
|
||||
|
@ -213,15 +212,14 @@ class EntityLinker_v1(TrainablePipe):
|
|||
if kb_id:
|
||||
entity_encoding = self.kb.get_vector(kb_id)
|
||||
entity_encodings.append(entity_encoding)
|
||||
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
|
||||
entity_encodings = self.model.ops.asarray2f(entity_encodings)
|
||||
if sentence_encodings.shape != entity_encodings.shape:
|
||||
err = Errors.E147.format(
|
||||
method="get_loss", msg="gold entities do not match up"
|
||||
)
|
||||
raise RuntimeError(err)
|
||||
# TODO: fix typing issue here
|
||||
gradients = self.distance.get_grad(sentence_encodings, entity_encodings) # type: ignore
|
||||
loss = self.distance.get_loss(sentence_encodings, entity_encodings) # type: ignore
|
||||
gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
|
||||
loss = self.distance.get_loss(sentence_encodings, entity_encodings)
|
||||
loss = loss / len(entity_encodings)
|
||||
return float(loss), gradients
|
||||
|
||||
|
@ -273,7 +271,6 @@ class EntityLinker_v1(TrainablePipe):
|
|||
final_kb_ids.append(self.NIL)
|
||||
elif len(candidates) == 1:
|
||||
# shortcut for efficiency reasons: take the 1 candidate
|
||||
# TODO: thresholding
|
||||
final_kb_ids.append(candidates[0].entity_)
|
||||
else:
|
||||
random.shuffle(candidates)
|
||||
|
@ -302,7 +299,6 @@ class EntityLinker_v1(TrainablePipe):
|
|||
if sims.shape != prior_probs.shape:
|
||||
raise ValueError(Errors.E161)
|
||||
scores = prior_probs + sims - (prior_probs * sims)
|
||||
# TODO: thresholding
|
||||
best_index = scores.argmax().item()
|
||||
best_candidate = candidates[best_index]
|
||||
final_kb_ids.append(best_candidate.entity_)
|
||||
|
|
|
@ -6,10 +6,10 @@ from thinc.api import Model, Config
|
|||
from ._parser_internals.transition_system import TransitionSystem
|
||||
from .transition_parser cimport Parser
|
||||
from ._parser_internals.ner cimport BiluoPushDown
|
||||
|
||||
from ..language import Language
|
||||
from ..scorer import get_ner_prf, PRFScore
|
||||
from ..util import registry
|
||||
from ..training import remove_bilu_prefix
|
||||
|
||||
|
||||
default_model_config = """
|
||||
|
@ -242,7 +242,7 @@ cdef class EntityRecognizer(Parser):
|
|||
def labels(self):
|
||||
# Get the labels from the model by looking at the available moves, e.g.
|
||||
# B-PERSON, I-PERSON, L-PERSON, U-PERSON
|
||||
labels = set(move.split("-")[1] for move in self.move_names
|
||||
labels = set(remove_bilu_prefix(move) for move in self.move_names
|
||||
if move[0] in ("B", "I", "L", "U"))
|
||||
return tuple(sorted(labels))
|
||||
|
||||
|
|
|
@ -75,7 +75,7 @@ def build_ngram_suggester(sizes: List[int]) -> Suggester:
|
|||
if spans:
|
||||
assert spans[-1].ndim == 2, spans[-1].shape
|
||||
lengths.append(length)
|
||||
lengths_array = cast(Ints1d, ops.asarray(lengths, dtype="i"))
|
||||
lengths_array = ops.asarray1i(lengths)
|
||||
if len(spans) > 0:
|
||||
output = Ragged(ops.xp.vstack(spans), lengths_array)
|
||||
else:
|
||||
|
|
|
@ -192,7 +192,7 @@ class TextCategorizer(TrainablePipe):
|
|||
if not any(len(doc) for doc in docs):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
tensors = [doc.tensor for doc in docs]
|
||||
xp = get_array_module(tensors)
|
||||
xp = self.model.ops.xp
|
||||
scores = xp.zeros((len(list(docs)), len(self.labels)))
|
||||
return scores
|
||||
scores = self.model.predict(docs)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from cymem.cymem cimport Pool
|
||||
from thinc.backends.cblas cimport CBlas
|
||||
|
||||
from ..vocab cimport Vocab
|
||||
from .trainable_pipe cimport TrainablePipe
|
||||
|
@ -12,7 +13,7 @@ cdef class Parser(TrainablePipe):
|
|||
cdef readonly TransitionSystem moves
|
||||
cdef public object _multitasks
|
||||
|
||||
cdef void _parseC(self, StateC** states,
|
||||
cdef void _parseC(self, CBlas cblas, StateC** states,
|
||||
WeightsC weights, SizesC sizes) nogil
|
||||
|
||||
cdef void c_transition_batch(self, StateC** states, const float* scores,
|
||||
|
|
|
@ -9,7 +9,7 @@ from libc.stdlib cimport calloc, free
|
|||
import random
|
||||
|
||||
import srsly
|
||||
from thinc.api import set_dropout_rate, CupyOps
|
||||
from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
|
||||
from thinc.extra.search cimport Beam
|
||||
import numpy.random
|
||||
import numpy
|
||||
|
@ -30,6 +30,9 @@ from ..errors import Errors, Warnings
|
|||
from .. import util
|
||||
|
||||
|
||||
NUMPY_OPS = NumpyOps()
|
||||
|
||||
|
||||
cdef class Parser(TrainablePipe):
|
||||
"""
|
||||
Base class of the DependencyParser and EntityRecognizer.
|
||||
|
@ -259,6 +262,12 @@ cdef class Parser(TrainablePipe):
|
|||
def greedy_parse(self, docs, drop=0.):
|
||||
cdef vector[StateC*] states
|
||||
cdef StateClass state
|
||||
ops = self.model.ops
|
||||
cdef CBlas cblas
|
||||
if isinstance(ops, CupyOps):
|
||||
cblas = NUMPY_OPS.cblas()
|
||||
else:
|
||||
cblas = ops.cblas()
|
||||
self._ensure_labels_are_added(docs)
|
||||
set_dropout_rate(self.model, drop)
|
||||
batch = self.moves.init_batch(docs)
|
||||
|
@ -269,8 +278,7 @@ cdef class Parser(TrainablePipe):
|
|||
states.push_back(state.c)
|
||||
sizes = get_c_sizes(model, states.size())
|
||||
with nogil:
|
||||
self._parseC(&states[0],
|
||||
weights, sizes)
|
||||
self._parseC(cblas, &states[0], weights, sizes)
|
||||
model.clear_memory()
|
||||
del model
|
||||
return batch
|
||||
|
@ -297,14 +305,13 @@ cdef class Parser(TrainablePipe):
|
|||
del model
|
||||
return list(batch)
|
||||
|
||||
cdef void _parseC(self, StateC** states,
|
||||
cdef void _parseC(self, CBlas cblas, StateC** states,
|
||||
WeightsC weights, SizesC sizes) nogil:
|
||||
cdef int i, j
|
||||
cdef vector[StateC*] unfinished
|
||||
cdef ActivationsC activations = alloc_activations(sizes)
|
||||
while sizes.states >= 1:
|
||||
predict_states(&activations,
|
||||
states, &weights, sizes)
|
||||
predict_states(cblas, &activations, states, &weights, sizes)
|
||||
# Validate actions, argmax, take action.
|
||||
self.c_transition_batch(states,
|
||||
activations.scores, sizes.classes, sizes.states)
|
||||
|
|
|
@ -3,12 +3,13 @@ from typing import Iterable, TypeVar, TYPE_CHECKING
|
|||
from .compat import Literal
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
|
||||
from pydantic.main import ModelMetaclass
|
||||
from thinc.api import Optimizer, ConfigValidationError, Model
|
||||
from thinc.config import Promise
|
||||
from collections import defaultdict
|
||||
import inspect
|
||||
import re
|
||||
|
||||
from .attrs import NAMES
|
||||
from .lookups import Lookups
|
||||
|
@ -104,7 +105,7 @@ def get_arg_model(
|
|||
sig_args[param.name] = (annotation, default)
|
||||
is_strict = strict and not has_variable
|
||||
sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra # type: ignore[assignment]
|
||||
return create_model(name, **sig_args) # type: ignore[arg-type, return-value]
|
||||
return create_model(name, **sig_args) # type: ignore[call-overload, arg-type, return-value]
|
||||
|
||||
|
||||
def validate_init_settings(
|
||||
|
@ -198,13 +199,18 @@ class TokenPatternNumber(BaseModel):
|
|||
return v
|
||||
|
||||
|
||||
class TokenPatternOperator(str, Enum):
|
||||
class TokenPatternOperatorSimple(str, Enum):
|
||||
plus: StrictStr = StrictStr("+")
|
||||
start: StrictStr = StrictStr("*")
|
||||
star: StrictStr = StrictStr("*")
|
||||
question: StrictStr = StrictStr("?")
|
||||
exclamation: StrictStr = StrictStr("!")
|
||||
|
||||
|
||||
class TokenPatternOperatorMinMax(ConstrainedStr):
|
||||
regex = re.compile("^({\d+}|{\d+,\d*}|{\d*,\d+})$")
|
||||
|
||||
|
||||
TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax]
|
||||
StringValue = Union[TokenPatternString, StrictStr]
|
||||
NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
|
||||
UnderscoreValue = Union[
|
||||
|
|
|
@ -26,4 +26,4 @@ cdef class StringStore:
|
|||
cdef public PreshMap _map
|
||||
|
||||
cdef const Utf8Str* intern_unicode(self, str py_string)
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
|
||||
|
|
|
@ -14,6 +14,13 @@ from .symbols import NAMES as SYMBOLS_BY_INT
|
|||
from .errors import Errors
|
||||
from . import util
|
||||
|
||||
# Not particularly elegant, but this is faster than `isinstance(key, numbers.Integral)`
|
||||
cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
|
||||
try:
|
||||
out_hash[0] = key
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def get_string_id(key):
|
||||
"""Get a string ID, handling the reserved symbols correctly. If the key is
|
||||
|
@ -22,15 +29,27 @@ def get_string_id(key):
|
|||
This function optimises for convenience over performance, so shouldn't be
|
||||
used in tight loops.
|
||||
"""
|
||||
if not isinstance(key, str):
|
||||
return key
|
||||
elif key in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[key]
|
||||
elif not key:
|
||||
return 0
|
||||
cdef hash_t str_hash
|
||||
if isinstance(key, str):
|
||||
if len(key) == 0:
|
||||
return 0
|
||||
|
||||
symbol = SYMBOLS_BY_STR.get(key, None)
|
||||
if symbol is not None:
|
||||
return symbol
|
||||
else:
|
||||
chars = key.encode("utf8")
|
||||
return hash_utf8(chars, len(chars))
|
||||
elif _try_coerce_to_hash(key, &str_hash):
|
||||
# Coerce the integral key to the expected primitive hash type.
|
||||
# This ensures that custom/overloaded "primitive" data types
|
||||
# such as those implemented by numpy are not inadvertently used
|
||||
# downsteam (as these are internally implemented as custom PyObjects
|
||||
# whose comparison operators can incur a significant overhead).
|
||||
return str_hash
|
||||
else:
|
||||
chars = key.encode("utf8")
|
||||
return hash_utf8(chars, len(chars))
|
||||
# TODO: Raise an error instead
|
||||
return key
|
||||
|
||||
|
||||
cpdef hash_t hash_string(str string) except 0:
|
||||
|
@ -110,28 +129,36 @@ cdef class StringStore:
|
|||
string_or_id (bytes, str or uint64): The value to encode.
|
||||
Returns (str / uint64): The value to be retrieved.
|
||||
"""
|
||||
if isinstance(string_or_id, str) and len(string_or_id) == 0:
|
||||
return 0
|
||||
elif string_or_id == 0:
|
||||
return ""
|
||||
elif string_or_id in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string_or_id]
|
||||
cdef hash_t key
|
||||
cdef hash_t str_hash
|
||||
cdef Utf8Str* utf8str = NULL
|
||||
|
||||
if isinstance(string_or_id, str):
|
||||
key = hash_string(string_or_id)
|
||||
return key
|
||||
elif isinstance(string_or_id, bytes):
|
||||
key = hash_utf8(string_or_id, len(string_or_id))
|
||||
return key
|
||||
elif string_or_id < len(SYMBOLS_BY_INT):
|
||||
return SYMBOLS_BY_INT[string_or_id]
|
||||
else:
|
||||
key = string_or_id
|
||||
utf8str = <Utf8Str*>self._map.get(key)
|
||||
if utf8str is NULL:
|
||||
raise KeyError(Errors.E018.format(hash_value=string_or_id))
|
||||
if len(string_or_id) == 0:
|
||||
return 0
|
||||
|
||||
# Return early if the string is found in the symbols LUT.
|
||||
symbol = SYMBOLS_BY_STR.get(string_or_id, None)
|
||||
if symbol is not None:
|
||||
return symbol
|
||||
else:
|
||||
return decode_Utf8Str(utf8str)
|
||||
return hash_string(string_or_id)
|
||||
elif isinstance(string_or_id, bytes):
|
||||
return hash_utf8(string_or_id, len(string_or_id))
|
||||
elif _try_coerce_to_hash(string_or_id, &str_hash):
|
||||
if str_hash == 0:
|
||||
return ""
|
||||
elif str_hash < len(SYMBOLS_BY_INT):
|
||||
return SYMBOLS_BY_INT[str_hash]
|
||||
else:
|
||||
utf8str = <Utf8Str*>self._map.get(str_hash)
|
||||
else:
|
||||
# TODO: Raise an error instead
|
||||
utf8str = <Utf8Str*>self._map.get(string_or_id)
|
||||
|
||||
if utf8str is NULL:
|
||||
raise KeyError(Errors.E018.format(hash_value=string_or_id))
|
||||
else:
|
||||
return decode_Utf8Str(utf8str)
|
||||
|
||||
def as_int(self, key):
|
||||
"""If key is an int, return it; otherwise, get the int value."""
|
||||
|
@ -153,19 +180,22 @@ cdef class StringStore:
|
|||
string (str): The string to add.
|
||||
RETURNS (uint64): The string's hash value.
|
||||
"""
|
||||
cdef hash_t str_hash
|
||||
if isinstance(string, str):
|
||||
if string in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string]
|
||||
key = hash_string(string)
|
||||
self.intern_unicode(string)
|
||||
|
||||
string = string.encode("utf8")
|
||||
str_hash = hash_utf8(string, len(string))
|
||||
self._intern_utf8(string, len(string), &str_hash)
|
||||
elif isinstance(string, bytes):
|
||||
if string in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string]
|
||||
key = hash_utf8(string, len(string))
|
||||
self._intern_utf8(string, len(string))
|
||||
str_hash = hash_utf8(string, len(string))
|
||||
self._intern_utf8(string, len(string), &str_hash)
|
||||
else:
|
||||
raise TypeError(Errors.E017.format(value_type=type(string)))
|
||||
return key
|
||||
return str_hash
|
||||
|
||||
def __len__(self):
|
||||
"""The number of strings in the store.
|
||||
|
@ -174,30 +204,29 @@ cdef class StringStore:
|
|||
"""
|
||||
return self.keys.size()
|
||||
|
||||
def __contains__(self, string not None):
|
||||
"""Check whether a string is in the store.
|
||||
def __contains__(self, string_or_id not None):
|
||||
"""Check whether a string or ID is in the store.
|
||||
|
||||
string (str): The string to check.
|
||||
string_or_id (str or int): The string to check.
|
||||
RETURNS (bool): Whether the store contains the string.
|
||||
"""
|
||||
cdef hash_t key
|
||||
if isinstance(string, int) or isinstance(string, long):
|
||||
if string == 0:
|
||||
cdef hash_t str_hash
|
||||
if isinstance(string_or_id, str):
|
||||
if len(string_or_id) == 0:
|
||||
return True
|
||||
key = string
|
||||
elif len(string) == 0:
|
||||
return True
|
||||
elif string in SYMBOLS_BY_STR:
|
||||
return True
|
||||
elif isinstance(string, str):
|
||||
key = hash_string(string)
|
||||
elif string_or_id in SYMBOLS_BY_STR:
|
||||
return True
|
||||
str_hash = hash_string(string_or_id)
|
||||
elif _try_coerce_to_hash(string_or_id, &str_hash):
|
||||
pass
|
||||
else:
|
||||
string = string.encode("utf8")
|
||||
key = hash_utf8(string, len(string))
|
||||
if key < len(SYMBOLS_BY_INT):
|
||||
# TODO: Raise an error instead
|
||||
return self._map.get(string_or_id) is not NULL
|
||||
|
||||
if str_hash < len(SYMBOLS_BY_INT):
|
||||
return True
|
||||
else:
|
||||
return self._map.get(key) is not NULL
|
||||
return self._map.get(str_hash) is not NULL
|
||||
|
||||
def __iter__(self):
|
||||
"""Iterate over the strings in the store, in order.
|
||||
|
@ -272,13 +301,13 @@ cdef class StringStore:
|
|||
cdef const Utf8Str* intern_unicode(self, str py_string):
|
||||
# 0 means missing, but we don't bother offsetting the index.
|
||||
cdef bytes byte_string = py_string.encode("utf8")
|
||||
return self._intern_utf8(byte_string, len(byte_string))
|
||||
return self._intern_utf8(byte_string, len(byte_string), NULL)
|
||||
|
||||
@cython.final
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length):
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
|
||||
# TODO: This function's API/behaviour is an unholy mess...
|
||||
# 0 means missing, but we don't bother offsetting the index.
|
||||
cdef hash_t key = hash_utf8(utf8_string, length)
|
||||
cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
|
||||
cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
|
||||
if value is not NULL:
|
||||
return value
|
||||
|
|
|
@ -1,5 +1,11 @@
|
|||
import pytest
|
||||
from spacy.util import get_lang_class
|
||||
from hypothesis import settings
|
||||
|
||||
# Functionally disable deadline settings for tests
|
||||
# to prevent spurious test failures in CI builds.
|
||||
settings.register_profile("no_deadlines", deadline=2 * 60 * 1000) # in ms
|
||||
settings.load_profile("no_deadlines")
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
|
|
8
spacy/tests/lang/bg/test_tokenizer.py
Normal file
8
spacy/tests/lang/bg/test_tokenizer.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
import pytest
|
||||
|
||||
|
||||
def test_bg_tokenizer_handles_final_diacritics(bg_tokenizer):
|
||||
text = "Ня̀маше яйца̀. Ня̀маше яйца̀."
|
||||
tokens = bg_tokenizer(text)
|
||||
assert tokens[1].text == "яйца̀"
|
||||
assert tokens[2].text == "."
|
|
@ -167,3 +167,12 @@ def test_issue3521(en_tokenizer, word):
|
|||
tok = en_tokenizer(word)[1]
|
||||
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||||
assert tok.is_stop
|
||||
|
||||
|
||||
@pytest.mark.issue(10699)
|
||||
@pytest.mark.parametrize("text", ["theses", "thisre"])
|
||||
def test_issue10699(en_tokenizer, text):
|
||||
"""Test that 'theses' and 'thisre' are excluded from the contractions
|
||||
generated by the English tokenizer exceptions."""
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from string import punctuation
|
||||
import pytest
|
||||
|
||||
|
||||
|
@ -122,3 +123,36 @@ def test_ru_tokenizer_splits_bracket_period(ru_tokenizer):
|
|||
text = "(Раз, два, три, проверка)."
|
||||
tokens = ru_tokenizer(text)
|
||||
assert tokens[len(tokens) - 1].text == "."
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text",
|
||||
[
|
||||
"рекоменду́я подда́ть жару́. Самого́ Баргамота",
|
||||
"РЕКОМЕНДУ́Я ПОДДА́ТЬ ЖАРУ́. САМОГО́ БАРГАМОТА",
|
||||
"рекоменду̍я подда̍ть жару̍.Самого̍ Баргамота",
|
||||
"рекоменду̍я подда̍ть жару̍.'Самого̍ Баргамота",
|
||||
"рекоменду̍я подда̍ть жару̍,самого̍ Баргамота",
|
||||
"рекоменду̍я подда̍ть жару̍:самого̍ Баргамота",
|
||||
"рекоменду̍я подда̍ть жару̍. самого̍ Баргамота",
|
||||
"рекоменду̍я подда̍ть жару̍, самого̍ Баргамота",
|
||||
"рекоменду̍я подда̍ть жару̍: самого̍ Баргамота",
|
||||
"рекоменду̍я подда̍ть жару̍-самого̍ Баргамота",
|
||||
],
|
||||
)
|
||||
def test_ru_tokenizer_handles_final_diacritics(ru_tokenizer, text):
|
||||
tokens = ru_tokenizer(text)
|
||||
assert tokens[2].text in ("жару́", "ЖАРУ́", "жару̍")
|
||||
assert tokens[3].text in punctuation
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text",
|
||||
[
|
||||
"РЕКОМЕНДУ́Я ПОДДА́ТЬ ЖАРУ́.САМОГО́ БАРГАМОТА",
|
||||
"рекоменду̍я подда̍ть жару́.самого́ Баргамота",
|
||||
],
|
||||
)
|
||||
def test_ru_tokenizer_handles_final_diacritic_and_period(ru_tokenizer, text):
|
||||
tokens = ru_tokenizer(text)
|
||||
assert tokens[2].text.lower() == "жару́.самого́"
|
||||
|
|
|
@ -140,3 +140,10 @@ def test_uk_tokenizer_splits_bracket_period(uk_tokenizer):
|
|||
text = "(Раз, два, три, проверка)."
|
||||
tokens = uk_tokenizer(text)
|
||||
assert tokens[len(tokens) - 1].text == "."
|
||||
|
||||
|
||||
def test_uk_tokenizer_handles_final_diacritics(uk_tokenizer):
|
||||
text = "Хлібі́в не було́. Хлібі́в не було́."
|
||||
tokens = uk_tokenizer(text)
|
||||
assert tokens[2].text == "було́"
|
||||
assert tokens[3].text == "."
|
||||
|
|
|
@ -476,6 +476,17 @@ def test_matcher_extension_set_membership(en_vocab):
|
|||
assert len(matches) == 0
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="IN predicate must handle sequence values in extensions")
|
||||
def test_matcher_extension_in_set_predicate(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
Token.set_extension("ext", default=[])
|
||||
pattern = [{"_": {"ext": {"IN": ["A", "C"]}}}]
|
||||
matcher.add("M", [pattern])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
doc[0]._.ext = ["A", "B"]
|
||||
assert len(matcher(doc)) == 1
|
||||
|
||||
|
||||
def test_matcher_basic_check(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
# Potential mistake: pass in pattern instead of list of patterns
|
||||
|
@ -669,3 +680,38 @@ def test_matcher_ent_iob_key(en_vocab):
|
|||
assert matches[0] == "Maria"
|
||||
assert matches[1] == "Maria Esperanza"
|
||||
assert matches[2] == "Esperanza"
|
||||
|
||||
|
||||
def test_matcher_min_max_operator(en_vocab):
|
||||
# Exactly n matches {n}
|
||||
doc = Doc(
|
||||
en_vocab,
|
||||
words=["foo", "bar", "foo", "foo", "bar", "foo", "foo", "foo", "bar", "bar"],
|
||||
)
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"ORTH": "foo", "OP": "{3}"}]
|
||||
matcher.add("TEST", [pattern])
|
||||
|
||||
matches1 = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||
assert len(matches1) == 1
|
||||
|
||||
# At least n matches {n,}
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"ORTH": "foo", "OP": "{2,}"}]
|
||||
matcher.add("TEST", [pattern])
|
||||
matches2 = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||
assert len(matches2) == 4
|
||||
|
||||
# At most m matches {,m}
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"ORTH": "foo", "OP": "{,2}"}]
|
||||
matcher.add("TEST", [pattern])
|
||||
matches3 = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||
assert len(matches3) == 9
|
||||
|
||||
# At least n matches and most m matches {n,m}
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"ORTH": "foo", "OP": "{2,3}"}]
|
||||
matcher.add("TEST", [pattern])
|
||||
matches4 = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||
assert len(matches4) == 4
|
||||
|
|
|
@ -699,6 +699,10 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
|
|||
("aaaa", "a a a a a?", [0, 1, 2, 3]),
|
||||
("aaab", "a+ a b", [0, 0, 1, 2]),
|
||||
("aaab", "a+ a+ b", [0, 0, 1, 2]),
|
||||
("aaab", "a{2,} b", [0, 0, 0, 1]),
|
||||
("aaab", "a{,3} b", [0, 0, 0, 1]),
|
||||
("aaab", "a{2} b", [0, 0, 1]),
|
||||
("aaab", "a{2,3} b", [0, 0, 0, 1]),
|
||||
]
|
||||
for string, pattern_str, result in cases:
|
||||
matcher = Matcher(en_vocab)
|
||||
|
@ -711,6 +715,8 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
|
|||
pattern.append({"ORTH": part[0], "OP": "*"})
|
||||
elif part.endswith("?"):
|
||||
pattern.append({"ORTH": part[0], "OP": "?"})
|
||||
elif part.endswith("}"):
|
||||
pattern.append({"ORTH": part[0], "OP": part[1:]})
|
||||
else:
|
||||
pattern.append({"ORTH": part})
|
||||
matcher.add("PATTERN", [pattern], greedy="LONGEST")
|
||||
|
@ -722,7 +728,7 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
|
|||
assert expected == result, (string, pattern_str, s, e, n_matches)
|
||||
|
||||
|
||||
def test_matcher_with_alignments_nongreedy(en_vocab):
|
||||
def test_matcher_with_alignments_non_greedy(en_vocab):
|
||||
cases = [
|
||||
(0, "aaab", "a* b", [[0, 1], [0, 0, 1], [0, 0, 0, 1], [1]]),
|
||||
(1, "baab", "b a* b", [[0, 1, 1, 2]]),
|
||||
|
@ -752,6 +758,10 @@ def test_matcher_with_alignments_nongreedy(en_vocab):
|
|||
(15, "aaaa", "a a a a a?", [[0, 1, 2, 3]]),
|
||||
(16, "aaab", "a+ a b", [[0, 1, 2], [0, 0, 1, 2]]),
|
||||
(17, "aaab", "a+ a+ b", [[0, 1, 2], [0, 0, 1, 2]]),
|
||||
(18, "aaab", "a{2,} b", [[0, 0, 1], [0, 0, 0, 1]]),
|
||||
(19, "aaab", "a{3} b", [[0, 0, 0, 1]]),
|
||||
(20, "aaab", "a{2} b", [[0, 0, 1]]),
|
||||
(21, "aaab", "a{2,3} b", [[0, 0, 1], [0, 0, 0, 1]]),
|
||||
]
|
||||
for case_id, string, pattern_str, results in cases:
|
||||
matcher = Matcher(en_vocab)
|
||||
|
@ -764,6 +774,8 @@ def test_matcher_with_alignments_nongreedy(en_vocab):
|
|||
pattern.append({"ORTH": part[0], "OP": "*"})
|
||||
elif part.endswith("?"):
|
||||
pattern.append({"ORTH": part[0], "OP": "?"})
|
||||
elif part.endswith("}"):
|
||||
pattern.append({"ORTH": part[0], "OP": part[1:]})
|
||||
else:
|
||||
pattern.append({"ORTH": part})
|
||||
|
||||
|
|
|
@ -14,6 +14,14 @@ TEST_PATTERNS = [
|
|||
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
|
||||
([{"ENT_IOB": "foo"}], 1, 1),
|
||||
([1, 2, 3], 3, 1),
|
||||
([{"TEXT": "foo", "OP": "{,}"}], 1, 1),
|
||||
([{"TEXT": "foo", "OP": "{,4}4"}], 1, 1),
|
||||
([{"TEXT": "foo", "OP": "{a,3}"}], 1, 1),
|
||||
([{"TEXT": "foo", "OP": "{a}"}], 1, 1),
|
||||
([{"TEXT": "foo", "OP": "{,a}"}], 1, 1),
|
||||
([{"TEXT": "foo", "OP": "{1,2,3}"}], 1, 1),
|
||||
([{"TEXT": "foo", "OP": "{1, 3}"}], 1, 1),
|
||||
([{"TEXT": "foo", "OP": "{-2}"}], 1, 1),
|
||||
# Bad patterns flagged outside of Matcher
|
||||
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)
|
||||
# Bad patterns not flagged with minimal checks
|
||||
|
@ -38,6 +46,7 @@ TEST_PATTERNS = [
|
|||
([{"SENT_START": True}], 0, 0),
|
||||
([{"ENT_ID": "STRING"}], 0, 0),
|
||||
([{"ENT_KB_ID": "STRING"}], 0, 0),
|
||||
([{"TEXT": "ha", "OP": "{3}"}], 0, 0),
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@ from spacy.lang.it import Italian
|
|||
from spacy.language import Language
|
||||
from spacy.lookups import Lookups
|
||||
from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
||||
from spacy.training import Example, iob_to_biluo
|
||||
from spacy.training import Example, iob_to_biluo, split_bilu_label
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy.vocab import Vocab
|
||||
import logging
|
||||
|
@ -110,6 +110,9 @@ def test_issue2385():
|
|||
# maintain support for iob2 format
|
||||
tags3 = ("B-PERSON", "I-PERSON", "B-PERSON")
|
||||
assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"]
|
||||
# ensure it works with hyphens in the name
|
||||
tags4 = ("B-MULTI-PERSON", "I-MULTI-PERSON", "B-MULTI-PERSON")
|
||||
assert iob_to_biluo(tags4) == ["B-MULTI-PERSON", "L-MULTI-PERSON", "U-MULTI-PERSON"]
|
||||
|
||||
|
||||
@pytest.mark.issue(2800)
|
||||
|
@ -154,6 +157,24 @@ def test_issue3209():
|
|||
assert ner2.move_names == move_names
|
||||
|
||||
|
||||
def test_labels_from_BILUO():
|
||||
"""Test that labels are inferred correctly when there's a - in label."""
|
||||
nlp = English()
|
||||
ner = nlp.add_pipe("ner")
|
||||
ner.add_label("LARGE-ANIMAL")
|
||||
nlp.initialize()
|
||||
move_names = [
|
||||
"O",
|
||||
"B-LARGE-ANIMAL",
|
||||
"I-LARGE-ANIMAL",
|
||||
"L-LARGE-ANIMAL",
|
||||
"U-LARGE-ANIMAL",
|
||||
]
|
||||
labels = {"LARGE-ANIMAL"}
|
||||
assert ner.move_names == move_names
|
||||
assert set(ner.labels) == labels
|
||||
|
||||
|
||||
@pytest.mark.issue(4267)
|
||||
def test_issue4267():
|
||||
"""Test that running an entity_ruler after ner gives consistent results"""
|
||||
|
@ -298,7 +319,7 @@ def test_oracle_moves_missing_B(en_vocab):
|
|||
elif tag == "O":
|
||||
moves.add_action(move_types.index("O"), "")
|
||||
else:
|
||||
action, label = tag.split("-")
|
||||
action, label = split_bilu_label(tag)
|
||||
moves.add_action(move_types.index("B"), label)
|
||||
moves.add_action(move_types.index("I"), label)
|
||||
moves.add_action(move_types.index("L"), label)
|
||||
|
@ -324,7 +345,7 @@ def test_oracle_moves_whitespace(en_vocab):
|
|||
elif tag == "O":
|
||||
moves.add_action(move_types.index("O"), "")
|
||||
else:
|
||||
action, label = tag.split("-")
|
||||
action, label = split_bilu_label(tag)
|
||||
moves.add_action(move_types.index(action), label)
|
||||
moves.get_oracle_sequence(example)
|
||||
|
||||
|
|
|
@ -49,7 +49,9 @@ def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree
|
|||
assert contains_cycle(multirooted_tree) is None
|
||||
|
||||
|
||||
def test_parser_is_nonproj_arc(nonproj_tree, partial_tree, multirooted_tree):
|
||||
def test_parser_is_nonproj_arc(
|
||||
cyclic_tree, nonproj_tree, partial_tree, multirooted_tree
|
||||
):
|
||||
assert is_nonproj_arc(0, nonproj_tree) is False
|
||||
assert is_nonproj_arc(1, nonproj_tree) is False
|
||||
assert is_nonproj_arc(2, nonproj_tree) is False
|
||||
|
@ -62,15 +64,23 @@ def test_parser_is_nonproj_arc(nonproj_tree, partial_tree, multirooted_tree):
|
|||
assert is_nonproj_arc(7, partial_tree) is False
|
||||
assert is_nonproj_arc(17, multirooted_tree) is False
|
||||
assert is_nonproj_arc(16, multirooted_tree) is True
|
||||
with pytest.raises(
|
||||
ValueError, match=r"Found cycle in dependency graph: \[1, 2, 2, 4, 5, 3, 2\]"
|
||||
):
|
||||
is_nonproj_arc(6, cyclic_tree)
|
||||
|
||||
|
||||
def test_parser_is_nonproj_tree(
|
||||
proj_tree, nonproj_tree, partial_tree, multirooted_tree
|
||||
proj_tree, cyclic_tree, nonproj_tree, partial_tree, multirooted_tree
|
||||
):
|
||||
assert is_nonproj_tree(proj_tree) is False
|
||||
assert is_nonproj_tree(nonproj_tree) is True
|
||||
assert is_nonproj_tree(partial_tree) is False
|
||||
assert is_nonproj_tree(multirooted_tree) is True
|
||||
with pytest.raises(
|
||||
ValueError, match=r"Found cycle in dependency graph: \[1, 2, 2, 4, 5, 3, 2\]"
|
||||
):
|
||||
is_nonproj_tree(cyclic_tree)
|
||||
|
||||
|
||||
def test_parser_pseudoprojectivity(en_vocab):
|
||||
|
@ -84,8 +94,10 @@ def test_parser_pseudoprojectivity(en_vocab):
|
|||
tree = [1, 2, 2]
|
||||
nonproj_tree = [1, 2, 2, 4, 5, 2, 7, 4, 2]
|
||||
nonproj_tree2 = [9, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1]
|
||||
cyclic_tree = [1, 2, 2, 4, 5, 3, 2]
|
||||
labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"]
|
||||
labels2 = ["advmod", "root", "det", "nsubj", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod", "advmod", "det", "amod", "punct"]
|
||||
cyclic_labels = ["det", "nsubj", "root", "det", "dobj", "aux", "punct"]
|
||||
# fmt: on
|
||||
assert nonproj.decompose("X||Y") == ("X", "Y")
|
||||
assert nonproj.decompose("X") == ("X", "")
|
||||
|
@ -97,6 +109,8 @@ def test_parser_pseudoprojectivity(en_vocab):
|
|||
assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree2) == 10
|
||||
# fmt: off
|
||||
proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
|
||||
with pytest.raises(ValueError, match=r'Found cycle in dependency graph: \[1, 2, 2, 4, 5, 3, 2\]'):
|
||||
nonproj.projectivize(cyclic_tree, cyclic_labels)
|
||||
assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]
|
||||
assert deco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
|
||||
"nsubj", "acl||dobj", "punct"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Callable, Iterable
|
||||
from typing import Callable, Iterable, Dict, Any
|
||||
|
||||
import pytest
|
||||
from numpy.testing import assert_equal
|
||||
|
@ -207,7 +207,7 @@ def test_no_entities():
|
|||
nlp.add_pipe("sentencizer", first=True)
|
||||
|
||||
# this will run the pipeline on the examples and shouldn't crash
|
||||
results = nlp.evaluate(train_examples)
|
||||
nlp.evaluate(train_examples)
|
||||
|
||||
|
||||
def test_partial_links():
|
||||
|
@ -1063,7 +1063,7 @@ def test_no_gold_ents(patterns):
|
|||
"entity_linker", config={"use_gold_ents": False}, last=True
|
||||
)
|
||||
entity_linker.set_kb(create_kb)
|
||||
assert entity_linker.use_gold_ents == False
|
||||
assert entity_linker.use_gold_ents is False
|
||||
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
for i in range(2):
|
||||
|
@ -1074,7 +1074,7 @@ def test_no_gold_ents(patterns):
|
|||
nlp.add_pipe("sentencizer", first=True)
|
||||
|
||||
# this will run the pipeline on the examples and shouldn't crash
|
||||
results = nlp.evaluate(train_examples)
|
||||
nlp.evaluate(train_examples)
|
||||
|
||||
|
||||
@pytest.mark.issue(9575)
|
||||
|
@ -1114,4 +1114,61 @@ def test_tokenization_mismatch():
|
|||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
|
||||
nlp.add_pipe("sentencizer", first=True)
|
||||
results = nlp.evaluate(train_examples)
|
||||
nlp.evaluate(train_examples)
|
||||
|
||||
|
||||
# fmt: off
|
||||
@pytest.mark.parametrize(
|
||||
"meet_threshold,config",
|
||||
[
|
||||
(False, {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
|
||||
(True, {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
|
||||
],
|
||||
)
|
||||
# fmt: on
|
||||
def test_threshold(meet_threshold: bool, config: Dict[str, Any]):
|
||||
"""Tests abstention threshold.
|
||||
meet_threshold (bool): Whether to configure NEL setup so that confidence threshold is met.
|
||||
config (Dict[str, Any]): NEL architecture config.
|
||||
"""
|
||||
nlp = English()
|
||||
nlp.add_pipe("sentencizer")
|
||||
text = "Mahler's Symphony No. 8 was beautiful."
|
||||
entities = [(0, 6, "PERSON")]
|
||||
links = {(0, 6): {"Q7304": 1.0}}
|
||||
sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
|
||||
entity_id = "Q7304"
|
||||
doc = nlp(text)
|
||||
train_examples = [
|
||||
Example.from_dict(
|
||||
doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
|
||||
)
|
||||
]
|
||||
|
||||
def create_kb(vocab):
|
||||
# create artificial KB
|
||||
mykb = KnowledgeBase(vocab, entity_vector_length=3)
|
||||
mykb.add_entity(entity=entity_id, freq=12, entity_vector=[6, -4, 3])
|
||||
mykb.add_alias(
|
||||
alias="Mahler",
|
||||
entities=[entity_id],
|
||||
probabilities=[1 if meet_threshold else 0.01],
|
||||
)
|
||||
return mykb
|
||||
|
||||
# Create the Entity Linker component and add it to the pipeline
|
||||
entity_linker = nlp.add_pipe(
|
||||
"entity_linker",
|
||||
last=True,
|
||||
config={"threshold": 0.99, "model": config},
|
||||
)
|
||||
entity_linker.set_kb(create_kb) # type: ignore
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
# Add a custom rule-based component to mimick NER
|
||||
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
|
||||
ruler.add_patterns([{"label": "PERSON", "pattern": [{"LOWER": "mahler"}]}]) # type: ignore
|
||||
doc = nlp(text)
|
||||
|
||||
assert len(doc.ents) == 1
|
||||
assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
|
||||
|
|
|
@ -491,7 +491,6 @@ def test_entity_ruler_remove_nonexisting_pattern(nlp, entity_ruler_factory):
|
|||
ruler.remove_by_id("nepattern")
|
||||
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
|
|
|
@ -4,13 +4,14 @@ import numpy
|
|||
import pytest
|
||||
from thinc.api import get_current_ops
|
||||
|
||||
import spacy
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.en.syntax_iterators import noun_chunks
|
||||
from spacy.language import Language
|
||||
from spacy.pipeline import TrainablePipe
|
||||
from spacy.tokens import Doc
|
||||
from spacy.training import Example
|
||||
from spacy.util import SimpleFrozenList, get_arg_names
|
||||
from spacy.util import SimpleFrozenList, get_arg_names, make_tempdir
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
|
||||
|
@ -602,3 +603,52 @@ def test_update_with_annotates():
|
|||
assert results[component] == "".join(eg.predicted.text for eg in examples)
|
||||
for component in components - set(components_to_annotate):
|
||||
assert results[component] == ""
|
||||
|
||||
|
||||
def test_load_disable_enable() -> None:
|
||||
"""
|
||||
Tests spacy.load() with dis-/enabling components.
|
||||
"""
|
||||
|
||||
base_nlp = English()
|
||||
for pipe in ("sentencizer", "tagger", "parser"):
|
||||
base_nlp.add_pipe(pipe)
|
||||
|
||||
with make_tempdir() as tmp_dir:
|
||||
base_nlp.to_disk(tmp_dir)
|
||||
to_disable = ["parser", "tagger"]
|
||||
to_enable = ["tagger", "parser"]
|
||||
|
||||
# Setting only `disable`.
|
||||
nlp = spacy.load(tmp_dir, disable=to_disable)
|
||||
assert all([comp_name in nlp.disabled for comp_name in to_disable])
|
||||
|
||||
# Setting only `enable`.
|
||||
nlp = spacy.load(tmp_dir, enable=to_enable)
|
||||
assert all(
|
||||
[
|
||||
(comp_name in nlp.disabled) is (comp_name not in to_enable)
|
||||
for comp_name in nlp.component_names
|
||||
]
|
||||
)
|
||||
|
||||
# Testing consistent enable/disable combination.
|
||||
nlp = spacy.load(
|
||||
tmp_dir,
|
||||
enable=to_enable,
|
||||
disable=[
|
||||
comp_name
|
||||
for comp_name in nlp.component_names
|
||||
if comp_name not in to_enable
|
||||
],
|
||||
)
|
||||
assert all(
|
||||
[
|
||||
(comp_name in nlp.disabled) is (comp_name not in to_enable)
|
||||
for comp_name in nlp.component_names
|
||||
]
|
||||
)
|
||||
|
||||
# Inconsistent enable/disable combination.
|
||||
with pytest.raises(ValueError):
|
||||
spacy.load(tmp_dir, enable=to_enable, disable=["parser"])
|
||||
|
|
|
@ -589,6 +589,7 @@ def test_string_to_list_intify(value):
|
|||
assert string_to_list(value, intify=True) == [1, 2, 3]
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Temporarily skip for dev version")
|
||||
def test_download_compatibility():
|
||||
spec = SpecifierSet("==" + about.__version__)
|
||||
spec.prereleases = False
|
||||
|
@ -599,6 +600,7 @@ def test_download_compatibility():
|
|||
assert get_minor_version(about.__version__) == get_minor_version(version)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Temporarily skip for dev version")
|
||||
def test_validate_compatibility_table():
|
||||
spec = SpecifierSet("==" + about.__version__)
|
||||
spec.prereleases = False
|
||||
|
|
|
@ -671,13 +671,38 @@ def test_gold_ner_missing_tags(en_tokenizer):
|
|||
|
||||
def test_projectivize(en_tokenizer):
|
||||
doc = en_tokenizer("He pretty quickly walks away")
|
||||
heads = [3, 2, 3, 0, 2]
|
||||
heads = [3, 2, 3, 3, 2]
|
||||
deps = ["dep"] * len(heads)
|
||||
example = Example.from_dict(doc, {"heads": heads, "deps": deps})
|
||||
proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
|
||||
nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False)
|
||||
assert proj_heads == [3, 2, 3, 0, 3]
|
||||
assert nonproj_heads == [3, 2, 3, 0, 2]
|
||||
assert proj_heads == [3, 2, 3, 3, 3]
|
||||
assert nonproj_heads == [3, 2, 3, 3, 2]
|
||||
|
||||
# Test single token documents
|
||||
doc = en_tokenizer("Conrail")
|
||||
heads = [0]
|
||||
deps = ["dep"]
|
||||
example = Example.from_dict(doc, {"heads": heads, "deps": deps})
|
||||
proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
|
||||
assert proj_heads == heads
|
||||
assert proj_labels == deps
|
||||
|
||||
# Test documents with no alignments
|
||||
doc_a = Doc(
|
||||
doc.vocab, words=["Double-Jointed"], spaces=[False], deps=["ROOT"], heads=[0]
|
||||
)
|
||||
doc_b = Doc(
|
||||
doc.vocab,
|
||||
words=["Double", "-", "Jointed"],
|
||||
spaces=[True, True, True],
|
||||
deps=["amod", "punct", "ROOT"],
|
||||
heads=[2, 2, 2],
|
||||
)
|
||||
example = Example(doc_a, doc_b)
|
||||
proj_heads, proj_deps = example.get_aligned_parse(projectivize=True)
|
||||
assert proj_heads == [None]
|
||||
assert proj_deps == [None]
|
||||
|
||||
|
||||
def test_iob_to_biluo():
|
||||
|
|
|
@ -5,6 +5,7 @@ import srsly
|
|||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.util import make_tempdir # noqa: F401
|
||||
from spacy.training import split_bilu_label
|
||||
from thinc.api import get_current_ops
|
||||
|
||||
|
||||
|
@ -40,7 +41,7 @@ def apply_transition_sequence(parser, doc, sequence):
|
|||
desired state."""
|
||||
for action_name in sequence:
|
||||
if "-" in action_name:
|
||||
move, label = action_name.split("-")
|
||||
move, label = split_bilu_label(action_name)
|
||||
parser.add_label(label)
|
||||
with parser.step_through(doc) as stepwise:
|
||||
for transition in sequence:
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import pytest
|
||||
import numpy
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
from ..util import get_cosine, add_vecs_to_vocab
|
||||
|
||||
|
@ -71,19 +72,17 @@ def test_vectors_similarity_DD(vocab, vectors):
|
|||
def test_vectors_similarity_TD(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = Doc(vocab, words=[word1, word2])
|
||||
with pytest.warns(UserWarning):
|
||||
assert isinstance(doc.similarity(doc[0]), float)
|
||||
assert isinstance(doc[0].similarity(doc), float)
|
||||
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
|
||||
assert isinstance(doc.similarity(doc[0]), float)
|
||||
assert isinstance(doc[0].similarity(doc), float)
|
||||
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
|
||||
|
||||
|
||||
def test_vectors_similarity_TS(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = Doc(vocab, words=[word1, word2])
|
||||
with pytest.warns(UserWarning):
|
||||
assert isinstance(doc[:2].similarity(doc[0]), float)
|
||||
assert isinstance(doc[0].similarity(doc[-2]), float)
|
||||
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
|
||||
assert isinstance(doc[:2].similarity(doc[0]), float)
|
||||
assert isinstance(doc[0].similarity(doc[:2]), float)
|
||||
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
|
||||
|
||||
|
||||
def test_vectors_similarity_DS(vocab, vectors):
|
||||
|
@ -91,3 +90,21 @@ def test_vectors_similarity_DS(vocab, vectors):
|
|||
doc = Doc(vocab, words=[word1, word2])
|
||||
assert isinstance(doc.similarity(doc[:2]), float)
|
||||
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
|
||||
|
||||
|
||||
def test_vectors_similarity_no_vectors():
|
||||
vocab = Vocab()
|
||||
doc1 = Doc(vocab, words=["a", "b"])
|
||||
doc2 = Doc(vocab, words=["c", "d", "e"])
|
||||
with pytest.warns(UserWarning):
|
||||
doc1.similarity(doc2)
|
||||
with pytest.warns(UserWarning):
|
||||
doc1.similarity(doc2[1])
|
||||
with pytest.warns(UserWarning):
|
||||
doc1.similarity(doc2[:2])
|
||||
with pytest.warns(UserWarning):
|
||||
doc2.similarity(doc1)
|
||||
with pytest.warns(UserWarning):
|
||||
doc2[1].similarity(doc1)
|
||||
with pytest.warns(UserWarning):
|
||||
doc2[:2].similarity(doc1)
|
||||
|
|
|
@ -318,17 +318,15 @@ def test_vectors_lexeme_doc_similarity(vocab, text):
|
|||
@pytest.mark.parametrize("text", [["apple", "orange", "juice"]])
|
||||
def test_vectors_span_span_similarity(vocab, text):
|
||||
doc = Doc(vocab, words=text)
|
||||
with pytest.warns(UserWarning):
|
||||
assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
|
||||
assert -1.0 < doc[0:2].similarity(doc[1:3]) < 1.0
|
||||
assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
|
||||
assert -1.0 < doc[0:2].similarity(doc[1:3]) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", [["apple", "orange", "juice"]])
|
||||
def test_vectors_span_doc_similarity(vocab, text):
|
||||
doc = Doc(vocab, words=text)
|
||||
with pytest.warns(UserWarning):
|
||||
assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
|
||||
assert -1.0 < doc[0:2].similarity(doc) < 1.0
|
||||
assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
|
||||
assert -1.0 < doc[0:2].similarity(doc) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
|
|
@ -607,7 +607,8 @@ cdef class Doc:
|
|||
if self.vocab.vectors.n_keys == 0:
|
||||
warnings.warn(Warnings.W007.format(obj="Doc"))
|
||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||
warnings.warn(Warnings.W008.format(obj="Doc"))
|
||||
if not self.has_vector or not other.has_vector:
|
||||
warnings.warn(Warnings.W008.format(obj="Doc"))
|
||||
return 0.0
|
||||
vector = self.vector
|
||||
xp = get_array_module(vector)
|
||||
|
@ -627,7 +628,7 @@ cdef class Doc:
|
|||
if "has_vector" in self.user_hooks:
|
||||
return self.user_hooks["has_vector"](self)
|
||||
elif self.vocab.vectors.size:
|
||||
return True
|
||||
return any(token.has_vector for token in self)
|
||||
elif self.tensor.size:
|
||||
return True
|
||||
else:
|
||||
|
|
|
@ -354,7 +354,8 @@ cdef class Span:
|
|||
if self.vocab.vectors.n_keys == 0:
|
||||
warnings.warn(Warnings.W007.format(obj="Span"))
|
||||
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
|
||||
warnings.warn(Warnings.W008.format(obj="Span"))
|
||||
if not self.has_vector or not other.has_vector:
|
||||
warnings.warn(Warnings.W008.format(obj="Span"))
|
||||
return 0.0
|
||||
vector = self.vector
|
||||
xp = get_array_module(vector)
|
||||
|
|
|
@ -206,7 +206,8 @@ cdef class Token:
|
|||
if self.vocab.vectors.n_keys == 0:
|
||||
warnings.warn(Warnings.W007.format(obj="Token"))
|
||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||
warnings.warn(Warnings.W008.format(obj="Token"))
|
||||
if not self.has_vector or not other.has_vector:
|
||||
warnings.warn(Warnings.W008.format(obj="Token"))
|
||||
return 0.0
|
||||
vector = self.vector
|
||||
xp = get_array_module(vector)
|
||||
|
|
|
@ -5,6 +5,7 @@ from .augment import dont_augment, orth_variants_augmenter # noqa: F401
|
|||
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
|
||||
from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401
|
||||
from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401
|
||||
from .iob_utils import split_bilu_label, remove_bilu_prefix # noqa: F401
|
||||
from .gold_io import docs_to_json, read_json_file # noqa: F401
|
||||
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
|
||||
from .loggers import console_logger # noqa: F401
|
||||
|
|
|
@ -1,33 +1,39 @@
|
|||
from typing import List
|
||||
from ..errors import Errors
|
||||
import numpy
|
||||
from libc.stdint cimport int32_t
|
||||
|
||||
|
||||
cdef class AlignmentArray:
|
||||
"""AlignmentArray is similar to Thinc's Ragged with two simplfications:
|
||||
indexing returns numpy arrays and this type can only be used for CPU arrays.
|
||||
However, these changes make AlginmentArray more efficient for indexing in a
|
||||
However, these changes make AlignmentArray more efficient for indexing in a
|
||||
tight loop."""
|
||||
|
||||
__slots__ = []
|
||||
|
||||
def __init__(self, alignment: List[List[int]]):
|
||||
self._lengths = None
|
||||
self._starts_ends = numpy.zeros(len(alignment) + 1, dtype="i")
|
||||
|
||||
cdef int data_len = 0
|
||||
cdef int outer_len
|
||||
cdef int idx
|
||||
|
||||
self._starts_ends = numpy.zeros(len(alignment) + 1, dtype='int32')
|
||||
cdef int32_t* starts_ends_ptr = <int32_t*>self._starts_ends.data
|
||||
|
||||
for idx, outer in enumerate(alignment):
|
||||
outer_len = len(outer)
|
||||
self._starts_ends[idx + 1] = self._starts_ends[idx] + outer_len
|
||||
starts_ends_ptr[idx + 1] = starts_ends_ptr[idx] + outer_len
|
||||
data_len += outer_len
|
||||
|
||||
self._data = numpy.empty(data_len, dtype="i")
|
||||
self._lengths = None
|
||||
self._data = numpy.empty(data_len, dtype="int32")
|
||||
|
||||
idx = 0
|
||||
cdef int32_t* data_ptr = <int32_t*>self._data.data
|
||||
|
||||
for outer in alignment:
|
||||
for inner in outer:
|
||||
self._data[idx] = inner
|
||||
data_ptr[idx] = inner
|
||||
idx += 1
|
||||
|
||||
def __getitem__(self, idx):
|
||||
|
|
|
@ -3,10 +3,10 @@ from typing import Optional
|
|||
import random
|
||||
import itertools
|
||||
from functools import partial
|
||||
from pydantic import BaseModel, StrictStr
|
||||
|
||||
from ..util import registry
|
||||
from .example import Example
|
||||
from .iob_utils import split_bilu_label
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..language import Language # noqa: F401
|
||||
|
@ -278,10 +278,8 @@ def make_whitespace_variant(
|
|||
ent_prev = doc_dict["entities"][position - 1]
|
||||
ent_next = doc_dict["entities"][position]
|
||||
if "-" in ent_prev and "-" in ent_next:
|
||||
ent_iob_prev = ent_prev.split("-")[0]
|
||||
ent_type_prev = ent_prev.split("-", 1)[1]
|
||||
ent_iob_next = ent_next.split("-")[0]
|
||||
ent_type_next = ent_next.split("-", 1)[1]
|
||||
ent_iob_prev, ent_type_prev = split_bilu_label(ent_prev)
|
||||
ent_iob_next, ent_type_next = split_bilu_label(ent_next)
|
||||
if (
|
||||
ent_iob_prev in ("B", "I")
|
||||
and ent_iob_next in ("I", "L")
|
||||
|
|
|
@ -9,11 +9,11 @@ from ..tokens.span import Span
|
|||
from ..attrs import IDS
|
||||
from .alignment import Alignment
|
||||
from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
|
||||
from .iob_utils import biluo_tags_to_spans
|
||||
from .iob_utils import biluo_tags_to_spans, remove_bilu_prefix
|
||||
from ..errors import Errors, Warnings
|
||||
from ..pipeline._parser_internals import nonproj
|
||||
from ..tokens.token cimport MISSING_DEP
|
||||
from ..util import logger, to_ternary_int
|
||||
from ..util import logger, to_ternary_int, all_equal
|
||||
|
||||
|
||||
cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
|
||||
|
@ -151,50 +151,127 @@ cdef class Example:
|
|||
self._y_sig = y_sig
|
||||
return self._cached_alignment
|
||||
|
||||
|
||||
def _get_aligned_vectorized(self, align, gold_values):
|
||||
# Fast path for Doc attributes/fields that are predominantly a single value,
|
||||
# i.e., TAG, POS, MORPH.
|
||||
x2y_single_toks = []
|
||||
x2y_single_toks_i = []
|
||||
|
||||
x2y_multiple_toks = []
|
||||
x2y_multiple_toks_i = []
|
||||
|
||||
# Gather indices of gold tokens aligned to the candidate tokens into two buckets.
|
||||
# Bucket 1: All tokens that have a one-to-one alignment.
|
||||
# Bucket 2: All tokens that have a one-to-many alignment.
|
||||
for idx, token in enumerate(self.predicted):
|
||||
aligned_gold_i = align[token.i]
|
||||
aligned_gold_len = len(aligned_gold_i)
|
||||
|
||||
if aligned_gold_len == 1:
|
||||
x2y_single_toks.append(aligned_gold_i.item())
|
||||
x2y_single_toks_i.append(idx)
|
||||
elif aligned_gold_len > 1:
|
||||
x2y_multiple_toks.append(aligned_gold_i)
|
||||
x2y_multiple_toks_i.append(idx)
|
||||
|
||||
# Map elements of the first bucket directly to the output array.
|
||||
output = numpy.full(len(self.predicted), None)
|
||||
output[x2y_single_toks_i] = gold_values[x2y_single_toks].squeeze()
|
||||
|
||||
# Collapse many-to-one alignments into one-to-one alignments if they
|
||||
# share the same value. Map to None in all other cases.
|
||||
for i in range(len(x2y_multiple_toks)):
|
||||
aligned_gold_values = gold_values[x2y_multiple_toks[i]]
|
||||
|
||||
# If all aligned tokens have the same value, use it.
|
||||
if all_equal(aligned_gold_values):
|
||||
x2y_multiple_toks[i] = aligned_gold_values[0].item()
|
||||
else:
|
||||
x2y_multiple_toks[i] = None
|
||||
|
||||
output[x2y_multiple_toks_i] = x2y_multiple_toks
|
||||
|
||||
return output.tolist()
|
||||
|
||||
|
||||
def _get_aligned_non_vectorized(self, align, gold_values):
|
||||
# Slower path for fields that return multiple values (resulting
|
||||
# in ragged arrays that cannot be vectorized trivially).
|
||||
output = [None] * len(self.predicted)
|
||||
|
||||
for token in self.predicted:
|
||||
aligned_gold_i = align[token.i]
|
||||
values = gold_values[aligned_gold_i].ravel()
|
||||
if len(values) == 1:
|
||||
output[token.i] = values.item()
|
||||
elif all_equal(values):
|
||||
# If all aligned tokens have the same value, use it.
|
||||
output[token.i] = values[0].item()
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def get_aligned(self, field, as_string=False):
|
||||
"""Return an aligned array for a token attribute."""
|
||||
align = self.alignment.x2y
|
||||
gold_values = self.reference.to_array([field])
|
||||
|
||||
if len(gold_values.shape) == 1:
|
||||
output = self._get_aligned_vectorized(align, gold_values)
|
||||
else:
|
||||
output = self._get_aligned_non_vectorized(align, gold_values)
|
||||
|
||||
vocab = self.reference.vocab
|
||||
gold_values = self.reference.to_array([field])
|
||||
output = [None] * len(self.predicted)
|
||||
for token in self.predicted:
|
||||
values = gold_values[align[token.i]]
|
||||
values = values.ravel()
|
||||
if len(values) == 0:
|
||||
output[token.i] = None
|
||||
elif len(values) == 1:
|
||||
output[token.i] = values[0]
|
||||
elif len(set(list(values))) == 1:
|
||||
# If all aligned tokens have the same value, use it.
|
||||
output[token.i] = values[0]
|
||||
else:
|
||||
output[token.i] = None
|
||||
if as_string and field not in ["ENT_IOB", "SENT_START"]:
|
||||
output = [vocab.strings[o] if o is not None else o for o in output]
|
||||
|
||||
return output
|
||||
|
||||
def get_aligned_parse(self, projectivize=True):
|
||||
cand_to_gold = self.alignment.x2y
|
||||
gold_to_cand = self.alignment.y2x
|
||||
aligned_heads = [None] * self.x.length
|
||||
aligned_deps = [None] * self.x.length
|
||||
has_deps = [token.has_dep() for token in self.y]
|
||||
has_heads = [token.has_head() for token in self.y]
|
||||
heads = [token.head.i for token in self.y]
|
||||
deps = [token.dep_ for token in self.y]
|
||||
|
||||
if projectivize:
|
||||
proj_heads, proj_deps = nonproj.projectivize(heads, deps)
|
||||
has_deps = [token.has_dep() for token in self.y]
|
||||
has_heads = [token.has_head() for token in self.y]
|
||||
|
||||
# ensure that missing data remains missing
|
||||
heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)]
|
||||
deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)]
|
||||
for cand_i in range(self.x.length):
|
||||
if cand_to_gold.lengths[cand_i] == 1:
|
||||
gold_i = cand_to_gold[cand_i][0]
|
||||
if gold_to_cand.lengths[heads[gold_i]] == 1:
|
||||
aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]][0])
|
||||
aligned_deps[cand_i] = deps[gold_i]
|
||||
return aligned_heads, aligned_deps
|
||||
|
||||
# Select all candidate tokens that are aligned to a single gold token.
|
||||
c2g_single_toks = numpy.where(cand_to_gold.lengths == 1)[0]
|
||||
|
||||
# Fetch all aligned gold token incides.
|
||||
if c2g_single_toks.shape == cand_to_gold.lengths.shape:
|
||||
# This the most likely case.
|
||||
gold_i = cand_to_gold[:]
|
||||
else:
|
||||
gold_i = numpy.vectorize(lambda x: cand_to_gold[int(x)][0], otypes='i')(c2g_single_toks)
|
||||
|
||||
# Fetch indices of all gold heads for the aligned gold tokens.
|
||||
heads = numpy.asarray(heads, dtype='i')
|
||||
gold_head_i = heads[gold_i]
|
||||
|
||||
# Select all gold tokens that are heads of the previously selected
|
||||
# gold tokens (and are aligned to a single candidate token).
|
||||
g2c_len_heads = gold_to_cand.lengths[gold_head_i]
|
||||
g2c_len_heads = numpy.where(g2c_len_heads == 1)[0]
|
||||
g2c_i = numpy.vectorize(lambda x: gold_to_cand[int(x)][0], otypes='i')(gold_head_i[g2c_len_heads]).squeeze()
|
||||
|
||||
# Update head/dep alignments with the above.
|
||||
aligned_heads = numpy.full((self.x.length), None)
|
||||
aligned_heads[c2g_single_toks[g2c_len_heads]] = g2c_i
|
||||
|
||||
deps = numpy.asarray(deps)
|
||||
aligned_deps = numpy.full((self.x.length), None)
|
||||
aligned_deps[c2g_single_toks] = deps[gold_i]
|
||||
|
||||
return aligned_heads.tolist(), aligned_deps.tolist()
|
||||
|
||||
def get_aligned_sent_starts(self):
|
||||
"""Get list of SENT_START attributes aligned to the predicted tokenization.
|
||||
|
@ -519,7 +596,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
|
|||
else:
|
||||
ent_iobs.append(iob_tag.split("-")[0])
|
||||
if iob_tag.startswith("I") or iob_tag.startswith("B"):
|
||||
ent_types.append(iob_tag.split("-", 1)[1])
|
||||
ent_types.append(remove_bilu_prefix(iob_tag))
|
||||
else:
|
||||
ent_types.append("")
|
||||
return ent_iobs, ent_types
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import List, Dict, Tuple, Iterable, Union, Iterator
|
||||
from typing import List, Dict, Tuple, Iterable, Union, Iterator, cast
|
||||
import warnings
|
||||
|
||||
from ..errors import Errors, Warnings
|
||||
|
@ -218,6 +218,14 @@ def tags_to_entities(tags: Iterable[str]) -> List[Tuple[str, int, int]]:
|
|||
return entities
|
||||
|
||||
|
||||
def split_bilu_label(label: str) -> Tuple[str, str]:
|
||||
return cast(Tuple[str, str], label.split("-", 1))
|
||||
|
||||
|
||||
def remove_bilu_prefix(label: str) -> str:
|
||||
return label.split("-", 1)[1]
|
||||
|
||||
|
||||
# Fallbacks to make backwards-compat easier
|
||||
offsets_from_biluo_tags = biluo_tags_to_offsets
|
||||
spans_from_biluo_tags = biluo_tags_to_spans
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from typing import List, Mapping, NoReturn, Union, Dict, Any, Set
|
||||
from typing import List, Mapping, NoReturn, Union, Dict, Any, Set, cast
|
||||
from typing import Optional, Iterable, Callable, Tuple, Type
|
||||
from typing import Iterator, Type, Pattern, Generator, TYPE_CHECKING
|
||||
from typing import Iterator, Pattern, Generator, TYPE_CHECKING
|
||||
from types import ModuleType
|
||||
import os
|
||||
import importlib
|
||||
|
@ -12,7 +12,6 @@ from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
|
|||
from thinc.api import ConfigValidationError, Model
|
||||
import functools
|
||||
import itertools
|
||||
import numpy.random
|
||||
import numpy
|
||||
import srsly
|
||||
import catalogue
|
||||
|
@ -294,7 +293,7 @@ def find_matching_language(lang: str) -> Optional[str]:
|
|||
|
||||
# Find out which language modules we have
|
||||
possible_languages = []
|
||||
for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore
|
||||
for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore[attr-defined]
|
||||
code = modinfo.name
|
||||
if code == "xx":
|
||||
# Temporarily make 'xx' into a valid language code
|
||||
|
@ -391,7 +390,8 @@ def get_module_path(module: ModuleType) -> Path:
|
|||
"""
|
||||
if not hasattr(module, "__module__"):
|
||||
raise ValueError(Errors.E169.format(module=repr(module)))
|
||||
return Path(sys.modules[module.__module__].__file__).parent
|
||||
file_path = Path(cast(os.PathLike, sys.modules[module.__module__].__file__))
|
||||
return file_path.parent
|
||||
|
||||
|
||||
def load_model(
|
||||
|
@ -399,6 +399,7 @@ def load_model(
|
|||
*,
|
||||
vocab: Union["Vocab", bool] = True,
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
enable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
|
||||
) -> "Language":
|
||||
|
@ -408,11 +409,19 @@ def load_model(
|
|||
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
|
||||
a new Vocab object will be created.
|
||||
disable (Iterable[str]): Names of pipeline components to disable.
|
||||
enable (Iterable[str]): Names of pipeline components to enable. All others will be disabled.
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
keyed by section values in dot notation.
|
||||
RETURNS (Language): The loaded nlp object.
|
||||
"""
|
||||
kwargs = {"vocab": vocab, "disable": disable, "exclude": exclude, "config": config}
|
||||
kwargs = {
|
||||
"vocab": vocab,
|
||||
"disable": disable,
|
||||
"enable": enable,
|
||||
"exclude": exclude,
|
||||
"config": config,
|
||||
}
|
||||
if isinstance(name, str): # name or string path
|
||||
if name.startswith("blank:"): # shortcut for blank model
|
||||
return get_lang_class(name.replace("blank:", ""))()
|
||||
|
@ -432,6 +441,7 @@ def load_model_from_package(
|
|||
*,
|
||||
vocab: Union["Vocab", bool] = True,
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
enable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
|
||||
) -> "Language":
|
||||
|
@ -443,6 +453,8 @@ def load_model_from_package(
|
|||
disable (Iterable[str]): Names of pipeline components to disable. Disabled
|
||||
pipes will be loaded but they won't be run unless you explicitly
|
||||
enable them by calling nlp.enable_pipe.
|
||||
enable (Iterable[str]): Names of pipeline components to enable. All other
|
||||
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
|
||||
components won't be loaded.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
|
@ -450,7 +462,7 @@ def load_model_from_package(
|
|||
RETURNS (Language): The loaded nlp object.
|
||||
"""
|
||||
cls = importlib.import_module(name)
|
||||
return cls.load(vocab=vocab, disable=disable, exclude=exclude, config=config) # type: ignore[attr-defined]
|
||||
return cls.load(vocab=vocab, disable=disable, enable=enable, exclude=exclude, config=config) # type: ignore[attr-defined]
|
||||
|
||||
|
||||
def load_model_from_path(
|
||||
|
@ -459,6 +471,7 @@ def load_model_from_path(
|
|||
meta: Optional[Dict[str, Any]] = None,
|
||||
vocab: Union["Vocab", bool] = True,
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
enable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
|
||||
) -> "Language":
|
||||
|
@ -472,6 +485,8 @@ def load_model_from_path(
|
|||
disable (Iterable[str]): Names of pipeline components to disable. Disabled
|
||||
pipes will be loaded but they won't be run unless you explicitly
|
||||
enable them by calling nlp.enable_pipe.
|
||||
enable (Iterable[str]): Names of pipeline components to enable. All other
|
||||
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
|
||||
components won't be loaded.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
|
@ -486,7 +501,12 @@ def load_model_from_path(
|
|||
overrides = dict_to_dot(config)
|
||||
config = load_config(config_path, overrides=overrides)
|
||||
nlp = load_model_from_config(
|
||||
config, vocab=vocab, disable=disable, exclude=exclude, meta=meta
|
||||
config,
|
||||
vocab=vocab,
|
||||
disable=disable,
|
||||
enable=enable,
|
||||
exclude=exclude,
|
||||
meta=meta,
|
||||
)
|
||||
return nlp.from_disk(model_path, exclude=exclude, overrides=overrides)
|
||||
|
||||
|
@ -497,6 +517,7 @@ def load_model_from_config(
|
|||
meta: Dict[str, Any] = SimpleFrozenDict(),
|
||||
vocab: Union["Vocab", bool] = True,
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
enable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
auto_fill: bool = False,
|
||||
validate: bool = True,
|
||||
|
@ -511,6 +532,8 @@ def load_model_from_config(
|
|||
disable (Iterable[str]): Names of pipeline components to disable. Disabled
|
||||
pipes will be loaded but they won't be run unless you explicitly
|
||||
enable them by calling nlp.enable_pipe.
|
||||
enable (Iterable[str]): Names of pipeline components to enable. All other
|
||||
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
|
||||
components won't be loaded.
|
||||
auto_fill (bool): Whether to auto-fill config with missing defaults.
|
||||
|
@ -529,6 +552,7 @@ def load_model_from_config(
|
|||
config,
|
||||
vocab=vocab,
|
||||
disable=disable,
|
||||
enable=enable,
|
||||
exclude=exclude,
|
||||
auto_fill=auto_fill,
|
||||
validate=validate,
|
||||
|
@ -593,6 +617,7 @@ def load_model_from_init_py(
|
|||
*,
|
||||
vocab: Union["Vocab", bool] = True,
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
enable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
|
||||
) -> "Language":
|
||||
|
@ -604,6 +629,8 @@ def load_model_from_init_py(
|
|||
disable (Iterable[str]): Names of pipeline components to disable. Disabled
|
||||
pipes will be loaded but they won't be run unless you explicitly
|
||||
enable them by calling nlp.enable_pipe.
|
||||
enable (Iterable[str]): Names of pipeline components to enable. All other
|
||||
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
|
||||
components won't be loaded.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
|
@ -621,6 +648,7 @@ def load_model_from_init_py(
|
|||
vocab=vocab,
|
||||
meta=meta,
|
||||
disable=disable,
|
||||
enable=enable,
|
||||
exclude=exclude,
|
||||
config=config,
|
||||
)
|
||||
|
@ -878,7 +906,7 @@ def get_package_path(name: str) -> Path:
|
|||
# Here we're importing the module just to find it. This is worryingly
|
||||
# indirect, but it's otherwise very difficult to find the package.
|
||||
pkg = importlib.import_module(name)
|
||||
return Path(pkg.__file__).parent
|
||||
return Path(cast(Union[str, os.PathLike], pkg.__file__)).parent
|
||||
|
||||
|
||||
def replace_model_node(model: Model, target: Model, replacement: Model) -> None:
|
||||
|
@ -1684,7 +1712,14 @@ def packages_distributions() -> Dict[str, List[str]]:
|
|||
it's not available in the builtin importlib.metadata.
|
||||
"""
|
||||
pkg_to_dist = defaultdict(list)
|
||||
for dist in importlib_metadata.distributions(): # type: ignore[attr-defined]
|
||||
for dist in importlib_metadata.distributions():
|
||||
for pkg in (dist.read_text("top_level.txt") or "").split():
|
||||
pkg_to_dist[pkg].append(dist.metadata["Name"])
|
||||
return dict(pkg_to_dist)
|
||||
|
||||
|
||||
def all_equal(iterable):
|
||||
"""Return True if all the elements are equal to each other
|
||||
(or if the input is an empty sequence), False otherwise."""
|
||||
g = itertools.groupby(iterable)
|
||||
return next(g, True) and not next(g, False)
|
||||
|
|
|
@ -336,10 +336,10 @@ cdef class Vectors:
|
|||
xp = get_array_module(self.data)
|
||||
if key is not None:
|
||||
key = get_string_id(key)
|
||||
return self.key2row.get(key, -1)
|
||||
return self.key2row.get(int(key), -1)
|
||||
elif keys is not None:
|
||||
keys = [get_string_id(key) for key in keys]
|
||||
rows = [self.key2row.get(key, -1.) for key in keys]
|
||||
rows = [self.key2row.get(int(key), -1) for key in keys]
|
||||
return xp.asarray(rows, dtype="i")
|
||||
else:
|
||||
row2key = {row: key for key, row in self.key2row.items()}
|
||||
|
|
|
@ -47,22 +47,24 @@ architectures and their arguments and hyperparameters.
|
|||
> "model": DEFAULT_NEL_MODEL,
|
||||
> "entity_vector_length": 64,
|
||||
> "get_candidates": {'@misc': 'spacy.CandidateGenerator.v1'},
|
||||
> "threshold": None,
|
||||
> }
|
||||
> nlp.add_pipe("entity_linker", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
|
||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
||||
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
||||
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||
| Setting | Description |
|
||||
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
|
||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
||||
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
||||
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||
|
||||
```python
|
||||
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
|
||||
|
@ -95,20 +97,21 @@ custom knowledge base, you should either call
|
|||
[`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the
|
||||
[`initialize`](/api/entitylinker#initialize) call.
|
||||
|
||||
| Name | Description |
|
||||
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ |
|
||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||
| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ |
|
||||
| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ |
|
||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ |
|
||||
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||
| Name | Description |
|
||||
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ |
|
||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||
| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ |
|
||||
| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ |
|
||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ |
|
||||
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||
|
||||
## EntityLinker.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
|
|
@ -59,15 +59,20 @@ matched:
|
|||
> [
|
||||
> {"POS": "ADJ", "OP": "*"},
|
||||
> {"POS": "NOUN", "OP": "+"}
|
||||
> {"POS": "PROPN", "OP": "{2}"}
|
||||
> ]
|
||||
> ```
|
||||
|
||||
| OP | Description |
|
||||
| --- | ---------------------------------------------------------------- |
|
||||
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
|
||||
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
|
||||
| `+` | Require the pattern to match 1 or more times. |
|
||||
| `*` | Allow the pattern to match 0 or more times. |
|
||||
| OP | Description |
|
||||
|---------|------------------------------------------------------------------------|
|
||||
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
|
||||
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
|
||||
| `+` | Require the pattern to match 1 or more times. |
|
||||
| `*` | Allow the pattern to match 0 or more times. |
|
||||
| `{n}` | Require the pattern to match exactly _n_ times. |
|
||||
| `{n,m}` | Require the pattern to match at least _n_ but not more than _m_ times. |
|
||||
| `{n,}` | Require the pattern to match at least _n_ times. |
|
||||
| `{,m}` | Require the pattern to match at most _m_ times. |
|
||||
|
||||
Token patterns can also map to a **dictionary of properties** instead of a
|
||||
single value to indicate whether the expected value is a member of a list or how
|
||||
|
|
|
@ -51,6 +51,7 @@ specified separately using the new `exclude` keyword argument.
|
|||
| _keyword-only_ | |
|
||||
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
||||
| `enable` | Names of pipeline components to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~List[str]~~ |
|
||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ |
|
||||
|
|
|
@ -130,8 +130,8 @@ grateful to use the work of Chainer's [CuPy](https://cupy.chainer.org) module,
|
|||
which provides a numpy-compatible interface for GPU arrays.
|
||||
|
||||
spaCy can be installed for a CUDA-compatible GPU by specifying `spacy[cuda]`,
|
||||
`spacy[cuda102]`, `spacy[cuda112]`, `spacy[cuda113]`, etc. If you know your
|
||||
CUDA version, using the more explicit specifier allows CuPy to be installed via
|
||||
`spacy[cuda102]`, `spacy[cuda112]`, `spacy[cuda113]`, etc. If you know your CUDA
|
||||
version, using the more explicit specifier allows CuPy to be installed via
|
||||
wheel, saving some compilation time. The specifiers should install
|
||||
[`cupy`](https://cupy.chainer.org).
|
||||
|
||||
|
@ -195,29 +195,73 @@ How to install compilers and related build tools:
|
|||
[Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/)
|
||||
that matches the version that was used to compile your Python interpreter.
|
||||
|
||||
#### Using build constraints when compiling from source
|
||||
|
||||
If you install spaCy from source or with `pip` for platforms where there are not
|
||||
binary wheels on PyPI, you may need to use build constraints if any package in
|
||||
your environment requires an older version of `numpy`.
|
||||
|
||||
If `numpy` gets downgraded from the most recent release at any point after
|
||||
you've compiled `spacy`, you might see an error that looks like this:
|
||||
|
||||
```none
|
||||
numpy.ndarray size changed, may indicate binary incompatibility.
|
||||
```
|
||||
|
||||
To fix this, create a new virtual environment and install `spacy` and all of its
|
||||
dependencies using build constraints.
|
||||
[Build constraints](https://pip.pypa.io/en/stable/user_guide/#constraints-files)
|
||||
specify an older version of `numpy` that is only used while compiling `spacy`,
|
||||
and then your runtime environment can use any newer version of `numpy` and still
|
||||
be compatible. In addition, use `--no-cache-dir` to ignore any previously cached
|
||||
wheels so that all relevant packages are recompiled from scratch:
|
||||
|
||||
```shell
|
||||
PIP_CONSTRAINT=https://raw.githubusercontent.com/explosion/spacy/master/build-constraints.txt \
|
||||
pip install spacy --no-cache-dir
|
||||
```
|
||||
|
||||
Our build constraints currently specify the oldest supported `numpy` available
|
||||
on PyPI for `x86_64` and `aarch64`. Depending on your platform and environment,
|
||||
you may want to customize the specific versions of `numpy`. For other platforms,
|
||||
you can have a look at SciPy's
|
||||
[`oldest-supported-numpy`](https://github.com/scipy/oldest-supported-numpy/blob/main/setup.cfg)
|
||||
package to see what the oldest recommended versions of `numpy` are.
|
||||
|
||||
(_Warning_: don't use `pip install -c constraints.txt` instead of
|
||||
`PIP_CONSTRAINT`, since this isn't applied to the isolated build environments.)
|
||||
|
||||
#### Additional options for developers {#source-developers}
|
||||
|
||||
Some additional options may be useful for spaCy developers who are editing the
|
||||
source code and recompiling frequently.
|
||||
|
||||
- Install in editable mode. Changes to `.py` files will be reflected as soon as
|
||||
the files are saved, but edits to Cython files (`.pxd`, `.pyx`) will require
|
||||
the `pip install` or `python setup.py build_ext` command below to be run
|
||||
again. Before installing in editable mode, be sure you have removed any
|
||||
previous installs with `pip uninstall spacy`, which you may need to run
|
||||
multiple times to remove all traces of earlier installs.
|
||||
- Install in editable mode. Changes to `.py` files will be reflected as soon
|
||||
as the files are saved, but edits to Cython files (`.pxd`, `.pyx`) will
|
||||
require the `pip install` command below to be run again. Before installing in
|
||||
editable mode, be sure you have removed any previous installs with
|
||||
`pip uninstall spacy`, which you may need to run multiple times to remove all
|
||||
traces of earlier installs.
|
||||
|
||||
```bash
|
||||
$ pip install -r requirements.txt
|
||||
$ pip install --no-build-isolation --editable .
|
||||
```
|
||||
|
||||
- Build in parallel using `N` CPUs to speed up compilation and then install in
|
||||
editable mode:
|
||||
- Build in parallel. Starting in v3.4.0, you can specify the number of
|
||||
build jobs with the environment variable `SPACY_NUM_BUILD_JOBS`:
|
||||
|
||||
```bash
|
||||
$ pip install -r requirements.txt
|
||||
$ python setup.py build_ext --inplace -j N
|
||||
$ SPACY_NUM_BUILD_JOBS=4 pip install --no-build-isolation --editable .
|
||||
```
|
||||
|
||||
- For editable mode and parallel builds with `python setup.py` instead of `pip`
|
||||
(no longer recommended):
|
||||
|
||||
```bash
|
||||
$ pip install -r requirements.txt
|
||||
$ python setup.py build_ext --inplace -j 4
|
||||
$ python setup.py develop
|
||||
```
|
||||
|
||||
|
|
|
@ -362,6 +362,18 @@ nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser"])
|
|||
nlp.enable_pipe("tagger")
|
||||
```
|
||||
|
||||
In addition to `disable`, `spacy.load()` also accepts `enable`. If `enable` is
|
||||
set, all components except for those in `enable` are disabled.
|
||||
|
||||
```python
|
||||
# Load the complete pipeline, but disable all components except for tok2vec and tagger
|
||||
nlp = spacy.load("en_core_web_sm", enable=["tok2vec", "tagger"])
|
||||
# Has the same effect, as NER is already not part of enabled set of components
|
||||
nlp = spacy.load("en_core_web_sm", enable=["tok2vec", "tagger"], disable=["ner"])
|
||||
# Will raise an error, as the sets of enabled and disabled components are conflicting
|
||||
nlp = spacy.load("en_core_web_sm", enable=["ner"], disable=["ner"])
|
||||
```
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0">
|
||||
|
||||
As of v3.0, the `disable` keyword argument specifies components to load but
|
||||
|
|
|
@ -374,12 +374,16 @@ punctuation marks, or specify optional tokens. Note that there are no nested or
|
|||
scoped quantifiers – instead, you can build those behaviors with `on_match`
|
||||
callbacks.
|
||||
|
||||
| OP | Description |
|
||||
| --- | ---------------------------------------------------------------- |
|
||||
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
|
||||
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
|
||||
| `+` | Require the pattern to match 1 or more times. |
|
||||
| `*` | Allow the pattern to match zero or more times. |
|
||||
| OP | Description |
|
||||
|---------|------------------------------------------------------------------------|
|
||||
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
|
||||
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
|
||||
| `+` | Require the pattern to match 1 or more times. |
|
||||
| `*` | Allow the pattern to match zero or more times. |
|
||||
| `{n}` | Require the pattern to match exactly _n_ times. |
|
||||
| `{n,m}` | Require the pattern to match at least _n_ but not more than _m_ times. |
|
||||
| `{n,}` | Require the pattern to match at least _n_ times. |
|
||||
| `{,m}` | Require the pattern to match at most _m_ times. |
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
143
website/docs/usage/v3-4.md
Normal file
143
website/docs/usage/v3-4.md
Normal file
|
@ -0,0 +1,143 @@
|
|||
---
|
||||
title: What's New in v3.4
|
||||
teaser: New features and how to upgrade
|
||||
menu:
|
||||
- ['New Features', 'features']
|
||||
- ['Upgrading Notes', 'upgrading']
|
||||
---
|
||||
|
||||
## New features {#features hidden="true"}
|
||||
|
||||
spaCy v3.4 brings typing and speed improvements along with new vectors for
|
||||
English CNN pipelines and new trained pipelines for Croatian. This release also
|
||||
includes prebuilt linux aarch64 wheels for all spaCy dependencies distributed by
|
||||
Explosion.
|
||||
|
||||
### Typing improvements {#typing}
|
||||
|
||||
spaCy v3.4 supports pydantic v1.9 and mypy 0.950+ through extensive updates to
|
||||
types in Thinc v8.1.
|
||||
|
||||
### Speed improvements {#speed}
|
||||
|
||||
- For the parser, use C `saxpy`/`sgemm` provided by the `Ops` implementation in
|
||||
order to use Accelerate through `thinc-apple-ops`.
|
||||
- Improved speed of vector lookups.
|
||||
- Improved speed for `Example.get_aligned_parse` and `Example.get_aligned`.
|
||||
|
||||
## Additional features and improvements
|
||||
|
||||
- Min/max `{n,m}` operator for `Matcher` patterns.
|
||||
- Language updates:
|
||||
- Improve tokenization for Cyrillic combining diacritics.
|
||||
- Improve English tokenizer exceptions for contractions with
|
||||
this/that/these/those.
|
||||
- Updated `spacy project clone` to try both `main` and `master` branches by
|
||||
default.
|
||||
- Added confidence threshold for named entity linker.
|
||||
- Improved handling of Typer optional default values for `init_config_cli`.
|
||||
- Added cycle detection in parser projectivization methods.
|
||||
- Added counts for NER labels in `debug data`.
|
||||
- Support for adding NVTX ranges to `TrainablePipe` components.
|
||||
- Support env variable `SPACY_NUM_BUILD_JOBS` to specify the number of build
|
||||
jobs to run in parallel with `pip`.
|
||||
|
||||
## Trained pipelines {#pipelines}
|
||||
|
||||
### New trained pipelines {#new-pipelines}
|
||||
|
||||
v3.4 introduces new CPU/CNN pipelines for Croatian, which use the trainable
|
||||
lemmatizer and [floret vectors](https://github.com/explosion/floret). Due to the
|
||||
use of [Bloom embeddings](https://explosion.ai/blog/bloom-embeddings) and
|
||||
subwords, the pipelines have compact vectors with no out-of-vocabulary words.
|
||||
|
||||
| Package | UPOS | Parser LAS | NER F |
|
||||
| ----------------------------------------------- | ---: | ---------: | ----: |
|
||||
| [`hr_core_news_sm`](/models/hr#hr_core_news_sm) | 96.6 | 77.5 | 76.1 |
|
||||
| [`hr_core_news_md`](/models/hr#hr_core_news_md) | 97.3 | 80.1 | 81.8 |
|
||||
| [`hr_core_news_lg`](/models/hr#hr_core_news_lg) | 97.5 | 80.4 | 83.0 |
|
||||
|
||||
### Pipeline updates {#pipeline-updates}
|
||||
|
||||
All CNN pipelines have been extended with whitespace augmentation.
|
||||
|
||||
The English CNN pipelines have new word vectors:
|
||||
|
||||
| Package | Model Version | TAG | Parser LAS | NER F |
|
||||
| ----------------------------------------------- | ------------- | ---: | ---------: | ----: |
|
||||
| [`en_core_news_md`](/models/en#en_core_news_md) | v3.3.0 | 97.3 | 90.1 | 84.6 |
|
||||
| [`en_core_news_md`](/models/en#en_core_news_lg) | v3.4.0 | 97.2 | 90.3 | 85.5 |
|
||||
| [`en_core_news_lg`](/models/en#en_core_news_md) | v3.3.0 | 97.4 | 90.1 | 85.3 |
|
||||
| [`en_core_news_lg`](/models/en#en_core_news_lg) | v3.4.0 | 97.3 | 90.2 | 85.6 |
|
||||
|
||||
## Notes about upgrading from v3.3 {#upgrading}
|
||||
|
||||
### Doc.has_vector
|
||||
|
||||
`Doc.has_vector` now matches `Token.has_vector` and `Span.has_vector`: it
|
||||
returns `True` if at least one token in the doc has a vector rather than
|
||||
checking only whether the vocab contains vectors.
|
||||
|
||||
### Using trained pipelines with floret vectors
|
||||
|
||||
If you're using a trained pipeline for Croatian, Finnish, Korean or Swedish with
|
||||
new texts and working with `Doc` objects, you shouldn't notice any difference
|
||||
between floret vectors and default vectors.
|
||||
|
||||
If you use vectors for similarity comparisons, there are a few differences,
|
||||
mainly because a floret pipeline doesn't include any kind of frequency-based
|
||||
word list similar to the list of in-vocabulary vector keys with default vectors.
|
||||
|
||||
- If your workflow iterates over the vector keys, you should use an external
|
||||
word list instead:
|
||||
|
||||
```diff
|
||||
- lexemes = [nlp.vocab[orth] for orth in nlp.vocab.vectors]
|
||||
+ lexemes = [nlp.vocab[word] for word in external_word_list]
|
||||
```
|
||||
|
||||
- `Vectors.most_similar` is not supported because there's no fixed list of
|
||||
vectors to compare your vectors to.
|
||||
|
||||
### Pipeline package version compatibility {#version-compat}
|
||||
|
||||
> #### Using legacy implementations
|
||||
>
|
||||
> In spaCy v3, you'll still be able to load and reference legacy implementations
|
||||
> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
|
||||
> components or architectures change and newer versions are available in the
|
||||
> core library.
|
||||
|
||||
When you're loading a pipeline package trained with an earlier version of spaCy
|
||||
v3, you will see a warning telling you that the pipeline may be incompatible.
|
||||
This doesn't necessarily have to be true, but we recommend running your
|
||||
pipelines against your test suite or evaluation data to make sure there are no
|
||||
unexpected results.
|
||||
|
||||
If you're using one of the [trained pipelines](/models) we provide, you should
|
||||
run [`spacy download`](/api/cli#download) to update to the latest version. To
|
||||
see an overview of all installed packages and their compatibility, you can run
|
||||
[`spacy validate`](/api/cli#validate).
|
||||
|
||||
If you've trained your own custom pipeline and you've confirmed that it's still
|
||||
working as expected, you can update the spaCy version requirements in the
|
||||
[`meta.json`](/api/data-formats#meta):
|
||||
|
||||
```diff
|
||||
- "spacy_version": ">=3.3.0,<3.4.0",
|
||||
+ "spacy_version": ">=3.3.0,<3.5.0",
|
||||
```
|
||||
|
||||
### Updating v3.3 configs
|
||||
|
||||
To update a config from spaCy v3.3 with the new v3.4 settings, run
|
||||
[`init fill-config`](/api/cli#init-fill-config):
|
||||
|
||||
```cli
|
||||
$ python -m spacy init fill-config config-v3.3.cfg config-v3.4.cfg
|
||||
```
|
||||
|
||||
In many cases ([`spacy train`](/api/cli#train),
|
||||
[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
|
||||
automatically, but you'll need to fill in the new settings to run
|
||||
[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).
|
|
@ -162,7 +162,12 @@
|
|||
{
|
||||
"code": "hr",
|
||||
"name": "Croatian",
|
||||
"has_examples": true
|
||||
"has_examples": true,
|
||||
"models": [
|
||||
"hr_core_news_sm",
|
||||
"hr_core_news_md",
|
||||
"hr_core_news_lg"
|
||||
]
|
||||
},
|
||||
{
|
||||
"code": "hsb",
|
||||
|
|
|
@ -12,7 +12,9 @@
|
|||
{ "text": "New in v3.0", "url": "/usage/v3" },
|
||||
{ "text": "New in v3.1", "url": "/usage/v3-1" },
|
||||
{ "text": "New in v3.2", "url": "/usage/v3-2" },
|
||||
{ "text": "New in v3.3", "url": "/usage/v3-3" }
|
||||
{ "text": "New in v3.2", "url": "/usage/v3-2" },
|
||||
{ "text": "New in v3.3", "url": "/usage/v3-3" },
|
||||
{ "text": "New in v3.4", "url": "/usage/v3-4" }
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -2983,6 +2983,7 @@
|
|||
"from pysbd.utils import PySBDFactory",
|
||||
"",
|
||||
"nlp = spacy.blank('en')",
|
||||
"# Caution: works with spaCy<=2.x.x",
|
||||
"nlp.add_pipe(PySBDFactory(nlp))",
|
||||
"",
|
||||
"doc = nlp('My name is Jonas E. Smith. Please turn to p. 55.')",
|
||||
|
|
|
@ -120,8 +120,8 @@ const AlertSpace = ({ nightly, legacy }) => {
|
|||
}
|
||||
|
||||
const navAlert = (
|
||||
<Link to="/usage/v3-3" hidden>
|
||||
<strong>💥 Out now:</strong> spaCy v3.3
|
||||
<Link to="/usage/v3-4" hidden>
|
||||
<strong>💥 Out now:</strong> spaCy v3.4
|
||||
</Link>
|
||||
)
|
||||
|
||||
|
|
|
@ -24,6 +24,8 @@ const CUDA = {
|
|||
'11.3': 'cuda113',
|
||||
'11.4': 'cuda114',
|
||||
'11.5': 'cuda115',
|
||||
'11.6': 'cuda116',
|
||||
'11.7': 'cuda117',
|
||||
}
|
||||
const LANG_EXTRAS = ['ja'] // only for languages with models
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user