mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-04 12:20:20 +03:00
Compare commits
44 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
c83dfa23dc | ||
|
a65379dede | ||
|
4af02ac9e4 | ||
|
67c6ef2b2a | ||
|
c4af89f956 | ||
|
9a60424827 | ||
|
d4883e79c1 | ||
|
e965f9d40a | ||
|
d4acae856a | ||
|
0f87720411 | ||
|
c8009c2734 | ||
|
d4d4d69cb4 | ||
|
337ebda793 | ||
|
5c975565dc | ||
|
f55b876326 | ||
|
ebcc7d830f | ||
|
694c318f4f | ||
|
308b1706a7 | ||
|
3420506954 | ||
|
f71de10405 | ||
|
5caccbd19e | ||
|
6a4a00c447 | ||
|
749631ad28 | ||
|
034ac0acf4 | ||
|
02e18926c3 | ||
|
f94168a41e | ||
|
0080454140 | ||
|
6db938959d | ||
|
99a3f26d7f | ||
|
c62566ffce | ||
|
066718b1dc | ||
|
81e71a61f8 | ||
|
6aa3fede76 | ||
|
71396273a5 | ||
|
e51fff5432 | ||
|
c78eb28dfa | ||
|
e3f1d4a7d0 | ||
|
81515b4690 | ||
|
8b9355d758 | ||
|
ad026dc5fd | ||
|
1db18732e0 | ||
|
a834b03216 | ||
|
55e5f8ede3 | ||
|
bb97e7bf8a |
87
.github/azure-steps.yml
vendored
87
.github/azure-steps.yml
vendored
|
@ -1,9 +1,6 @@
|
||||||
parameters:
|
parameters:
|
||||||
python_version: ''
|
python_version: ''
|
||||||
architecture: ''
|
architecture: 'x64'
|
||||||
prefix: ''
|
|
||||||
gpu: false
|
|
||||||
num_build_jobs: 1
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- task: UsePythonVersion@0
|
- task: UsePythonVersion@0
|
||||||
|
@ -16,52 +13,76 @@ steps:
|
||||||
displayName: 'Set variables'
|
displayName: 'Set variables'
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pip install -U pip setuptools
|
python -m pip install -U build pip setuptools
|
||||||
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
|
python -m pip install -U -r requirements.txt
|
||||||
displayName: "Install dependencies"
|
displayName: "Install dependencies"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
|
python -m build --sdist
|
||||||
${{ parameters.prefix }} python setup.py sdist --formats=gztar
|
displayName: "Build sdist"
|
||||||
displayName: "Compile and build sdist"
|
|
||||||
|
|
||||||
- task: DeleteFiles@1
|
- task: DeleteFiles@1
|
||||||
inputs:
|
inputs:
|
||||||
contents: "spacy"
|
contents: "spacy"
|
||||||
displayName: "Delete source directory"
|
displayName: "Delete source directory"
|
||||||
|
|
||||||
|
- task: DeleteFiles@1
|
||||||
|
inputs:
|
||||||
|
contents: "*.egg-info"
|
||||||
|
displayName: "Delete egg-info directory"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
|
python -m pip freeze > installed.txt
|
||||||
${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
|
python -m pip uninstall -y -r installed.txt
|
||||||
displayName: "Uninstall all packages"
|
displayName: "Uninstall all packages"
|
||||||
|
|
||||||
- bash: |
|
- bash: |
|
||||||
${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||||
${{ parameters.prefix }} python -m pip install dist/$SDIST
|
python -m pip install dist/$SDIST
|
||||||
displayName: "Install from sdist"
|
displayName: "Install from sdist"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
|
python -W error -c "import spacy"
|
||||||
|
displayName: "Test import"
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m spacy download es_core_news_sm
|
||||||
|
python -c "import spacy; nlp=spacy.load('es_core_news_sm'); doc=nlp('test')"
|
||||||
|
displayName: 'Test download CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||||
|
displayName: 'Test convert CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m spacy init config -p ner -l es ner.cfg
|
||||||
|
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
||||||
|
displayName: 'Test debug config CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
# will have errors due to sparse data, check for summary in output
|
||||||
|
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
||||||
|
displayName: 'Test debug data CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||||
|
displayName: 'Test train CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'es_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||||
|
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||||
|
displayName: 'Test assemble CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m pip install -U -r requirements.txt
|
||||||
displayName: "Install test requirements"
|
displayName: "Install test requirements"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
|
python -m pytest --pyargs spacy -W error
|
||||||
${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
|
|
||||||
displayName: "Install GPU requirements"
|
|
||||||
condition: eq(${{ parameters.gpu }}, true)
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
${{ parameters.prefix }} python -m pytest --pyargs spacy
|
|
||||||
displayName: "Run CPU tests"
|
displayName: "Run CPU tests"
|
||||||
condition: eq(${{ parameters.gpu }}, false)
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
|
|
||||||
displayName: "Run GPU tests"
|
|
||||||
condition: eq(${{ parameters.gpu }}, true)
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m spacy download en_core_web_sm
|
|
||||||
python -c "import spacy; nlp=spacy.load('en_core_web_sm'); doc=nlp('test')"
|
|
||||||
displayName: 'Test download CLI'
|
|
||||||
condition: eq(variables['python_version'], '3.8')
|
|
||||||
|
|
|
@ -8,3 +8,4 @@ recursive-exclude spacy/lang *.json
|
||||||
recursive-include spacy/lang *.json.gz
|
recursive-include spacy/lang *.json.gz
|
||||||
recursive-include spacy/cli *.json *.yml
|
recursive-include spacy/cli *.json *.yml
|
||||||
recursive-include licenses *
|
recursive-include licenses *
|
||||||
|
recursive-exclude spacy *.cpp
|
||||||
|
|
|
@ -22,13 +22,13 @@ jobs:
|
||||||
# defined in .flake8 and overwrites the selected codes.
|
# defined in .flake8 and overwrites the selected codes.
|
||||||
- job: "Validate"
|
- job: "Validate"
|
||||||
pool:
|
pool:
|
||||||
vmImage: "ubuntu-18.04"
|
vmImage: "ubuntu-latest"
|
||||||
steps:
|
steps:
|
||||||
- task: UsePythonVersion@0
|
- task: UsePythonVersion@0
|
||||||
inputs:
|
inputs:
|
||||||
versionSpec: "3.7"
|
versionSpec: "3.7"
|
||||||
- script: |
|
- script: |
|
||||||
pip install flake8==3.5.0
|
pip install flake8==5.0.4
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
||||||
displayName: "flake8"
|
displayName: "flake8"
|
||||||
|
|
||||||
|
@ -38,41 +38,50 @@ jobs:
|
||||||
matrix:
|
matrix:
|
||||||
# We're only running one platform per Python version to speed up builds
|
# We're only running one platform per Python version to speed up builds
|
||||||
Python36Linux:
|
Python36Linux:
|
||||||
imageName: "ubuntu-18.04"
|
imageName: "ubuntu-20.04"
|
||||||
python.version: "3.6"
|
python.version: "3.6"
|
||||||
# Python36Windows:
|
# Python36Windows:
|
||||||
# imageName: "vs2017-win2016"
|
# imageName: "windows-latest"
|
||||||
# python.version: "3.6"
|
# python.version: "3.6"
|
||||||
# Python36Mac:
|
# Python36Mac:
|
||||||
# imageName: "macos-10.14"
|
# imageName: "macos-latest"
|
||||||
# python.version: "3.6"
|
# python.version: "3.6"
|
||||||
# Python37Linux:
|
# Python37Linux:
|
||||||
# imageName: "ubuntu-18.04"
|
# imageName: "ubuntu-20.04"
|
||||||
# python.version: "3.7"
|
# python.version: "3.7"
|
||||||
Python37Windows:
|
Python37Windows:
|
||||||
imageName: "vs2017-win2016"
|
imageName: "windows-latest"
|
||||||
python.version: "3.7"
|
python.version: "3.7"
|
||||||
# Python37Mac:
|
# Python37Mac:
|
||||||
# imageName: "macos-10.14"
|
# imageName: "macos-latest"
|
||||||
# python.version: "3.7"
|
# python.version: "3.7"
|
||||||
# Python38Linux:
|
# Python38Linux:
|
||||||
# imageName: "ubuntu-18.04"
|
# imageName: "ubuntu-latest"
|
||||||
# python.version: "3.8"
|
# python.version: "3.8"
|
||||||
# Python38Windows:
|
# Python38Windows:
|
||||||
# imageName: "vs2017-win2016"
|
# imageName: "windows-latest"
|
||||||
# python.version: "3.8"
|
# python.version: "3.8"
|
||||||
Python38Mac:
|
Python38Mac:
|
||||||
imageName: "macos-10.14"
|
imageName: "macos-latest"
|
||||||
python.version: "3.8"
|
python.version: "3.8"
|
||||||
Python39Linux:
|
Python39Linux:
|
||||||
imageName: "ubuntu-18.04"
|
imageName: "ubuntu-latest"
|
||||||
python.version: "3.9"
|
|
||||||
Python39Windows:
|
|
||||||
imageName: "vs2017-win2016"
|
|
||||||
python.version: "3.9"
|
|
||||||
Python39Mac:
|
|
||||||
imageName: "macos-10.14"
|
|
||||||
python.version: "3.9"
|
python.version: "3.9"
|
||||||
|
# Python39Windows:
|
||||||
|
# imageName: "windows-latest"
|
||||||
|
# python.version: "3.9"
|
||||||
|
# Python39Mac:
|
||||||
|
# imageName: "macos-latest"
|
||||||
|
# python.version: "3.9"
|
||||||
|
Python310Linux:
|
||||||
|
imageName: "ubuntu-latest"
|
||||||
|
python.version: "3.10"
|
||||||
|
Python310Windows:
|
||||||
|
imageName: "windows-latest"
|
||||||
|
python.version: "3.10"
|
||||||
|
Python310Mac:
|
||||||
|
imageName: "macos-latest"
|
||||||
|
python.version: "3.10"
|
||||||
maxParallel: 4
|
maxParallel: 4
|
||||||
pool:
|
pool:
|
||||||
vmImage: $(imageName)
|
vmImage: $(imageName)
|
||||||
|
@ -80,20 +89,3 @@ jobs:
|
||||||
- template: .github/azure-steps.yml
|
- template: .github/azure-steps.yml
|
||||||
parameters:
|
parameters:
|
||||||
python_version: '$(python.version)'
|
python_version: '$(python.version)'
|
||||||
architecture: 'x64'
|
|
||||||
|
|
||||||
# - job: "TestGPU"
|
|
||||||
# dependsOn: "Validate"
|
|
||||||
# strategy:
|
|
||||||
# matrix:
|
|
||||||
# Python38LinuxX64_GPU:
|
|
||||||
# python.version: '3.8'
|
|
||||||
# pool:
|
|
||||||
# name: "LinuxX64_GPU"
|
|
||||||
# steps:
|
|
||||||
# - template: .github/azure-steps.yml
|
|
||||||
# parameters:
|
|
||||||
# python_version: '$(python.version)'
|
|
||||||
# architecture: 'x64'
|
|
||||||
# gpu: true
|
|
||||||
# num_build_jobs: 24
|
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
# build version constraints for use with wheelwright + multibuild
|
# build version constraints for use with wheelwright + multibuild
|
||||||
numpy==1.15.0; python_version<='3.7'
|
numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
|
||||||
numpy==1.17.3; python_version=='3.8'
|
numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
|
||||||
|
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
|
||||||
|
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
|
||||||
numpy==1.19.3; python_version=='3.9'
|
numpy==1.19.3; python_version=='3.9'
|
||||||
numpy; python_version>='3.10'
|
numpy==1.21.3; python_version=='3.10'
|
||||||
|
numpy; python_version>='3.11'
|
||||||
|
|
|
@ -11,6 +11,7 @@ srsly>=2.4.1,<3.0.0
|
||||||
catalogue>=2.0.4,<2.1.0
|
catalogue>=2.0.4,<2.1.0
|
||||||
typer>=0.3.0,<0.4.0
|
typer>=0.3.0,<0.4.0
|
||||||
pathy>=0.3.5
|
pathy>=0.3.5
|
||||||
|
smart-open>=5.2.1,<7.0.0
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
|
@ -26,5 +27,6 @@ cython>=0.25,<3.0
|
||||||
pytest>=5.2.0
|
pytest>=5.2.0
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.5.0,<3.6.0
|
flake8>=3.8.0,<6.0.0
|
||||||
hypothesis>=3.27.0,<7.0.0
|
hypothesis>=3.27.0,<7.0.0
|
||||||
|
mypy==0.910
|
||||||
|
|
|
@ -49,9 +49,10 @@ install_requires =
|
||||||
wasabi>=0.8.1,<1.1.0
|
wasabi>=0.8.1,<1.1.0
|
||||||
srsly>=2.4.1,<3.0.0
|
srsly>=2.4.1,<3.0.0
|
||||||
catalogue>=2.0.4,<2.1.0
|
catalogue>=2.0.4,<2.1.0
|
||||||
|
# Third-party dependencies
|
||||||
typer>=0.3.0,<0.4.0
|
typer>=0.3.0,<0.4.0
|
||||||
pathy>=0.3.5
|
pathy>=0.3.5
|
||||||
# Third-party dependencies
|
smart-open>=5.2.1,<7.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.0.6"
|
__version__ = "3.0.9"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -355,7 +355,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
|
||||||
if dest.exists() and not force:
|
if dest.exists() and not force:
|
||||||
return None
|
return None
|
||||||
src = str(src)
|
src = str(src)
|
||||||
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
|
with smart_open.open(src, mode="rb", compression="disable") as input_file:
|
||||||
with dest.open(mode="wb") as output_file:
|
with dest.open(mode="wb") as output_file:
|
||||||
output_file.write(input_file.read())
|
output_file.write(input_file.read())
|
||||||
|
|
||||||
|
|
|
@ -115,7 +115,8 @@ def convert(
|
||||||
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
||||||
doc_files = []
|
doc_files = []
|
||||||
for input_loc in walk_directory(Path(input_path), converter):
|
for input_loc in walk_directory(Path(input_path), converter):
|
||||||
input_data = input_loc.open("r", encoding="utf-8").read()
|
with input_loc.open("r", encoding="utf-8") as infile:
|
||||||
|
input_data = infile.read()
|
||||||
# Use converter function to convert data
|
# Use converter function to convert data
|
||||||
func = CONVERTERS[converter]
|
func = CONVERTERS[converter]
|
||||||
docs = func(
|
docs = func(
|
||||||
|
|
|
@ -18,7 +18,7 @@ def package_cli(
|
||||||
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
||||||
code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
|
code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
|
||||||
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
||||||
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
|
||||||
name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
|
name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
|
||||||
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
||||||
build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
|
build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
{# This is a template for training configs used for the quickstart widget in
|
{# This is a template for training configs used for the quickstart widget in
|
||||||
the docs and the init config command. It encodes various best practices and
|
the docs and the init config command. It encodes various best practices and
|
||||||
can help generate the best possible configuration, given a user's requirements. #}
|
can help generate the best possible configuration, given a user's requirements. #}
|
||||||
{%- set use_transformer = hardware != "cpu" -%}
|
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||||
[paths]
|
[paths]
|
||||||
train = null
|
train = null
|
||||||
|
@ -418,7 +418,7 @@ compound = 1.001
|
||||||
|
|
||||||
[initialize]
|
[initialize]
|
||||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||||
vectors = null
|
vectors = ${paths.vectors}
|
||||||
{% else -%}
|
{% else -%}
|
||||||
vectors = "{{ word_vectors }}"
|
vectors = "{{ word_vectors }}"
|
||||||
{% endif -%}
|
{% endif -%}
|
||||||
|
|
|
@ -3,6 +3,7 @@ from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
import requests
|
import requests
|
||||||
from wasabi import msg, Printer
|
from wasabi import msg, Printer
|
||||||
|
import warnings
|
||||||
|
|
||||||
from ._util import app
|
from ._util import app
|
||||||
from .. import about
|
from .. import about
|
||||||
|
@ -45,7 +46,7 @@ def validate() -> None:
|
||||||
version = msg.text(data["version"], color="green", no_print=True)
|
version = msg.text(data["version"], color="green", no_print=True)
|
||||||
else:
|
else:
|
||||||
version = msg.text(data["version"], color="red", no_print=True)
|
version = msg.text(data["version"], color="red", no_print=True)
|
||||||
comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
|
comp = f"--> {current_compat.get(data['name'], ['n/a'])[0]}"
|
||||||
rows.append((data["name"], data["spacy"], version, comp))
|
rows.append((data["name"], data["spacy"], version, comp))
|
||||||
msg.table(rows, header=header)
|
msg.table(rows, header=header)
|
||||||
else:
|
else:
|
||||||
|
@ -78,6 +79,8 @@ def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
|
||||||
msg.good("Loaded compatibility table")
|
msg.good("Loaded compatibility table")
|
||||||
compat = r.json()["spacy"]
|
compat = r.json()["spacy"]
|
||||||
all_models = set()
|
all_models = set()
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings("ignore", message="\\[W09[45]")
|
||||||
installed_models = get_installed_models()
|
installed_models = get_installed_models()
|
||||||
for spacy_v, models in dict(compat).items():
|
for spacy_v, models in dict(compat).items():
|
||||||
all_models.update(models.keys())
|
all_models.update(models.keys())
|
||||||
|
@ -92,6 +95,8 @@ def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
|
||||||
spacy_version = about.__version__
|
spacy_version = about.__version__
|
||||||
else:
|
else:
|
||||||
model_path = get_package_path(package)
|
model_path = get_package_path(package)
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings("ignore", message="\\[W09[45]")
|
||||||
model_meta = get_model_meta(model_path)
|
model_meta = get_model_meta(model_path)
|
||||||
spacy_version = model_meta.get("spacy_version", "n/a")
|
spacy_version = model_meta.get("spacy_version", "n/a")
|
||||||
is_compat = is_compatible_version(about.__version__, spacy_version)
|
is_compat = is_compatible_version(about.__version__, spacy_version)
|
||||||
|
|
|
@ -320,6 +320,11 @@ class Errors:
|
||||||
"clear the existing vectors and resize the table.")
|
"clear the existing vectors and resize the table.")
|
||||||
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
||||||
"to end with the attribute {attr}. Got: {bad_attr}.")
|
"to end with the attribute {attr}. Got: {bad_attr}.")
|
||||||
|
E079 = ("Error computing states in beam: number of predicted beams "
|
||||||
|
"({pbeams}) does not equal number of gold beams ({gbeams}).")
|
||||||
|
E080 = ("Duplicate state found in beam: {key}.")
|
||||||
|
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
|
||||||
|
"does not equal number of losses ({losses}).")
|
||||||
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
||||||
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
||||||
"match.")
|
"match.")
|
||||||
|
@ -518,6 +523,11 @@ class Errors:
|
||||||
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
|
E867 = ("The 'textcat' component requires at least two labels because it "
|
||||||
|
"uses mutually exclusive classes where exactly one label is True "
|
||||||
|
"for each doc. For binary classification tasks, you can use two "
|
||||||
|
"labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you "
|
||||||
|
"can use the 'textcat_multilabel' component with one label.")
|
||||||
E870 = ("Could not serialize the DocBin because it is too large. Consider "
|
E870 = ("Could not serialize the DocBin because it is too large. Consider "
|
||||||
"splitting up your documents into several doc bins and serializing "
|
"splitting up your documents into several doc bins and serializing "
|
||||||
"each separately. spacy.Corpus.v1 will search recursively for all "
|
"each separately. spacy.Corpus.v1 will search recursively for all "
|
||||||
|
|
|
@ -1,16 +1,11 @@
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
class AzerbaijaniDefaults(Language.Defaults):
|
class AzerbaijaniDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
token_match = TOKEN_MATCH
|
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
|
||||||
|
|
||||||
|
|
||||||
class Azerbaijani(Language):
|
class Azerbaijani(Language):
|
||||||
|
|
|
@ -57,6 +57,6 @@ class GreekLemmatizer(Lemmatizer):
|
||||||
forms.extend(oov_forms)
|
forms.extend(oov_forms)
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.append(string)
|
forms.append(string)
|
||||||
forms = list(set(forms))
|
forms = list(dict.fromkeys(forms))
|
||||||
self.cache[cache_key] = forms
|
self.cache[cache_key] = forms
|
||||||
return forms
|
return forms
|
||||||
|
|
|
@ -12,7 +12,6 @@ PUNCT_RULES = {"«": '"', "»": '"'}
|
||||||
|
|
||||||
|
|
||||||
class RussianLemmatizer(Lemmatizer):
|
class RussianLemmatizer(Lemmatizer):
|
||||||
_morph = None
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -31,8 +30,8 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
"The Russian lemmatizer mode 'pymorphy2' requires the "
|
"The Russian lemmatizer mode 'pymorphy2' requires the "
|
||||||
"pymorphy2 library. Install it with: pip install pymorphy2"
|
"pymorphy2 library. Install it with: pip install pymorphy2"
|
||||||
) from None
|
) from None
|
||||||
if RussianLemmatizer._morph is None:
|
if getattr(self, "_morph", None) is None:
|
||||||
RussianLemmatizer._morph = MorphAnalyzer()
|
self._morph = MorphAnalyzer()
|
||||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
|
|
||||||
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
||||||
|
|
|
@ -7,8 +7,6 @@ from ...vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
class UkrainianLemmatizer(RussianLemmatizer):
|
class UkrainianLemmatizer(RussianLemmatizer):
|
||||||
_morph = None
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vocab: Vocab,
|
vocab: Vocab,
|
||||||
|
@ -27,6 +25,6 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
||||||
"pymorphy2 library and dictionaries. Install them with: "
|
"pymorphy2 library and dictionaries. Install them with: "
|
||||||
"pip install pymorphy2 pymorphy2-dicts-uk"
|
"pip install pymorphy2 pymorphy2-dicts-uk"
|
||||||
) from None
|
) from None
|
||||||
if UkrainianLemmatizer._morph is None:
|
if getattr(self, "_morph", None) is None:
|
||||||
UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
|
self._morph = MorphAnalyzer(lang="uk")
|
||||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
|
|
|
@ -50,6 +50,8 @@ cdef class PhraseMatcher:
|
||||||
if isinstance(attr, (int, long)):
|
if isinstance(attr, (int, long)):
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
else:
|
else:
|
||||||
|
if attr is None:
|
||||||
|
attr = "ORTH"
|
||||||
attr = attr.upper()
|
attr = attr.upper()
|
||||||
if attr == "TEXT":
|
if attr == "TEXT":
|
||||||
attr = "ORTH"
|
attr = "ORTH"
|
||||||
|
|
|
@ -6,6 +6,7 @@ from thinc.api import Model, Maxout, Linear
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ...kb import KnowledgeBase, Candidate, get_candidates
|
from ...kb import KnowledgeBase, Candidate, get_candidates
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
|
from ...tokens import Span
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.EntityLinker.v1")
|
@registry.architectures("spacy.EntityLinker.v1")
|
||||||
|
@ -44,5 +45,5 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.CandidateGenerator.v1")
|
@registry.misc("spacy.CandidateGenerator.v1")
|
||||||
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
|
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
||||||
return get_candidates
|
return get_candidates
|
||||||
|
|
|
@ -3,7 +3,7 @@ from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Mode
|
||||||
from thinc.api import MultiSoftmax, list2array
|
from thinc.api import MultiSoftmax, list2array
|
||||||
from thinc.api import to_categorical, CosineDistance, L2Distance
|
from thinc.api import to_categorical, CosineDistance, L2Distance
|
||||||
|
|
||||||
from ...util import registry
|
from ...util import registry, OOV_RANK
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...attrs import ID
|
from ...attrs import ID
|
||||||
|
|
||||||
|
@ -70,6 +70,7 @@ def get_vectors_loss(ops, docs, prediction, distance):
|
||||||
# and look them up all at once. This prevents data copying.
|
# and look them up all at once. This prevents data copying.
|
||||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||||
target = docs[0].vocab.vectors.data[ids]
|
target = docs[0].vocab.vectors.data[ids]
|
||||||
|
target[ids == OOV_RANK] = 0
|
||||||
d_target, loss = distance(prediction, target)
|
d_target, loss = distance(prediction, target)
|
||||||
return loss, d_target
|
return loss, d_target
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ import warnings
|
||||||
|
|
||||||
from ..kb import KnowledgeBase, Candidate
|
from ..kb import KnowledgeBase, Candidate
|
||||||
from ..ml import empty_kb
|
from ..ml import empty_kb
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc, Span
|
||||||
from .pipe import deserialize_config
|
from .pipe import deserialize_config
|
||||||
from .trainable_pipe import TrainablePipe
|
from .trainable_pipe import TrainablePipe
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
@ -67,7 +67,7 @@ def make_entity_linker(
|
||||||
incl_prior: bool,
|
incl_prior: bool,
|
||||||
incl_context: bool,
|
incl_context: bool,
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
):
|
):
|
||||||
"""Construct an EntityLinker component.
|
"""Construct an EntityLinker component.
|
||||||
|
|
||||||
|
@ -114,7 +114,7 @@ class EntityLinker(TrainablePipe):
|
||||||
incl_prior: bool,
|
incl_prior: bool,
|
||||||
incl_context: bool,
|
incl_context: bool,
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize an entity linker.
|
"""Initialize an entity linker.
|
||||||
|
|
||||||
|
@ -127,7 +127,7 @@ class EntityLinker(TrainablePipe):
|
||||||
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
||||||
incl_context (bool): Whether or not to include the local context in the model.
|
incl_context (bool): Whether or not to include the local context in the model.
|
||||||
entity_vector_length (int): Size of encoding vectors in the KB.
|
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||||
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
|
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#init
|
DOCS: https://spacy.io/api/entitylinker#init
|
||||||
|
@ -481,7 +481,8 @@ class EntityLinker(TrainablePipe):
|
||||||
|
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(p.open("rb").read())
|
with p.open("rb") as infile:
|
||||||
|
self.model.from_bytes(infile.read())
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise ValueError(Errors.E149) from None
|
raise ValueError(Errors.E149) from None
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@ from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable,
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
|
import warnings
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from ..training import Example
|
from ..training import Example
|
||||||
|
@ -102,17 +103,12 @@ class EntityRuler(Pipe):
|
||||||
self.overwrite = overwrite_ents
|
self.overwrite = overwrite_ents
|
||||||
self.token_patterns = defaultdict(list)
|
self.token_patterns = defaultdict(list)
|
||||||
self.phrase_patterns = defaultdict(list)
|
self.phrase_patterns = defaultdict(list)
|
||||||
|
self._validate = validate
|
||||||
self.matcher = Matcher(nlp.vocab, validate=validate)
|
self.matcher = Matcher(nlp.vocab, validate=validate)
|
||||||
if phrase_matcher_attr is not None:
|
|
||||||
if phrase_matcher_attr.upper() == "TEXT":
|
|
||||||
phrase_matcher_attr = "ORTH"
|
|
||||||
self.phrase_matcher_attr = phrase_matcher_attr
|
self.phrase_matcher_attr = phrase_matcher_attr
|
||||||
self.phrase_matcher = PhraseMatcher(
|
self.phrase_matcher = PhraseMatcher(
|
||||||
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
|
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
self.phrase_matcher_attr = None
|
|
||||||
self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
|
|
||||||
self.ent_id_sep = ent_id_sep
|
self.ent_id_sep = ent_id_sep
|
||||||
self._ent_ids = defaultdict(dict)
|
self._ent_ids = defaultdict(dict)
|
||||||
if patterns is not None:
|
if patterns is not None:
|
||||||
|
@ -146,6 +142,8 @@ class EntityRuler(Pipe):
|
||||||
|
|
||||||
def match(self, doc: Doc):
|
def match(self, doc: Doc):
|
||||||
self._require_patterns()
|
self._require_patterns()
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings("ignore", message="\\[W036")
|
||||||
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
||||||
matches = set(
|
matches = set(
|
||||||
[(m_id, start, end) for m_id, start, end in matches if start != end]
|
[(m_id, start, end) for m_id, start, end in matches if start != end]
|
||||||
|
@ -281,7 +279,7 @@ class EntityRuler(Pipe):
|
||||||
current_index = i
|
current_index = i
|
||||||
break
|
break
|
||||||
subsequent_pipes = [
|
subsequent_pipes = [
|
||||||
pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
|
pipe for pipe in self.nlp.pipe_names[current_index :]
|
||||||
]
|
]
|
||||||
except ValueError:
|
except ValueError:
|
||||||
subsequent_pipes = []
|
subsequent_pipes = []
|
||||||
|
@ -317,20 +315,22 @@ class EntityRuler(Pipe):
|
||||||
pattern = entry["pattern"]
|
pattern = entry["pattern"]
|
||||||
if isinstance(pattern, Doc):
|
if isinstance(pattern, Doc):
|
||||||
self.phrase_patterns[label].append(pattern)
|
self.phrase_patterns[label].append(pattern)
|
||||||
|
self.phrase_matcher.add(label, [pattern])
|
||||||
elif isinstance(pattern, list):
|
elif isinstance(pattern, list):
|
||||||
self.token_patterns[label].append(pattern)
|
self.token_patterns[label].append(pattern)
|
||||||
|
self.matcher.add(label, [pattern])
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E097.format(pattern=pattern))
|
raise ValueError(Errors.E097.format(pattern=pattern))
|
||||||
for label, patterns in self.token_patterns.items():
|
|
||||||
self.matcher.add(label, patterns)
|
|
||||||
for label, patterns in self.phrase_patterns.items():
|
|
||||||
self.phrase_matcher.add(label, patterns)
|
|
||||||
|
|
||||||
def clear(self) -> None:
|
def clear(self) -> None:
|
||||||
"""Reset all patterns."""
|
"""Reset all patterns."""
|
||||||
self.token_patterns = defaultdict(list)
|
self.token_patterns = defaultdict(list)
|
||||||
self.phrase_patterns = defaultdict(list)
|
self.phrase_patterns = defaultdict(list)
|
||||||
self._ent_ids = defaultdict(dict)
|
self._ent_ids = defaultdict(dict)
|
||||||
|
self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
|
||||||
|
self.phrase_matcher = PhraseMatcher(
|
||||||
|
self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
|
||||||
|
)
|
||||||
|
|
||||||
def _require_patterns(self) -> None:
|
def _require_patterns(self) -> None:
|
||||||
"""Raise a warning if this component has no patterns defined."""
|
"""Raise a warning if this component has no patterns defined."""
|
||||||
|
@ -381,7 +381,6 @@ class EntityRuler(Pipe):
|
||||||
self.add_patterns(cfg.get("patterns", cfg))
|
self.add_patterns(cfg.get("patterns", cfg))
|
||||||
self.overwrite = cfg.get("overwrite", False)
|
self.overwrite = cfg.get("overwrite", False)
|
||||||
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
|
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
|
||||||
if self.phrase_matcher_attr is not None:
|
|
||||||
self.phrase_matcher = PhraseMatcher(
|
self.phrase_matcher = PhraseMatcher(
|
||||||
self.nlp.vocab, attr=self.phrase_matcher_attr
|
self.nlp.vocab, attr=self.phrase_matcher_attr
|
||||||
)
|
)
|
||||||
|
@ -435,7 +434,6 @@ class EntityRuler(Pipe):
|
||||||
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
|
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
|
||||||
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
||||||
|
|
||||||
if self.phrase_matcher_attr is not None:
|
|
||||||
self.phrase_matcher = PhraseMatcher(
|
self.phrase_matcher = PhraseMatcher(
|
||||||
self.nlp.vocab, attr=self.phrase_matcher_attr
|
self.nlp.vocab, attr=self.phrase_matcher_attr
|
||||||
)
|
)
|
||||||
|
|
|
@ -332,6 +332,8 @@ class TextCategorizer(TrainablePipe):
|
||||||
else:
|
else:
|
||||||
for label in labels:
|
for label in labels:
|
||||||
self.add_label(label)
|
self.add_label(label)
|
||||||
|
if len(self.labels) < 2:
|
||||||
|
raise ValueError(Errors.E867)
|
||||||
if positive_label is not None:
|
if positive_label is not None:
|
||||||
if positive_label not in self.labels:
|
if positive_label not in self.labels:
|
||||||
err = Errors.E920.format(pos_label=positive_label, labels=self.labels)
|
err = Errors.E920.format(pos_label=positive_label, labels=self.labels)
|
||||||
|
|
|
@ -118,6 +118,10 @@ class Tok2Vec(TrainablePipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tok2vec#predict
|
DOCS: https://spacy.io/api/tok2vec#predict
|
||||||
"""
|
"""
|
||||||
|
if not any(len(doc) for doc in docs):
|
||||||
|
# Handle cases where there are no tokens in any docs.
|
||||||
|
width = self.model.get_dim("nO")
|
||||||
|
return [self.model.ops.alloc((0, width)) for doc in docs]
|
||||||
tokvecs = self.model.predict(docs)
|
tokvecs = self.model.predict(docs)
|
||||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
batch_id = Tok2VecListener.get_batch_id(docs)
|
||||||
for listener in self.listeners:
|
for listener in self.listeners:
|
||||||
|
|
|
@ -324,7 +324,8 @@ cdef class TrainablePipe(Pipe):
|
||||||
|
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(p.open("rb").read())
|
with open(p, "rb") as mfile:
|
||||||
|
self.model.from_bytes(mfile.read())
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise ValueError(Errors.E149) from None
|
raise ValueError(Errors.E149) from None
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ from enum import Enum
|
||||||
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
||||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
||||||
from pydantic.main import ModelMetaclass
|
from pydantic.main import ModelMetaclass
|
||||||
from thinc.api import Optimizer, ConfigValidationError
|
from thinc.api import Optimizer, ConfigValidationError, Model
|
||||||
from thinc.config import Promise
|
from thinc.config import Promise
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import inspect
|
import inspect
|
||||||
|
@ -17,6 +17,7 @@ if TYPE_CHECKING:
|
||||||
# This lets us add type hints for mypy etc. without causing circular imports
|
# This lets us add type hints for mypy etc. without causing circular imports
|
||||||
from .language import Language # noqa: F401
|
from .language import Language # noqa: F401
|
||||||
from .training import Example # noqa: F401
|
from .training import Example # noqa: F401
|
||||||
|
from .vocab import Vocab # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
@ -353,7 +354,7 @@ class ConfigSchemaPretrain(BaseModel):
|
||||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||||
component: str = Field(..., title="Component to find the layer to pretrain")
|
component: str = Field(..., title="Component to find the layer to pretrain")
|
||||||
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
|
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
|
||||||
objective: Callable[["Vocab", "Model"], "Model"] = Field(..., title="A function that creates the pretraining objective.")
|
objective: Callable[["Vocab", Model], Model] = Field(..., title="A function that creates the pretraining objective.")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
|
|
|
@ -3,7 +3,13 @@ from spacy.util import get_lang_class
|
||||||
|
|
||||||
|
|
||||||
def pytest_addoption(parser):
|
def pytest_addoption(parser):
|
||||||
|
try:
|
||||||
parser.addoption("--slow", action="store_true", help="include slow tests")
|
parser.addoption("--slow", action="store_true", help="include slow tests")
|
||||||
|
parser.addoption("--issue", action="store", help="test specific issues")
|
||||||
|
# Options are already added, e.g. if conftest is copied in a build pipeline
|
||||||
|
# and runs twice
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def pytest_runtest_setup(item):
|
def pytest_runtest_setup(item):
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
import numpy
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
|
from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
|
||||||
|
|
||||||
|
@ -100,14 +101,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
|
||||||
|
|
||||||
# head before start
|
# head before start
|
||||||
arr = doc.to_array(["HEAD"])
|
arr = doc.to_array(["HEAD"])
|
||||||
arr[0] = -1
|
arr[0] = numpy.int32(-1).astype(numpy.uint64)
|
||||||
doc_from_array = Doc(en_vocab, words=words)
|
doc_from_array = Doc(en_vocab, words=words)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
doc_from_array.from_array(["HEAD"], arr)
|
doc_from_array.from_array(["HEAD"], arr)
|
||||||
|
|
||||||
# head after end
|
# head after end
|
||||||
arr = doc.to_array(["HEAD"])
|
arr = doc.to_array(["HEAD"])
|
||||||
arr[0] = 5
|
arr[0] = numpy.int32(5).astype(numpy.uint64)
|
||||||
doc_from_array = Doc(en_vocab, words=words)
|
doc_from_array = Doc(en_vocab, words=words)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
doc_from_array.from_array(["HEAD"], arr)
|
doc_from_array.from_array(["HEAD"], arr)
|
||||||
|
|
|
@ -4,6 +4,7 @@ import pytest
|
||||||
import numpy
|
import numpy
|
||||||
import logging
|
import logging
|
||||||
import mock
|
import mock
|
||||||
|
import warnings
|
||||||
|
|
||||||
from spacy.lang.xx import MultiLanguage
|
from spacy.lang.xx import MultiLanguage
|
||||||
from spacy.tokens import Doc, Span, Token
|
from spacy.tokens import Doc, Span, Token
|
||||||
|
@ -316,9 +317,9 @@ def test_doc_from_array_sent_starts(en_vocab):
|
||||||
# no warning using default attrs
|
# no warning using default attrs
|
||||||
attrs = doc._get_array_attrs()
|
attrs = doc._get_array_attrs()
|
||||||
arr = doc.to_array(attrs)
|
arr = doc.to_array(attrs)
|
||||||
with pytest.warns(None) as record:
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
new_doc.from_array(attrs, arr)
|
new_doc.from_array(attrs, arr)
|
||||||
assert len(record) == 0
|
|
||||||
# only SENT_START uses SENT_START
|
# only SENT_START uses SENT_START
|
||||||
attrs = [SENT_START]
|
attrs = [SENT_START]
|
||||||
arr = doc.to_array(attrs)
|
arr = doc.to_array(attrs)
|
||||||
|
@ -351,13 +352,21 @@ def test_doc_from_array_morph(en_vocab):
|
||||||
|
|
||||||
@pytest.mark.usefixtures("clean_underscore")
|
@pytest.mark.usefixtures("clean_underscore")
|
||||||
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
en_texts = ["Merging the docs is fun.", "", "They don't think alike."]
|
en_texts = [
|
||||||
|
"Merging the docs is fun.",
|
||||||
|
"",
|
||||||
|
"They don't think alike. ",
|
||||||
|
"Another doc.",
|
||||||
|
]
|
||||||
en_texts_without_empty = [t for t in en_texts if len(t)]
|
en_texts_without_empty = [t for t in en_texts if len(t)]
|
||||||
de_text = "Wie war die Frage?"
|
de_text = "Wie war die Frage?"
|
||||||
en_docs = [en_tokenizer(text) for text in en_texts]
|
en_docs = [en_tokenizer(text) for text in en_texts]
|
||||||
en_docs[0].spans["group"] = [en_docs[0][1:4]]
|
en_docs[0].spans["group"] = [en_docs[0][1:4]]
|
||||||
en_docs[2].spans["group"] = [en_docs[2][1:4]]
|
en_docs[2].spans["group"] = [en_docs[2][1:4]]
|
||||||
span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text])
|
en_docs[3].spans["group"] = [en_docs[3][0:1]]
|
||||||
|
span_group_texts = sorted(
|
||||||
|
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
|
||||||
|
)
|
||||||
de_doc = de_tokenizer(de_text)
|
de_doc = de_tokenizer(de_text)
|
||||||
Token.set_extension("is_ambiguous", default=False)
|
Token.set_extension("is_ambiguous", default=False)
|
||||||
en_docs[0][2]._.is_ambiguous = True # docs
|
en_docs[0][2]._.is_ambiguous = True # docs
|
||||||
|
@ -371,8 +380,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
|
|
||||||
m_doc = Doc.from_docs(en_docs)
|
m_doc = Doc.from_docs(en_docs)
|
||||||
assert len(en_texts_without_empty) == len(list(m_doc.sents))
|
assert len(en_texts_without_empty) == len(list(m_doc.sents))
|
||||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
|
||||||
assert str(m_doc) == " ".join(en_texts_without_empty)
|
assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
|
||||||
p_token = m_doc[len(en_docs[0]) - 1]
|
p_token = m_doc[len(en_docs[0]) - 1]
|
||||||
assert p_token.text == "." and bool(p_token.whitespace_)
|
assert p_token.text == "." and bool(p_token.whitespace_)
|
||||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||||
|
@ -384,11 +393,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
assert not any([t._.is_ambiguous for t in m_doc[3:8]])
|
assert not any([t._.is_ambiguous for t in m_doc[3:8]])
|
||||||
assert "group" in m_doc.spans
|
assert "group" in m_doc.spans
|
||||||
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
|
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
|
||||||
|
assert bool(m_doc[11].whitespace_)
|
||||||
|
|
||||||
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
|
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
|
||||||
assert len(en_texts_without_empty) == len(list(m_doc.sents))
|
assert len(en_texts_without_empty) == len(list(m_doc.sents))
|
||||||
assert len(str(m_doc)) == sum(len(t) for t in en_texts)
|
assert len(m_doc.text) == sum(len(t) for t in en_texts)
|
||||||
assert str(m_doc) == "".join(en_texts)
|
assert m_doc.text == "".join(en_texts_without_empty)
|
||||||
p_token = m_doc[len(en_docs[0]) - 1]
|
p_token = m_doc[len(en_docs[0]) - 1]
|
||||||
assert p_token.text == "." and not bool(p_token.whitespace_)
|
assert p_token.text == "." and not bool(p_token.whitespace_)
|
||||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||||
|
@ -397,11 +407,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
assert m_doc[9].idx == think_idx
|
assert m_doc[9].idx == think_idx
|
||||||
assert "group" in m_doc.spans
|
assert "group" in m_doc.spans
|
||||||
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
|
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
|
||||||
|
assert bool(m_doc[11].whitespace_)
|
||||||
|
|
||||||
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
|
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
|
||||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
|
||||||
# space delimiter considered, although spacy attribute was missing
|
# space delimiter considered, although spacy attribute was missing
|
||||||
assert str(m_doc) == " ".join(en_texts_without_empty)
|
assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
|
||||||
p_token = m_doc[len(en_docs[0]) - 1]
|
p_token = m_doc[len(en_docs[0]) - 1]
|
||||||
assert p_token.text == "." and bool(p_token.whitespace_)
|
assert p_token.text == "." and bool(p_token.whitespace_)
|
||||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||||
|
@ -414,6 +425,16 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
# can merge empty docs
|
# can merge empty docs
|
||||||
doc = Doc.from_docs([en_tokenizer("")] * 10)
|
doc = Doc.from_docs([en_tokenizer("")] * 10)
|
||||||
|
|
||||||
|
# empty but set spans keys are preserved
|
||||||
|
en_docs = [en_tokenizer(text) for text in en_texts]
|
||||||
|
m_doc = Doc.from_docs(en_docs)
|
||||||
|
assert "group" not in m_doc.spans
|
||||||
|
for doc in en_docs:
|
||||||
|
doc.spans["group"] = []
|
||||||
|
m_doc = Doc.from_docs(en_docs)
|
||||||
|
assert "group" in m_doc.spans
|
||||||
|
assert len(m_doc.spans["group"]) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_doc_api_from_docs_ents(en_tokenizer):
|
def test_doc_api_from_docs_ents(en_tokenizer):
|
||||||
texts = ["Merging the docs is fun.", "They don't think alike."]
|
texts = ["Merging the docs is fun.", "They don't think alike."]
|
||||||
|
|
|
@ -2,6 +2,9 @@ import pytest
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
|
|
||||||
|
|
||||||
def test_ru_doc_lemmatization(ru_lemmatizer):
|
def test_ru_doc_lemmatization(ru_lemmatizer):
|
||||||
words = ["мама", "мыла", "раму"]
|
words = ["мама", "мыла", "раму"]
|
||||||
pos = ["NOUN", "VERB", "NOUN"]
|
pos = ["NOUN", "VERB", "NOUN"]
|
||||||
|
|
|
@ -4,12 +4,13 @@ from spacy.util import get_lang_class
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
# Only include languages with no external dependencies
|
# Only include languages with no external dependencies
|
||||||
# excluded: ja, ru, th, uk, vi, zh
|
# excluded: ja, ko, th, vi, zh
|
||||||
LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
|
LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
|
||||||
"et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
|
"en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
|
||||||
"it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
|
"hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
|
||||||
"sl", "sq", "sr", "sv", "ta", "te", "tl", "tn", "tr", "tt", "ur",
|
"mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
|
||||||
"yo"]
|
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
|
||||||
|
"tr", "tt", "uk", "ur", "xx", "yo"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,9 @@ import pytest
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
|
|
||||||
|
|
||||||
def test_uk_lemmatizer(uk_lemmatizer):
|
def test_uk_lemmatizer(uk_lemmatizer):
|
||||||
"""Check that the default uk lemmatizer runs."""
|
"""Check that the default uk lemmatizer runs."""
|
||||||
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
||||||
|
|
|
@ -481,6 +481,7 @@ def test_matcher_schema_token_attributes(en_vocab, pattern, text):
|
||||||
assert len(matches) == 1
|
assert len(matches) == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore:\\[W036")
|
||||||
def test_matcher_valid_callback(en_vocab):
|
def test_matcher_valid_callback(en_vocab):
|
||||||
"""Test that on_match can only be None or callable."""
|
"""Test that on_match can only be None or callable."""
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
|
|
|
@ -180,6 +180,7 @@ def test_matcher_sets_return_correct_tokens(en_vocab):
|
||||||
assert texts == ["zero", "one", "two"]
|
assert texts == ["zero", "one", "two"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore:\\[W036")
|
||||||
def test_matcher_remove():
|
def test_matcher_remove():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
matcher = Matcher(nlp.vocab)
|
matcher = Matcher(nlp.vocab)
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
import warnings
|
||||||
import srsly
|
import srsly
|
||||||
from mock import Mock
|
from mock import Mock
|
||||||
from spacy.matcher import PhraseMatcher
|
from spacy.matcher import PhraseMatcher
|
||||||
|
@ -197,13 +198,13 @@ def test_phrase_matcher_validation(en_vocab):
|
||||||
matcher.add("TEST1", [doc1])
|
matcher.add("TEST1", [doc1])
|
||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
matcher.add("TEST2", [doc2])
|
matcher.add("TEST2", [doc2])
|
||||||
with pytest.warns(None) as record:
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
matcher.add("TEST3", [doc3])
|
matcher.add("TEST3", [doc3])
|
||||||
assert not record.list
|
|
||||||
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
|
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
|
||||||
with pytest.warns(None) as record:
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
matcher.add("TEST4", [doc2])
|
matcher.add("TEST4", [doc2])
|
||||||
assert not record.list
|
|
||||||
|
|
||||||
|
|
||||||
def test_attr_validation(en_vocab):
|
def test_attr_validation(en_vocab):
|
||||||
|
|
|
@ -252,12 +252,12 @@ def test_ruler_before_ner():
|
||||||
# 1 : Entity Ruler - should set "this" to B and everything else to empty
|
# 1 : Entity Ruler - should set "this" to B and everything else to empty
|
||||||
patterns = [{"label": "THING", "pattern": "This"}]
|
patterns = [{"label": "THING", "pattern": "This"}]
|
||||||
ruler = nlp.add_pipe("entity_ruler")
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
|
|
||||||
# 2: untrained NER - should set everything else to O
|
# 2: untrained NER - should set everything else to O
|
||||||
untrained_ner = nlp.add_pipe("ner")
|
untrained_ner = nlp.add_pipe("ner")
|
||||||
untrained_ner.add_label("MY_LABEL")
|
untrained_ner.add_label("MY_LABEL")
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
doc = nlp("This is Antti Korhonen speaking in Finland")
|
doc = nlp("This is Antti Korhonen speaking in Finland")
|
||||||
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
||||||
expected_types = ["THING", "", "", "", "", "", ""]
|
expected_types = ["THING", "", "", "", "", "", ""]
|
||||||
|
|
|
@ -324,6 +324,7 @@ def test_append_alias(nlp):
|
||||||
assert len(mykb.get_alias_candidates("douglas")) == 3
|
assert len(mykb.get_alias_candidates("douglas")) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore:\\[W036")
|
||||||
def test_append_invalid_alias(nlp):
|
def test_append_invalid_alias(nlp):
|
||||||
"""Test that append an alias will throw an error if prior probs are exceeding 1"""
|
"""Test that append an alias will throw an error if prior probs are exceeding 1"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||||
|
@ -342,6 +343,7 @@ def test_append_invalid_alias(nlp):
|
||||||
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
|
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore:\\[W036")
|
||||||
def test_preserving_links_asdoc(nlp):
|
def test_preserving_links_asdoc(nlp):
|
||||||
"""Test that Span.as_doc preserves the existing entity links"""
|
"""Test that Span.as_doc preserves the existing entity links"""
|
||||||
vector_length = 1
|
vector_length = 1
|
||||||
|
|
|
@ -89,6 +89,20 @@ def test_entity_ruler_init_clear(nlp, patterns):
|
||||||
assert len(ruler.labels) == 0
|
assert len(ruler.labels) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_entity_ruler_clear(nlp, patterns):
|
||||||
|
"""Test that initialization clears patterns."""
|
||||||
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
assert len(ruler.labels) == 4
|
||||||
|
doc = nlp("hello world")
|
||||||
|
assert len(doc.ents) == 1
|
||||||
|
ruler.clear()
|
||||||
|
assert len(ruler.labels) == 0
|
||||||
|
with pytest.warns(UserWarning):
|
||||||
|
doc = nlp("hello world")
|
||||||
|
assert len(doc.ents) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_existing(nlp, patterns):
|
def test_entity_ruler_existing(nlp, patterns):
|
||||||
ruler = nlp.add_pipe("entity_ruler")
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
|
|
|
@ -334,24 +334,31 @@ def test_language_factories_invalid():
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"weights,expected",
|
"weights,override,expected",
|
||||||
[
|
[
|
||||||
([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {"a": 0.33, "b": 0.33, "c": 0.33}),
|
([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {}, {"a": 0.33, "b": 0.33, "c": 0.33}),
|
||||||
([{"a": 1.0}, {"b": 50}, {"c": 123}], {"a": 0.33, "b": 0.33, "c": 0.33}),
|
([{"a": 1.0}, {"b": 50}, {"c": 100}], {}, {"a": 0.01, "b": 0.33, "c": 0.66}),
|
||||||
(
|
(
|
||||||
[{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}],
|
[{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}],
|
||||||
|
{},
|
||||||
{"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17},
|
{"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17},
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
|
[{"a": 100, "b": 300}, {"c": 50, "d": 50}],
|
||||||
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
|
{},
|
||||||
|
{"a": 0.2, "b": 0.6, "c": 0.1, "d": 0.1},
|
||||||
),
|
),
|
||||||
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}),
|
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {}, {"a": 0.33, "b": 0.67}),
|
||||||
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}),
|
([{"a": 0.5, "b": 0.0}], {}, {"a": 1.0, "b": 0.0}),
|
||||||
|
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}),
|
||||||
|
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}),
|
||||||
|
([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}),
|
||||||
|
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"c": 0.2}, {"a": 0.0, "b": 0.0, "c": 1.0}),
|
||||||
|
([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_language_factories_combine_score_weights(weights, expected):
|
def test_language_factories_combine_score_weights(weights, override, expected):
|
||||||
result = combine_score_weights(weights)
|
result = combine_score_weights(weights, override)
|
||||||
assert sum(result.values()) in (0.99, 1.0, 0.0)
|
assert sum(result.values()) in (0.99, 1.0, 0.0)
|
||||||
assert result == expected
|
assert result == expected
|
||||||
|
|
||||||
|
@ -377,17 +384,17 @@ def test_language_factories_scores():
|
||||||
# Test with custom defaults
|
# Test with custom defaults
|
||||||
config = nlp.config.copy()
|
config = nlp.config.copy()
|
||||||
config["training"]["score_weights"]["a1"] = 0.0
|
config["training"]["score_weights"]["a1"] = 0.0
|
||||||
config["training"]["score_weights"]["b3"] = 1.0
|
config["training"]["score_weights"]["b3"] = 1.3
|
||||||
nlp = English.from_config(config)
|
nlp = English.from_config(config)
|
||||||
score_weights = nlp.config["training"]["score_weights"]
|
score_weights = nlp.config["training"]["score_weights"]
|
||||||
expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34}
|
expected = {"a1": 0.0, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.65}
|
||||||
assert score_weights == expected
|
assert score_weights == expected
|
||||||
# Test with null values
|
# Test with null values
|
||||||
config = nlp.config.copy()
|
config = nlp.config.copy()
|
||||||
config["training"]["score_weights"]["a1"] = None
|
config["training"]["score_weights"]["a1"] = None
|
||||||
nlp = English.from_config(config)
|
nlp = English.from_config(config)
|
||||||
score_weights = nlp.config["training"]["score_weights"]
|
score_weights = nlp.config["training"]["score_weights"]
|
||||||
expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35}
|
expected = {"a1": None, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.66}
|
||||||
assert score_weights == expected
|
assert score_weights == expected
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -108,6 +108,12 @@ def test_label_types(name):
|
||||||
textcat.add_label("answer")
|
textcat.add_label("answer")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
textcat.add_label(9)
|
textcat.add_label(9)
|
||||||
|
# textcat requires at least two labels
|
||||||
|
if name == "textcat":
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.initialize()
|
||||||
|
else:
|
||||||
|
nlp.initialize()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"])
|
@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"])
|
||||||
|
|
|
@ -11,7 +11,7 @@ from spacy.lang.en import English
|
||||||
from thinc.api import Config, get_current_ops
|
from thinc.api import Config, get_current_ops
|
||||||
from numpy.testing import assert_array_equal
|
from numpy.testing import assert_array_equal
|
||||||
|
|
||||||
from ..util import get_batch, make_tempdir
|
from ..util import get_batch, make_tempdir, add_vecs_to_vocab
|
||||||
|
|
||||||
|
|
||||||
def test_empty_doc():
|
def test_empty_doc():
|
||||||
|
@ -134,9 +134,25 @@ TRAIN_DATA = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_tok2vec_listener():
|
@pytest.mark.parametrize("with_vectors", (False, True))
|
||||||
|
def test_tok2vec_listener(with_vectors):
|
||||||
orig_config = Config().from_str(cfg_string)
|
orig_config = Config().from_str(cfg_string)
|
||||||
|
orig_config["components"]["tok2vec"]["model"]["embed"][
|
||||||
|
"include_static_vectors"
|
||||||
|
] = with_vectors
|
||||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||||
|
|
||||||
|
if with_vectors:
|
||||||
|
ops = get_current_ops()
|
||||||
|
vectors = [
|
||||||
|
("apple", ops.asarray([1, 2, 3])),
|
||||||
|
("orange", ops.asarray([-1, -2, -3])),
|
||||||
|
("and", ops.asarray([-1, -1, -1])),
|
||||||
|
("juice", ops.asarray([5, 5, 10])),
|
||||||
|
("pie", ops.asarray([7, 6.3, 8.9])),
|
||||||
|
]
|
||||||
|
add_vecs_to_vocab(nlp.vocab, vectors)
|
||||||
|
|
||||||
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
||||||
tagger = nlp.get_pipe("tagger")
|
tagger = nlp.get_pipe("tagger")
|
||||||
tok2vec = nlp.get_pipe("tok2vec")
|
tok2vec = nlp.get_pipe("tok2vec")
|
||||||
|
@ -163,6 +179,9 @@ def test_tok2vec_listener():
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor))
|
assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor))
|
||||||
|
|
||||||
|
# test with empty doc
|
||||||
|
doc = nlp("")
|
||||||
|
|
||||||
# TODO: should this warn or error?
|
# TODO: should this warn or error?
|
||||||
nlp.select_pipes(disable="tok2vec")
|
nlp.select_pipes(disable="tok2vec")
|
||||||
assert nlp.pipe_names == ["tagger"]
|
assert nlp.pipe_names == ["tagger"]
|
||||||
|
|
|
@ -49,8 +49,8 @@ def test_issue5551(textcat_config):
|
||||||
# All results should be the same because of the fixed seed
|
# All results should be the same because of the fixed seed
|
||||||
assert len(results) == 3
|
assert len(results) == 3
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]))
|
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5)
|
||||||
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]))
|
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5)
|
||||||
|
|
||||||
|
|
||||||
def test_issue5838():
|
def test_issue5838():
|
||||||
|
|
34
spacy/tests/regression/test_issue8216.py
Normal file
34
spacy/tests/regression/test_issue8216.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from spacy import registry
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.pipeline import EntityRuler
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def nlp():
|
||||||
|
return Language()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
@registry.misc("entity_ruler_patterns")
|
||||||
|
def patterns():
|
||||||
|
return [
|
||||||
|
{"label": "HELLO", "pattern": "hello world"},
|
||||||
|
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||||
|
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||||
|
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||||
|
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||||
|
{"label": "TECH_ORG", "pattern": "Microsoft", "id": "a2"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_entity_ruler_fix8216(nlp, patterns):
|
||||||
|
"""Test that patterns don't get added excessively."""
|
||||||
|
ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
|
||||||
|
assert pattern_count > 0
|
||||||
|
ruler.add_patterns([])
|
||||||
|
after_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
|
||||||
|
assert after_count == pattern_count
|
|
@ -3,6 +3,7 @@ from typing import Callable
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.util import ensure_path, registry, load_model_from_config
|
from spacy.util import ensure_path, registry, load_model_from_config
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb import KnowledgeBase
|
||||||
|
from spacy.vocab import Vocab
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
@ -111,7 +112,7 @@ def test_serialize_subclassed_kb():
|
||||||
@registry.misc("spacy.CustomKB.v1")
|
@registry.misc("spacy.CustomKB.v1")
|
||||||
def custom_kb(
|
def custom_kb(
|
||||||
entity_vector_length: int, custom_field: int
|
entity_vector_length: int, custom_field: int
|
||||||
) -> Callable[["Vocab"], KnowledgeBase]:
|
) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
def custom_kb_factory(vocab):
|
def custom_kb_factory(vocab):
|
||||||
kb = SubKnowledgeBase(
|
kb = SubKnowledgeBase(
|
||||||
vocab=vocab,
|
vocab=vocab,
|
||||||
|
|
|
@ -10,6 +10,7 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||||
from spacy.cli._util import load_project_config, substitute_project_variables
|
from spacy.cli._util import load_project_config, substitute_project_variables
|
||||||
from spacy.cli._util import string_to_list
|
from spacy.cli._util import string_to_list
|
||||||
|
from spacy.cli._util import upload_file, download_file
|
||||||
from thinc.api import ConfigValidationError, Config
|
from thinc.api import ConfigValidationError, Config
|
||||||
import srsly
|
import srsly
|
||||||
import os
|
import os
|
||||||
|
@ -474,3 +475,18 @@ def test_string_to_list(value):
|
||||||
def test_string_to_list_intify(value):
|
def test_string_to_list_intify(value):
|
||||||
assert string_to_list(value, intify=False) == ["1", "2", "3"]
|
assert string_to_list(value, intify=False) == ["1", "2", "3"]
|
||||||
assert string_to_list(value, intify=True) == [1, 2, 3]
|
assert string_to_list(value, intify=True) == [1, 2, 3]
|
||||||
|
|
||||||
|
|
||||||
|
def test_upload_download_local_file():
|
||||||
|
with make_tempdir() as d1, make_tempdir() as d2:
|
||||||
|
filename = "f.txt"
|
||||||
|
content = "content"
|
||||||
|
local_file = d1 / filename
|
||||||
|
remote_file = d2 / filename
|
||||||
|
with local_file.open(mode="w") as file_:
|
||||||
|
file_.write(content)
|
||||||
|
upload_file(local_file, remote_file)
|
||||||
|
local_file.unlink()
|
||||||
|
download_file(remote_file, local_file)
|
||||||
|
with local_file.open(mode="r") as file_:
|
||||||
|
assert file_.read() == content
|
||||||
|
|
|
@ -20,7 +20,7 @@ def get_textcat_bow_kwargs():
|
||||||
|
|
||||||
|
|
||||||
def get_textcat_cnn_kwargs():
|
def get_textcat_cnn_kwargs():
|
||||||
return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
||||||
|
|
||||||
|
|
||||||
def get_all_params(model):
|
def get_all_params(model):
|
||||||
|
@ -62,7 +62,7 @@ def get_tok2vec_kwargs():
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_tok2vec():
|
def make_test_tok2vec():
|
||||||
return build_Tok2Vec_model(**get_tok2vec_kwargs())
|
return build_Tok2Vec_model(**get_tok2vec_kwargs())
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -84,7 +84,8 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n
|
||||||
@pytest.mark.parametrize("file_name", ["sun.txt"])
|
@pytest.mark.parametrize("file_name", ["sun.txt"])
|
||||||
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
|
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
|
||||||
loc = ensure_path(__file__).parent / file_name
|
loc = ensure_path(__file__).parent / file_name
|
||||||
text = loc.open("r", encoding="utf8").read()
|
with loc.open("r", encoding="utf8") as infile:
|
||||||
|
text = infile.read()
|
||||||
assert len(text) != 0
|
assert len(text) != 0
|
||||||
tokens = tokenizer(text)
|
tokens = tokenizer(text)
|
||||||
assert len(tokens) > 100
|
assert len(tokens) > 100
|
||||||
|
|
|
@ -182,6 +182,27 @@ def test_Example_from_dict_with_entities(annots):
|
||||||
assert example.reference[5].ent_type_ == "LOC"
|
assert example.reference[5].ent_type_ == "LOC"
|
||||||
|
|
||||||
|
|
||||||
|
def test_Example_from_dict_with_empty_entities():
|
||||||
|
annots = {
|
||||||
|
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
|
||||||
|
"entities": [],
|
||||||
|
}
|
||||||
|
vocab = Vocab()
|
||||||
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
|
example = Example.from_dict(predicted, annots)
|
||||||
|
# entities as empty list sets everything to O
|
||||||
|
assert example.reference.has_annotation("ENT_IOB")
|
||||||
|
assert len(list(example.reference.ents)) == 0
|
||||||
|
assert all(token.ent_iob_ == "O" for token in example.reference)
|
||||||
|
# various unset/missing entities leaves entities unset
|
||||||
|
annots["entities"] = None
|
||||||
|
example = Example.from_dict(predicted, annots)
|
||||||
|
assert not example.reference.has_annotation("ENT_IOB")
|
||||||
|
annots.pop("entities", None)
|
||||||
|
example = Example.from_dict(predicted, annots)
|
||||||
|
assert not example.reference.has_annotation("ENT_IOB")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"annots",
|
"annots",
|
||||||
[
|
[
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from typing import Dict, Iterable, Callable
|
from typing import Dict, Iterable, Callable
|
||||||
import pytest
|
import pytest
|
||||||
from thinc.api import Config
|
from thinc.api import Config, fix_random_seed
|
||||||
from spacy import Language
|
from spacy import Language
|
||||||
from spacy.util import load_model_from_config, registry, resolve_dot_names
|
from spacy.util import load_model_from_config, registry, resolve_dot_names
|
||||||
from spacy.schemas import ConfigSchemaTraining
|
from spacy.schemas import ConfigSchemaTraining
|
||||||
|
@ -64,8 +64,8 @@ def test_readers():
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"reader,additional_config",
|
"reader,additional_config",
|
||||||
[
|
[
|
||||||
("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 2}),
|
("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 10}),
|
||||||
("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 2}),
|
("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 10}),
|
||||||
("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}),
|
("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -82,17 +82,18 @@ def test_cat_readers(reader, additional_config):
|
||||||
|
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "en"
|
lang = "en"
|
||||||
pipeline = ["tok2vec", "textcat"]
|
pipeline = ["tok2vec", "textcat_multilabel"]
|
||||||
|
|
||||||
[components]
|
[components]
|
||||||
|
|
||||||
[components.tok2vec]
|
[components.tok2vec]
|
||||||
factory = "tok2vec"
|
factory = "tok2vec"
|
||||||
|
|
||||||
[components.textcat]
|
[components.textcat_multilabel]
|
||||||
factory = "textcat"
|
factory = "textcat_multilabel"
|
||||||
"""
|
"""
|
||||||
config = Config().from_str(nlp_config_string)
|
config = Config().from_str(nlp_config_string)
|
||||||
|
fix_random_seed(config["training"]["seed"])
|
||||||
config["corpora"]["@readers"] = reader
|
config["corpora"]["@readers"] = reader
|
||||||
config["corpora"].update(additional_config)
|
config["corpora"].update(additional_config)
|
||||||
nlp = load_model_from_config(config, auto_fill=True)
|
nlp = load_model_from_config(config, auto_fill=True)
|
||||||
|
|
|
@ -347,6 +347,7 @@ cdef class Doc:
|
||||||
for annot in annotations:
|
for annot in annotations:
|
||||||
if annot:
|
if annot:
|
||||||
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
||||||
|
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
if attrs.ndim == 1:
|
if attrs.ndim == 1:
|
||||||
attrs[i] = annot[i]
|
attrs[i] = annot[i]
|
||||||
|
@ -1141,6 +1142,10 @@ cdef class Doc:
|
||||||
else:
|
else:
|
||||||
warnings.warn(Warnings.W102.format(key=key, value=value))
|
warnings.warn(Warnings.W102.format(key=key, value=value))
|
||||||
for key in doc.spans:
|
for key in doc.spans:
|
||||||
|
# if a spans key is in any doc, include it in the merged doc
|
||||||
|
# even if it is empty
|
||||||
|
if key not in concat_spans:
|
||||||
|
concat_spans[key] = []
|
||||||
for span in doc.spans[key]:
|
for span in doc.spans[key]:
|
||||||
concat_spans[key].append((
|
concat_spans[key].append((
|
||||||
span.start_char + char_offset,
|
span.start_char + char_offset,
|
||||||
|
@ -1150,7 +1155,7 @@ cdef class Doc:
|
||||||
span.text, # included as a check
|
span.text, # included as a check
|
||||||
))
|
))
|
||||||
char_offset += len(doc.text)
|
char_offset += len(doc.text)
|
||||||
if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space:
|
if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
|
||||||
char_offset += 1
|
char_offset += 1
|
||||||
|
|
||||||
arrays = [doc.to_array(attrs) for doc in docs]
|
arrays = [doc.to_array(attrs) for doc in docs]
|
||||||
|
|
|
@ -297,7 +297,7 @@ cdef class Span:
|
||||||
for ancestor in ancestors:
|
for ancestor in ancestors:
|
||||||
ancestor_i = ancestor.i - self.c.start
|
ancestor_i = ancestor.i - self.c.start
|
||||||
if ancestor_i in range(length):
|
if ancestor_i in range(length):
|
||||||
array[i, head_col] = ancestor_i - i
|
array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
|
||||||
|
|
||||||
# if there is no appropriate ancestor, define a new artificial root
|
# if there is no appropriate ancestor, define a new artificial root
|
||||||
value = array[i, head_col]
|
value = array[i, head_col]
|
||||||
|
@ -305,7 +305,7 @@ cdef class Span:
|
||||||
new_root = old_to_new_root.get(ancestor_i, None)
|
new_root = old_to_new_root.get(ancestor_i, None)
|
||||||
if new_root is not None:
|
if new_root is not None:
|
||||||
# take the same artificial root as a previous token from the same sentence
|
# take the same artificial root as a previous token from the same sentence
|
||||||
array[i, head_col] = new_root - i
|
array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
|
||||||
else:
|
else:
|
||||||
# set this token as the new artificial root
|
# set this token as the new artificial root
|
||||||
array[i, head_col] = 0
|
array[i, head_col] = 0
|
||||||
|
|
|
@ -329,26 +329,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
if key not in IDS:
|
if key not in IDS:
|
||||||
raise ValueError(Errors.E974.format(obj="token", key=key))
|
raise ValueError(Errors.E974.format(obj="token", key=key))
|
||||||
elif key in ["ORTH", "SPACY"]:
|
elif key in ["ORTH", "SPACY"]:
|
||||||
pass
|
continue
|
||||||
elif key == "HEAD":
|
elif key == "HEAD":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
|
row = [h-i if h is not None else 0 for i, h in enumerate(value)]
|
||||||
elif key == "DEP":
|
elif key == "DEP":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
|
row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
|
||||||
elif key == "SENT_START":
|
elif key == "SENT_START":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([to_ternary_int(v) for v in value])
|
row = [to_ternary_int(v) for v in value]
|
||||||
elif key == "MORPH":
|
elif key == "MORPH":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.morphology.add(v) for v in value])
|
row = [vocab.morphology.add(v) for v in value]
|
||||||
else:
|
else:
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
if not all(isinstance(v, str) for v in value):
|
if not all(isinstance(v, str) for v in value):
|
||||||
types = set([type(v) for v in value])
|
types = set([type(v) for v in value])
|
||||||
raise TypeError(Errors.E969.format(field=key, types=types)) from None
|
raise TypeError(Errors.E969.format(field=key, types=types)) from None
|
||||||
values.append([vocab.strings.add(v) for v in value])
|
row = [vocab.strings.add(v) for v in value]
|
||||||
array = numpy.asarray(values, dtype="uint64")
|
values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
|
||||||
|
array = numpy.array(values, dtype=numpy.uint64)
|
||||||
return attrs, array.T
|
return attrs, array.T
|
||||||
|
|
||||||
|
|
||||||
|
@ -416,7 +417,7 @@ def _fix_legacy_dict_data(example_dict):
|
||||||
token_dict = example_dict.get("token_annotation", {})
|
token_dict = example_dict.get("token_annotation", {})
|
||||||
doc_dict = example_dict.get("doc_annotation", {})
|
doc_dict = example_dict.get("doc_annotation", {})
|
||||||
for key, value in example_dict.items():
|
for key, value in example_dict.items():
|
||||||
if value:
|
if value is not None:
|
||||||
if key in ("token_annotation", "doc_annotation"):
|
if key in ("token_annotation", "doc_annotation"):
|
||||||
pass
|
pass
|
||||||
elif key == "ids":
|
elif key == "ids":
|
||||||
|
|
|
@ -274,3 +274,5 @@ def ensure_shape(vectors_loc):
|
||||||
# store all the results in a list in memory
|
# store all the results in a list in memory
|
||||||
lines2 = open_file(vectors_loc)
|
lines2 = open_file(vectors_loc)
|
||||||
yield from lines2
|
yield from lines2
|
||||||
|
lines2.close()
|
||||||
|
lines.close()
|
||||||
|
|
|
@ -1370,32 +1370,14 @@ def combine_score_weights(
|
||||||
should be preserved.
|
should be preserved.
|
||||||
RETURNS (Dict[str, float]): The combined and normalized weights.
|
RETURNS (Dict[str, float]): The combined and normalized weights.
|
||||||
"""
|
"""
|
||||||
|
# We divide each weight by the total weight sum.
|
||||||
# We first need to extract all None/null values for score weights that
|
# We first need to extract all None/null values for score weights that
|
||||||
# shouldn't be shown in the table *or* be weighted
|
# shouldn't be shown in the table *or* be weighted
|
||||||
result = {}
|
result = {key: overrides.get(key, value) for w_dict in weights for (key, value) in w_dict.items()}
|
||||||
all_weights = []
|
weight_sum = sum([v if v else 0.0 for v in result.values()])
|
||||||
for w_dict in weights:
|
for key, value in result.items():
|
||||||
filtered_weights = {}
|
if value and weight_sum > 0:
|
||||||
for key, value in w_dict.items():
|
result[key] = round(value / weight_sum, 2)
|
||||||
value = overrides.get(key, value)
|
|
||||||
if value is None:
|
|
||||||
result[key] = None
|
|
||||||
else:
|
|
||||||
filtered_weights[key] = value
|
|
||||||
all_weights.append(filtered_weights)
|
|
||||||
for w_dict in all_weights:
|
|
||||||
# We need to account for weights that don't sum to 1.0 and normalize
|
|
||||||
# the score weights accordingly, then divide score by the number of
|
|
||||||
# components.
|
|
||||||
total = sum(w_dict.values())
|
|
||||||
for key, value in w_dict.items():
|
|
||||||
if total == 0:
|
|
||||||
weight = 0.0
|
|
||||||
else:
|
|
||||||
weight = round(value / total / len(all_weights), 2)
|
|
||||||
prev_weight = result.get(key, 0.0)
|
|
||||||
prev_weight = 0.0 if prev_weight is None else prev_weight
|
|
||||||
result[key] = prev_weight + weight
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -10,11 +10,12 @@ api_trainable: true
|
||||||
---
|
---
|
||||||
|
|
||||||
The text categorizer predicts **categories over a whole document**. and comes in
|
The text categorizer predicts **categories over a whole document**. and comes in
|
||||||
two flavours: `textcat` and `textcat_multilabel`. When you need to predict
|
two flavors: `textcat` and `textcat_multilabel`. When you need to predict
|
||||||
exactly one true label per document, use the `textcat` which has mutually
|
exactly one true label per document, use the `textcat` which has mutually
|
||||||
exclusive labels. If you want to perform multi-label classification and predict
|
exclusive labels. If you want to perform multi-label classification and predict
|
||||||
zero, one or more labels per document, use the `textcat_multilabel` component
|
zero, one or more true labels per document, use the `textcat_multilabel`
|
||||||
instead.
|
component instead. For a binary classification task, you can use `textcat` with
|
||||||
|
**two** labels or `textcat_multilabel` with **one** label.
|
||||||
|
|
||||||
Both components are documented on this page.
|
Both components are documented on this page.
|
||||||
|
|
||||||
|
@ -189,7 +190,7 @@ This method was previously called `begin_training`.
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
|
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
|
||||||
| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is not available when using the `textcat_multilabel` component. ~~Optional[str]~~ |
|
| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is only used during scoring. It is not available when using the `textcat_multilabel` component. ~~Optional[str]~~ |
|
||||||
|
|
||||||
## TextCategorizer.predict {#predict tag="method"}
|
## TextCategorizer.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -262,7 +262,12 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"code": "mk",
|
"code": "mk",
|
||||||
"name": "Macedonian"
|
"name": "Macedonian",
|
||||||
|
"models": [
|
||||||
|
"mk_core_news_sm",
|
||||||
|
"mk_core_news_md",
|
||||||
|
"mk_core_news_lg"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"code": "ml",
|
"code": "ml",
|
||||||
|
|
Loading…
Reference in New Issue
Block a user