mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-04 20:30:24 +03:00
Compare commits
35 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
0fc87f64bd | ||
|
834cc20278 | ||
|
f37be33c61 | ||
|
5a04e05f95 | ||
|
de96c6888e | ||
|
647d1e188e | ||
|
d2464d7bc9 | ||
|
6e8ab15445 | ||
|
427de63f0a | ||
|
386a3e69da | ||
|
b449d355d5 | ||
|
e73755e49f | ||
|
41afbb2f89 | ||
|
571ef56fa9 | ||
|
1a5352e423 | ||
|
e3ef798e03 | ||
|
8cfc4c7325 | ||
|
3ac7230abd | ||
|
0de7892033 | ||
|
21204f17c7 | ||
|
a8b883fead | ||
|
cca1e21ad6 | ||
|
346a25f587 | ||
|
9a566e7d2b | ||
|
b50fe5ec68 | ||
|
259ad994e2 | ||
|
03bee62568 | ||
|
b2f34b1507 | ||
|
19b16f047f | ||
|
b6fa6ef94d | ||
|
9de43ab0a8 | ||
|
bee99548e0 | ||
|
99425de369 | ||
|
b31993e03c | ||
|
f606e1d044 |
117
.github/azure-steps.yml
vendored
117
.github/azure-steps.yml
vendored
|
@ -1,117 +0,0 @@
|
|||
parameters:
|
||||
python_version: ''
|
||||
architecture: ''
|
||||
prefix: ''
|
||||
gpu: false
|
||||
num_build_jobs: 1
|
||||
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: ${{ parameters.python_version }}
|
||||
architecture: ${{ parameters.architecture }}
|
||||
|
||||
- bash: |
|
||||
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
|
||||
displayName: 'Set variables'
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pip install -U pip setuptools
|
||||
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
|
||||
displayName: "Install dependencies"
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
|
||||
${{ parameters.prefix }} python setup.py sdist --formats=gztar
|
||||
displayName: "Compile and build sdist"
|
||||
|
||||
- script: python -m mypy spacy
|
||||
displayName: 'Run mypy'
|
||||
condition: ne(variables['python_version'], '3.10')
|
||||
|
||||
- task: DeleteFiles@1
|
||||
inputs:
|
||||
contents: "spacy"
|
||||
displayName: "Delete source directory"
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
|
||||
${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
|
||||
displayName: "Uninstall all packages"
|
||||
|
||||
- bash: |
|
||||
${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||
${{ parameters.prefix }} python -m pip install dist/$SDIST
|
||||
displayName: "Install from sdist"
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
|
||||
displayName: "Install test requirements"
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
|
||||
${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
|
||||
displayName: "Install GPU requirements"
|
||||
condition: eq(${{ parameters.gpu }}, true)
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pytest --pyargs spacy
|
||||
displayName: "Run CPU tests"
|
||||
condition: eq(${{ parameters.gpu }}, false)
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
|
||||
displayName: "Run GPU tests"
|
||||
condition: eq(${{ parameters.gpu }}, true)
|
||||
|
||||
- script: |
|
||||
python -m spacy download ca_core_news_sm
|
||||
python -m spacy download ca_core_news_md
|
||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
displayName: 'Test download CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||
displayName: 'Test convert CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m spacy init config -p ner -l ca ner.cfg
|
||||
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
||||
displayName: 'Test debug config CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
# will have errors due to sparse data, check for summary in output
|
||||
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
||||
displayName: 'Test debug data CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||
displayName: 'Test train CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
displayName: 'Test assemble CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
displayName: 'Test assemble CLI vectors warning'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python .github/validate_universe_json.py website/meta/universe.json
|
||||
displayName: 'Test website/meta/universe.json'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pip install thinc-apple-ops
|
||||
${{ parameters.prefix }} python -m pytest --pyargs spacy
|
||||
displayName: "Run CPU tests with thinc-apple-ops"
|
||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9'))
|
170
.github/workflows/tests.yml
vendored
Normal file
170
.github/workflows/tests.yml
vendored
Normal file
|
@ -0,0 +1,170 @@
|
|||
name: tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches-ignore:
|
||||
- "spacy.io"
|
||||
- "nightly.spacy.io"
|
||||
- "v2.spacy.io"
|
||||
paths-ignore:
|
||||
- "*.md"
|
||||
- "*.mdx"
|
||||
- "website/**"
|
||||
- ".github/workflows/**"
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, edited]
|
||||
paths-ignore:
|
||||
- "*.md"
|
||||
- "*.mdx"
|
||||
- "website/**"
|
||||
|
||||
jobs:
|
||||
validate:
|
||||
name: Validate
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Configure Python version
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.7"
|
||||
architecture: x64
|
||||
|
||||
- name: black
|
||||
run: |
|
||||
python -m pip install black -c requirements.txt
|
||||
python -m black spacy --check
|
||||
- name: flake8
|
||||
run: |
|
||||
python -m pip install flake8==5.0.4
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||
tests:
|
||||
name: Test
|
||||
needs: Validate
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||
python_version: ["3.10"]
|
||||
include:
|
||||
- os: ubuntu-20.04
|
||||
python_version: "3.6"
|
||||
- os: windows-latest
|
||||
python_version: "3.7"
|
||||
- os: macos-latest
|
||||
python_version: "3.8"
|
||||
- os: ubuntu-latest
|
||||
python_version: "3.9"
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- name: Check out repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Configure Python version
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python_version }}
|
||||
architecture: x64
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install -U build pip setuptools
|
||||
python -m pip install -U -r requirements.txt
|
||||
|
||||
- name: Build sdist
|
||||
run: |
|
||||
python -m build --sdist
|
||||
|
||||
- name: Run mypy
|
||||
run: |
|
||||
# Install older numpy for mypy (bug with newer numpy+mypy not fixed
|
||||
# until mypy 0.981)
|
||||
python -m pip install "numpy<1.22"
|
||||
python -m mypy spacy
|
||||
if: matrix.python_version != '3.6'
|
||||
|
||||
- name: Delete source directory and .egg-info
|
||||
run: |
|
||||
rm -rf spacy *.egg-info
|
||||
shell: bash
|
||||
|
||||
- name: Uninstall all packages
|
||||
run: |
|
||||
python -m pip freeze
|
||||
python -m pip freeze --exclude pywin32 > installed.txt
|
||||
python -m pip uninstall -y -r installed.txt
|
||||
|
||||
- name: Install from sdist
|
||||
run: |
|
||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||
SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
|
||||
shell: bash
|
||||
|
||||
- name: Test import
|
||||
run: python -W error -c "import spacy"
|
||||
|
||||
- name: "Test download CLI"
|
||||
run: |
|
||||
python -m spacy download ca_core_news_sm
|
||||
python -m spacy download ca_core_news_md
|
||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test no warnings on load (#11713)"
|
||||
run: |
|
||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test convert CLI"
|
||||
run: |
|
||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test debug config CLI"
|
||||
run: |
|
||||
python -m spacy init config -p ner -l ca ner.cfg
|
||||
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test debug data CLI"
|
||||
run: |
|
||||
# will have errors due to sparse data, check for summary in output
|
||||
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test train CLI"
|
||||
run: |
|
||||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test assemble CLI"
|
||||
run: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test assemble CLI vectors warning"
|
||||
run: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Install test requirements"
|
||||
run: |
|
||||
python -m pip install -U -r requirements.txt
|
||||
|
||||
- name: "Run CPU tests"
|
||||
run: |
|
||||
python -m pytest --pyargs spacy -W error
|
||||
if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.10')"
|
||||
|
||||
- name: "Run CPU tests with thinc-apple-ops"
|
||||
run: |
|
||||
python -m pip install 'spacy[apple]'
|
||||
python -m pytest --pyargs spacy
|
||||
if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.10'
|
|
@ -5,7 +5,7 @@ repos:
|
|||
- id: black
|
||||
language_version: python3.7
|
||||
- repo: https://gitlab.com/pycqa/flake8
|
||||
rev: 3.9.2
|
||||
rev: 5.0.4
|
||||
hooks:
|
||||
- id: flake8
|
||||
args:
|
||||
|
|
|
@ -1,109 +0,0 @@
|
|||
trigger:
|
||||
batch: true
|
||||
branches:
|
||||
include:
|
||||
- "*"
|
||||
exclude:
|
||||
- "spacy.io"
|
||||
- "nightly.spacy.io"
|
||||
- "v2.spacy.io"
|
||||
paths:
|
||||
exclude:
|
||||
- "website/*"
|
||||
- "*.md"
|
||||
pr:
|
||||
paths:
|
||||
exclude:
|
||||
- "*.md"
|
||||
- "website/docs/*"
|
||||
- "website/src/*"
|
||||
|
||||
jobs:
|
||||
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
||||
# defined in .flake8 and overwrites the selected codes.
|
||||
- job: "Validate"
|
||||
pool:
|
||||
vmImage: "ubuntu-latest"
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: "3.7"
|
||||
- script: |
|
||||
pip install flake8==3.9.2
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
||||
displayName: "flake8"
|
||||
|
||||
- job: "Test"
|
||||
dependsOn: "Validate"
|
||||
strategy:
|
||||
matrix:
|
||||
# We're only running one platform per Python version to speed up builds
|
||||
Python36Linux:
|
||||
imageName: "ubuntu-latest"
|
||||
python.version: "3.6"
|
||||
# Python36Windows:
|
||||
# imageName: "windows-latest"
|
||||
# python.version: "3.6"
|
||||
# Python36Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.6"
|
||||
# Python37Linux:
|
||||
# imageName: "ubuntu-latest"
|
||||
# python.version: "3.7"
|
||||
Python37Windows:
|
||||
imageName: "windows-latest"
|
||||
python.version: "3.7"
|
||||
# Python37Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.7"
|
||||
# Python38Linux:
|
||||
# imageName: "ubuntu-latest"
|
||||
# python.version: "3.8"
|
||||
# Python38Windows:
|
||||
# imageName: "windows-latest"
|
||||
# python.version: "3.8"
|
||||
Python38Mac:
|
||||
imageName: "macos-latest"
|
||||
python.version: "3.8"
|
||||
Python39Linux:
|
||||
imageName: "ubuntu-latest"
|
||||
python.version: "3.9"
|
||||
# Python39Windows:
|
||||
# imageName: "windows-latest"
|
||||
# python.version: "3.9"
|
||||
# Python39Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.9"
|
||||
Python310Linux:
|
||||
imageName: "ubuntu-latest"
|
||||
python.version: "3.10"
|
||||
Python310Windows:
|
||||
imageName: "windows-latest"
|
||||
python.version: "3.10"
|
||||
Python310Mac:
|
||||
imageName: "macos-latest"
|
||||
python.version: "3.10"
|
||||
maxParallel: 4
|
||||
pool:
|
||||
vmImage: $(imageName)
|
||||
steps:
|
||||
- template: .github/azure-steps.yml
|
||||
parameters:
|
||||
python_version: '$(python.version)'
|
||||
architecture: 'x64'
|
||||
|
||||
# - job: "TestGPU"
|
||||
# dependsOn: "Validate"
|
||||
# strategy:
|
||||
# matrix:
|
||||
# Python38LinuxX64_GPU:
|
||||
# python.version: '3.8'
|
||||
# pool:
|
||||
# name: "LinuxX64_GPU"
|
||||
# steps:
|
||||
# - template: .github/azure-steps.yml
|
||||
# parameters:
|
||||
# python_version: '$(python.version)'
|
||||
# architecture: 'x64'
|
||||
# gpu: true
|
||||
# num_build_jobs: 24
|
|
@ -1,6 +1,8 @@
|
|||
# build version constraints for use with wheelwright + multibuild
|
||||
numpy==1.15.0; python_version<='3.7'
|
||||
numpy==1.17.3; python_version=='3.8'
|
||||
numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
|
||||
numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
|
||||
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
|
||||
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
|
||||
numpy==1.19.3; python_version=='3.9'
|
||||
numpy==1.21.3; python_version=='3.10'
|
||||
numpy; python_version>='3.11'
|
||||
|
|
|
@ -12,6 +12,7 @@ srsly>=2.4.1,<3.0.0
|
|||
catalogue>=2.0.6,<2.1.0
|
||||
typer>=0.3.0,<0.5.0
|
||||
pathy>=0.3.5
|
||||
smart-open>=5.2.1,<7.0.0
|
||||
# Third party dependencies
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
|
@ -22,7 +23,9 @@ langcodes>=3.2.0,<4.0.0
|
|||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8"
|
||||
# Require and pin typing_extensions for all python versions as a workaround
|
||||
# for pydantic incompatibility with typing_extensions>=4.6.0
|
||||
typing_extensions>=3.7.4.1,<4.6.0
|
||||
# Development dependencies
|
||||
pre-commit>=2.13.0
|
||||
cython>=0.25,<3.0
|
||||
|
|
|
@ -51,9 +51,10 @@ install_requires =
|
|||
wasabi>=0.8.1,<1.1.0
|
||||
srsly>=2.4.1,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
# Third-party dependencies
|
||||
typer>=0.3.0,<0.5.0
|
||||
pathy>=0.3.5
|
||||
# Third-party dependencies
|
||||
smart-open>=5.2.1,<7.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
|
@ -62,7 +63,9 @@ install_requires =
|
|||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
|
||||
# Require and pin typing_extensions for all python versions as a workaround
|
||||
# for pydantic incompatibility with typing_extensions>=4.6.0
|
||||
typing_extensions>=3.7.4.1,<4.6.0
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
|
||||
[options.entry_points]
|
||||
|
|
2
setup.py
2
setup.py
|
@ -125,6 +125,8 @@ class build_ext_options:
|
|||
|
||||
class build_ext_subclass(build_ext, build_ext_options):
|
||||
def build_extensions(self):
|
||||
if not self.parallel:
|
||||
self.parallel = int(os.environ.get("SPACY_NUM_BUILD_JOBS", 1))
|
||||
build_ext_options.build_options(self)
|
||||
build_ext.build_extensions(self)
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.2.2"
|
||||
__version__ = "3.2.6"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
|
|
|
@ -358,7 +358,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
|
|||
if dest.exists() and not force:
|
||||
return None
|
||||
src = str(src)
|
||||
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
|
||||
with smart_open.open(src, mode="rb", compression="disable") as input_file:
|
||||
with dest.open(mode="wb") as output_file:
|
||||
output_file.write(input_file.read())
|
||||
|
||||
|
|
|
@ -50,7 +50,7 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
|
|||
)
|
||||
pip_args = pip_args + ("--no-deps",)
|
||||
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
|
||||
dl_tpl = "{m}-{v}/{m}-{v}{s}#egg={m}=={v}"
|
||||
dl_tpl = "{m}-{v}/{m}-{v}{s}"
|
||||
if direct:
|
||||
components = model.split("-")
|
||||
model_name = "".join(components[:-1])
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{# This is a template for training configs used for the quickstart widget in
|
||||
the docs and the init config command. It encodes various best practices and
|
||||
can help generate the best possible configuration, given a user's requirements. #}
|
||||
{%- set use_transformer = hardware != "cpu" -%}
|
||||
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||
[paths]
|
||||
train = null
|
||||
|
|
|
@ -322,6 +322,11 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"clear the existing vectors and resize the table.")
|
||||
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
||||
"to end with the attribute {attr}. Got: {bad_attr}.")
|
||||
E079 = ("Error computing states in beam: number of predicted beams "
|
||||
"({pbeams}) does not equal number of gold beams ({gbeams}).")
|
||||
E080 = ("Duplicate state found in beam: {key}.")
|
||||
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
|
||||
"does not equal number of losses ({losses}).")
|
||||
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
||||
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
||||
"match.")
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Tuple, Callable
|
||||
from typing import List, Tuple, Callable
|
||||
from thinc.api import Model, to_numpy
|
||||
from thinc.types import Ragged, Ints1d
|
||||
|
||||
|
@ -52,14 +52,14 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
|
|||
indices will be [5, 6, 7, 8, 8, 9].
|
||||
"""
|
||||
spans, lengths = _ensure_cpu(spans, lengths)
|
||||
indices = []
|
||||
indices: List[int] = []
|
||||
offset = 0
|
||||
for i, length in enumerate(lengths):
|
||||
spans_i = spans[i].dataXd + offset
|
||||
for j in range(spans_i.shape[0]):
|
||||
indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1])) # type: ignore[call-overload, index]
|
||||
indices.extend(range(spans_i[j, 0], spans_i[j, 1])) # type: ignore
|
||||
offset += length
|
||||
return ops.flatten(indices, dtype="i", ndim_if_empty=1)
|
||||
return ops.asarray1i(indices)
|
||||
|
||||
|
||||
def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
|
||||
|
|
|
@ -268,7 +268,10 @@ class SpanCategorizer(TrainablePipe):
|
|||
DOCS: https://spacy.io/api/spancategorizer#predict
|
||||
"""
|
||||
indices = self.suggester(docs, ops=self.model.ops)
|
||||
scores = self.model.predict((docs, indices)) # type: ignore
|
||||
if indices.lengths.sum() == 0:
|
||||
scores = self.model.ops.alloc2f(0, 0)
|
||||
else:
|
||||
scores = self.model.predict((docs, indices)) # type: ignore
|
||||
return indices, scores
|
||||
|
||||
def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
|
||||
|
|
|
@ -118,6 +118,10 @@ class Tok2Vec(TrainablePipe):
|
|||
|
||||
DOCS: https://spacy.io/api/tok2vec#predict
|
||||
"""
|
||||
if not any(len(doc) for doc in docs):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
width = self.model.get_dim("nO")
|
||||
return [self.model.ops.alloc((0, width)) for doc in docs]
|
||||
tokvecs = self.model.predict(docs)
|
||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
||||
for listener in self.listeners:
|
||||
|
|
|
@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
|
|||
|
||||
# head before start
|
||||
arr = doc.to_array(["HEAD"])
|
||||
arr[0] = -1
|
||||
arr[0] = numpy.int32(-1).astype(numpy.uint64)
|
||||
doc_from_array = Doc(en_vocab, words=words)
|
||||
with pytest.raises(ValueError):
|
||||
doc_from_array.from_array(["HEAD"], arr)
|
||||
|
||||
# head after end
|
||||
arr = doc.to_array(["HEAD"])
|
||||
arr[0] = 5
|
||||
arr[0] = numpy.int32(5).astype(numpy.uint64)
|
||||
doc_from_array = Doc(en_vocab, words=words)
|
||||
with pytest.raises(ValueError):
|
||||
doc_from_array.from_array(["HEAD"], arr)
|
||||
|
|
|
@ -2,6 +2,7 @@ import weakref
|
|||
|
||||
import numpy
|
||||
import pytest
|
||||
import warnings
|
||||
from thinc.api import NumpyOps, get_current_ops
|
||||
|
||||
from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
|
||||
|
@ -528,9 +529,9 @@ def test_doc_from_array_sent_starts(en_vocab):
|
|||
# no warning using default attrs
|
||||
attrs = doc._get_array_attrs()
|
||||
arr = doc.to_array(attrs)
|
||||
with pytest.warns(None) as record:
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error")
|
||||
new_doc.from_array(attrs, arr)
|
||||
assert len(record) == 0
|
||||
# only SENT_START uses SENT_START
|
||||
attrs = [SENT_START]
|
||||
arr = doc.to_array(attrs)
|
||||
|
|
|
@ -2,6 +2,9 @@ import pytest
|
|||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||
|
||||
|
||||
def test_ru_doc_lemmatization(ru_lemmatizer):
|
||||
words = ["мама", "мыла", "раму"]
|
||||
pos = ["NOUN", "VERB", "NOUN"]
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||
|
||||
|
||||
def test_uk_lemmatizer(uk_lemmatizer):
|
||||
"""Check that the default uk lemmatizer runs."""
|
||||
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import pytest
|
||||
import warnings
|
||||
import srsly
|
||||
from mock import Mock
|
||||
|
||||
|
@ -314,13 +315,13 @@ def test_phrase_matcher_validation(en_vocab):
|
|||
matcher.add("TEST1", [doc1])
|
||||
with pytest.warns(UserWarning):
|
||||
matcher.add("TEST2", [doc2])
|
||||
with pytest.warns(None) as record:
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error")
|
||||
matcher.add("TEST3", [doc3])
|
||||
assert not record.list
|
||||
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
|
||||
with pytest.warns(None) as record:
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error")
|
||||
matcher.add("TEST4", [doc2])
|
||||
assert not record.list
|
||||
|
||||
|
||||
def test_attr_validation(en_vocab):
|
||||
|
|
|
@ -372,24 +372,39 @@ def test_overfitting_IO_overlapping():
|
|||
|
||||
|
||||
def test_zero_suggestions():
|
||||
# Test with a suggester that returns 0 suggestions
|
||||
# Test with a suggester that can return 0 suggestions
|
||||
|
||||
@registry.misc("test_zero_suggester")
|
||||
def make_zero_suggester():
|
||||
def zero_suggester(docs, *, ops=None):
|
||||
@registry.misc("test_mixed_zero_suggester")
|
||||
def make_mixed_zero_suggester():
|
||||
def mixed_zero_suggester(docs, *, ops=None):
|
||||
if ops is None:
|
||||
ops = get_current_ops()
|
||||
return Ragged(
|
||||
ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
|
||||
)
|
||||
spans = []
|
||||
lengths = []
|
||||
for doc in docs:
|
||||
if len(doc) > 0 and len(doc) % 2 == 0:
|
||||
spans.append((0, 1))
|
||||
lengths.append(1)
|
||||
else:
|
||||
lengths.append(0)
|
||||
spans = ops.asarray2i(spans)
|
||||
lengths_array = ops.asarray1i(lengths)
|
||||
if len(spans) > 0:
|
||||
output = Ragged(ops.xp.vstack(spans), lengths_array)
|
||||
else:
|
||||
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
|
||||
return output
|
||||
|
||||
return zero_suggester
|
||||
return mixed_zero_suggester
|
||||
|
||||
fix_random_seed(0)
|
||||
nlp = English()
|
||||
spancat = nlp.add_pipe(
|
||||
"spancat",
|
||||
config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
|
||||
config={
|
||||
"suggester": {"@misc": "test_mixed_zero_suggester"},
|
||||
"spans_key": SPAN_KEY,
|
||||
},
|
||||
)
|
||||
train_examples = make_examples(nlp)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
@ -397,3 +412,13 @@ def test_zero_suggestions():
|
|||
assert set(spancat.labels) == {"LOC", "PERSON"}
|
||||
|
||||
nlp.update(train_examples, sgd=optimizer)
|
||||
# empty doc
|
||||
nlp("")
|
||||
# single doc with zero suggestions
|
||||
nlp("one")
|
||||
# single doc with one suggestion
|
||||
nlp("two two")
|
||||
# batch with mixed zero/one suggestions
|
||||
list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"]))
|
||||
# batch with no suggestions
|
||||
list(nlp.pipe(["", "one", "three three three"]))
|
||||
|
|
|
@ -11,7 +11,7 @@ from spacy.lang.en import English
|
|||
from thinc.api import Config, get_current_ops
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from ..util import get_batch, make_tempdir
|
||||
from ..util import get_batch, make_tempdir, add_vecs_to_vocab
|
||||
|
||||
|
||||
def test_empty_doc():
|
||||
|
@ -140,9 +140,25 @@ TRAIN_DATA = [
|
|||
]
|
||||
|
||||
|
||||
def test_tok2vec_listener():
|
||||
@pytest.mark.parametrize("with_vectors", (False, True))
|
||||
def test_tok2vec_listener(with_vectors):
|
||||
orig_config = Config().from_str(cfg_string)
|
||||
orig_config["components"]["tok2vec"]["model"]["embed"][
|
||||
"include_static_vectors"
|
||||
] = with_vectors
|
||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||
|
||||
if with_vectors:
|
||||
ops = get_current_ops()
|
||||
vectors = [
|
||||
("apple", ops.asarray([1, 2, 3])),
|
||||
("orange", ops.asarray([-1, -2, -3])),
|
||||
("and", ops.asarray([-1, -1, -1])),
|
||||
("juice", ops.asarray([5, 5, 10])),
|
||||
("pie", ops.asarray([7, 6.3, 8.9])),
|
||||
]
|
||||
add_vecs_to_vocab(nlp.vocab, vectors)
|
||||
|
||||
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
||||
tagger = nlp.get_pipe("tagger")
|
||||
tok2vec = nlp.get_pipe("tok2vec")
|
||||
|
@ -169,6 +185,9 @@ def test_tok2vec_listener():
|
|||
ops = get_current_ops()
|
||||
assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor))
|
||||
|
||||
# test with empty doc
|
||||
doc = nlp("")
|
||||
|
||||
# TODO: should this warn or error?
|
||||
nlp.select_pipes(disable="tok2vec")
|
||||
assert nlp.pipe_names == ["tagger"]
|
||||
|
|
|
@ -12,6 +12,7 @@ from spacy.cli._util import is_subpath_of, load_project_config
|
|||
from spacy.cli._util import parse_config_overrides, string_to_list
|
||||
from spacy.cli._util import substitute_project_variables
|
||||
from spacy.cli._util import validate_project_commands
|
||||
from spacy.cli._util import upload_file, download_file
|
||||
from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
|
||||
from spacy.cli.debug_data import _get_labels_from_spancat
|
||||
from spacy.cli.download import get_compatibility, get_version
|
||||
|
@ -706,17 +707,42 @@ def test_permitted_package_names():
|
|||
assert _is_permitted_package_name("-package") == False
|
||||
assert _is_permitted_package_name("package-") == False
|
||||
|
||||
|
||||
|
||||
def test_debug_data_compile_gold():
|
||||
nlp = English()
|
||||
pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
|
||||
ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "O", "B-ENT"])
|
||||
ref = Doc(
|
||||
nlp.vocab,
|
||||
words=["Token", ".", "New York City"],
|
||||
sent_starts=[True, False, True],
|
||||
ents=["O", "O", "B-ENT"],
|
||||
)
|
||||
eg = Example(pred, ref)
|
||||
data = _compile_gold([eg], ["ner"], nlp, True)
|
||||
assert data["boundary_cross_ents"] == 0
|
||||
|
||||
pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
|
||||
ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"])
|
||||
ref = Doc(
|
||||
nlp.vocab,
|
||||
words=["Token", ".", "New York City"],
|
||||
sent_starts=[True, False, True],
|
||||
ents=["O", "B-ENT", "I-ENT"],
|
||||
)
|
||||
eg = Example(pred, ref)
|
||||
data = _compile_gold([eg], ["ner"], nlp, True)
|
||||
assert data["boundary_cross_ents"] == 1
|
||||
assert data["boundary_cross_ents"] == 1
|
||||
|
||||
|
||||
def test_upload_download_local_file():
|
||||
with make_tempdir() as d1, make_tempdir() as d2:
|
||||
filename = "f.txt"
|
||||
content = "content"
|
||||
local_file = d1 / filename
|
||||
remote_file = d2 / filename
|
||||
with local_file.open(mode="w") as file_:
|
||||
file_.write(content)
|
||||
upload_file(local_file, remote_file)
|
||||
local_file.unlink()
|
||||
download_file(remote_file, local_file)
|
||||
with local_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
|
|
|
@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():
|
|||
|
||||
|
||||
def get_textcat_cnn_kwargs():
|
||||
return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
||||
return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
||||
|
||||
|
||||
def get_all_params(model):
|
||||
|
@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
|
|||
}
|
||||
|
||||
|
||||
def test_tok2vec():
|
||||
def make_test_tok2vec():
|
||||
return build_Tok2Vec_model(**get_tok2vec_kwargs())
|
||||
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ from ..util import get_cosine, add_vecs_to_vocab
|
|||
|
||||
@pytest.fixture
|
||||
def vectors():
|
||||
return [("apple", [1, 2, 3]), ("orange", [-1, -2, -3])]
|
||||
return [("apple", [1, 2, 3]), ("orange", [-1, -2, -5])]
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
|
@ -71,19 +71,17 @@ def test_vectors_similarity_DD(vocab, vectors):
|
|||
def test_vectors_similarity_TD(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = Doc(vocab, words=[word1, word2])
|
||||
with pytest.warns(UserWarning):
|
||||
assert isinstance(doc.similarity(doc[0]), float)
|
||||
assert isinstance(doc[0].similarity(doc), float)
|
||||
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
|
||||
assert isinstance(doc.similarity(doc[0]), float)
|
||||
assert isinstance(doc[0].similarity(doc), float)
|
||||
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
|
||||
|
||||
|
||||
def test_vectors_similarity_TS(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = Doc(vocab, words=[word1, word2])
|
||||
with pytest.warns(UserWarning):
|
||||
assert isinstance(doc[:2].similarity(doc[0]), float)
|
||||
assert isinstance(doc[0].similarity(doc[-2]), float)
|
||||
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
|
||||
assert isinstance(doc[:2].similarity(doc[0]), float)
|
||||
assert isinstance(doc[0].similarity(doc[-2]), float)
|
||||
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
|
||||
|
||||
|
||||
def test_vectors_similarity_DS(vocab, vectors):
|
||||
|
|
|
@ -356,6 +356,7 @@ cdef class Doc:
|
|||
for annot in annotations:
|
||||
if annot:
|
||||
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
||||
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
|
||||
for i in range(len(words)):
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = annot[i]
|
||||
|
|
|
@ -305,7 +305,7 @@ cdef class Span:
|
|||
for ancestor in ancestors:
|
||||
ancestor_i = ancestor.i - self.c.start
|
||||
if ancestor_i in range(length):
|
||||
array[i, head_col] = ancestor_i - i
|
||||
array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
|
||||
|
||||
# if there is no appropriate ancestor, define a new artificial root
|
||||
value = array[i, head_col]
|
||||
|
@ -313,7 +313,7 @@ cdef class Span:
|
|||
new_root = old_to_new_root.get(ancestor_i, None)
|
||||
if new_root is not None:
|
||||
# take the same artificial root as a previous token from the same sentence
|
||||
array[i, head_col] = new_root - i
|
||||
array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
|
||||
else:
|
||||
# set this token as the new artificial root
|
||||
array[i, head_col] = 0
|
||||
|
|
|
@ -333,26 +333,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
|||
if key not in IDS:
|
||||
raise ValueError(Errors.E974.format(obj="token", key=key))
|
||||
elif key in ["ORTH", "SPACY"]:
|
||||
pass
|
||||
continue
|
||||
elif key == "HEAD":
|
||||
attrs.append(key)
|
||||
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
|
||||
row = [h-i if h is not None else 0 for i, h in enumerate(value)]
|
||||
elif key == "DEP":
|
||||
attrs.append(key)
|
||||
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
|
||||
row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
|
||||
elif key == "SENT_START":
|
||||
attrs.append(key)
|
||||
values.append([to_ternary_int(v) for v in value])
|
||||
row = [to_ternary_int(v) for v in value]
|
||||
elif key == "MORPH":
|
||||
attrs.append(key)
|
||||
values.append([vocab.morphology.add(v) for v in value])
|
||||
row = [vocab.morphology.add(v) for v in value]
|
||||
else:
|
||||
attrs.append(key)
|
||||
if not all(isinstance(v, str) for v in value):
|
||||
types = set([type(v) for v in value])
|
||||
raise TypeError(Errors.E969.format(field=key, types=types)) from None
|
||||
values.append([vocab.strings.add(v) for v in value])
|
||||
array = numpy.asarray(values, dtype="uint64")
|
||||
row = [vocab.strings.add(v) for v in value]
|
||||
values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
|
||||
array = numpy.array(values, dtype=numpy.uint64)
|
||||
return attrs, array.T
|
||||
|
||||
|
||||
|
|
|
@ -335,3 +335,5 @@ def ensure_shape(vectors_loc):
|
|||
# store all the results in a list in memory
|
||||
lines2 = open_file(vectors_loc)
|
||||
yield from lines2
|
||||
lines2.close()
|
||||
lines.close()
|
||||
|
|
Loading…
Reference in New Issue
Block a user