mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 08:12:24 +03:00
Merge pull request #11965 from adrianeboyd/backport/v3.0.9
Backport bug fixes to v3.0.x
This commit is contained in:
commit
a65379dede
87
.github/azure-steps.yml
vendored
87
.github/azure-steps.yml
vendored
|
@ -1,9 +1,6 @@
|
||||||
parameters:
|
parameters:
|
||||||
python_version: ''
|
python_version: ''
|
||||||
architecture: ''
|
architecture: 'x64'
|
||||||
prefix: ''
|
|
||||||
gpu: false
|
|
||||||
num_build_jobs: 1
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- task: UsePythonVersion@0
|
- task: UsePythonVersion@0
|
||||||
|
@ -16,52 +13,76 @@ steps:
|
||||||
displayName: 'Set variables'
|
displayName: 'Set variables'
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pip install -U pip setuptools
|
python -m pip install -U build pip setuptools
|
||||||
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
|
python -m pip install -U -r requirements.txt
|
||||||
displayName: "Install dependencies"
|
displayName: "Install dependencies"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
|
python -m build --sdist
|
||||||
${{ parameters.prefix }} python setup.py sdist --formats=gztar
|
displayName: "Build sdist"
|
||||||
displayName: "Compile and build sdist"
|
|
||||||
|
|
||||||
- task: DeleteFiles@1
|
- task: DeleteFiles@1
|
||||||
inputs:
|
inputs:
|
||||||
contents: "spacy"
|
contents: "spacy"
|
||||||
displayName: "Delete source directory"
|
displayName: "Delete source directory"
|
||||||
|
|
||||||
|
- task: DeleteFiles@1
|
||||||
|
inputs:
|
||||||
|
contents: "*.egg-info"
|
||||||
|
displayName: "Delete egg-info directory"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
|
python -m pip freeze > installed.txt
|
||||||
${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
|
python -m pip uninstall -y -r installed.txt
|
||||||
displayName: "Uninstall all packages"
|
displayName: "Uninstall all packages"
|
||||||
|
|
||||||
- bash: |
|
- bash: |
|
||||||
${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||||
${{ parameters.prefix }} python -m pip install dist/$SDIST
|
python -m pip install dist/$SDIST
|
||||||
displayName: "Install from sdist"
|
displayName: "Install from sdist"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
|
python -W error -c "import spacy"
|
||||||
|
displayName: "Test import"
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m spacy download es_core_news_sm
|
||||||
|
python -c "import spacy; nlp=spacy.load('es_core_news_sm'); doc=nlp('test')"
|
||||||
|
displayName: 'Test download CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||||
|
displayName: 'Test convert CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m spacy init config -p ner -l es ner.cfg
|
||||||
|
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
||||||
|
displayName: 'Test debug config CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
# will have errors due to sparse data, check for summary in output
|
||||||
|
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
||||||
|
displayName: 'Test debug data CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||||
|
displayName: 'Test train CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'es_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||||
|
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||||
|
displayName: 'Test assemble CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m pip install -U -r requirements.txt
|
||||||
displayName: "Install test requirements"
|
displayName: "Install test requirements"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
|
python -m pytest --pyargs spacy -W error
|
||||||
${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
|
|
||||||
displayName: "Install GPU requirements"
|
|
||||||
condition: eq(${{ parameters.gpu }}, true)
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
${{ parameters.prefix }} python -m pytest --pyargs spacy
|
|
||||||
displayName: "Run CPU tests"
|
displayName: "Run CPU tests"
|
||||||
condition: eq(${{ parameters.gpu }}, false)
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
|
|
||||||
displayName: "Run GPU tests"
|
|
||||||
condition: eq(${{ parameters.gpu }}, true)
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m spacy download en_core_web_sm
|
|
||||||
python -c "import spacy; nlp=spacy.load('en_core_web_sm'); doc=nlp('test')"
|
|
||||||
displayName: 'Test download CLI'
|
|
||||||
condition: eq(variables['python_version'], '3.8')
|
|
||||||
|
|
|
@ -28,7 +28,7 @@ jobs:
|
||||||
inputs:
|
inputs:
|
||||||
versionSpec: "3.7"
|
versionSpec: "3.7"
|
||||||
- script: |
|
- script: |
|
||||||
pip install flake8==3.5.0
|
pip install flake8==5.0.4
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
||||||
displayName: "flake8"
|
displayName: "flake8"
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ jobs:
|
||||||
matrix:
|
matrix:
|
||||||
# We're only running one platform per Python version to speed up builds
|
# We're only running one platform per Python version to speed up builds
|
||||||
Python36Linux:
|
Python36Linux:
|
||||||
imageName: "ubuntu-latest"
|
imageName: "ubuntu-20.04"
|
||||||
python.version: "3.6"
|
python.version: "3.6"
|
||||||
# Python36Windows:
|
# Python36Windows:
|
||||||
# imageName: "windows-latest"
|
# imageName: "windows-latest"
|
||||||
|
@ -47,7 +47,7 @@ jobs:
|
||||||
# imageName: "macos-latest"
|
# imageName: "macos-latest"
|
||||||
# python.version: "3.6"
|
# python.version: "3.6"
|
||||||
# Python37Linux:
|
# Python37Linux:
|
||||||
# imageName: "ubuntu-latest"
|
# imageName: "ubuntu-20.04"
|
||||||
# python.version: "3.7"
|
# python.version: "3.7"
|
||||||
Python37Windows:
|
Python37Windows:
|
||||||
imageName: "windows-latest"
|
imageName: "windows-latest"
|
||||||
|
@ -89,20 +89,3 @@ jobs:
|
||||||
- template: .github/azure-steps.yml
|
- template: .github/azure-steps.yml
|
||||||
parameters:
|
parameters:
|
||||||
python_version: '$(python.version)'
|
python_version: '$(python.version)'
|
||||||
architecture: 'x64'
|
|
||||||
|
|
||||||
# - job: "TestGPU"
|
|
||||||
# dependsOn: "Validate"
|
|
||||||
# strategy:
|
|
||||||
# matrix:
|
|
||||||
# Python38LinuxX64_GPU:
|
|
||||||
# python.version: '3.8'
|
|
||||||
# pool:
|
|
||||||
# name: "LinuxX64_GPU"
|
|
||||||
# steps:
|
|
||||||
# - template: .github/azure-steps.yml
|
|
||||||
# parameters:
|
|
||||||
# python_version: '$(python.version)'
|
|
||||||
# architecture: 'x64'
|
|
||||||
# gpu: true
|
|
||||||
# num_build_jobs: 24
|
|
||||||
|
|
|
@ -11,6 +11,7 @@ srsly>=2.4.1,<3.0.0
|
||||||
catalogue>=2.0.4,<2.1.0
|
catalogue>=2.0.4,<2.1.0
|
||||||
typer>=0.3.0,<0.4.0
|
typer>=0.3.0,<0.4.0
|
||||||
pathy>=0.3.5
|
pathy>=0.3.5
|
||||||
|
smart-open>=5.2.1,<7.0.0
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
|
@ -26,6 +27,6 @@ cython>=0.25,<3.0
|
||||||
pytest>=5.2.0
|
pytest>=5.2.0
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.5.0,<3.6.0
|
flake8>=3.8.0,<6.0.0
|
||||||
hypothesis>=3.27.0,<7.0.0
|
hypothesis>=3.27.0,<7.0.0
|
||||||
mypy==0.910
|
mypy==0.910
|
||||||
|
|
|
@ -49,9 +49,10 @@ install_requires =
|
||||||
wasabi>=0.8.1,<1.1.0
|
wasabi>=0.8.1,<1.1.0
|
||||||
srsly>=2.4.1,<3.0.0
|
srsly>=2.4.1,<3.0.0
|
||||||
catalogue>=2.0.4,<2.1.0
|
catalogue>=2.0.4,<2.1.0
|
||||||
|
# Third-party dependencies
|
||||||
typer>=0.3.0,<0.4.0
|
typer>=0.3.0,<0.4.0
|
||||||
pathy>=0.3.5
|
pathy>=0.3.5
|
||||||
# Third-party dependencies
|
smart-open>=5.2.1,<7.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.0.8"
|
__version__ = "3.0.9"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -355,7 +355,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
|
||||||
if dest.exists() and not force:
|
if dest.exists() and not force:
|
||||||
return None
|
return None
|
||||||
src = str(src)
|
src = str(src)
|
||||||
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
|
with smart_open.open(src, mode="rb", compression="disable") as input_file:
|
||||||
with dest.open(mode="wb") as output_file:
|
with dest.open(mode="wb") as output_file:
|
||||||
output_file.write(input_file.read())
|
output_file.write(input_file.read())
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
{# This is a template for training configs used for the quickstart widget in
|
{# This is a template for training configs used for the quickstart widget in
|
||||||
the docs and the init config command. It encodes various best practices and
|
the docs and the init config command. It encodes various best practices and
|
||||||
can help generate the best possible configuration, given a user's requirements. #}
|
can help generate the best possible configuration, given a user's requirements. #}
|
||||||
{%- set use_transformer = hardware != "cpu" -%}
|
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||||
[paths]
|
[paths]
|
||||||
train = null
|
train = null
|
||||||
|
|
|
@ -320,6 +320,11 @@ class Errors:
|
||||||
"clear the existing vectors and resize the table.")
|
"clear the existing vectors and resize the table.")
|
||||||
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
||||||
"to end with the attribute {attr}. Got: {bad_attr}.")
|
"to end with the attribute {attr}. Got: {bad_attr}.")
|
||||||
|
E079 = ("Error computing states in beam: number of predicted beams "
|
||||||
|
"({pbeams}) does not equal number of gold beams ({gbeams}).")
|
||||||
|
E080 = ("Duplicate state found in beam: {key}.")
|
||||||
|
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
|
||||||
|
"does not equal number of losses ({losses}).")
|
||||||
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
||||||
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
||||||
"match.")
|
"match.")
|
||||||
|
|
|
@ -6,6 +6,7 @@ from thinc.api import Model, Maxout, Linear
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ...kb import KnowledgeBase, Candidate, get_candidates
|
from ...kb import KnowledgeBase, Candidate, get_candidates
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
|
from ...tokens import Span
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.EntityLinker.v1")
|
@registry.architectures("spacy.EntityLinker.v1")
|
||||||
|
@ -44,5 +45,5 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.CandidateGenerator.v1")
|
@registry.misc("spacy.CandidateGenerator.v1")
|
||||||
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
|
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
||||||
return get_candidates
|
return get_candidates
|
||||||
|
|
|
@ -9,7 +9,7 @@ import warnings
|
||||||
|
|
||||||
from ..kb import KnowledgeBase, Candidate
|
from ..kb import KnowledgeBase, Candidate
|
||||||
from ..ml import empty_kb
|
from ..ml import empty_kb
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc, Span
|
||||||
from .pipe import deserialize_config
|
from .pipe import deserialize_config
|
||||||
from .trainable_pipe import TrainablePipe
|
from .trainable_pipe import TrainablePipe
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
@ -67,7 +67,7 @@ def make_entity_linker(
|
||||||
incl_prior: bool,
|
incl_prior: bool,
|
||||||
incl_context: bool,
|
incl_context: bool,
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
):
|
):
|
||||||
"""Construct an EntityLinker component.
|
"""Construct an EntityLinker component.
|
||||||
|
|
||||||
|
@ -114,7 +114,7 @@ class EntityLinker(TrainablePipe):
|
||||||
incl_prior: bool,
|
incl_prior: bool,
|
||||||
incl_context: bool,
|
incl_context: bool,
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize an entity linker.
|
"""Initialize an entity linker.
|
||||||
|
|
||||||
|
@ -127,7 +127,7 @@ class EntityLinker(TrainablePipe):
|
||||||
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
||||||
incl_context (bool): Whether or not to include the local context in the model.
|
incl_context (bool): Whether or not to include the local context in the model.
|
||||||
entity_vector_length (int): Size of encoding vectors in the KB.
|
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||||
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
|
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#init
|
DOCS: https://spacy.io/api/entitylinker#init
|
||||||
|
|
|
@ -4,7 +4,7 @@ from enum import Enum
|
||||||
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
||||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
||||||
from pydantic.main import ModelMetaclass
|
from pydantic.main import ModelMetaclass
|
||||||
from thinc.api import Optimizer, ConfigValidationError
|
from thinc.api import Optimizer, ConfigValidationError, Model
|
||||||
from thinc.config import Promise
|
from thinc.config import Promise
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import inspect
|
import inspect
|
||||||
|
@ -17,6 +17,7 @@ if TYPE_CHECKING:
|
||||||
# This lets us add type hints for mypy etc. without causing circular imports
|
# This lets us add type hints for mypy etc. without causing circular imports
|
||||||
from .language import Language # noqa: F401
|
from .language import Language # noqa: F401
|
||||||
from .training import Example # noqa: F401
|
from .training import Example # noqa: F401
|
||||||
|
from .vocab import Vocab # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
@ -353,7 +354,7 @@ class ConfigSchemaPretrain(BaseModel):
|
||||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||||
component: str = Field(..., title="Component to find the layer to pretrain")
|
component: str = Field(..., title="Component to find the layer to pretrain")
|
||||||
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
|
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
|
||||||
objective: Callable[["Vocab", "Model"], "Model"] = Field(..., title="A function that creates the pretraining objective.")
|
objective: Callable[["Vocab", Model], Model] = Field(..., title="A function that creates the pretraining objective.")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
import numpy
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
|
from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
|
||||||
|
|
||||||
|
@ -100,14 +101,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
|
||||||
|
|
||||||
# head before start
|
# head before start
|
||||||
arr = doc.to_array(["HEAD"])
|
arr = doc.to_array(["HEAD"])
|
||||||
arr[0] = -1
|
arr[0] = numpy.int32(-1).astype(numpy.uint64)
|
||||||
doc_from_array = Doc(en_vocab, words=words)
|
doc_from_array = Doc(en_vocab, words=words)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
doc_from_array.from_array(["HEAD"], arr)
|
doc_from_array.from_array(["HEAD"], arr)
|
||||||
|
|
||||||
# head after end
|
# head after end
|
||||||
arr = doc.to_array(["HEAD"])
|
arr = doc.to_array(["HEAD"])
|
||||||
arr[0] = 5
|
arr[0] = numpy.int32(5).astype(numpy.uint64)
|
||||||
doc_from_array = Doc(en_vocab, words=words)
|
doc_from_array = Doc(en_vocab, words=words)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
doc_from_array.from_array(["HEAD"], arr)
|
doc_from_array.from_array(["HEAD"], arr)
|
||||||
|
|
|
@ -4,6 +4,7 @@ import pytest
|
||||||
import numpy
|
import numpy
|
||||||
import logging
|
import logging
|
||||||
import mock
|
import mock
|
||||||
|
import warnings
|
||||||
|
|
||||||
from spacy.lang.xx import MultiLanguage
|
from spacy.lang.xx import MultiLanguage
|
||||||
from spacy.tokens import Doc, Span, Token
|
from spacy.tokens import Doc, Span, Token
|
||||||
|
@ -316,9 +317,9 @@ def test_doc_from_array_sent_starts(en_vocab):
|
||||||
# no warning using default attrs
|
# no warning using default attrs
|
||||||
attrs = doc._get_array_attrs()
|
attrs = doc._get_array_attrs()
|
||||||
arr = doc.to_array(attrs)
|
arr = doc.to_array(attrs)
|
||||||
with pytest.warns(None) as record:
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
new_doc.from_array(attrs, arr)
|
new_doc.from_array(attrs, arr)
|
||||||
assert len(record) == 0
|
|
||||||
# only SENT_START uses SENT_START
|
# only SENT_START uses SENT_START
|
||||||
attrs = [SENT_START]
|
attrs = [SENT_START]
|
||||||
arr = doc.to_array(attrs)
|
arr = doc.to_array(attrs)
|
||||||
|
|
|
@ -2,6 +2,9 @@ import pytest
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
|
|
||||||
|
|
||||||
def test_ru_doc_lemmatization(ru_lemmatizer):
|
def test_ru_doc_lemmatization(ru_lemmatizer):
|
||||||
words = ["мама", "мыла", "раму"]
|
words = ["мама", "мыла", "раму"]
|
||||||
pos = ["NOUN", "VERB", "NOUN"]
|
pos = ["NOUN", "VERB", "NOUN"]
|
||||||
|
|
|
@ -2,6 +2,9 @@ import pytest
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
|
|
||||||
|
|
||||||
def test_uk_lemmatizer(uk_lemmatizer):
|
def test_uk_lemmatizer(uk_lemmatizer):
|
||||||
"""Check that the default uk lemmatizer runs."""
|
"""Check that the default uk lemmatizer runs."""
|
||||||
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
import warnings
|
||||||
import srsly
|
import srsly
|
||||||
from mock import Mock
|
from mock import Mock
|
||||||
from spacy.matcher import PhraseMatcher
|
from spacy.matcher import PhraseMatcher
|
||||||
|
@ -197,13 +198,13 @@ def test_phrase_matcher_validation(en_vocab):
|
||||||
matcher.add("TEST1", [doc1])
|
matcher.add("TEST1", [doc1])
|
||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
matcher.add("TEST2", [doc2])
|
matcher.add("TEST2", [doc2])
|
||||||
with pytest.warns(None) as record:
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
matcher.add("TEST3", [doc3])
|
matcher.add("TEST3", [doc3])
|
||||||
assert not record.list
|
|
||||||
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
|
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
|
||||||
with pytest.warns(None) as record:
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
matcher.add("TEST4", [doc2])
|
matcher.add("TEST4", [doc2])
|
||||||
assert not record.list
|
|
||||||
|
|
||||||
|
|
||||||
def test_attr_validation(en_vocab):
|
def test_attr_validation(en_vocab):
|
||||||
|
|
|
@ -49,8 +49,8 @@ def test_issue5551(textcat_config):
|
||||||
# All results should be the same because of the fixed seed
|
# All results should be the same because of the fixed seed
|
||||||
assert len(results) == 3
|
assert len(results) == 3
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]))
|
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5)
|
||||||
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]))
|
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5)
|
||||||
|
|
||||||
|
|
||||||
def test_issue5838():
|
def test_issue5838():
|
||||||
|
|
|
@ -3,6 +3,7 @@ from typing import Callable
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.util import ensure_path, registry, load_model_from_config
|
from spacy.util import ensure_path, registry, load_model_from_config
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb import KnowledgeBase
|
||||||
|
from spacy.vocab import Vocab
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
@ -111,7 +112,7 @@ def test_serialize_subclassed_kb():
|
||||||
@registry.misc("spacy.CustomKB.v1")
|
@registry.misc("spacy.CustomKB.v1")
|
||||||
def custom_kb(
|
def custom_kb(
|
||||||
entity_vector_length: int, custom_field: int
|
entity_vector_length: int, custom_field: int
|
||||||
) -> Callable[["Vocab"], KnowledgeBase]:
|
) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
def custom_kb_factory(vocab):
|
def custom_kb_factory(vocab):
|
||||||
kb = SubKnowledgeBase(
|
kb = SubKnowledgeBase(
|
||||||
vocab=vocab,
|
vocab=vocab,
|
||||||
|
|
|
@ -10,6 +10,7 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||||
from spacy.cli._util import load_project_config, substitute_project_variables
|
from spacy.cli._util import load_project_config, substitute_project_variables
|
||||||
from spacy.cli._util import string_to_list
|
from spacy.cli._util import string_to_list
|
||||||
|
from spacy.cli._util import upload_file, download_file
|
||||||
from thinc.api import ConfigValidationError, Config
|
from thinc.api import ConfigValidationError, Config
|
||||||
import srsly
|
import srsly
|
||||||
import os
|
import os
|
||||||
|
@ -474,3 +475,18 @@ def test_string_to_list(value):
|
||||||
def test_string_to_list_intify(value):
|
def test_string_to_list_intify(value):
|
||||||
assert string_to_list(value, intify=False) == ["1", "2", "3"]
|
assert string_to_list(value, intify=False) == ["1", "2", "3"]
|
||||||
assert string_to_list(value, intify=True) == [1, 2, 3]
|
assert string_to_list(value, intify=True) == [1, 2, 3]
|
||||||
|
|
||||||
|
|
||||||
|
def test_upload_download_local_file():
|
||||||
|
with make_tempdir() as d1, make_tempdir() as d2:
|
||||||
|
filename = "f.txt"
|
||||||
|
content = "content"
|
||||||
|
local_file = d1 / filename
|
||||||
|
remote_file = d2 / filename
|
||||||
|
with local_file.open(mode="w") as file_:
|
||||||
|
file_.write(content)
|
||||||
|
upload_file(local_file, remote_file)
|
||||||
|
local_file.unlink()
|
||||||
|
download_file(remote_file, local_file)
|
||||||
|
with local_file.open(mode="r") as file_:
|
||||||
|
assert file_.read() == content
|
||||||
|
|
|
@ -20,7 +20,7 @@ def get_textcat_bow_kwargs():
|
||||||
|
|
||||||
|
|
||||||
def get_textcat_cnn_kwargs():
|
def get_textcat_cnn_kwargs():
|
||||||
return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
||||||
|
|
||||||
|
|
||||||
def get_all_params(model):
|
def get_all_params(model):
|
||||||
|
@ -62,7 +62,7 @@ def get_tok2vec_kwargs():
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_tok2vec():
|
def make_test_tok2vec():
|
||||||
return build_Tok2Vec_model(**get_tok2vec_kwargs())
|
return build_Tok2Vec_model(**get_tok2vec_kwargs())
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -347,6 +347,7 @@ cdef class Doc:
|
||||||
for annot in annotations:
|
for annot in annotations:
|
||||||
if annot:
|
if annot:
|
||||||
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
||||||
|
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
if attrs.ndim == 1:
|
if attrs.ndim == 1:
|
||||||
attrs[i] = annot[i]
|
attrs[i] = annot[i]
|
||||||
|
|
|
@ -297,7 +297,7 @@ cdef class Span:
|
||||||
for ancestor in ancestors:
|
for ancestor in ancestors:
|
||||||
ancestor_i = ancestor.i - self.c.start
|
ancestor_i = ancestor.i - self.c.start
|
||||||
if ancestor_i in range(length):
|
if ancestor_i in range(length):
|
||||||
array[i, head_col] = ancestor_i - i
|
array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
|
||||||
|
|
||||||
# if there is no appropriate ancestor, define a new artificial root
|
# if there is no appropriate ancestor, define a new artificial root
|
||||||
value = array[i, head_col]
|
value = array[i, head_col]
|
||||||
|
@ -305,7 +305,7 @@ cdef class Span:
|
||||||
new_root = old_to_new_root.get(ancestor_i, None)
|
new_root = old_to_new_root.get(ancestor_i, None)
|
||||||
if new_root is not None:
|
if new_root is not None:
|
||||||
# take the same artificial root as a previous token from the same sentence
|
# take the same artificial root as a previous token from the same sentence
|
||||||
array[i, head_col] = new_root - i
|
array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
|
||||||
else:
|
else:
|
||||||
# set this token as the new artificial root
|
# set this token as the new artificial root
|
||||||
array[i, head_col] = 0
|
array[i, head_col] = 0
|
||||||
|
|
|
@ -329,26 +329,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
if key not in IDS:
|
if key not in IDS:
|
||||||
raise ValueError(Errors.E974.format(obj="token", key=key))
|
raise ValueError(Errors.E974.format(obj="token", key=key))
|
||||||
elif key in ["ORTH", "SPACY"]:
|
elif key in ["ORTH", "SPACY"]:
|
||||||
pass
|
continue
|
||||||
elif key == "HEAD":
|
elif key == "HEAD":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
|
row = [h-i if h is not None else 0 for i, h in enumerate(value)]
|
||||||
elif key == "DEP":
|
elif key == "DEP":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
|
row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
|
||||||
elif key == "SENT_START":
|
elif key == "SENT_START":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([to_ternary_int(v) for v in value])
|
row = [to_ternary_int(v) for v in value]
|
||||||
elif key == "MORPH":
|
elif key == "MORPH":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.morphology.add(v) for v in value])
|
row = [vocab.morphology.add(v) for v in value]
|
||||||
else:
|
else:
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
if not all(isinstance(v, str) for v in value):
|
if not all(isinstance(v, str) for v in value):
|
||||||
types = set([type(v) for v in value])
|
types = set([type(v) for v in value])
|
||||||
raise TypeError(Errors.E969.format(field=key, types=types)) from None
|
raise TypeError(Errors.E969.format(field=key, types=types)) from None
|
||||||
values.append([vocab.strings.add(v) for v in value])
|
row = [vocab.strings.add(v) for v in value]
|
||||||
array = numpy.asarray(values, dtype="uint64")
|
values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
|
||||||
|
array = numpy.array(values, dtype=numpy.uint64)
|
||||||
return attrs, array.T
|
return attrs, array.T
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -274,3 +274,5 @@ def ensure_shape(vectors_loc):
|
||||||
# store all the results in a list in memory
|
# store all the results in a list in memory
|
||||||
lines2 = open_file(vectors_loc)
|
lines2 = open_file(vectors_loc)
|
||||||
yield from lines2
|
yield from lines2
|
||||||
|
lines2.close()
|
||||||
|
lines.close()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user