Merge pull request #11965 from adrianeboyd/backport/v3.0.9

Backport bug fixes to v3.0.x
This commit is contained in:
Adriane Boyd 2022-12-15 08:04:20 +01:00 committed by GitHub
commit a65379dede
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 130 additions and 87 deletions

View File

@ -1,9 +1,6 @@
parameters: parameters:
python_version: '' python_version: ''
architecture: '' architecture: 'x64'
prefix: ''
gpu: false
num_build_jobs: 1
steps: steps:
- task: UsePythonVersion@0 - task: UsePythonVersion@0
@ -16,52 +13,76 @@ steps:
displayName: 'Set variables' displayName: 'Set variables'
- script: | - script: |
${{ parameters.prefix }} python -m pip install -U pip setuptools python -m pip install -U build pip setuptools
${{ parameters.prefix }} python -m pip install -U -r requirements.txt python -m pip install -U -r requirements.txt
displayName: "Install dependencies" displayName: "Install dependencies"
- script: | - script: |
${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }} python -m build --sdist
${{ parameters.prefix }} python setup.py sdist --formats=gztar displayName: "Build sdist"
displayName: "Compile and build sdist"
- task: DeleteFiles@1 - task: DeleteFiles@1
inputs: inputs:
contents: "spacy" contents: "spacy"
displayName: "Delete source directory" displayName: "Delete source directory"
- task: DeleteFiles@1
inputs:
contents: "*.egg-info"
displayName: "Delete egg-info directory"
- script: | - script: |
${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt python -m pip freeze > installed.txt
${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt python -m pip uninstall -y -r installed.txt
displayName: "Uninstall all packages" displayName: "Uninstall all packages"
- bash: | - bash: |
${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
${{ parameters.prefix }} python -m pip install dist/$SDIST python -m pip install dist/$SDIST
displayName: "Install from sdist" displayName: "Install from sdist"
- script: | - script: |
${{ parameters.prefix }} python -m pip install -U -r requirements.txt python -W error -c "import spacy"
displayName: "Test import"
- script: |
python -m spacy download es_core_news_sm
python -c "import spacy; nlp=spacy.load('es_core_news_sm'); doc=nlp('test')"
displayName: 'Test download CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
displayName: 'Test convert CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy init config -p ner -l es ner.cfg
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
displayName: 'Test debug config CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
# will have errors due to sparse data, check for summary in output
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
displayName: 'Test debug data CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
displayName: 'Test train CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'es_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
displayName: 'Test assemble CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m pip install -U -r requirements.txt
displayName: "Install test requirements" displayName: "Install test requirements"
- script: | - script: |
${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0 python -m pytest --pyargs spacy -W error
${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
displayName: "Install GPU requirements"
condition: eq(${{ parameters.gpu }}, true)
- script: |
${{ parameters.prefix }} python -m pytest --pyargs spacy
displayName: "Run CPU tests" displayName: "Run CPU tests"
condition: eq(${{ parameters.gpu }}, false)
- script: |
${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
displayName: "Run GPU tests"
condition: eq(${{ parameters.gpu }}, true)
- script: |
python -m spacy download en_core_web_sm
python -c "import spacy; nlp=spacy.load('en_core_web_sm'); doc=nlp('test')"
displayName: 'Test download CLI'
condition: eq(variables['python_version'], '3.8')

View File

@ -28,7 +28,7 @@ jobs:
inputs: inputs:
versionSpec: "3.7" versionSpec: "3.7"
- script: | - script: |
pip install flake8==3.5.0 pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
displayName: "flake8" displayName: "flake8"
@ -38,7 +38,7 @@ jobs:
matrix: matrix:
# We're only running one platform per Python version to speed up builds # We're only running one platform per Python version to speed up builds
Python36Linux: Python36Linux:
imageName: "ubuntu-latest" imageName: "ubuntu-20.04"
python.version: "3.6" python.version: "3.6"
# Python36Windows: # Python36Windows:
# imageName: "windows-latest" # imageName: "windows-latest"
@ -47,7 +47,7 @@ jobs:
# imageName: "macos-latest" # imageName: "macos-latest"
# python.version: "3.6" # python.version: "3.6"
# Python37Linux: # Python37Linux:
# imageName: "ubuntu-latest" # imageName: "ubuntu-20.04"
# python.version: "3.7" # python.version: "3.7"
Python37Windows: Python37Windows:
imageName: "windows-latest" imageName: "windows-latest"
@ -89,20 +89,3 @@ jobs:
- template: .github/azure-steps.yml - template: .github/azure-steps.yml
parameters: parameters:
python_version: '$(python.version)' python_version: '$(python.version)'
architecture: 'x64'
# - job: "TestGPU"
# dependsOn: "Validate"
# strategy:
# matrix:
# Python38LinuxX64_GPU:
# python.version: '3.8'
# pool:
# name: "LinuxX64_GPU"
# steps:
# - template: .github/azure-steps.yml
# parameters:
# python_version: '$(python.version)'
# architecture: 'x64'
# gpu: true
# num_build_jobs: 24

View File

@ -11,6 +11,7 @@ srsly>=2.4.1,<3.0.0
catalogue>=2.0.4,<2.1.0 catalogue>=2.0.4,<2.1.0
typer>=0.3.0,<0.4.0 typer>=0.3.0,<0.4.0
pathy>=0.3.5 pathy>=0.3.5
smart-open>=5.2.1,<7.0.0
# Third party dependencies # Third party dependencies
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
@ -26,6 +27,6 @@ cython>=0.25,<3.0
pytest>=5.2.0 pytest>=5.2.0
pytest-timeout>=1.3.0,<2.0.0 pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0 mock>=2.0.0,<3.0.0
flake8>=3.5.0,<3.6.0 flake8>=3.8.0,<6.0.0
hypothesis>=3.27.0,<7.0.0 hypothesis>=3.27.0,<7.0.0
mypy==0.910 mypy==0.910

View File

@ -49,9 +49,10 @@ install_requires =
wasabi>=0.8.1,<1.1.0 wasabi>=0.8.1,<1.1.0
srsly>=2.4.1,<3.0.0 srsly>=2.4.1,<3.0.0
catalogue>=2.0.4,<2.1.0 catalogue>=2.0.4,<2.1.0
# Third-party dependencies
typer>=0.3.0,<0.4.0 typer>=0.3.0,<0.4.0
pathy>=0.3.5 pathy>=0.3.5
# Third-party dependencies smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "3.0.8" __version__ = "3.0.9"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects" __projects__ = "https://github.com/explosion/projects"

View File

@ -355,7 +355,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
if dest.exists() and not force: if dest.exists() and not force:
return None return None
src = str(src) src = str(src)
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file: with smart_open.open(src, mode="rb", compression="disable") as input_file:
with dest.open(mode="wb") as output_file: with dest.open(mode="wb") as output_file:
output_file.write(input_file.read()) output_file.write(input_file.read())

View File

@ -1,7 +1,7 @@
{# This is a template for training configs used for the quickstart widget in {# This is a template for training configs used for the quickstart widget in
the docs and the init config command. It encodes various best practices and the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #} can help generate the best possible configuration, given a user's requirements. #}
{%- set use_transformer = hardware != "cpu" -%} {%- set use_transformer = hardware != "cpu" and transformer_data -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%} {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
[paths] [paths]
train = null train = null

View File

@ -320,6 +320,11 @@ class Errors:
"clear the existing vectors and resize the table.") "clear the existing vectors and resize the table.")
E074 = ("Error interpreting compiled match pattern: patterns are expected " E074 = ("Error interpreting compiled match pattern: patterns are expected "
"to end with the attribute {attr}. Got: {bad_attr}.") "to end with the attribute {attr}. Got: {bad_attr}.")
E079 = ("Error computing states in beam: number of predicted beams "
"({pbeams}) does not equal number of gold beams ({gbeams}).")
E080 = ("Duplicate state found in beam: {key}.")
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
"does not equal number of losses ({losses}).")
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), " E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not " "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
"match.") "match.")

View File

@ -6,6 +6,7 @@ from thinc.api import Model, Maxout, Linear
from ...util import registry from ...util import registry
from ...kb import KnowledgeBase, Candidate, get_candidates from ...kb import KnowledgeBase, Candidate, get_candidates
from ...vocab import Vocab from ...vocab import Vocab
from ...tokens import Span
@registry.architectures("spacy.EntityLinker.v1") @registry.architectures("spacy.EntityLinker.v1")
@ -44,5 +45,5 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
@registry.misc("spacy.CandidateGenerator.v1") @registry.misc("spacy.CandidateGenerator.v1")
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]: def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
return get_candidates return get_candidates

View File

@ -9,7 +9,7 @@ import warnings
from ..kb import KnowledgeBase, Candidate from ..kb import KnowledgeBase, Candidate
from ..ml import empty_kb from ..ml import empty_kb
from ..tokens import Doc from ..tokens import Doc, Span
from .pipe import deserialize_config from .pipe import deserialize_config
from .trainable_pipe import TrainablePipe from .trainable_pipe import TrainablePipe
from ..language import Language from ..language import Language
@ -67,7 +67,7 @@ def make_entity_linker(
incl_prior: bool, incl_prior: bool,
incl_context: bool, incl_context: bool,
entity_vector_length: int, entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]], get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
): ):
"""Construct an EntityLinker component. """Construct an EntityLinker component.
@ -114,7 +114,7 @@ class EntityLinker(TrainablePipe):
incl_prior: bool, incl_prior: bool,
incl_context: bool, incl_context: bool,
entity_vector_length: int, entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]], get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
) -> None: ) -> None:
"""Initialize an entity linker. """Initialize an entity linker.
@ -127,7 +127,7 @@ class EntityLinker(TrainablePipe):
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model. incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
incl_context (bool): Whether or not to include the local context in the model. incl_context (bool): Whether or not to include the local context in the model.
entity_vector_length (int): Size of encoding vectors in the KB. entity_vector_length (int): Size of encoding vectors in the KB.
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention. produces a list of candidates, given a certain knowledge base and a textual mention.
DOCS: https://spacy.io/api/entitylinker#init DOCS: https://spacy.io/api/entitylinker#init

View File

@ -4,7 +4,7 @@ from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator, create_model from pydantic import BaseModel, Field, ValidationError, validator, create_model
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
from pydantic.main import ModelMetaclass from pydantic.main import ModelMetaclass
from thinc.api import Optimizer, ConfigValidationError from thinc.api import Optimizer, ConfigValidationError, Model
from thinc.config import Promise from thinc.config import Promise
from collections import defaultdict from collections import defaultdict
import inspect import inspect
@ -17,6 +17,7 @@ if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports # This lets us add type hints for mypy etc. without causing circular imports
from .language import Language # noqa: F401 from .language import Language # noqa: F401
from .training import Example # noqa: F401 from .training import Example # noqa: F401
from .vocab import Vocab # noqa: F401
# fmt: off # fmt: off
@ -353,7 +354,7 @@ class ConfigSchemaPretrain(BaseModel):
batcher: Batcher = Field(..., title="Batcher for the training data") batcher: Batcher = Field(..., title="Batcher for the training data")
component: str = Field(..., title="Component to find the layer to pretrain") component: str = Field(..., title="Component to find the layer to pretrain")
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.") layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
objective: Callable[["Vocab", "Model"], "Model"] = Field(..., title="A function that creates the pretraining objective.") objective: Callable[["Vocab", Model], Model] = Field(..., title="A function that creates the pretraining objective.")
# fmt: on # fmt: on
class Config: class Config:

View File

@ -1,4 +1,5 @@
import pytest import pytest
import numpy
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
@ -100,14 +101,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
# head before start # head before start
arr = doc.to_array(["HEAD"]) arr = doc.to_array(["HEAD"])
arr[0] = -1 arr[0] = numpy.int32(-1).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words) doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError): with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr) doc_from_array.from_array(["HEAD"], arr)
# head after end # head after end
arr = doc.to_array(["HEAD"]) arr = doc.to_array(["HEAD"])
arr[0] = 5 arr[0] = numpy.int32(5).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words) doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError): with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr) doc_from_array.from_array(["HEAD"], arr)

View File

@ -4,6 +4,7 @@ import pytest
import numpy import numpy
import logging import logging
import mock import mock
import warnings
from spacy.lang.xx import MultiLanguage from spacy.lang.xx import MultiLanguage
from spacy.tokens import Doc, Span, Token from spacy.tokens import Doc, Span, Token
@ -316,9 +317,9 @@ def test_doc_from_array_sent_starts(en_vocab):
# no warning using default attrs # no warning using default attrs
attrs = doc._get_array_attrs() attrs = doc._get_array_attrs()
arr = doc.to_array(attrs) arr = doc.to_array(attrs)
with pytest.warns(None) as record: with warnings.catch_warnings():
warnings.simplefilter("error")
new_doc.from_array(attrs, arr) new_doc.from_array(attrs, arr)
assert len(record) == 0
# only SENT_START uses SENT_START # only SENT_START uses SENT_START
attrs = [SENT_START] attrs = [SENT_START]
arr = doc.to_array(attrs) arr = doc.to_array(attrs)

View File

@ -2,6 +2,9 @@ import pytest
from spacy.tokens import Doc from spacy.tokens import Doc
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_ru_doc_lemmatization(ru_lemmatizer): def test_ru_doc_lemmatization(ru_lemmatizer):
words = ["мама", "мыла", "раму"] words = ["мама", "мыла", "раму"]
pos = ["NOUN", "VERB", "NOUN"] pos = ["NOUN", "VERB", "NOUN"]

View File

@ -2,6 +2,9 @@ import pytest
from spacy.tokens import Doc from spacy.tokens import Doc
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_uk_lemmatizer(uk_lemmatizer): def test_uk_lemmatizer(uk_lemmatizer):
"""Check that the default uk lemmatizer runs.""" """Check that the default uk lemmatizer runs."""
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"]) doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])

View File

@ -1,4 +1,5 @@
import pytest import pytest
import warnings
import srsly import srsly
from mock import Mock from mock import Mock
from spacy.matcher import PhraseMatcher from spacy.matcher import PhraseMatcher
@ -197,13 +198,13 @@ def test_phrase_matcher_validation(en_vocab):
matcher.add("TEST1", [doc1]) matcher.add("TEST1", [doc1])
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
matcher.add("TEST2", [doc2]) matcher.add("TEST2", [doc2])
with pytest.warns(None) as record: with warnings.catch_warnings():
warnings.simplefilter("error")
matcher.add("TEST3", [doc3]) matcher.add("TEST3", [doc3])
assert not record.list
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True) matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
with pytest.warns(None) as record: with warnings.catch_warnings():
warnings.simplefilter("error")
matcher.add("TEST4", [doc2]) matcher.add("TEST4", [doc2])
assert not record.list
def test_attr_validation(en_vocab): def test_attr_validation(en_vocab):

View File

@ -49,8 +49,8 @@ def test_issue5551(textcat_config):
# All results should be the same because of the fixed seed # All results should be the same because of the fixed seed
assert len(results) == 3 assert len(results) == 3
ops = get_current_ops() ops = get_current_ops()
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1])) assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5)
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2])) assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5)
def test_issue5838(): def test_issue5838():

View File

@ -3,6 +3,7 @@ from typing import Callable
from spacy import util from spacy import util
from spacy.util import ensure_path, registry, load_model_from_config from spacy.util import ensure_path, registry, load_model_from_config
from spacy.kb import KnowledgeBase from spacy.kb import KnowledgeBase
from spacy.vocab import Vocab
from thinc.api import Config from thinc.api import Config
from ..util import make_tempdir from ..util import make_tempdir
@ -111,7 +112,7 @@ def test_serialize_subclassed_kb():
@registry.misc("spacy.CustomKB.v1") @registry.misc("spacy.CustomKB.v1")
def custom_kb( def custom_kb(
entity_vector_length: int, custom_field: int entity_vector_length: int, custom_field: int
) -> Callable[["Vocab"], KnowledgeBase]: ) -> Callable[[Vocab], KnowledgeBase]:
def custom_kb_factory(vocab): def custom_kb_factory(vocab):
kb = SubKnowledgeBase( kb = SubKnowledgeBase(
vocab=vocab, vocab=vocab,

View File

@ -10,6 +10,7 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.cli._util import load_project_config, substitute_project_variables from spacy.cli._util import load_project_config, substitute_project_variables
from spacy.cli._util import string_to_list from spacy.cli._util import string_to_list
from spacy.cli._util import upload_file, download_file
from thinc.api import ConfigValidationError, Config from thinc.api import ConfigValidationError, Config
import srsly import srsly
import os import os
@ -474,3 +475,18 @@ def test_string_to_list(value):
def test_string_to_list_intify(value): def test_string_to_list_intify(value):
assert string_to_list(value, intify=False) == ["1", "2", "3"] assert string_to_list(value, intify=False) == ["1", "2", "3"]
assert string_to_list(value, intify=True) == [1, 2, 3] assert string_to_list(value, intify=True) == [1, 2, 3]
def test_upload_download_local_file():
with make_tempdir() as d1, make_tempdir() as d2:
filename = "f.txt"
content = "content"
local_file = d1 / filename
remote_file = d2 / filename
with local_file.open(mode="w") as file_:
file_.write(content)
upload_file(local_file, remote_file)
local_file.unlink()
download_file(remote_file, local_file)
with local_file.open(mode="r") as file_:
assert file_.read() == content

View File

@ -20,7 +20,7 @@ def get_textcat_bow_kwargs():
def get_textcat_cnn_kwargs(): def get_textcat_cnn_kwargs():
return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13} return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
def get_all_params(model): def get_all_params(model):
@ -62,7 +62,7 @@ def get_tok2vec_kwargs():
} }
def test_tok2vec(): def make_test_tok2vec():
return build_Tok2Vec_model(**get_tok2vec_kwargs()) return build_Tok2Vec_model(**get_tok2vec_kwargs())

View File

@ -347,6 +347,7 @@ cdef class Doc:
for annot in annotations: for annot in annotations:
if annot: if annot:
if annot is heads or annot is sent_starts or annot is ent_iobs: if annot is heads or annot is sent_starts or annot is ent_iobs:
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
for i in range(len(words)): for i in range(len(words)):
if attrs.ndim == 1: if attrs.ndim == 1:
attrs[i] = annot[i] attrs[i] = annot[i]

View File

@ -297,7 +297,7 @@ cdef class Span:
for ancestor in ancestors: for ancestor in ancestors:
ancestor_i = ancestor.i - self.c.start ancestor_i = ancestor.i - self.c.start
if ancestor_i in range(length): if ancestor_i in range(length):
array[i, head_col] = ancestor_i - i array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
# if there is no appropriate ancestor, define a new artificial root # if there is no appropriate ancestor, define a new artificial root
value = array[i, head_col] value = array[i, head_col]
@ -305,7 +305,7 @@ cdef class Span:
new_root = old_to_new_root.get(ancestor_i, None) new_root = old_to_new_root.get(ancestor_i, None)
if new_root is not None: if new_root is not None:
# take the same artificial root as a previous token from the same sentence # take the same artificial root as a previous token from the same sentence
array[i, head_col] = new_root - i array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
else: else:
# set this token as the new artificial root # set this token as the new artificial root
array[i, head_col] = 0 array[i, head_col] = 0

View File

@ -329,26 +329,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
if key not in IDS: if key not in IDS:
raise ValueError(Errors.E974.format(obj="token", key=key)) raise ValueError(Errors.E974.format(obj="token", key=key))
elif key in ["ORTH", "SPACY"]: elif key in ["ORTH", "SPACY"]:
pass continue
elif key == "HEAD": elif key == "HEAD":
attrs.append(key) attrs.append(key)
values.append([h-i if h is not None else 0 for i, h in enumerate(value)]) row = [h-i if h is not None else 0 for i, h in enumerate(value)]
elif key == "DEP": elif key == "DEP":
attrs.append(key) attrs.append(key)
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]) row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
elif key == "SENT_START": elif key == "SENT_START":
attrs.append(key) attrs.append(key)
values.append([to_ternary_int(v) for v in value]) row = [to_ternary_int(v) for v in value]
elif key == "MORPH": elif key == "MORPH":
attrs.append(key) attrs.append(key)
values.append([vocab.morphology.add(v) for v in value]) row = [vocab.morphology.add(v) for v in value]
else: else:
attrs.append(key) attrs.append(key)
if not all(isinstance(v, str) for v in value): if not all(isinstance(v, str) for v in value):
types = set([type(v) for v in value]) types = set([type(v) for v in value])
raise TypeError(Errors.E969.format(field=key, types=types)) from None raise TypeError(Errors.E969.format(field=key, types=types)) from None
values.append([vocab.strings.add(v) for v in value]) row = [vocab.strings.add(v) for v in value]
array = numpy.asarray(values, dtype="uint64") values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
array = numpy.array(values, dtype=numpy.uint64)
return attrs, array.T return attrs, array.T

View File

@ -274,3 +274,5 @@ def ensure_shape(vectors_loc):
# store all the results in a list in memory # store all the results in a list in memory
lines2 = open_file(vectors_loc) lines2 = open_file(vectors_loc)
yield from lines2 yield from lines2
lines2.close()
lines.close()