Merge pull request #11966 from adrianeboyd/backport/v3.1.7

Backport bug fixes to v3.1.x
This commit is contained in:
Adriane Boyd 2022-12-14 20:44:36 +01:00 committed by GitHub
commit a898c7e9eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
23 changed files with 140 additions and 101 deletions

View File

@ -1,9 +1,6 @@
parameters: parameters:
python_version: '' python_version: ''
architecture: '' architecture: 'x64'
prefix: ''
gpu: false
num_build_jobs: 1
steps: steps:
- task: UsePythonVersion@0 - task: UsePythonVersion@0
@ -16,16 +13,16 @@ steps:
displayName: 'Set variables' displayName: 'Set variables'
- script: | - script: |
${{ parameters.prefix }} python -m pip install -U pip setuptools python -m pip install -U build pip setuptools
${{ parameters.prefix }} python -m pip install -U -r requirements.txt python -m pip install -U -r requirements.txt
displayName: "Install dependencies" displayName: "Install dependencies"
- script: | - script: |
${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }} python -m build --sdist
${{ parameters.prefix }} python setup.py sdist --formats=gztar displayName: "Build sdist"
displayName: "Compile and build sdist"
- script: python -m mypy spacy - script: |
python -m mypy spacy
displayName: 'Run mypy' displayName: 'Run mypy'
condition: ne(variables['python_version'], '3.10') condition: ne(variables['python_version'], '3.10')
@ -34,35 +31,24 @@ steps:
contents: "spacy" contents: "spacy"
displayName: "Delete source directory" displayName: "Delete source directory"
- task: DeleteFiles@1
inputs:
contents: "*.egg-info"
displayName: "Delete egg-info directory"
- script: | - script: |
${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt python -m pip freeze > installed.txt
${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt python -m pip uninstall -y -r installed.txt
displayName: "Uninstall all packages" displayName: "Uninstall all packages"
- bash: | - bash: |
${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
${{ parameters.prefix }} python -m pip install dist/$SDIST python -m pip install dist/$SDIST
displayName: "Install from sdist" displayName: "Install from sdist"
- script: | - script: |
${{ parameters.prefix }} python -m pip install -U -r requirements.txt python -W error -c "import spacy"
displayName: "Install test requirements" displayName: "Test import"
- script: |
${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
displayName: "Install GPU requirements"
condition: eq(${{ parameters.gpu }}, true)
- script: |
${{ parameters.prefix }} python -m pytest --pyargs spacy
displayName: "Run CPU tests"
condition: eq(${{ parameters.gpu }}, false)
- script: |
${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
displayName: "Run GPU tests"
condition: eq(${{ parameters.gpu }}, true)
- script: | - script: |
python -m spacy download ca_core_news_sm python -m spacy download ca_core_news_sm
@ -105,13 +91,21 @@ steps:
displayName: 'Test assemble CLI vectors warning' displayName: 'Test assemble CLI vectors warning'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.8')
- script: |
python -m pip install -U -r requirements.txt
displayName: "Install test requirements"
- script: |
python -m pytest --pyargs spacy -W error
displayName: "Run CPU tests"
- script: |
python -m pip install 'spacy[apple]'
python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))
- script: | - script: |
python .github/validate_universe_json.py website/meta/universe.json python .github/validate_universe_json.py website/meta/universe.json
displayName: 'Test website/meta/universe.json' displayName: 'Test website/meta/universe.json'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.8')
- script: |
${{ parameters.prefix }} python -m pip install thinc-apple-ops
${{ parameters.prefix }} python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9'))

View File

@ -5,7 +5,7 @@ repos:
- id: black - id: black
language_version: python3.7 language_version: python3.7
- repo: https://gitlab.com/pycqa/flake8 - repo: https://gitlab.com/pycqa/flake8
rev: 3.9.2 rev: 5.0.4
hooks: hooks:
- id: flake8 - id: flake8
args: args:

View File

@ -29,7 +29,7 @@ jobs:
inputs: inputs:
versionSpec: "3.7" versionSpec: "3.7"
- script: | - script: |
pip install flake8==3.9.2 pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
displayName: "flake8" displayName: "flake8"
@ -39,7 +39,7 @@ jobs:
matrix: matrix:
# We're only running one platform per Python version to speed up builds # We're only running one platform per Python version to speed up builds
Python36Linux: Python36Linux:
imageName: "ubuntu-latest" imageName: "ubuntu-20.04"
python.version: "3.6" python.version: "3.6"
# Python36Windows: # Python36Windows:
# imageName: "windows-latest" # imageName: "windows-latest"
@ -48,7 +48,7 @@ jobs:
# imageName: "macos-latest" # imageName: "macos-latest"
# python.version: "3.6" # python.version: "3.6"
# Python37Linux: # Python37Linux:
# imageName: "ubuntu-latest" # imageName: "ubuntu-20.04"
# python.version: "3.7" # python.version: "3.7"
Python37Windows: Python37Windows:
imageName: "windows-latest" imageName: "windows-latest"
@ -90,20 +90,3 @@ jobs:
- template: .github/azure-steps.yml - template: .github/azure-steps.yml
parameters: parameters:
python_version: '$(python.version)' python_version: '$(python.version)'
architecture: 'x64'
# - job: "TestGPU"
# dependsOn: "Validate"
# strategy:
# matrix:
# Python38LinuxX64_GPU:
# python.version: '3.8'
# pool:
# name: "LinuxX64_GPU"
# steps:
# - template: .github/azure-steps.yml
# parameters:
# python_version: '$(python.version)'
# architecture: 'x64'
# gpu: true
# num_build_jobs: 24

View File

@ -10,8 +10,8 @@ wasabi>=0.8.1,<1.1.0
srsly>=2.4.1,<3.0.0 srsly>=2.4.1,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.5.0 typer>=0.3.0,<0.5.0
click<8.1.0
pathy>=0.3.5 pathy>=0.3.5
smart-open>=5.2.1,<7.0.0
# Third party dependencies # Third party dependencies
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
@ -28,7 +28,7 @@ cython>=0.25,<3.0
pytest>=5.2.0 pytest>=5.2.0
pytest-timeout>=1.3.0,<2.0.0 pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0 mock>=2.0.0,<3.0.0
flake8>=3.8.0,<3.10.0 flake8>=3.8.0,<6.0.0
hypothesis>=3.27.0,<7.0.0 hypothesis>=3.27.0,<7.0.0
mypy==0.910 mypy==0.910
types-dataclasses>=0.1.3; python_version < "3.7" types-dataclasses>=0.1.3; python_version < "3.7"

View File

@ -50,10 +50,10 @@ install_requires =
wasabi>=0.8.1,<1.1.0 wasabi>=0.8.1,<1.1.0
srsly>=2.4.1,<3.0.0 srsly>=2.4.1,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.5.0
click<8.1.0
pathy>=0.3.5
# Third-party dependencies # Third-party dependencies
typer>=0.3.0,<0.5.0
pathy>=0.3.5
smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "3.1.6" __version__ = "3.1.7"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects" __projects__ = "https://github.com/explosion/projects"

View File

@ -358,7 +358,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
if dest.exists() and not force: if dest.exists() and not force:
return None return None
src = str(src) src = str(src)
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file: with smart_open.open(src, mode="rb", compression="disable") as input_file:
with dest.open(mode="wb") as output_file: with dest.open(mode="wb") as output_file:
output_file.write(input_file.read()) output_file.write(input_file.read())

View File

@ -1,7 +1,7 @@
{# This is a template for training configs used for the quickstart widget in {# This is a template for training configs used for the quickstart widget in
the docs and the init config command. It encodes various best practices and the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #} can help generate the best possible configuration, given a user's requirements. #}
{%- set use_transformer = hardware != "cpu" -%} {%- set use_transformer = hardware != "cpu" and transformer_data -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%} {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
[paths] [paths]
train = null train = null

View File

@ -316,6 +316,11 @@ class Errors(metaclass=ErrorsWithCodes):
"clear the existing vectors and resize the table.") "clear the existing vectors and resize the table.")
E074 = ("Error interpreting compiled match pattern: patterns are expected " E074 = ("Error interpreting compiled match pattern: patterns are expected "
"to end with the attribute {attr}. Got: {bad_attr}.") "to end with the attribute {attr}. Got: {bad_attr}.")
E079 = ("Error computing states in beam: number of predicted beams "
"({pbeams}) does not equal number of gold beams ({gbeams}).")
E080 = ("Duplicate state found in beam: {key}.")
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
"does not equal number of losses ({losses}).")
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), " E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not " "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
"match.") "match.")

View File

@ -229,6 +229,9 @@ class SpanCategorizer(TrainablePipe):
DOCS: https://spacy.io/api/spancategorizer#predict DOCS: https://spacy.io/api/spancategorizer#predict
""" """
indices = self.suggester(docs, ops=self.model.ops) indices = self.suggester(docs, ops=self.model.ops)
if indices.lengths.sum() == 0:
scores = self.model.ops.alloc2f(0, 0)
else:
scores = self.model.predict((docs, indices)) # type: ignore scores = self.model.predict((docs, indices)) # type: ignore
return indices, scores return indices, scores

View File

@ -1,4 +1,6 @@
import pytest import pytest
import numpy
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
@ -100,14 +102,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
# head before start # head before start
arr = doc.to_array(["HEAD"]) arr = doc.to_array(["HEAD"])
arr[0] = -1 arr[0] = numpy.int32(-1).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words) doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError): with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr) doc_from_array.from_array(["HEAD"], arr)
# head after end # head after end
arr = doc.to_array(["HEAD"]) arr = doc.to_array(["HEAD"])
arr[0] = 5 arr[0] = numpy.int32(5).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words) doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError): with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr) doc_from_array.from_array(["HEAD"], arr)

View File

@ -2,6 +2,7 @@ import weakref
import pytest import pytest
import numpy import numpy
import warnings
from spacy.lang.xx import MultiLanguage from spacy.lang.xx import MultiLanguage
from spacy.tokens import Doc, Span, Token from spacy.tokens import Doc, Span, Token
@ -311,9 +312,9 @@ def test_doc_from_array_sent_starts(en_vocab):
# no warning using default attrs # no warning using default attrs
attrs = doc._get_array_attrs() attrs = doc._get_array_attrs()
arr = doc.to_array(attrs) arr = doc.to_array(attrs)
with pytest.warns(None) as record: with warnings.catch_warnings():
warnings.simplefilter("error")
new_doc.from_array(attrs, arr) new_doc.from_array(attrs, arr)
assert len(record) == 0
# only SENT_START uses SENT_START # only SENT_START uses SENT_START
attrs = [SENT_START] attrs = [SENT_START]
arr = doc.to_array(attrs) arr = doc.to_array(attrs)

View File

@ -2,6 +2,9 @@ import pytest
from spacy.tokens import Doc from spacy.tokens import Doc
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_ru_doc_lemmatization(ru_lemmatizer): def test_ru_doc_lemmatization(ru_lemmatizer):
words = ["мама", "мыла", "раму"] words = ["мама", "мыла", "раму"]
pos = ["NOUN", "VERB", "NOUN"] pos = ["NOUN", "VERB", "NOUN"]

View File

@ -1,6 +1,10 @@
import pytest
from spacy.tokens import Doc from spacy.tokens import Doc
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_uk_lemmatizer(uk_lemmatizer): def test_uk_lemmatizer(uk_lemmatizer):
"""Check that the default uk lemmatizer runs.""" """Check that the default uk lemmatizer runs."""
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"]) doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])

View File

@ -1,4 +1,5 @@
import pytest import pytest
import warnings
import srsly import srsly
from mock import Mock from mock import Mock
from spacy.matcher import PhraseMatcher from spacy.matcher import PhraseMatcher
@ -197,13 +198,13 @@ def test_phrase_matcher_validation(en_vocab):
matcher.add("TEST1", [doc1]) matcher.add("TEST1", [doc1])
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
matcher.add("TEST2", [doc2]) matcher.add("TEST2", [doc2])
with pytest.warns(None) as record: with warnings.catch_warnings():
warnings.simplefilter("error")
matcher.add("TEST3", [doc3]) matcher.add("TEST3", [doc3])
assert not record.list
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True) matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
with pytest.warns(None) as record: with warnings.catch_warnings():
warnings.simplefilter("error")
matcher.add("TEST4", [doc2]) matcher.add("TEST4", [doc2])
assert not record.list
def test_attr_validation(en_vocab): def test_attr_validation(en_vocab):

View File

@ -369,24 +369,39 @@ def test_overfitting_IO_overlapping():
def test_zero_suggestions(): def test_zero_suggestions():
# Test with a suggester that returns 0 suggestions # Test with a suggester that can return 0 suggestions
@registry.misc("test_zero_suggester") @registry.misc("test_mixed_zero_suggester")
def make_zero_suggester(): def make_mixed_zero_suggester():
def zero_suggester(docs, *, ops=None): def mixed_zero_suggester(docs, *, ops=None):
if ops is None: if ops is None:
ops = get_current_ops() ops = get_current_ops()
return Ragged( spans = []
ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i") lengths = []
) for doc in docs:
if len(doc) > 0 and len(doc) % 2 == 0:
spans.append((0, 1))
lengths.append(1)
else:
lengths.append(0)
spans = ops.asarray2i(spans)
lengths_array = ops.asarray1i(lengths)
if len(spans) > 0:
output = Ragged(ops.xp.vstack(spans), lengths_array)
else:
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
return output
return zero_suggester return mixed_zero_suggester
fix_random_seed(0) fix_random_seed(0)
nlp = English() nlp = English()
spancat = nlp.add_pipe( spancat = nlp.add_pipe(
"spancat", "spancat",
config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY}, config={
"suggester": {"@misc": "test_mixed_zero_suggester"},
"spans_key": SPAN_KEY,
},
) )
train_examples = make_examples(nlp) train_examples = make_examples(nlp)
optimizer = nlp.initialize(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)
@ -394,3 +409,13 @@ def test_zero_suggestions():
assert set(spancat.labels) == {"LOC", "PERSON"} assert set(spancat.labels) == {"LOC", "PERSON"}
nlp.update(train_examples, sgd=optimizer) nlp.update(train_examples, sgd=optimizer)
# empty doc
nlp("")
# single doc with zero suggestions
nlp("one")
# single doc with one suggestion
nlp("two two")
# batch with mixed zero/one suggestions
list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"]))
# batch with no suggestions
list(nlp.pipe(["", "one", "three three three"]))

View File

@ -11,6 +11,7 @@ from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.cli._util import load_project_config, substitute_project_variables from spacy.cli._util import load_project_config, substitute_project_variables
from spacy.cli._util import is_subpath_of from spacy.cli._util import is_subpath_of
from spacy.cli._util import string_to_list from spacy.cli._util import string_to_list
from spacy.cli._util import upload_file, download_file
from spacy import about from spacy import about
from spacy.util import get_minor_version from spacy.util import get_minor_version
from spacy.cli.validate import get_model_pkgs from spacy.cli.validate import get_model_pkgs
@ -574,3 +575,18 @@ def test_get_third_party_dependencies():
) )
def test_is_subpath_of(parent, child, expected): def test_is_subpath_of(parent, child, expected):
assert is_subpath_of(parent, child) == expected assert is_subpath_of(parent, child) == expected
def test_upload_download_local_file():
with make_tempdir() as d1, make_tempdir() as d2:
filename = "f.txt"
content = "content"
local_file = d1 / filename
remote_file = d2 / filename
with local_file.open(mode="w") as file_:
file_.write(content)
upload_file(local_file, remote_file)
local_file.unlink()
download_file(remote_file, local_file)
with local_file.open(mode="r") as file_:
assert file_.read() == content

View File

@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():
def get_textcat_cnn_kwargs(): def get_textcat_cnn_kwargs():
return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13} return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
def get_all_params(model): def get_all_params(model):
@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
} }
def test_tok2vec(): def make_test_tok2vec():
return build_Tok2Vec_model(**get_tok2vec_kwargs()) return build_Tok2Vec_model(**get_tok2vec_kwargs())

View File

@ -7,7 +7,7 @@ from ..util import get_cosine, add_vecs_to_vocab
@pytest.fixture @pytest.fixture
def vectors(): def vectors():
return [("apple", [1, 2, 3]), ("orange", [-1, -2, -3])] return [("apple", [1, 2, 3]), ("orange", [-1, -2, -5])]
@pytest.fixture() @pytest.fixture()
@ -44,7 +44,6 @@ def test_vectors_similarity_TT(vocab, vectors):
def test_vectors_similarity_TD(vocab, vectors): def test_vectors_similarity_TD(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors [(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2]) doc = Doc(vocab, words=[word1, word2])
with pytest.warns(UserWarning):
assert doc.similarity(doc[0]) == doc[0].similarity(doc) assert doc.similarity(doc[0]) == doc[0].similarity(doc)
@ -57,5 +56,4 @@ def test_vectors_similarity_DS(vocab, vectors):
def test_vectors_similarity_TS(vocab, vectors): def test_vectors_similarity_TS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors [(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2]) doc = Doc(vocab, words=[word1, word2])
with pytest.warns(UserWarning):
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2]) assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])

View File

@ -356,6 +356,7 @@ cdef class Doc:
for annot in annotations: for annot in annotations:
if annot: if annot:
if annot is heads or annot is sent_starts or annot is ent_iobs: if annot is heads or annot is sent_starts or annot is ent_iobs:
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
for i in range(len(words)): for i in range(len(words)):
if attrs.ndim == 1: if attrs.ndim == 1:
attrs[i] = annot[i] attrs[i] = annot[i]

View File

@ -307,7 +307,7 @@ cdef class Span:
for ancestor in ancestors: for ancestor in ancestors:
ancestor_i = ancestor.i - self.c.start ancestor_i = ancestor.i - self.c.start
if ancestor_i in range(length): if ancestor_i in range(length):
array[i, head_col] = ancestor_i - i array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
# if there is no appropriate ancestor, define a new artificial root # if there is no appropriate ancestor, define a new artificial root
value = array[i, head_col] value = array[i, head_col]
@ -315,7 +315,7 @@ cdef class Span:
new_root = old_to_new_root.get(ancestor_i, None) new_root = old_to_new_root.get(ancestor_i, None)
if new_root is not None: if new_root is not None:
# take the same artificial root as a previous token from the same sentence # take the same artificial root as a previous token from the same sentence
array[i, head_col] = new_root - i array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
else: else:
# set this token as the new artificial root # set this token as the new artificial root
array[i, head_col] = 0 array[i, head_col] = 0

View File

@ -333,26 +333,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
if key not in IDS: if key not in IDS:
raise ValueError(Errors.E974.format(obj="token", key=key)) raise ValueError(Errors.E974.format(obj="token", key=key))
elif key in ["ORTH", "SPACY"]: elif key in ["ORTH", "SPACY"]:
pass continue
elif key == "HEAD": elif key == "HEAD":
attrs.append(key) attrs.append(key)
values.append([h-i if h is not None else 0 for i, h in enumerate(value)]) row = [h-i if h is not None else 0 for i, h in enumerate(value)]
elif key == "DEP": elif key == "DEP":
attrs.append(key) attrs.append(key)
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]) row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
elif key == "SENT_START": elif key == "SENT_START":
attrs.append(key) attrs.append(key)
values.append([to_ternary_int(v) for v in value]) row = [to_ternary_int(v) for v in value]
elif key == "MORPH": elif key == "MORPH":
attrs.append(key) attrs.append(key)
values.append([vocab.morphology.add(v) for v in value]) row = [vocab.morphology.add(v) for v in value]
else: else:
attrs.append(key) attrs.append(key)
if not all(isinstance(v, str) for v in value): if not all(isinstance(v, str) for v in value):
types = set([type(v) for v in value]) types = set([type(v) for v in value])
raise TypeError(Errors.E969.format(field=key, types=types)) from None raise TypeError(Errors.E969.format(field=key, types=types)) from None
values.append([vocab.strings.add(v) for v in value]) row = [vocab.strings.add(v) for v in value]
array = numpy.asarray(values, dtype="uint64") values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
array = numpy.array(values, dtype=numpy.uint64)
return attrs, array.T return attrs, array.T

View File

@ -290,3 +290,5 @@ def ensure_shape(vectors_loc):
# store all the results in a list in memory # store all the results in a list in memory
lines2 = open_file(vectors_loc) lines2 = open_file(vectors_loc)
yield from lines2 yield from lines2
lines2.close()
lines.close()