diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 80c88b0b8..7e3f94df6 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -1,9 +1,6 @@ parameters: python_version: '' - architecture: '' - prefix: '' - gpu: false - num_build_jobs: 1 + architecture: 'x64' steps: - task: UsePythonVersion@0 @@ -16,16 +13,16 @@ steps: displayName: 'Set variables' - script: | - ${{ parameters.prefix }} python -m pip install -U pip setuptools - ${{ parameters.prefix }} python -m pip install -U -r requirements.txt + python -m pip install -U build pip setuptools + python -m pip install -U -r requirements.txt displayName: "Install dependencies" - script: | - ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }} - ${{ parameters.prefix }} python setup.py sdist --formats=gztar - displayName: "Compile and build sdist" + python -m build --sdist + displayName: "Build sdist" - - script: python -m mypy spacy + - script: | + python -m mypy spacy displayName: 'Run mypy' condition: ne(variables['python_version'], '3.10') @@ -34,35 +31,24 @@ steps: contents: "spacy" displayName: "Delete source directory" + - task: DeleteFiles@1 + inputs: + contents: "*.egg-info" + displayName: "Delete egg-info directory" + - script: | - ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt - ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt + python -m pip freeze > installed.txt + python -m pip uninstall -y -r installed.txt displayName: "Uninstall all packages" - bash: | - ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) - ${{ parameters.prefix }} python -m pip install dist/$SDIST + SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) + python -m pip install dist/$SDIST displayName: "Install from sdist" - script: | - ${{ parameters.prefix }} python -m pip install -U -r requirements.txt - displayName: "Install test requirements" - - - script: | - ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0 - ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html - displayName: "Install GPU requirements" - condition: eq(${{ parameters.gpu }}, true) - - - script: | - ${{ parameters.prefix }} python -m pytest --pyargs spacy - displayName: "Run CPU tests" - condition: eq(${{ parameters.gpu }}, false) - - - script: | - ${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu - displayName: "Run GPU tests" - condition: eq(${{ parameters.gpu }}, true) + python -W error -c "import spacy" + displayName: "Test import" - script: | python -m spacy download ca_core_news_sm @@ -105,13 +91,21 @@ steps: displayName: 'Test assemble CLI vectors warning' condition: eq(variables['python_version'], '3.8') + - script: | + python -m pip install -U -r requirements.txt + displayName: "Install test requirements" + + - script: | + python -m pytest --pyargs spacy -W error + displayName: "Run CPU tests" + + - script: | + python -m pip install 'spacy[apple]' + python -m pytest --pyargs spacy + displayName: "Run CPU tests with thinc-apple-ops" + condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10')) + - script: | python .github/validate_universe_json.py website/meta/universe.json displayName: 'Test website/meta/universe.json' condition: eq(variables['python_version'], '3.8') - - - script: | - ${{ parameters.prefix }} python -m pip install thinc-apple-ops - ${{ parameters.prefix }} python -m pytest --pyargs spacy - displayName: "Run CPU tests with thinc-apple-ops" - condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9')) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a7a12fd24..2612c9a81 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,7 @@ repos: - id: black language_version: python3.7 - repo: https://gitlab.com/pycqa/flake8 - rev: 3.9.2 + rev: 5.0.4 hooks: - id: flake8 args: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 71a793911..57f5c8727 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -29,7 +29,7 @@ jobs: inputs: versionSpec: "3.7" - script: | - pip install flake8==3.9.2 + pip install flake8==5.0.4 python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics displayName: "flake8" @@ -39,7 +39,7 @@ jobs: matrix: # We're only running one platform per Python version to speed up builds Python36Linux: - imageName: "ubuntu-latest" + imageName: "ubuntu-20.04" python.version: "3.6" # Python36Windows: # imageName: "windows-latest" @@ -48,7 +48,7 @@ jobs: # imageName: "macos-latest" # python.version: "3.6" # Python37Linux: - # imageName: "ubuntu-latest" + # imageName: "ubuntu-20.04" # python.version: "3.7" Python37Windows: imageName: "windows-latest" @@ -90,20 +90,3 @@ jobs: - template: .github/azure-steps.yml parameters: python_version: '$(python.version)' - architecture: 'x64' - -# - job: "TestGPU" -# dependsOn: "Validate" -# strategy: -# matrix: -# Python38LinuxX64_GPU: -# python.version: '3.8' -# pool: -# name: "LinuxX64_GPU" -# steps: -# - template: .github/azure-steps.yml -# parameters: -# python_version: '$(python.version)' -# architecture: 'x64' -# gpu: true -# num_build_jobs: 24 diff --git a/requirements.txt b/requirements.txt index d76135c71..d36f38d15 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,8 +10,8 @@ wasabi>=0.8.1,<1.1.0 srsly>=2.4.1,<3.0.0 catalogue>=2.0.6,<2.1.0 typer>=0.3.0,<0.5.0 -click<8.1.0 pathy>=0.3.5 +smart-open>=5.2.1,<7.0.0 # Third party dependencies numpy>=1.15.0 requests>=2.13.0,<3.0.0 @@ -28,7 +28,7 @@ cython>=0.25,<3.0 pytest>=5.2.0 pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 -flake8>=3.8.0,<3.10.0 +flake8>=3.8.0,<6.0.0 hypothesis>=3.27.0,<7.0.0 mypy==0.910 types-dataclasses>=0.1.3; python_version < "3.7" diff --git a/setup.cfg b/setup.cfg index b146577c4..8517b77b2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,10 +50,10 @@ install_requires = wasabi>=0.8.1,<1.1.0 srsly>=2.4.1,<3.0.0 catalogue>=2.0.6,<2.1.0 - typer>=0.3.0,<0.5.0 - click<8.1.0 - pathy>=0.3.5 # Third-party dependencies + typer>=0.3.0,<0.5.0 + pathy>=0.3.5 + smart-open>=5.2.1,<7.0.0 tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 requests>=2.13.0,<3.0.0 diff --git a/spacy/about.py b/spacy/about.py index 37b9f2556..ef41b112b 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.1.6" +__version__ = "3.1.7" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index fb680d888..1b9740d0b 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -358,7 +358,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) if dest.exists() and not force: return None src = str(src) - with smart_open.open(src, mode="rb", ignore_ext=True) as input_file: + with smart_open.open(src, mode="rb", compression="disable") as input_file: with dest.open(mode="wb") as output_file: output_file.write(input_file.read()) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index b78806fec..7304cab5b 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -1,7 +1,7 @@ {# This is a template for training configs used for the quickstart widget in the docs and the init config command. It encodes various best practices and can help generate the best possible configuration, given a user's requirements. #} -{%- set use_transformer = hardware != "cpu" -%} +{%- set use_transformer = hardware != "cpu" and transformer_data -%} {%- set transformer = transformer_data[optimize] if use_transformer else {} -%} [paths] train = null diff --git a/spacy/errors.py b/spacy/errors.py index 2da52e3b8..a29d62d75 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -316,6 +316,11 @@ class Errors(metaclass=ErrorsWithCodes): "clear the existing vectors and resize the table.") E074 = ("Error interpreting compiled match pattern: patterns are expected " "to end with the attribute {attr}. Got: {bad_attr}.") + E079 = ("Error computing states in beam: number of predicted beams " + "({pbeams}) does not equal number of gold beams ({gbeams}).") + E080 = ("Duplicate state found in beam: {key}.") + E081 = ("Error getting gradient in beam: number of histories ({n_hist}) " + "does not equal number of losses ({losses}).") E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), " "projective heads ({n_proj_heads}) and labels ({n_labels}) do not " "match.") diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 7c5d3386e..b4e08502b 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -229,7 +229,10 @@ class SpanCategorizer(TrainablePipe): DOCS: https://spacy.io/api/spancategorizer#predict """ indices = self.suggester(docs, ops=self.model.ops) - scores = self.model.predict((docs, indices)) # type: ignore + if indices.lengths.sum() == 0: + scores = self.model.ops.alloc2f(0, 0) + else: + scores = self.model.predict((docs, indices)) # type: ignore return indices, scores def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None: diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index ef54c581c..94f4cb140 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -1,4 +1,6 @@ import pytest +import numpy + from spacy.tokens import Doc from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH @@ -100,14 +102,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab): # head before start arr = doc.to_array(["HEAD"]) - arr[0] = -1 + arr[0] = numpy.int32(-1).astype(numpy.uint64) doc_from_array = Doc(en_vocab, words=words) with pytest.raises(ValueError): doc_from_array.from_array(["HEAD"], arr) # head after end arr = doc.to_array(["HEAD"]) - arr[0] = 5 + arr[0] = numpy.int32(5).astype(numpy.uint64) doc_from_array = Doc(en_vocab, words=words) with pytest.raises(ValueError): doc_from_array.from_array(["HEAD"], arr) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 57df87642..316131848 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -2,6 +2,7 @@ import weakref import pytest import numpy +import warnings from spacy.lang.xx import MultiLanguage from spacy.tokens import Doc, Span, Token @@ -311,9 +312,9 @@ def test_doc_from_array_sent_starts(en_vocab): # no warning using default attrs attrs = doc._get_array_attrs() arr = doc.to_array(attrs) - with pytest.warns(None) as record: + with warnings.catch_warnings(): + warnings.simplefilter("error") new_doc.from_array(attrs, arr) - assert len(record) == 0 # only SENT_START uses SENT_START attrs = [SENT_START] arr = doc.to_array(attrs) diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py index 3810323bf..9ca7f441b 100644 --- a/spacy/tests/lang/ru/test_lemmatizer.py +++ b/spacy/tests/lang/ru/test_lemmatizer.py @@ -2,6 +2,9 @@ import pytest from spacy.tokens import Doc +pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") + + def test_ru_doc_lemmatization(ru_lemmatizer): words = ["мама", "мыла", "раму"] pos = ["NOUN", "VERB", "NOUN"] diff --git a/spacy/tests/lang/uk/test_lemmatizer.py b/spacy/tests/lang/uk/test_lemmatizer.py index 4a787b2a6..57dd4198a 100644 --- a/spacy/tests/lang/uk/test_lemmatizer.py +++ b/spacy/tests/lang/uk/test_lemmatizer.py @@ -1,6 +1,10 @@ +import pytest from spacy.tokens import Doc +pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") + + def test_uk_lemmatizer(uk_lemmatizer): """Check that the default uk lemmatizer runs.""" doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"]) diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 478949601..ffcd206d5 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -1,4 +1,5 @@ import pytest +import warnings import srsly from mock import Mock from spacy.matcher import PhraseMatcher @@ -197,13 +198,13 @@ def test_phrase_matcher_validation(en_vocab): matcher.add("TEST1", [doc1]) with pytest.warns(UserWarning): matcher.add("TEST2", [doc2]) - with pytest.warns(None) as record: + with warnings.catch_warnings(): + warnings.simplefilter("error") matcher.add("TEST3", [doc3]) - assert not record.list matcher = PhraseMatcher(en_vocab, attr="POS", validate=True) - with pytest.warns(None) as record: + with warnings.catch_warnings(): + warnings.simplefilter("error") matcher.add("TEST4", [doc2]) - assert not record.list def test_attr_validation(en_vocab): diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index 2f7e952d3..53d997400 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -369,24 +369,39 @@ def test_overfitting_IO_overlapping(): def test_zero_suggestions(): - # Test with a suggester that returns 0 suggestions + # Test with a suggester that can return 0 suggestions - @registry.misc("test_zero_suggester") - def make_zero_suggester(): - def zero_suggester(docs, *, ops=None): + @registry.misc("test_mixed_zero_suggester") + def make_mixed_zero_suggester(): + def mixed_zero_suggester(docs, *, ops=None): if ops is None: ops = get_current_ops() - return Ragged( - ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i") - ) + spans = [] + lengths = [] + for doc in docs: + if len(doc) > 0 and len(doc) % 2 == 0: + spans.append((0, 1)) + lengths.append(1) + else: + lengths.append(0) + spans = ops.asarray2i(spans) + lengths_array = ops.asarray1i(lengths) + if len(spans) > 0: + output = Ragged(ops.xp.vstack(spans), lengths_array) + else: + output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array) + return output - return zero_suggester + return mixed_zero_suggester fix_random_seed(0) nlp = English() spancat = nlp.add_pipe( "spancat", - config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY}, + config={ + "suggester": {"@misc": "test_mixed_zero_suggester"}, + "spans_key": SPAN_KEY, + }, ) train_examples = make_examples(nlp) optimizer = nlp.initialize(get_examples=lambda: train_examples) @@ -394,3 +409,13 @@ def test_zero_suggestions(): assert set(spancat.labels) == {"LOC", "PERSON"} nlp.update(train_examples, sgd=optimizer) + # empty doc + nlp("") + # single doc with zero suggestions + nlp("one") + # single doc with one suggestion + nlp("two two") + # batch with mixed zero/one suggestions + list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"])) + # batch with no suggestions + list(nlp.pipe(["", "one", "three three three"])) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 72bbe04e5..894a41b73 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -11,6 +11,7 @@ from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import load_project_config, substitute_project_variables from spacy.cli._util import is_subpath_of from spacy.cli._util import string_to_list +from spacy.cli._util import upload_file, download_file from spacy import about from spacy.util import get_minor_version from spacy.cli.validate import get_model_pkgs @@ -574,3 +575,18 @@ def test_get_third_party_dependencies(): ) def test_is_subpath_of(parent, child, expected): assert is_subpath_of(parent, child) == expected + + +def test_upload_download_local_file(): + with make_tempdir() as d1, make_tempdir() as d2: + filename = "f.txt" + content = "content" + local_file = d1 / filename + remote_file = d2 / filename + with local_file.open(mode="w") as file_: + file_.write(content) + upload_file(local_file, remote_file) + local_file.unlink() + download_file(remote_file, local_file) + with local_file.open(mode="r") as file_: + assert file_.read() == content diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index 2306cabb7..d91ed1201 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -23,7 +23,7 @@ def get_textcat_bow_kwargs(): def get_textcat_cnn_kwargs(): - return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13} + return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13} def get_all_params(model): @@ -65,7 +65,7 @@ def get_tok2vec_kwargs(): } -def test_tok2vec(): +def make_test_tok2vec(): return build_Tok2Vec_model(**get_tok2vec_kwargs()) diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py index b5f7303b5..91b2f5b05 100644 --- a/spacy/tests/vocab_vectors/test_similarity.py +++ b/spacy/tests/vocab_vectors/test_similarity.py @@ -7,7 +7,7 @@ from ..util import get_cosine, add_vecs_to_vocab @pytest.fixture def vectors(): - return [("apple", [1, 2, 3]), ("orange", [-1, -2, -3])] + return [("apple", [1, 2, 3]), ("orange", [-1, -2, -5])] @pytest.fixture() @@ -44,8 +44,7 @@ def test_vectors_similarity_TT(vocab, vectors): def test_vectors_similarity_TD(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = Doc(vocab, words=[word1, word2]) - with pytest.warns(UserWarning): - assert doc.similarity(doc[0]) == doc[0].similarity(doc) + assert doc.similarity(doc[0]) == doc[0].similarity(doc) def test_vectors_similarity_DS(vocab, vectors): @@ -57,5 +56,4 @@ def test_vectors_similarity_DS(vocab, vectors): def test_vectors_similarity_TS(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = Doc(vocab, words=[word1, word2]) - with pytest.warns(UserWarning): - assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2]) + assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2]) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 1ee845934..111d35974 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -356,6 +356,7 @@ cdef class Doc: for annot in annotations: if annot: if annot is heads or annot is sent_starts or annot is ent_iobs: + annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64) for i in range(len(words)): if attrs.ndim == 1: attrs[i] = annot[i] diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index c9c807d7d..f2a41fa3c 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -307,7 +307,7 @@ cdef class Span: for ancestor in ancestors: ancestor_i = ancestor.i - self.c.start if ancestor_i in range(length): - array[i, head_col] = ancestor_i - i + array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64) # if there is no appropriate ancestor, define a new artificial root value = array[i, head_col] @@ -315,7 +315,7 @@ cdef class Span: new_root = old_to_new_root.get(ancestor_i, None) if new_root is not None: # take the same artificial root as a previous token from the same sentence - array[i, head_col] = new_root - i + array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64) else: # set this token as the new artificial root array[i, head_col] = 0 diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 732203e7b..7e5ff32dd 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -333,26 +333,27 @@ def _annot2array(vocab, tok_annot, doc_annot): if key not in IDS: raise ValueError(Errors.E974.format(obj="token", key=key)) elif key in ["ORTH", "SPACY"]: - pass + continue elif key == "HEAD": attrs.append(key) - values.append([h-i if h is not None else 0 for i, h in enumerate(value)]) + row = [h-i if h is not None else 0 for i, h in enumerate(value)] elif key == "DEP": attrs.append(key) - values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]) + row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value] elif key == "SENT_START": attrs.append(key) - values.append([to_ternary_int(v) for v in value]) + row = [to_ternary_int(v) for v in value] elif key == "MORPH": attrs.append(key) - values.append([vocab.morphology.add(v) for v in value]) + row = [vocab.morphology.add(v) for v in value] else: attrs.append(key) if not all(isinstance(v, str) for v in value): types = set([type(v) for v in value]) raise TypeError(Errors.E969.format(field=key, types=types)) from None - values.append([vocab.strings.add(v) for v in value]) - array = numpy.asarray(values, dtype="uint64") + row = [vocab.strings.add(v) for v in value] + values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row]) + array = numpy.array(values, dtype=numpy.uint64) return attrs, array.T diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 96abcc7cd..159df4ab9 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -290,3 +290,5 @@ def ensure_shape(vectors_loc): # store all the results in a list in memory lines2 = open_file(vectors_loc) yield from lines2 + lines2.close() + lines.close()