Update build constraints for aarch64 and python 3.10 (#11980 )

Merge pull request #11965 from adrianeboyd/backport/v3.0.9
Backport bug fixes to v3.0.x
2025-08-04 12:20:20 +03:00 · 2022-12-15 10:33:16 +01:00 · 2022-12-15 08:04:20 +01:00 · 2022-12-13 13:56:56 +01:00 · 2022-12-13 13:56:56 +01:00 · 2022-12-12 17:27:00 +01:00
56 changed files with 416 additions and 221 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -1,9 +1,6 @@
 parameters:
  python_version: ''
-  architecture: ''
+  architecture: 'x64'
  prefix: ''
  gpu: false
  num_build_jobs: 1
 steps:
  - task: UsePythonVersion@0
@ -16,52 +13,76 @@ steps:
    displayName: 'Set variables'
  - script: |
-      ${{ parameters.prefix }} python -m pip install -U pip setuptools
+      python -m pip install -U build pip setuptools
-      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+      python -m pip install -U -r requirements.txt
    displayName: "Install dependencies"
  - script: |
-      ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
+      python -m build --sdist
-      ${{ parameters.prefix }} python setup.py sdist --formats=gztar
+    displayName: "Build sdist"
    displayName: "Compile and build sdist"
  - task: DeleteFiles@1
    inputs:
      contents: "spacy"
    displayName: "Delete source directory"
  - task: DeleteFiles@1
    inputs:
      contents: "*.egg-info"
    displayName: "Delete egg-info directory"
  - script: |
-      ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
+      python -m pip freeze > installed.txt
-      ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
+      python -m pip uninstall -y -r installed.txt
    displayName: "Uninstall all packages"
  - bash: |
-      ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+      SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
-      ${{ parameters.prefix }} python -m pip install dist/$SDIST
+      python -m pip install dist/$SDIST
    displayName: "Install from sdist"
  - script: |
-      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+      python -W error -c "import spacy"
    displayName: "Test import"
  - script: |
      python -m spacy download es_core_news_sm
      python -c "import spacy; nlp=spacy.load('es_core_news_sm'); doc=nlp('test')"
    displayName: 'Test download CLI'
    condition: eq(variables['python_version'], '3.8')
  - script: |
      python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
    displayName: 'Test convert CLI'
    condition: eq(variables['python_version'], '3.8')
  - script: |
      python -m spacy init config -p ner -l es ner.cfg
      python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
    displayName: 'Test debug config CLI'
    condition: eq(variables['python_version'], '3.8')
  - script: |
      # will have errors due to sparse data, check for summary in output
      python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
    displayName: 'Test debug data CLI'
    condition: eq(variables['python_version'], '3.8')
  - script: |
      python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
    displayName: 'Test train CLI'
    condition: eq(variables['python_version'], '3.8')
  - script: |
      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'es_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
    displayName: 'Test assemble CLI'
    condition: eq(variables['python_version'], '3.8')
  - script: |
      python -m pip install -U -r requirements.txt
    displayName: "Install test requirements"
  - script: |
-      ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
+      python -m pytest --pyargs spacy -W error
      ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
    displayName: "Install GPU requirements"
    condition: eq(${{ parameters.gpu }}, true)
  - script: |
      ${{ parameters.prefix }} python -m pytest --pyargs spacy
    displayName: "Run CPU tests"
    condition: eq(${{ parameters.gpu }}, false)
  - script: |
      ${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
    displayName: "Run GPU tests"
    condition: eq(${{ parameters.gpu }}, true)
  - script: |
      python -m spacy download en_core_web_sm
      python -c "import spacy; nlp=spacy.load('en_core_web_sm'); doc=nlp('test')"
    displayName: 'Test download CLI'
    condition: eq(variables['python_version'], '3.8')
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -8,3 +8,4 @@ recursive-exclude spacy/lang *.json
 recursive-include spacy/lang *.json.gz
 recursive-include spacy/cli *.json *.yml
 recursive-include licenses *
 recursive-exclude spacy *.cpp
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -22,13 +22,13 @@ jobs:
  # defined in .flake8 and overwrites the selected codes.
  - job: "Validate"
    pool:
-      vmImage: "ubuntu-18.04"
+      vmImage: "ubuntu-latest"
    steps:
      - task: UsePythonVersion@0
        inputs:
          versionSpec: "3.7"
      - script: |
-          pip install flake8==3.5.0
+          pip install flake8==5.0.4
          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
        displayName: "flake8"
@ -38,41 +38,50 @@ jobs:
      matrix:
        # We're only running one platform per Python version to speed up builds
        Python36Linux:
-          imageName: "ubuntu-18.04"
+          imageName: "ubuntu-20.04"
          python.version: "3.6"
        #        Python36Windows:
-        #          imageName: "vs2017-win2016"
+        #          imageName: "windows-latest"
        #          python.version: "3.6"
        #        Python36Mac:
-        #          imageName: "macos-10.14"
+        #          imageName: "macos-latest"
        #          python.version: "3.6"
        #        Python37Linux:
-        #          imageName: "ubuntu-18.04"
+        #          imageName: "ubuntu-20.04"
        #          python.version: "3.7"
        Python37Windows:
-          imageName: "vs2017-win2016"
+          imageName: "windows-latest"
          python.version: "3.7"
        #        Python37Mac:
-        #          imageName: "macos-10.14"
+        #          imageName: "macos-latest"
        #          python.version: "3.7"
        #        Python38Linux:
-        #          imageName: "ubuntu-18.04"
+        #          imageName: "ubuntu-latest"
        #          python.version: "3.8"
        #        Python38Windows:
-        #          imageName: "vs2017-win2016"
+        #          imageName: "windows-latest"
        #          python.version: "3.8"
        Python38Mac:
-          imageName: "macos-10.14"
+          imageName: "macos-latest"
          python.version: "3.8"
        Python39Linux:
-          imageName: "ubuntu-18.04"
+          imageName: "ubuntu-latest"
          python.version: "3.9"
        Python39Windows:
          imageName: "vs2017-win2016"
          python.version: "3.9"
        Python39Mac:
          imageName: "macos-10.14"
          python.version: "3.9"
        #        Python39Windows:
        #          imageName: "windows-latest"
        #          python.version: "3.9"
        #        Python39Mac:
        #          imageName: "macos-latest"
        #          python.version: "3.9"
        Python310Linux:
          imageName: "ubuntu-latest"
          python.version: "3.10"
        Python310Windows:
          imageName: "windows-latest"
          python.version: "3.10"
        Python310Mac:
          imageName: "macos-latest"
          python.version: "3.10"
      maxParallel: 4
    pool:
      vmImage: $(imageName)
@ -80,20 +89,3 @@ jobs:
      - template: .github/azure-steps.yml
        parameters:
          python_version: '$(python.version)'
          architecture: 'x64'
 #  - job: "TestGPU"
 #    dependsOn: "Validate"
 #    strategy:
 #      matrix:
 #        Python38LinuxX64_GPU:
 #          python.version: '3.8'
 #    pool:
 #      name: "LinuxX64_GPU"
 #    steps:
 #      - template: .github/azure-steps.yml
 #        parameters:
 #          python_version: '$(python.version)'
 #          architecture: 'x64'
 #          gpu: true
 #          num_build_jobs: 24
--- a/build-constraints.txt
+++ b/build-constraints.txt
@ -1,5 +1,8 @@
 # build version constraints for use with wheelwright + multibuild
-numpy==1.15.0; python_version<='3.7'
+numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
-numpy==1.17.3; python_version=='3.8'
+numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
 numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
 numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
 numpy==1.19.3; python_version=='3.9'
-numpy; python_version>='3.10'
+numpy==1.21.3; python_version=='3.10'
 numpy; python_version>='3.11'
--- a/requirements.txt
+++ b/requirements.txt
@ -11,6 +11,7 @@ srsly>=2.4.1,<3.0.0
 catalogue>=2.0.4,<2.1.0
 typer>=0.3.0,<0.4.0
 pathy>=0.3.5
 smart-open>=5.2.1,<7.0.0
 # Third party dependencies
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
@ -26,5 +27,6 @@ cython>=0.25,<3.0
 pytest>=5.2.0
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
-flake8>=3.5.0,<3.6.0
+flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
 mypy==0.910
--- a/setup.cfg
+++ b/setup.cfg
@ -49,9 +49,10 @@ install_requires =
    wasabi>=0.8.1,<1.1.0
    srsly>=2.4.1,<3.0.0
    catalogue>=2.0.4,<2.1.0
    # Third-party dependencies
    typer>=0.3.0,<0.4.0
    pathy>=0.3.5
-    # Third-party dependencies
+    smart-open>=5.2.1,<7.0.0
    tqdm>=4.38.0,<5.0.0
    numpy>=1.15.0
    requests>=2.13.0,<3.0.0
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.0.6"
+__version__ = "3.0.9"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -355,7 +355,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
    if dest.exists() and not force:
        return None
    src = str(src)
-    with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
+    with smart_open.open(src, mode="rb", compression="disable") as input_file:
        with dest.open(mode="wb") as output_file:
            output_file.write(input_file.read())
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -115,7 +115,8 @@ def convert(
    ner_map = srsly.read_json(ner_map) if ner_map is not None else None
    doc_files = []
    for input_loc in walk_directory(Path(input_path), converter):
-        input_data = input_loc.open("r", encoding="utf-8").read()
+        with input_loc.open("r", encoding="utf-8") as infile:
            input_data = infile.read()
        # Use converter function to convert data
        func = CONVERTERS[converter]
        docs = func(
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -18,7 +18,7 @@ def package_cli(
    output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
    code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
    meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
-    create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
+    create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
    name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
    version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
    build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -1,7 +1,7 @@
 {# This is a template for training configs used for the quickstart widget in
 the docs and the init config command. It encodes various best practices and
 can help generate the best possible configuration, given a user's requirements. #}
-{%- set use_transformer = hardware != "cpu" -%}
+{%- set use_transformer = hardware != "cpu" and transformer_data -%}
 {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
 [paths]
 train = null
@ -418,7 +418,7 @@ compound = 1.001
 [initialize]
 {% if use_transformer or optimize == "efficiency" or not word_vectors -%}
-vectors = null
+vectors = ${paths.vectors}
 {% else -%}
 vectors = "{{ word_vectors }}"
 {% endif -%}
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@ -3,6 +3,7 @@ from pathlib import Path
 import sys
 import requests
 from wasabi import msg, Printer
 import warnings
 from ._util import app
 from .. import about
@ -45,7 +46,7 @@ def validate() -> None:
                version = msg.text(data["version"], color="green", no_print=True)
            else:
                version = msg.text(data["version"], color="red", no_print=True)
-                comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
+                comp = f"--> {current_compat.get(data['name'], ['n/a'])[0]}"
            rows.append((data["name"], data["spacy"], version, comp))
        msg.table(rows, header=header)
    else:
@ -78,6 +79,8 @@ def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
    msg.good("Loaded compatibility table")
    compat = r.json()["spacy"]
    all_models = set()
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", message="\\[W09[45]")
        installed_models = get_installed_models()
    for spacy_v, models in dict(compat).items():
        all_models.update(models.keys())
@ -92,6 +95,8 @@ def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
            spacy_version = about.__version__
        else:
            model_path = get_package_path(package)
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", message="\\[W09[45]")
                model_meta = get_model_meta(model_path)
            spacy_version = model_meta.get("spacy_version", "n/a")
            is_compat = is_compatible_version(about.__version__, spacy_version)
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -320,6 +320,11 @@ class Errors:
            "clear the existing vectors and resize the table.")
    E074 = ("Error interpreting compiled match pattern: patterns are expected "
            "to end with the attribute {attr}. Got: {bad_attr}.")
    E079 = ("Error computing states in beam: number of predicted beams "
            "({pbeams}) does not equal number of gold beams ({gbeams}).")
    E080 = ("Duplicate state found in beam: {key}.")
    E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
            "does not equal number of losses ({losses}).")
    E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
            "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
            "match.")
@ -518,6 +523,11 @@ class Errors:
    E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
    # New errors added in v3.x
    E867 = ("The 'textcat' component requires at least two labels because it "
            "uses mutually exclusive classes where exactly one label is True "
            "for each doc. For binary classification tasks, you can use two "
            "labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you "
            "can use the 'textcat_multilabel' component with one label.")
    E870 = ("Could not serialize the DocBin because it is too large. Consider "
            "splitting up your documents into several doc bins and serializing "
            "each separately. spacy.Corpus.v1 will search recursively for all "
--- a/spacy/lang/az/init.py
+++ b/spacy/lang/az/init.py
@ -1,16 +1,11 @@
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
 class AzerbaijaniDefaults(Language.Defaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
    token_match = TOKEN_MATCH
    syntax_iterators = SYNTAX_ITERATORS
 class Azerbaijani(Language):
--- a/spacy/lang/el/lemmatizer.py
+++ b/spacy/lang/el/lemmatizer.py
@ -57,6 +57,6 @@ class GreekLemmatizer(Lemmatizer):
            forms.extend(oov_forms)
        if not forms:
            forms.append(string)
-        forms = list(set(forms))
+        forms = list(dict.fromkeys(forms))
        self.cache[cache_key] = forms
        return forms
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@ -12,7 +12,6 @@ PUNCT_RULES = {"«": '"', "»": '"'}
 class RussianLemmatizer(Lemmatizer):
    _morph = None
    def __init__(
        self,
@ -31,8 +30,8 @@ class RussianLemmatizer(Lemmatizer):
                    "The Russian lemmatizer mode 'pymorphy2' requires the "
                    "pymorphy2 library. Install it with: pip install pymorphy2"
                ) from None
-            if RussianLemmatizer._morph is None:
+            if getattr(self, "_morph", None) is None:
-                RussianLemmatizer._morph = MorphAnalyzer()
+                self._morph = MorphAnalyzer()
        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
    def pymorphy2_lemmatize(self, token: Token) -> List[str]:
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@ -7,8 +7,6 @@ from ...vocab import Vocab
 class UkrainianLemmatizer(RussianLemmatizer):
    _morph = None
    def __init__(
        self,
        vocab: Vocab,
@ -27,6 +25,6 @@ class UkrainianLemmatizer(RussianLemmatizer):
                    "pymorphy2 library and dictionaries. Install them with: "
                    "pip install pymorphy2 pymorphy2-dicts-uk"
                ) from None
-            if UkrainianLemmatizer._morph is None:
+            if getattr(self, "_morph", None) is None:
-                UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
+                self._morph = MorphAnalyzer(lang="uk")
        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -50,6 +50,8 @@ cdef class PhraseMatcher:
        if isinstance(attr, (int, long)):
            self.attr = attr
        else:
            if attr is None:
                attr = "ORTH"
            attr = attr.upper()
            if attr == "TEXT":
                attr = "ORTH"
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@ -6,6 +6,7 @@ from thinc.api import Model, Maxout, Linear
 from ...util import registry
 from ...kb import KnowledgeBase, Candidate, get_candidates
 from ...vocab import Vocab
 from ...tokens import Span
@registry.architectures("spacy.EntityLinker.v1")
@ -44,5 +45,5 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
@registry.misc("spacy.CandidateGenerator.v1")
-def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
+def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
    return get_candidates
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@ -3,7 +3,7 @@ from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Mode
 from thinc.api import MultiSoftmax, list2array
 from thinc.api import to_categorical, CosineDistance, L2Distance
-from ...util import registry
+from ...util import registry, OOV_RANK
 from ...errors import Errors
 from ...attrs import ID
@ -70,6 +70,7 @@ def get_vectors_loss(ops, docs, prediction, distance):
    # and look them up all at once. This prevents data copying.
    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
    target = docs[0].vocab.vectors.data[ids]
    target[ids == OOV_RANK] = 0
    d_target, loss = distance(prediction, target)
    return loss, d_target
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -9,7 +9,7 @@ import warnings
 from ..kb import KnowledgeBase, Candidate
 from ..ml import empty_kb
-from ..tokens import Doc
+from ..tokens import Doc, Span
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 from ..language import Language
@ -67,7 +67,7 @@ def make_entity_linker(
    incl_prior: bool,
    incl_context: bool,
    entity_vector_length: int,
-    get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
+    get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
 ):
    """Construct an EntityLinker component.
@ -114,7 +114,7 @@ class EntityLinker(TrainablePipe):
        incl_prior: bool,
        incl_context: bool,
        entity_vector_length: int,
-        get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
+        get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
    ) -> None:
        """Initialize an entity linker.
@ -127,7 +127,7 @@ class EntityLinker(TrainablePipe):
        incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
        incl_context (bool): Whether or not to include the local context in the model.
        entity_vector_length (int): Size of encoding vectors in the KB.
-        get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
+        get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
            produces a list of candidates, given a certain knowledge base and a textual mention.
        DOCS: https://spacy.io/api/entitylinker#init
@ -481,7 +481,8 @@ class EntityLinker(TrainablePipe):
        def load_model(p):
            try:
-                self.model.from_bytes(p.open("rb").read())
+                with p.open("rb") as infile:
                    self.model.from_bytes(infile.read())
            except AttributeError:
                raise ValueError(Errors.E149) from None
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -3,6 +3,7 @@ from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable,
 from collections import defaultdict
 from pathlib import Path
 import srsly
 import warnings
 from .pipe import Pipe
 from ..training import Example
@ -102,17 +103,12 @@ class EntityRuler(Pipe):
        self.overwrite = overwrite_ents
        self.token_patterns = defaultdict(list)
        self.phrase_patterns = defaultdict(list)
        self._validate = validate
        self.matcher = Matcher(nlp.vocab, validate=validate)
        if phrase_matcher_attr is not None:
            if phrase_matcher_attr.upper() == "TEXT":
                phrase_matcher_attr = "ORTH"
        self.phrase_matcher_attr = phrase_matcher_attr
        self.phrase_matcher = PhraseMatcher(
            nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
        )
        else:
            self.phrase_matcher_attr = None
            self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
        self.ent_id_sep = ent_id_sep
        self._ent_ids = defaultdict(dict)
        if patterns is not None:
@ -146,6 +142,8 @@ class EntityRuler(Pipe):
    def match(self, doc: Doc):
        self._require_patterns()
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="\\[W036")
            matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
        matches = set(
            [(m_id, start, end) for m_id, start, end in matches if start != end]
@ -281,7 +279,7 @@ class EntityRuler(Pipe):
                    current_index = i
                    break
            subsequent_pipes = [
-                pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
+                pipe for pipe in self.nlp.pipe_names[current_index :]
            ]
        except ValueError:
            subsequent_pipes = []
@ -317,20 +315,22 @@ class EntityRuler(Pipe):
                pattern = entry["pattern"]
                if isinstance(pattern, Doc):
                    self.phrase_patterns[label].append(pattern)
                    self.phrase_matcher.add(label, [pattern])
                elif isinstance(pattern, list):
                    self.token_patterns[label].append(pattern)
                    self.matcher.add(label, [pattern])
                else:
                    raise ValueError(Errors.E097.format(pattern=pattern))
            for label, patterns in self.token_patterns.items():
                self.matcher.add(label, patterns)
            for label, patterns in self.phrase_patterns.items():
                self.phrase_matcher.add(label, patterns)
    def clear(self) -> None:
        """Reset all patterns."""
        self.token_patterns = defaultdict(list)
        self.phrase_patterns = defaultdict(list)
        self._ent_ids = defaultdict(dict)
        self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
        self.phrase_matcher = PhraseMatcher(
            self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
        )
    def _require_patterns(self) -> None:
        """Raise a warning if this component has no patterns defined."""
@ -381,7 +381,6 @@ class EntityRuler(Pipe):
            self.add_patterns(cfg.get("patterns", cfg))
            self.overwrite = cfg.get("overwrite", False)
            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
            if self.phrase_matcher_attr is not None:
            self.phrase_matcher = PhraseMatcher(
                self.nlp.vocab, attr=self.phrase_matcher_attr
            )
@ -435,7 +434,6 @@ class EntityRuler(Pipe):
            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
            if self.phrase_matcher_attr is not None:
            self.phrase_matcher = PhraseMatcher(
                self.nlp.vocab, attr=self.phrase_matcher_attr
            )
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -332,6 +332,8 @@ class TextCategorizer(TrainablePipe):
        else:
            for label in labels:
                self.add_label(label)
        if len(self.labels) < 2:
            raise ValueError(Errors.E867)
        if positive_label is not None:
            if positive_label not in self.labels:
                err = Errors.E920.format(pos_label=positive_label, labels=self.labels)
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -118,6 +118,10 @@ class Tok2Vec(TrainablePipe):
        DOCS: https://spacy.io/api/tok2vec#predict
        """
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            width = self.model.get_dim("nO")
            return [self.model.ops.alloc((0, width)) for doc in docs]
        tokvecs = self.model.predict(docs)
        batch_id = Tok2VecListener.get_batch_id(docs)
        for listener in self.listeners:
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@ -324,7 +324,8 @@ cdef class TrainablePipe(Pipe):
        def load_model(p):
            try:
-                self.model.from_bytes(p.open("rb").read())
+                with open(p, "rb") as mfile:
                    self.model.from_bytes(mfile.read())
            except AttributeError:
                raise ValueError(Errors.E149) from None
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -4,7 +4,7 @@ from enum import Enum
 from pydantic import BaseModel, Field, ValidationError, validator, create_model
 from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
 from pydantic.main import ModelMetaclass
-from thinc.api import Optimizer, ConfigValidationError
+from thinc.api import Optimizer, ConfigValidationError, Model
 from thinc.config import Promise
 from collections import defaultdict
 import inspect
@ -17,6 +17,7 @@ if TYPE_CHECKING:
    # This lets us add type hints for mypy etc. without causing circular imports
    from .language import Language  # noqa: F401
    from .training import Example  # noqa: F401
    from .vocab import Vocab  # noqa: F401
 # fmt: off
@ -353,7 +354,7 @@ class ConfigSchemaPretrain(BaseModel):
    batcher: Batcher = Field(..., title="Batcher for the training data")
    component: str = Field(..., title="Component to find the layer to pretrain")
    layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
-    objective: Callable[["Vocab", "Model"], "Model"] = Field(..., title="A function that creates the pretraining objective.")
+    objective: Callable[["Vocab", Model], Model] = Field(..., title="A function that creates the pretraining objective.")
    # fmt: on
    class Config:
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -3,7 +3,13 @@ from spacy.util import get_lang_class
 def pytest_addoption(parser):
    try:
        parser.addoption("--slow", action="store_true", help="include slow tests")
        parser.addoption("--issue", action="store", help="test specific issues")
    # Options are already added, e.g. if conftest is copied in a build pipeline
    # and runs twice
    except ValueError:
        pass
 def pytest_runtest_setup(item):
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@ -1,4 +1,5 @@
 import pytest
 import numpy
 from spacy.tokens import Doc
 from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
@ -100,14 +101,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
    # head before start
    arr = doc.to_array(["HEAD"])
-    arr[0] = -1
+    arr[0] = numpy.int32(-1).astype(numpy.uint64)
    doc_from_array = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        doc_from_array.from_array(["HEAD"], arr)
    # head after end
    arr = doc.to_array(["HEAD"])
-    arr[0] = 5
+    arr[0] = numpy.int32(5).astype(numpy.uint64)
    doc_from_array = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        doc_from_array.from_array(["HEAD"], arr)
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -4,6 +4,7 @@ import pytest
 import numpy
 import logging
 import mock
 import warnings
 from spacy.lang.xx import MultiLanguage
 from spacy.tokens import Doc, Span, Token
@ -316,9 +317,9 @@ def test_doc_from_array_sent_starts(en_vocab):
    # no warning using default attrs
    attrs = doc._get_array_attrs()
    arr = doc.to_array(attrs)
-    with pytest.warns(None) as record:
+    with warnings.catch_warnings():
        warnings.simplefilter("error")
        new_doc.from_array(attrs, arr)
        assert len(record) == 0
    # only SENT_START uses SENT_START
    attrs = [SENT_START]
    arr = doc.to_array(attrs)
@ -351,13 +352,21 @@ def test_doc_from_array_morph(en_vocab):
@pytest.mark.usefixtures("clean_underscore")
 def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
-    en_texts = ["Merging the docs is fun.", "", "They don't think alike."]
+    en_texts = [
        "Merging the docs is fun.",
        "",
        "They don't think alike. ",
        "Another doc.",
    ]
    en_texts_without_empty = [t for t in en_texts if len(t)]
    de_text = "Wie war die Frage?"
    en_docs = [en_tokenizer(text) for text in en_texts]
    en_docs[0].spans["group"] = [en_docs[0][1:4]]
    en_docs[2].spans["group"] = [en_docs[2][1:4]]
-    span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text])
+    en_docs[3].spans["group"] = [en_docs[3][0:1]]
    span_group_texts = sorted(
        [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
    )
    de_doc = de_tokenizer(de_text)
    Token.set_extension("is_ambiguous", default=False)
    en_docs[0][2]._.is_ambiguous = True  # docs
@ -371,8 +380,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    m_doc = Doc.from_docs(en_docs)
    assert len(en_texts_without_empty) == len(list(m_doc.sents))
-    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
+    assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
-    assert str(m_doc) == " ".join(en_texts_without_empty)
+    assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
    p_token = m_doc[len(en_docs[0]) - 1]
    assert p_token.text == "." and bool(p_token.whitespace_)
    en_docs_tokens = [t for doc in en_docs for t in doc]
@ -384,11 +393,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    assert not any([t._.is_ambiguous for t in m_doc[3:8]])
    assert "group" in m_doc.spans
    assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
    assert bool(m_doc[11].whitespace_)
    m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
    assert len(en_texts_without_empty) == len(list(m_doc.sents))
-    assert len(str(m_doc)) == sum(len(t) for t in en_texts)
+    assert len(m_doc.text) == sum(len(t) for t in en_texts)
-    assert str(m_doc) == "".join(en_texts)
+    assert m_doc.text == "".join(en_texts_without_empty)
    p_token = m_doc[len(en_docs[0]) - 1]
    assert p_token.text == "." and not bool(p_token.whitespace_)
    en_docs_tokens = [t for doc in en_docs for t in doc]
@ -397,11 +407,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    assert m_doc[9].idx == think_idx
    assert "group" in m_doc.spans
    assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
    assert bool(m_doc[11].whitespace_)
    m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
-    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
+    assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
    # space delimiter considered, although spacy attribute was missing
-    assert str(m_doc) == " ".join(en_texts_without_empty)
+    assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
    p_token = m_doc[len(en_docs[0]) - 1]
    assert p_token.text == "." and bool(p_token.whitespace_)
    en_docs_tokens = [t for doc in en_docs for t in doc]
@ -414,6 +425,16 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    # can merge empty docs
    doc = Doc.from_docs([en_tokenizer("")] * 10)
    # empty but set spans keys are preserved
    en_docs = [en_tokenizer(text) for text in en_texts]
    m_doc = Doc.from_docs(en_docs)
    assert "group" not in m_doc.spans
    for doc in en_docs:
        doc.spans["group"] = []
    m_doc = Doc.from_docs(en_docs)
    assert "group" in m_doc.spans
    assert len(m_doc.spans["group"]) == 0
 def test_doc_api_from_docs_ents(en_tokenizer):
    texts = ["Merging the docs is fun.", "They don't think alike."]
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@ -2,6 +2,9 @@ import pytest
 from spacy.tokens import Doc
 pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_ru_doc_lemmatization(ru_lemmatizer):
    words = ["мама", "мыла", "раму"]
    pos = ["NOUN", "VERB", "NOUN"]
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@ -4,12 +4,13 @@ from spacy.util import get_lang_class
 # fmt: off
 # Only include languages with no external dependencies
-# excluded: ja, ru, th, uk, vi, zh
+# excluded: ja, ko, th, vi, zh
-LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
+LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
-             "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
+             "en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
-             "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
+             "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
-             "sl", "sq", "sr", "sv", "ta", "te", "tl", "tn", "tr", "tt", "ur",
+             "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
-             "yo"]
+             "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
             "tr", "tt", "uk", "ur", "xx", "yo"]
 # fmt: on
--- a/spacy/tests/lang/uk/test_lemmatizer.py
+++ b/spacy/tests/lang/uk/test_lemmatizer.py
@ -2,6 +2,9 @@ import pytest
 from spacy.tokens import Doc
 pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_uk_lemmatizer(uk_lemmatizer):
    """Check that the default uk lemmatizer runs."""
    doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -481,6 +481,7 @@ def test_matcher_schema_token_attributes(en_vocab, pattern, text):
    assert len(matches) == 1
@pytest.mark.filterwarnings("ignore:\\[W036")
 def test_matcher_valid_callback(en_vocab):
    """Test that on_match can only be None or callable."""
    matcher = Matcher(en_vocab)
--- a/spacy/tests/matcher/test_matcher_logic.py
+++ b/spacy/tests/matcher/test_matcher_logic.py
@ -180,6 +180,7 @@ def test_matcher_sets_return_correct_tokens(en_vocab):
    assert texts == ["zero", "one", "two"]
@pytest.mark.filterwarnings("ignore:\\[W036")
 def test_matcher_remove():
    nlp = English()
    matcher = Matcher(nlp.vocab)
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@ -1,4 +1,5 @@
 import pytest
 import warnings
 import srsly
 from mock import Mock
 from spacy.matcher import PhraseMatcher
@ -197,13 +198,13 @@ def test_phrase_matcher_validation(en_vocab):
        matcher.add("TEST1", [doc1])
    with pytest.warns(UserWarning):
        matcher.add("TEST2", [doc2])
-    with pytest.warns(None) as record:
+    with warnings.catch_warnings():
        warnings.simplefilter("error")
        matcher.add("TEST3", [doc3])
        assert not record.list
    matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
-    with pytest.warns(None) as record:
+    with warnings.catch_warnings():
        warnings.simplefilter("error")
        matcher.add("TEST4", [doc2])
        assert not record.list
 def test_attr_validation(en_vocab):
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -252,12 +252,12 @@ def test_ruler_before_ner():
    # 1 : Entity Ruler - should set "this" to B and everything else to empty
    patterns = [{"label": "THING", "pattern": "This"}]
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    # 2: untrained NER - should set everything else to O
    untrained_ner = nlp.add_pipe("ner")
    untrained_ner.add_label("MY_LABEL")
    nlp.initialize()
    ruler.add_patterns(patterns)
    doc = nlp("This is Antti Korhonen speaking in Finland")
    expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
    expected_types = ["THING", "", "", "", "", "", ""]
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -324,6 +324,7 @@ def test_append_alias(nlp):
    assert len(mykb.get_alias_candidates("douglas")) == 3
@pytest.mark.filterwarnings("ignore:\\[W036")
 def test_append_invalid_alias(nlp):
    """Test that append an alias will throw an error if prior probs are exceeding 1"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@ -342,6 +343,7 @@ def test_append_invalid_alias(nlp):
        mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
@pytest.mark.filterwarnings("ignore:\\[W036")
 def test_preserving_links_asdoc(nlp):
    """Test that Span.as_doc preserves the existing entity links"""
    vector_length = 1
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@ -89,6 +89,20 @@ def test_entity_ruler_init_clear(nlp, patterns):
    assert len(ruler.labels) == 0
 def test_entity_ruler_clear(nlp, patterns):
    """Test that initialization clears patterns."""
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    assert len(ruler.labels) == 4
    doc = nlp("hello world")
    assert len(doc.ents) == 1
    ruler.clear()
    assert len(ruler.labels) == 0
    with pytest.warns(UserWarning):
        doc = nlp("hello world")
    assert len(doc.ents) == 0
 def test_entity_ruler_existing(nlp, patterns):
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@ -334,24 +334,31 @@ def test_language_factories_invalid():
@pytest.mark.parametrize(
-    "weights,expected",
+    "weights,override,expected",
    [
-        ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {"a": 0.33, "b": 0.33, "c": 0.33}),
+        ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {}, {"a": 0.33, "b": 0.33, "c": 0.33}),
-        ([{"a": 1.0}, {"b": 50}, {"c": 123}], {"a": 0.33, "b": 0.33, "c": 0.33}),
+        ([{"a": 1.0}, {"b": 50}, {"c": 100}], {}, {"a": 0.01, "b": 0.33, "c": 0.66}),
        (
            [{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}],
            {},
            {"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17},
        ),
        (
-            [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
+            [{"a": 100, "b": 300}, {"c": 50, "d": 50}],
-            {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
+            {},
            {"a": 0.2, "b": 0.6, "c": 0.1, "d": 0.1},
        ),
-        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}),
+        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {}, {"a": 0.33, "b": 0.67}),
-        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}),
+        ([{"a": 0.5, "b": 0.0}], {}, {"a": 1.0, "b": 0.0}),
        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}),
        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}),
        ([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}),
        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"c": 0.2}, {"a": 0.0, "b": 0.0, "c": 1.0}),
        ([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}),
    ],
 )
-def test_language_factories_combine_score_weights(weights, expected):
+def test_language_factories_combine_score_weights(weights, override, expected):
-    result = combine_score_weights(weights)
+    result = combine_score_weights(weights, override)
    assert sum(result.values()) in (0.99, 1.0, 0.0)
    assert result == expected
@ -377,17 +384,17 @@ def test_language_factories_scores():
    # Test with custom defaults
    config = nlp.config.copy()
    config["training"]["score_weights"]["a1"] = 0.0
-    config["training"]["score_weights"]["b3"] = 1.0
+    config["training"]["score_weights"]["b3"] = 1.3
    nlp = English.from_config(config)
    score_weights = nlp.config["training"]["score_weights"]
-    expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34}
+    expected = {"a1": 0.0, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.65}
    assert score_weights == expected
    # Test with null values
    config = nlp.config.copy()
    config["training"]["score_weights"]["a1"] = None
    nlp = English.from_config(config)
    score_weights = nlp.config["training"]["score_weights"]
-    expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35}
+    expected = {"a1": None, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.66}
    assert score_weights == expected
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -108,6 +108,12 @@ def test_label_types(name):
    textcat.add_label("answer")
    with pytest.raises(ValueError):
        textcat.add_label(9)
    # textcat requires at least two labels
    if name == "textcat":
        with pytest.raises(ValueError):
            nlp.initialize()
    else:
        nlp.initialize()
@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"])
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@ -11,7 +11,7 @@ from spacy.lang.en import English
 from thinc.api import Config, get_current_ops
 from numpy.testing import assert_array_equal
-from ..util import get_batch, make_tempdir
+from ..util import get_batch, make_tempdir, add_vecs_to_vocab
 def test_empty_doc():
@ -134,9 +134,25 @@ TRAIN_DATA = [
 ]
-def test_tok2vec_listener():
+@pytest.mark.parametrize("with_vectors", (False, True))
 def test_tok2vec_listener(with_vectors):
    orig_config = Config().from_str(cfg_string)
    orig_config["components"]["tok2vec"]["model"]["embed"][
        "include_static_vectors"
    ] = with_vectors
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    if with_vectors:
        ops = get_current_ops()
        vectors = [
            ("apple", ops.asarray([1, 2, 3])),
            ("orange", ops.asarray([-1, -2, -3])),
            ("and", ops.asarray([-1, -1, -1])),
            ("juice", ops.asarray([5, 5, 10])),
            ("pie", ops.asarray([7, 6.3, 8.9])),
        ]
        add_vecs_to_vocab(nlp.vocab, vectors)
    assert nlp.pipe_names == ["tok2vec", "tagger"]
    tagger = nlp.get_pipe("tagger")
    tok2vec = nlp.get_pipe("tok2vec")
@ -163,6 +179,9 @@ def test_tok2vec_listener():
    ops = get_current_ops()
    assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor))
    # test with empty doc
    doc = nlp("")
    # TODO: should this warn or error?
    nlp.select_pipes(disable="tok2vec")
    assert nlp.pipe_names == ["tagger"]
--- a/spacy/tests/regression/test_issue5501-6000.py
+++ b/spacy/tests/regression/test_issue5501-6000.py
@ -49,8 +49,8 @@ def test_issue5551(textcat_config):
    # All results should be the same because of the fixed seed
    assert len(results) == 3
    ops = get_current_ops()
-    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]))
+    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5)
-    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]))
+    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5)
 def test_issue5838():
--- a/spacy/tests/regression/test_issue8216.py
+++ b/spacy/tests/regression/test_issue8216.py
@ -0,0 +1,34 @@
 import pytest
 from spacy import registry
 from spacy.language import Language
 from spacy.pipeline import EntityRuler
@pytest.fixture
 def nlp():
    return Language()
@pytest.fixture
@registry.misc("entity_ruler_patterns")
 def patterns():
    return [
        {"label": "HELLO", "pattern": "hello world"},
        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
        {"label": "TECH_ORG", "pattern": "Microsoft", "id": "a2"},
    ]
 def test_entity_ruler_fix8216(nlp, patterns):
    """Test that patterns don't get added excessively."""
    ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
    ruler.add_patterns(patterns)
    pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
    assert pattern_count > 0
    ruler.add_patterns([])
    after_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
    assert after_count == pattern_count
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@ -3,6 +3,7 @@ from typing import Callable
 from spacy import util
 from spacy.util import ensure_path, registry, load_model_from_config
 from spacy.kb import KnowledgeBase
 from spacy.vocab import Vocab
 from thinc.api import Config
 from ..util import make_tempdir
@ -111,7 +112,7 @@ def test_serialize_subclassed_kb():
    @registry.misc("spacy.CustomKB.v1")
    def custom_kb(
        entity_vector_length: int, custom_field: int
-    ) -> Callable[["Vocab"], KnowledgeBase]:
+    ) -> Callable[[Vocab], KnowledgeBase]:
        def custom_kb_factory(vocab):
            kb = SubKnowledgeBase(
                vocab=vocab,
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -10,6 +10,7 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
 from spacy.cli._util import string_to_list
 from spacy.cli._util import upload_file, download_file
 from thinc.api import ConfigValidationError, Config
 import srsly
 import os
@ -474,3 +475,18 @@ def test_string_to_list(value):
 def test_string_to_list_intify(value):
    assert string_to_list(value, intify=False) == ["1", "2", "3"]
    assert string_to_list(value, intify=True) == [1, 2, 3]
 def test_upload_download_local_file():
    with make_tempdir() as d1, make_tempdir() as d2:
        filename = "f.txt"
        content = "content"
        local_file = d1 / filename
        remote_file = d2 / filename
        with local_file.open(mode="w") as file_:
            file_.write(content)
        upload_file(local_file, remote_file)
        local_file.unlink()
        download_file(remote_file, local_file)
        with local_file.open(mode="r") as file_:
            assert file_.read() == content
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@ -20,7 +20,7 @@ def get_textcat_bow_kwargs():
 def get_textcat_cnn_kwargs():
-    return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
+    return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
 def get_all_params(model):
@ -62,7 +62,7 @@ def get_tok2vec_kwargs():
    }
-def test_tok2vec():
+def make_test_tok2vec():
    return build_Tok2Vec_model(**get_tok2vec_kwargs())
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -84,7 +84,8 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n
@pytest.mark.parametrize("file_name", ["sun.txt"])
 def test_tokenizer_handle_text_from_file(tokenizer, file_name):
    loc = ensure_path(__file__).parent / file_name
-    text = loc.open("r", encoding="utf8").read()
+    with loc.open("r", encoding="utf8") as infile:
        text = infile.read()
    assert len(text) != 0
    tokens = tokenizer(text)
    assert len(tokens) > 100
--- a/spacy/tests/training/test_new_example.py
+++ b/spacy/tests/training/test_new_example.py
@ -182,6 +182,27 @@ def test_Example_from_dict_with_entities(annots):
    assert example.reference[5].ent_type_ == "LOC"
 def test_Example_from_dict_with_empty_entities():
    annots = {
        "words": ["I", "like", "New", "York", "and", "Berlin", "."],
        "entities": [],
    }
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    # entities as empty list sets everything to O
    assert example.reference.has_annotation("ENT_IOB")
    assert len(list(example.reference.ents)) == 0
    assert all(token.ent_iob_ == "O" for token in example.reference)
    # various unset/missing entities leaves entities unset
    annots["entities"] = None
    example = Example.from_dict(predicted, annots)
    assert not example.reference.has_annotation("ENT_IOB")
    annots.pop("entities", None)
    example = Example.from_dict(predicted, annots)
    assert not example.reference.has_annotation("ENT_IOB")
@pytest.mark.parametrize(
    "annots",
    [
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@ -1,6 +1,6 @@
 from typing import Dict, Iterable, Callable
 import pytest
-from thinc.api import Config
+from thinc.api import Config, fix_random_seed
 from spacy import Language
 from spacy.util import load_model_from_config, registry, resolve_dot_names
 from spacy.schemas import ConfigSchemaTraining
@ -64,8 +64,8 @@ def test_readers():
@pytest.mark.parametrize(
    "reader,additional_config",
    [
-        ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 2}),
+        ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 10}),
-        ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 2}),
+        ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 10}),
        ("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}),
    ],
 )
@ -82,17 +82,18 @@ def test_cat_readers(reader, additional_config):
    [nlp]
    lang = "en"
-    pipeline = ["tok2vec", "textcat"]
+    pipeline = ["tok2vec", "textcat_multilabel"]
    [components]
    [components.tok2vec]
    factory = "tok2vec"
-    [components.textcat]
+    [components.textcat_multilabel]
-    factory = "textcat"
+    factory = "textcat_multilabel"
    """
    config = Config().from_str(nlp_config_string)
    fix_random_seed(config["training"]["seed"])
    config["corpora"]["@readers"] = reader
    config["corpora"].update(additional_config)
    nlp = load_model_from_config(config, auto_fill=True)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -347,6 +347,7 @@ cdef class Doc:
            for annot in annotations:
                if annot:
                    if annot is heads or annot is sent_starts or annot is ent_iobs:
                        annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
                        for i in range(len(words)):
                            if attrs.ndim == 1:
                                attrs[i] = annot[i]
@ -1141,6 +1142,10 @@ cdef class Doc:
                else:
                    warnings.warn(Warnings.W102.format(key=key, value=value))
            for key in doc.spans:
                # if a spans key is in any doc, include it in the merged doc
                # even if it is empty
                if key not in concat_spans:
                    concat_spans[key] = []
                for span in doc.spans[key]:
                    concat_spans[key].append((
                        span.start_char + char_offset,
@ -1150,7 +1155,7 @@ cdef class Doc:
                        span.text, # included as a check
                    ))
            char_offset += len(doc.text)
-            if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space:
+            if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
                char_offset += 1
        arrays = [doc.to_array(attrs) for doc in docs]
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -297,7 +297,7 @@ cdef class Span:
                    for ancestor in ancestors:
                        ancestor_i = ancestor.i - self.c.start
                        if ancestor_i in range(length):
-                            array[i, head_col] = ancestor_i - i
+                            array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
                # if there is no appropriate ancestor, define a new artificial root
                value = array[i, head_col]
@ -305,7 +305,7 @@ cdef class Span:
                    new_root = old_to_new_root.get(ancestor_i, None)
                    if new_root is not None:
                        # take the same artificial root as a previous token from the same sentence
-                        array[i, head_col] = new_root - i
+                        array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
                    else:
                        # set this token as the new artificial root
                        array[i, head_col] = 0
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -329,26 +329,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
        if key not in IDS:
            raise ValueError(Errors.E974.format(obj="token", key=key))
        elif key in ["ORTH", "SPACY"]:
-            pass
+            continue
        elif key == "HEAD":
            attrs.append(key)
-            values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
+            row = [h-i if h is not None else 0 for i, h in enumerate(value)]
        elif key == "DEP":
            attrs.append(key)
-            values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
+            row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
        elif key == "SENT_START":
            attrs.append(key)
-            values.append([to_ternary_int(v) for v in value])
+            row = [to_ternary_int(v) for v in value]
        elif key == "MORPH":
            attrs.append(key)
-            values.append([vocab.morphology.add(v) for v in value])
+            row = [vocab.morphology.add(v) for v in value]
        else:
            attrs.append(key)
            if not all(isinstance(v, str) for v in value):
                types = set([type(v) for v in value])
                raise TypeError(Errors.E969.format(field=key, types=types)) from None
-            values.append([vocab.strings.add(v) for v in value])
+            row = [vocab.strings.add(v) for v in value]
-    array = numpy.asarray(values, dtype="uint64")
+        values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
    array = numpy.array(values, dtype=numpy.uint64)
    return attrs, array.T
@ -416,7 +417,7 @@ def _fix_legacy_dict_data(example_dict):
    token_dict = example_dict.get("token_annotation", {})
    doc_dict = example_dict.get("doc_annotation", {})
    for key, value in example_dict.items():
-        if value:
+        if value is not None:
            if key in ("token_annotation", "doc_annotation"):
                pass
            elif key == "ids":
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -274,3 +274,5 @@ def ensure_shape(vectors_loc):
        # store all the results in a list in memory
        lines2 = open_file(vectors_loc)
        yield from lines2
        lines2.close()
    lines.close()
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1370,32 +1370,14 @@ def combine_score_weights(
        should be preserved.
    RETURNS (Dict[str, float]): The combined and normalized weights.
    """
    # We divide each weight by the total weight sum.
    # We first need to extract all None/null values for score weights that
    # shouldn't be shown in the table *or* be weighted
-    result = {}
+    result = {key: overrides.get(key, value) for w_dict in weights for (key, value) in w_dict.items()}
-    all_weights = []
+    weight_sum = sum([v if v else 0.0 for v in result.values()])
-    for w_dict in weights:
+    for key, value in result.items():
-        filtered_weights = {}
+        if value and weight_sum > 0:
-        for key, value in w_dict.items():
+            result[key] = round(value / weight_sum, 2)
            value = overrides.get(key, value)
            if value is None:
                result[key] = None
            else:
                filtered_weights[key] = value
        all_weights.append(filtered_weights)
    for w_dict in all_weights:
        # We need to account for weights that don't sum to 1.0 and normalize
        # the score weights accordingly, then divide score by the number of
        # components.
        total = sum(w_dict.values())
        for key, value in w_dict.items():
            if total == 0:
                weight = 0.0
            else:
                weight = round(value / total / len(all_weights), 2)
            prev_weight = result.get(key, 0.0)
            prev_weight = 0.0 if prev_weight is None else prev_weight
            result[key] = prev_weight + weight
    return result
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@ -10,11 +10,12 @@ api_trainable: true
 ---
 The text categorizer predicts **categories over a whole document**. and comes in
-two flavours: `textcat` and `textcat_multilabel`. When you need to predict
+two flavors: `textcat` and `textcat_multilabel`. When you need to predict
 exactly one true label per document, use the `textcat` which has mutually
 exclusive labels. If you want to perform multi-label classification and predict
-zero, one or more labels per document, use the `textcat_multilabel` component
+zero, one or more true labels per document, use the `textcat_multilabel`
-instead.
+component instead. For a binary classification task, you can use `textcat` with
 **two** labels or `textcat_multilabel` with **one** label.
 Both components are documented on this page.
@ -189,7 +190,7 @@ This method was previously called `begin_training`.
 | _keyword-only_   |                                                                                                                                                                                                                                                                                                                                                                                                            |
 | `nlp`            | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                       |
 | `labels`         | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
-| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is not available when using the `textcat_multilabel` component. ~~Optional[str]~~                                                                                                                                                                                                             |
+| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is only used during scoring. It is not available when using the `textcat_multilabel` component. ~~Optional[str]~~                                                                                                                                                                             |
 ## TextCategorizer.predict {#predict tag="method"}
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@ -262,7 +262,12 @@
        },
        {
            "code": "mk",
-            "name": "Macedonian"
+            "name": "Macedonian",
            "models": [
                "mk_core_news_sm",
                "mk_core_news_md",
                "mk_core_news_lg"
            ]
        },
        {
            "code": "ml",
Author	SHA1	Message	Date
Adriane Boyd	c83dfa23dc	Update build constraints for aarch64 and python 3.10 (#11980 )	2022-12-15 10:33:16 +01:00
Adriane Boyd	a65379dede	Merge pull request #11965 from adrianeboyd/backport/v3.0.9 Backport bug fixes to v3.0.x	2022-12-15 08:04:20 +01:00
Adriane Boyd	4af02ac9e4	Set version to v3.0.9	2022-12-13 13:56:56 +01:00
Adriane Boyd	67c6ef2b2a	Increase tolerance for almost equal checks in textcat regression test	2022-12-13 13:56:56 +01:00
Adriane Boyd	c4af89f956	Clean up warnings in the test suite (#11331 )	2022-12-12 17:27:00 +01:00
Paul O'Leary McCann	9a60424827	Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928 ) * Switch ubuntu-latest to ubuntu-20.04 in main tests * Only use 20.04 for 3.6	2022-12-12 17:27:00 +01:00
Adriane Boyd	d4883e79c1	Modernize and simplify CI steps (#11738 ) * Use `build` instead of `python setup.py sdist` * Remove in-place build with `setup.py` * Remove `gpu` parameter and GPU tests * Keep `architecture` and `num_build_jobs` in azure steps with CI defaults * Fix use of `num_build_jobs` parameters * Remove now-unused `prefix` parameter * Test imports and CLI before installing test requirements * Remove `.egg-info` directory in addition to source directory for an warning-free `import spacy` Switch `thinc-apple-ops` test to python 3.11 (as most recent python that is tested across platforms)	2022-12-12 17:27:00 +01:00
Adriane Boyd	e965f9d40a	Switch to flake8==5.0.4	2022-12-12 14:30:06 +01:00
Adriane Boyd	d4acae856a	Update flake8 version in reqs and CI * Update some unneeded forward refs related to flake8 checks	2022-12-12 14:30:06 +01:00
Adriane Boyd	0f87720411	Rename test helper method with non-test_ name (#11701 )	2022-12-12 14:02:50 +01:00
Adriane Boyd	c8009c2734	Cast to uint64 for all array-based doc representations (#11933 ) * Convert all individual values explicitly to uint64 for array-based doc representations * Temporarily test with latest numpy v1.24.0rc * Remove unnecessary conversion from attr_t * Reduce number of individual casts * Convert specifically from int32 to uint64 * Revert "Temporarily test with latest numpy v1.24.0rc" This reverts commit `eb0e3c5006`. * Also use int32 in tests	2022-12-12 14:02:50 +01:00
Paul O'Leary McCann	d4d4d69cb4	Config generation fails for GPU without transformers (#11899 ) If you don't have spacy-transformers installed, but try to use `init config` with the GPU flag, you'll get an error. The issue is that the `use_transformers` flag in the config is conflated with the GPU flag, and then there's an attempt to access transformers config info that may not exist. There may be a better way to do this, but this stops the error.	2022-12-12 14:02:50 +01:00
Paul O'Leary McCann	337ebda793	Add in errors used in the beam code that were removed at some point (#11935 ) I don't think there's any way to use the beam code at the moment, but as long as it's around the errors it refers to should also be present.	2022-12-12 14:02:50 +01:00
Adriane Boyd	5c975565dc	Add smart_open requirement, update deprecated options (#11864 ) * Switch from deprecated `ignore_ext` to `compression` * Add upload/download test for local files	2022-12-12 14:02:50 +01:00
Adriane Boyd	f55b876326	Merge pull request #10387 from adrianeboyd/chore/v3.0.8 Set version to v3.0.8	2022-02-28 12:53:53 +01:00
Adriane Boyd	ebcc7d830f	Update slow readers test to use textcat_multilabel (#9300 )	2022-02-28 11:22:06 +01:00
Adriane Boyd	694c318f4f	Address random results in slow readers tests (#9544 ) * Set random seed for dataset shuffling * Use more dev examples for non-zero scores	2022-02-28 11:19:43 +01:00
Ines Montani	308b1706a7	Allow conftest.py to run twice for build envs	2022-02-28 09:22:34 +01:00
Adriane Boyd	3420506954	Set version to v3.0.8	2022-02-28 09:02:03 +01:00
Adriane Boyd	f71de10405	Merge pull request #10346 from adrianeboyd/chore/v3.0-backport-10324 Fix Tok2Vec for empty batches (#10324)	2022-02-21 16:41:13 +01:00
Adriane Boyd	5caccbd19e	Switch to latest CI images (#9773 )	2022-02-21 15:02:52 +01:00
Daniël de Kok	6a4a00c447	Pin mypy to 0.910 until there is a compatible pydantic version	2022-02-21 15:01:36 +01:00
Adriane Boyd	749631ad28	Fix Tok2Vec for empty batches (#10324 ) * Add test for tok2vec with vectors and empty docs * Add shortcut for empty batch in Tok2Vec.predict * Avoid types	2022-02-21 14:33:16 +01:00
Adriane Boyd	034ac0acf4	Merge pull request #8787 from adrianeboyd/chore/backport-v3.0.7 Backport bug fixes to v3.0.x	2021-07-21 16:53:50 +02:00
Adriane Boyd	02e18926c3	Revert "Backport bugfixes from v3.1.0 to v3.0 (#8739 )" (#8786 ) This reverts commit `f94168a41e`.	2021-07-21 15:32:37 +02:00
Adriane Boyd	f94168a41e	Backport bugfixes from v3.1.0 to v3.0 (#8739 ) * Fix scoring normalization (#7629) * fix scoring normalization * score weights by total sum instead of per component * cleanup * more cleanup * Use a context manager when reading model (fix #7036) (#8244) * Fix other open calls without context managers (#8245) * Don't add duplicate patterns all the time in EntityRuler (fix #8216) (#8246) * Don't add duplicate patterns (fix #8216) * Refactor EntityRuler init This simplifies the EntityRuler init code. This is helpful as prep for allowing the EntityRuler to reset itself. * Make EntityRuler.clear reset matchers Includes a new test for this. * Tidy PhraseMatcher instantiation Since the attr can be None safely now, the guard if is no longer required here. Also renamed the `_validate` attr. Maybe it's not needed? * Fix NER test * Add test to make sure patterns aren't increasing * Move test to regression tests * Exclude generated .cpp files from package (#8271) * Fix non-deterministic deduplication in Greek lemmatizer (#8421) * Fix setting empty entities in Example.from_dict (#8426) * Filter W036 for entity ruler, etc. (#8424) * Preserve paths.vectors/initialize.vectors setting in quickstart template * Various fixes for spans in Docs.from_docs (#8487) * Fix spans offsets if a doc ends in a single space and no space is inserted * Also include spans key in merged doc for empty spans lists * Fix duplicate spacy package CLI opts (#8551) Use `-c` for `--code` and not additionally for `--create-meta`, in line with the docs. * Raise an error for textcat with <2 labels (#8584) * Raise an error for textcat with <2 labels Raise an error if initializing a `textcat` component without at least two labels. * Add similar note to docs * Update positive_label description in API docs * Add Macedonian models to website (#8637) * Fix Azerbaijani init, extend lang init tests (#8656) * Extend langs in initialize tests * Fix az init * Fix ru/uk lemmatizer mp with spawn (#8657) Use an instance variable instead a class variable for the morphological analzyer so that multiprocessing with spawn is possible. * Use 0-vector for OOV lexemes (#8639) * Set version to v3.0.7 Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>	2021-07-19 09:20:40 +02:00
Adriane Boyd	0080454140	Set version to v3.0.7	2021-07-16 16:38:15 +02:00
Adriane Boyd	6db938959d	Use 0-vector for OOV lexemes (#8639 )	2021-07-16 15:48:47 +02:00
Adriane Boyd	99a3f26d7f	Fix ru/uk lemmatizer mp with spawn (#8657 ) Use an instance variable instead a class variable for the morphological analzyer so that multiprocessing with spawn is possible.	2021-07-16 15:48:47 +02:00
Adriane Boyd	c62566ffce	Fix Azerbaijani init, extend lang init tests (#8656 ) * Extend langs in initialize tests * Fix az init	2021-07-16 15:48:47 +02:00
Adriane Boyd	066718b1dc	Add Macedonian models to website (#8637 )	2021-07-16 15:48:47 +02:00
Adriane Boyd	81e71a61f8	Raise an error for textcat with <2 labels (#8584 ) * Raise an error for textcat with <2 labels Raise an error if initializing a `textcat` component without at least two labels. * Add similar note to docs * Update positive_label description in API docs	2021-07-16 15:48:42 +02:00
Adriane Boyd	6aa3fede76	Fix duplicate spacy package CLI opts (#8551 ) Use `-c` for `--code` and not additionally for `--create-meta`, in line with the docs.	2021-07-16 15:48:19 +02:00
Adriane Boyd	71396273a5	Various fixes for spans in Docs.from_docs (#8487 ) * Fix spans offsets if a doc ends in a single space and no space is inserted * Also include spans key in merged doc for empty spans lists	2021-07-16 15:48:19 +02:00
Adriane Boyd	e51fff5432	Preserve paths.vectors/initialize.vectors setting in quickstart template	2021-07-16 15:48:19 +02:00
Adriane Boyd	c78eb28dfa	Filter W036 for entity ruler, etc. (#8424 )	2021-07-16 15:48:19 +02:00
Adriane Boyd	e3f1d4a7d0	Fix setting empty entities in Example.from_dict (#8426 )	2021-07-16 15:48:19 +02:00
Adriane Boyd	81515b4690	Fix non-deterministic deduplication in Greek lemmatizer (#8421 )	2021-07-16 15:48:19 +02:00
Adriane Boyd	8b9355d758	Exclude generated .cpp files from package (#8271 )	2021-07-16 15:47:55 +02:00
Paul O'Leary McCann	ad026dc5fd	Don't add duplicate patterns all the time in EntityRuler (fix #8216 ) (#8246 ) * Don't add duplicate patterns (fix #8216) * Refactor EntityRuler init This simplifies the EntityRuler init code. This is helpful as prep for allowing the EntityRuler to reset itself. * Make EntityRuler.clear reset matchers Includes a new test for this. * Tidy PhraseMatcher instantiation Since the attr can be None safely now, the guard if is no longer required here. Also renamed the `_validate` attr. Maybe it's not needed? * Fix NER test * Add test to make sure patterns aren't increasing * Move test to regression tests	2021-07-16 15:47:55 +02:00
Paul O'Leary McCann	1db18732e0	Fix other open calls without context managers (#8245 )	2021-07-16 15:47:55 +02:00
Paul O'Leary McCann	a834b03216	Use a context manager when reading model (fix #7036 ) (#8244 )	2021-07-16 15:47:55 +02:00
Sofie Van Landeghem	55e5f8ede3	Fix scoring normalization (#7629 ) * fix scoring normalization * score weights by total sum instead of per component * cleanup * more cleanup	2021-07-16 15:47:55 +02:00
Adriane Boyd	bb97e7bf8a	Update validate CLI to fix compat and ignore warnings (#8423 )	2021-07-14 23:28:08 +02:00