Merge remote-tracking branch 'upstream/v4' into feature/remove-stop-words

This commit is contained in:
Adriane Boyd 2023-02-06 13:53:35 +01:00
commit 9a8864db84
60 changed files with 478 additions and 600 deletions

View File

@ -62,6 +62,11 @@ steps:
# - script: | # - script: |
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" # python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
# displayName: 'Test no warnings on load (#11713)' # displayName: 'Test no warnings on load (#11713)'
# condition: eq(variables['python_version'], '3.8')
#
# - script: |
# python -m spacy download ca_core_news_sm 2>&1 | grep -q skipping
# displayName: 'Test skip re-download (#12188)'
# condition: eq(variables['python_version'], '3.8') # condition: eq(variables['python_version'], '3.8')
- script: | - script: |

View File

@ -19,6 +19,7 @@ import os
from ..schemas import ProjectConfigSchema, validate from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir, registry, logger from ..util import import_file, run_command, make_tempdir, registry, logger
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
from ..errors import RENAMED_LANGUAGE_CODES
from .. import about from .. import about
if TYPE_CHECKING: if TYPE_CHECKING:
@ -134,6 +135,16 @@ def _parse_override(value: Any) -> Any:
return str(value) return str(value)
def _handle_renamed_language_codes(lang: Optional[str]) -> None:
# Throw error for renamed language codes in v4
if lang in RENAMED_LANGUAGE_CODES:
msg.fail(
title="Renamed language code",
text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in spaCy v4. Update the language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
exits=1,
)
def load_project_config( def load_project_config(
path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict() path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
) -> Dict[str, Any]: ) -> Dict[str, Any]:

View File

@ -7,7 +7,7 @@ import re
import sys import sys
import itertools import itertools
from ._util import app, Arg, Opt, walk_directory from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
from ..training import docs_to_json from ..training import docs_to_json
from ..tokens import Doc, DocBin from ..tokens import Doc, DocBin
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
@ -112,6 +112,10 @@ def convert(
input_path = Path(input_path) input_path = Path(input_path)
if not msg: if not msg:
msg = Printer(no_print=silent) msg = Printer(no_print=silent)
# Throw error for renamed language codes in v4
_handle_renamed_language_codes(lang)
ner_map = srsly.read_json(ner_map) if ner_map is not None else None ner_map = srsly.read_json(ner_map) if ner_map is not None else None
doc_files = [] doc_files = []
for input_loc in walk_directory(input_path, converter): for input_loc in walk_directory(input_path, converter):

View File

@ -7,7 +7,8 @@ import typer
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
from .. import about from .. import about
from ..util import is_package, get_minor_version, run_command from ..util import is_package, get_minor_version, run_command
from ..util import is_prerelease_version from ..util import is_prerelease_version, get_installed_models
from ..util import get_package_version
@app.command( @app.command(
@ -63,6 +64,14 @@ def download(
compatibility = get_compatibility() compatibility = get_compatibility()
version = get_version(model_name, compatibility) version = get_version(model_name, compatibility)
# If we already have this version installed, skip downloading
installed = get_installed_models()
if model_name in installed:
installed_version = get_package_version(model_name)
if installed_version == version:
msg.warn(f"{model_name} v{version} already installed, skipping")
return
filename = get_model_filename(model_name, version, sdist) filename = get_model_filename(model_name, version, sdist)
download_model(filename, pip_args) download_model(filename, pip_args)

View File

@ -8,11 +8,11 @@ import re
from jinja2 import Template from jinja2 import Template
from .. import util from .. import util
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
from ..schemas import RecommendationSchema from ..schemas import RecommendationSchema
from ..util import SimpleFrozenList from ..util import SimpleFrozenList
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
from ._util import string_to_list, import_code from ._util import string_to_list, import_code, _handle_renamed_language_codes
ROOT = Path(__file__).parent / "templates" ROOT = Path(__file__).parent / "templates"
@ -43,7 +43,7 @@ class InitValues:
def init_config_cli( def init_config_cli(
# fmt: off # fmt: off
output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True), output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"), lang: str = Opt(InitValues.lang, "--lang", "-l", help="Code of the language to use"),
pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"), pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."), optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."), gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
@ -83,6 +83,7 @@ def init_fill_config_cli(
# fmt: off # fmt: off
base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False), base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False),
output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True), output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True),
distillation: bool = Opt(False, "--distillation", "-dt", help="Include config for distillation (with 'spacy distill')"),
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"), pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"), diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
@ -98,13 +99,20 @@ def init_fill_config_cli(
DOCS: https://spacy.io/api/cli#init-fill-config DOCS: https://spacy.io/api/cli#init-fill-config
""" """
import_code(code_path) import_code(code_path)
fill_config(output_file, base_path, pretraining=pretraining, diff=diff) fill_config(
output_file,
base_path,
distillation=distillation,
pretraining=pretraining,
diff=diff,
)
def fill_config( def fill_config(
output_file: Path, output_file: Path,
base_path: Path, base_path: Path,
*, *,
distillation: bool = False,
pretraining: bool = False, pretraining: bool = False,
diff: bool = False, diff: bool = False,
silent: bool = False, silent: bool = False,
@ -123,6 +131,9 @@ def fill_config(
# replaced with their actual config after loading, so we have to re-add them # replaced with their actual config after loading, so we have to re-add them
sourced = util.get_sourced_components(config) sourced = util.get_sourced_components(config)
filled["components"].update(sourced) filled["components"].update(sourced)
if distillation:
distillation_config = util.load_config(DEFAULT_CONFIG_DISTILL_PATH)
filled = distillation_config.merge(filled)
if pretraining: if pretraining:
validate_config_for_pretrain(filled, msg) validate_config_for_pretrain(filled, msg)
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
@ -158,6 +169,10 @@ def init_config(
msg = Printer(no_print=silent) msg = Printer(no_print=silent)
with TEMPLATE_PATH.open("r") as f: with TEMPLATE_PATH.open("r") as f:
template = Template(f.read()) template = Template(f.read())
# Throw error for renamed language codes in v4
_handle_renamed_language_codes(lang)
# Filter out duplicates since tok2vec and transformer are added by template # Filter out duplicates since tok2vec and transformer are added by template
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")] pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
defaults = RECOMMENDATIONS["__default__"] defaults = RECOMMENDATIONS["__default__"]

View File

@ -9,7 +9,7 @@ from .. import util
from ..training.initialize import init_nlp, convert_vectors from ..training.initialize import init_nlp, convert_vectors
from ..language import Language from ..language import Language
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu from ._util import import_code, setup_gpu, _handle_renamed_language_codes
@init_cli.command("vectors") @init_cli.command("vectors")
@ -31,6 +31,10 @@ def init_vectors_cli(
a model with vectors. a model with vectors.
""" """
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
# Throw error for renamed language codes in v4
_handle_renamed_language_codes(lang)
msg.info(f"Creating blank nlp object for language '{lang}'") msg.info(f"Creating blank nlp object for language '{lang}'")
nlp = util.get_lang_class(lang)() nlp = util.get_lang_class(lang)()
if jsonl_loc is not None: if jsonl_loc is not None:

View File

@ -0,0 +1,34 @@
[paths]
raw_text = null
[distillation]
corpus = "corpora.distillation"
dropout = 0.1
max_epochs = 1
max_steps = 0
student_to_teacher = {}
[distillation.batcher]
@batchers = "spacy.batch_by_words.v1"
size = 3000
discard_oversize = false
tolerance = 0.2
[distillation.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = true
eps = 1e-8
learn_rate = 1e-4
[corpora]
[corpora.distillation]
@readers = "spacy.PlainTextCorpus.v1"
path = ${paths.raw_text}
min_length = 0
max_length = 0

View File

@ -962,6 +962,7 @@ class Errors(metaclass=ErrorsWithCodes):
"reference and predicted docs.") "reference and predicted docs.")
E4004 = ("Backprop is not supported when is_train is not set.") E4004 = ("Backprop is not supported when is_train is not set.")
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
# fmt: on # fmt: on

View File

@ -6,7 +6,7 @@ class IcelandicDefaults(BaseDefaults):
class Icelandic(Language): class Icelandic(Language):
lang = "is" lang = "isl"
Defaults = IcelandicDefaults Defaults = IcelandicDefaults

View File

@ -3,10 +3,10 @@ from ...language import Language
class MultiLanguage(Language): class MultiLanguage(Language):
"""Language class to be used for models that support multiple languages. """Language class to be used for models that support multiple languages.
This module allows models to specify their language ID as 'xx'. This module allows models to specify their language ID as 'mul'.
""" """
lang = "xx" lang = "mul"
__all__ = ["MultiLanguage"] __all__ = ["MultiLanguage"]

View File

@ -48,6 +48,9 @@ PipeCallable = Callable[[Doc], Doc]
# This is the base config will all settings (training etc.) # This is the base config will all settings (training etc.)
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg" DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH) DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
# This is the base config for the [distillation] block and currently not included
# in the main config and only added via the 'init fill-config' command
DEFAULT_CONFIG_DISTILL_PATH = Path(__file__).parent / "default_config_distillation.cfg"
# This is the base config for the [pretraining] block and currently not included # This is the base config for the [pretraining] block and currently not included
# in the main config and only added via the 'init fill-config' command # in the main config and only added via the 'init fill-config' command
DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg" DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"
@ -1059,7 +1062,7 @@ class Language:
return losses return losses
validate_distillation_examples(examples, "Language.distill") validate_distillation_examples(examples, "Language.distill")
examples = _copy_examples(examples) examples = _copy_examples(examples, copy_x=True, copy_y=True)
if sgd is None: if sgd is None:
if self._optimizer is None: if self._optimizer is None:
@ -1245,17 +1248,12 @@ class Language:
component_cfg[name].setdefault("drop", drop) component_cfg[name].setdefault("drop", drop)
pipe_kwargs[name].setdefault("batch_size", self.batch_size) pipe_kwargs[name].setdefault("batch_size", self.batch_size)
for name, proc in self.pipeline: for name, proc in self.pipeline:
# ignore statements are used here because mypy ignores hasattr if (
if name not in exclude and hasattr(proc, "update"): name not in exclude
proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) # type: ignore and isinstance(proc, ty.TrainableComponent)
if sgd not in (None, False): and proc.is_trainable
if ( ):
name not in exclude proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
and isinstance(proc, ty.TrainableComponent)
and proc.is_trainable
and proc.model not in (True, False, None)
):
proc.finish_update(sgd)
if name in annotates: if name in annotates:
for doc, eg in zip( for doc, eg in zip(
_pipe( _pipe(
@ -1268,6 +1266,17 @@ class Language:
examples, examples,
): ):
eg.predicted = doc eg.predicted = doc
# Only finish the update after all component updates are done. Some
# components may share weights (such as tok2vec) and we only want
# to apply weight updates after all gradients are accumulated.
for name, proc in self.pipeline:
if (
name not in exclude
and isinstance(proc, ty.TrainableComponent)
and proc.is_trainable
):
proc.finish_update(sgd)
return losses return losses
def rehearse( def rehearse(
@ -2328,13 +2337,18 @@ class DisabledPipes(list):
self[:] = [] self[:] = []
def _copy_examples(examples: Iterable[Example]) -> List[Example]: def _copy_examples(
examples: Iterable[Example], *, copy_x: bool = True, copy_y: bool = False
) -> List[Example]:
"""Make a copy of a batch of examples, copying the predicted Doc as well. """Make a copy of a batch of examples, copying the predicted Doc as well.
This is used in contexts where we need to take ownership of the examples This is used in contexts where we need to take ownership of the examples
so that they can be mutated, for instance during Language.evaluate and so that they can be mutated, for instance during Language.evaluate and
Language.update. Language.update.
""" """
return [Example(eg.x.copy(), eg.y) for eg in examples] return [
Example(eg.x.copy() if copy_x else eg.x, eg.y.copy() if copy_y else eg.y)
for eg in examples
]
def _apply_pipes( def _apply_pipes(

View File

@ -13,7 +13,6 @@ from ..kb import KnowledgeBase, Candidate
from ..ml import empty_kb from ..ml import empty_kb
from ..tokens import Doc, Span from ..tokens import Doc, Span
from .pipe import deserialize_config from .pipe import deserialize_config
from .legacy.entity_linker import EntityLinker_v1
from .trainable_pipe import TrainablePipe from .trainable_pipe import TrainablePipe
from ..language import Language from ..language import Language
from ..vocab import Vocab from ..vocab import Vocab
@ -28,9 +27,6 @@ ActivationsT = Dict[str, Union[List[Ragged], List[str]]]
KNOWLEDGE_BASE_IDS = "kb_ids" KNOWLEDGE_BASE_IDS = "kb_ids"
# See #9050
BACKWARD_OVERWRITE = True
default_model_config = """ default_model_config = """
[model] [model]
@architectures = "spacy.EntityLinker.v2" @architectures = "spacy.EntityLinker.v2"
@ -61,7 +57,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
"entity_vector_length": 64, "entity_vector_length": 64,
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
"overwrite": True, "overwrite": False,
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
"use_gold_ents": True, "use_gold_ents": True,
"candidates_batch_size": 1, "candidates_batch_size": 1,
@ -120,6 +116,12 @@ def make_entity_linker(
""" """
if not model.attrs.get("include_span_maker", False): if not model.attrs.get("include_span_maker", False):
try:
from spacy_legacy.components.entity_linker import EntityLinker_v1
except:
raise ImportError(
"In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12."
)
# The only difference in arguments here is that use_gold_ents and threshold aren't available. # The only difference in arguments here is that use_gold_ents and threshold aren't available.
return EntityLinker_v1( return EntityLinker_v1(
nlp.vocab, nlp.vocab,
@ -186,7 +188,7 @@ class EntityLinker(TrainablePipe):
get_candidates_batch: Callable[ get_candidates_batch: Callable[
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
], ],
overwrite: bool = BACKWARD_OVERWRITE, overwrite: bool = False,
scorer: Optional[Callable] = entity_linker_score, scorer: Optional[Callable] = entity_linker_score,
use_gold_ents: bool, use_gold_ents: bool,
candidates_batch_size: int, candidates_batch_size: int,
@ -210,6 +212,7 @@ class EntityLinker(TrainablePipe):
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
Iterable[Candidate]] Iterable[Candidate]]
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
overwrite (bool): Whether to overwrite existing non-empty annotations.
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
component must provide entity annotations. component must provide entity annotations.

View File

@ -1,3 +0,0 @@
from .entity_linker import EntityLinker_v1
__all__ = ["EntityLinker_v1"]

View File

@ -1,422 +0,0 @@
# This file is present to provide a prior version of the EntityLinker component
# for backwards compatability. For details see #9669.
from typing import Optional, Iterable, Callable, Dict, Union, List, Any
from thinc.types import Floats2d
from pathlib import Path
from itertools import islice
import srsly
import random
from thinc.api import CosineDistance, Model, Optimizer
from thinc.api import set_dropout_rate
import warnings
from ...kb import KnowledgeBase, Candidate
from ...ml import empty_kb
from ...tokens import Doc, Span
from ..pipe import deserialize_config
from ..trainable_pipe import TrainablePipe
from ...language import Language
from ...vocab import Vocab
from ...training import Example, validate_examples, validate_get_examples
from ...errors import Errors, Warnings
from ...util import SimpleFrozenList
from ... import util
from ...scorer import Scorer
# See #9050
BACKWARD_OVERWRITE = True
def entity_linker_score(examples, **kwargs):
return Scorer.score_links(examples, negative_labels=[EntityLinker_v1.NIL], **kwargs)
class EntityLinker_v1(TrainablePipe):
"""Pipeline component for named entity linking.
DOCS: https://spacy.io/api/entitylinker
"""
NIL = "NIL" # string used to refer to a non-existing link
def __init__(
self,
vocab: Vocab,
model: Model,
name: str = "entity_linker",
*,
labels_discard: Iterable[str],
n_sents: int,
incl_prior: bool,
incl_context: bool,
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
overwrite: bool = BACKWARD_OVERWRITE,
scorer: Optional[Callable] = entity_linker_score,
) -> None:
"""Initialize an entity linker.
vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
n_sents (int): The number of neighbouring sentences to take into account.
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
incl_context (bool): Whether or not to include the local context in the model.
entity_vector_length (int): Size of encoding vectors in the KB.
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
DOCS: https://spacy.io/api/entitylinker#init
"""
self.vocab = vocab
self.model = model
self.name = name
self.labels_discard = list(labels_discard)
self.n_sents = n_sents
self.incl_prior = incl_prior
self.incl_context = incl_context
self.get_candidates = get_candidates
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
self.distance = CosineDistance(normalize=False)
# how many neighbour sentences to take into account
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
self.kb = empty_kb(entity_vector_length)(self.vocab)
self.scorer = scorer
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
"""Define the KB of this pipe by providing a function that will
create it using this object's vocab."""
if not callable(kb_loader):
raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
self.kb = kb_loader(self.vocab)
def validate_kb(self) -> None:
# Raise an error if the knowledge base is not initialized.
if self.kb is None:
raise ValueError(Errors.E1018.format(name=self.name))
if len(self.kb) == 0:
raise ValueError(Errors.E139.format(name=self.name))
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],
*,
nlp: Optional[Language] = None,
kb_loader: Optional[Callable[[Vocab], KnowledgeBase]] = None,
):
"""Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates an InMemoryLookupKB from a Vocab instance.
Note that providing this argument, will overwrite all data accumulated in the current KB.
Use this only when loading a KB as-such from file.
DOCS: https://spacy.io/api/entitylinker#initialize
"""
validate_get_examples(get_examples, "EntityLinker_v1.initialize")
if kb_loader is not None:
self.set_kb(kb_loader)
self.validate_kb()
nO = self.kb.entity_vector_length
doc_sample = []
vector_sample = []
for example in islice(get_examples(), 10):
doc_sample.append(example.x)
vector_sample.append(self.model.ops.alloc1f(nO))
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
)
def update(
self,
examples: Iterable[Example],
*,
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
) -> Dict[str, float]:
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model. Delegates to predict and get_loss.
examples (Iterable[Example]): A batch of Example objects.
drop (float): The dropout rate.
sgd (thinc.api.Optimizer): The optimizer.
losses (Dict[str, float]): Optional record of the loss during training.
Updated using the component name as the key.
RETURNS (Dict[str, float]): The updated losses dictionary.
DOCS: https://spacy.io/api/entitylinker#update
"""
self.validate_kb()
if losses is None:
losses = {}
losses.setdefault(self.name, 0.0)
if not examples:
return losses
validate_examples(examples, "EntityLinker_v1.update")
sentence_docs = []
for eg in examples:
sentences = [s for s in eg.reference.sents]
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
for ent in eg.reference.ents:
# KB ID of the first token is the same as the whole span
kb_id = kb_ids[ent.start]
if kb_id:
try:
# find the sentence in the list of sentences.
sent_index = sentences.index(ent.sent)
except AttributeError:
# Catch the exception when ent.sent is None and provide a user-friendly warning
raise RuntimeError(Errors.E030) from None
# get n previous sentences, if there are any
start_sentence = max(0, sent_index - self.n_sents)
# get n posterior sentences, or as many < n as there are
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
# get token positions
start_token = sentences[start_sentence].start
end_token = sentences[end_sentence].end
# append that span as a doc to training
sent_doc = eg.predicted[start_token:end_token].as_doc()
sentence_docs.append(sent_doc)
set_dropout_rate(self.model, drop)
if not sentence_docs:
warnings.warn(Warnings.W093.format(name="Entity Linker"))
return losses
sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
loss, d_scores = self.get_loss(
sentence_encodings=sentence_encodings, examples=examples
)
bp_context(d_scores)
if sgd is not None:
self.finish_update(sgd)
losses[self.name] += loss
return losses
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
validate_examples(examples, "EntityLinker_v1.get_loss")
entity_encodings = []
for eg in examples:
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
for ent in eg.reference.ents:
kb_id = kb_ids[ent.start]
if kb_id:
entity_encoding = self.kb.get_vector(kb_id)
entity_encodings.append(entity_encoding)
entity_encodings = self.model.ops.asarray2f(entity_encodings)
if sentence_encodings.shape != entity_encodings.shape:
err = Errors.E147.format(
method="get_loss", msg="gold entities do not match up"
)
raise RuntimeError(err)
gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
loss = self.distance.get_loss(sentence_encodings, entity_encodings)
loss = loss / len(entity_encodings)
return float(loss), gradients
def predict(self, docs: Iterable[Doc]) -> List[str]:
"""Apply the pipeline's model to a batch of docs, without modifying them.
Returns the KB IDs for each entity in each doc, including NIL if there is
no prediction.
docs (Iterable[Doc]): The documents to predict.
RETURNS (List[str]): The models prediction for each document.
DOCS: https://spacy.io/api/entitylinker#predict
"""
self.validate_kb()
entity_count = 0
final_kb_ids: List[str] = []
if not docs:
return final_kb_ids
if isinstance(docs, Doc):
docs = [docs]
for i, doc in enumerate(docs):
sentences = [s for s in doc.sents]
if len(doc) > 0:
# Looping through each entity (TODO: rewrite)
for ent in doc.ents:
sent = ent.sent
sent_index = sentences.index(sent)
assert sent_index >= 0
# get n_neighbour sentences, clipped to the length of the document
start_sentence = max(0, sent_index - self.n_sents)
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
start_token = sentences[start_sentence].start
end_token = sentences[end_sentence].end
sent_doc = doc[start_token:end_token].as_doc()
# currently, the context is the same for each entity in a sentence (should be refined)
xp = self.model.ops.xp
if self.incl_context:
sentence_encoding = self.model.predict([sent_doc])[0]
sentence_encoding_t = sentence_encoding.T
sentence_norm = xp.linalg.norm(sentence_encoding_t)
entity_count += 1
if ent.label_ in self.labels_discard:
# ignoring this entity - setting to NIL
final_kb_ids.append(self.NIL)
else:
candidates = list(self.get_candidates(self.kb, ent))
if not candidates:
# no prediction possible for this entity - setting to NIL
final_kb_ids.append(self.NIL)
elif len(candidates) == 1:
# shortcut for efficiency reasons: take the 1 candidate
final_kb_ids.append(candidates[0].entity_)
else:
random.shuffle(candidates)
# set all prior probabilities to 0 if incl_prior=False
prior_probs = xp.asarray([c.prior_prob for c in candidates])
if not self.incl_prior:
prior_probs = xp.asarray([0.0 for _ in candidates])
scores = prior_probs
# add in similarity from the context
if self.incl_context:
entity_encodings = xp.asarray(
[c.entity_vector for c in candidates]
)
entity_norm = xp.linalg.norm(entity_encodings, axis=1)
if len(entity_encodings) != len(prior_probs):
raise RuntimeError(
Errors.E147.format(
method="predict",
msg="vectors not of equal length",
)
)
# cosine similarity
sims = xp.dot(entity_encodings, sentence_encoding_t) / (
sentence_norm * entity_norm
)
if sims.shape != prior_probs.shape:
raise ValueError(Errors.E161)
scores = prior_probs + sims - (prior_probs * sims)
best_index = scores.argmax().item()
best_candidate = candidates[best_index]
final_kb_ids.append(best_candidate.entity_)
if not (len(final_kb_ids) == entity_count):
err = Errors.E147.format(
method="predict", msg="result variables not of equal length"
)
raise RuntimeError(err)
return final_kb_ids
def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
DOCS: https://spacy.io/api/entitylinker#set_annotations
"""
count_ents = len([ent for doc in docs for ent in doc.ents])
if count_ents != len(kb_ids):
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
i = 0
overwrite = self.cfg["overwrite"]
for doc in docs:
for ent in doc.ents:
kb_id = kb_ids[i]
i += 1
for token in ent:
if token.ent_kb_id == 0 or overwrite:
token.ent_kb_id_ = kb_id
def to_bytes(self, *, exclude=tuple()):
"""Serialize the pipe to a bytestring.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (bytes): The serialized object.
DOCS: https://spacy.io/api/entitylinker#to_bytes
"""
self._validate_serialization_attrs()
serialize = {}
if hasattr(self, "cfg") and self.cfg is not None:
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
serialize["kb"] = self.kb.to_bytes
serialize["model"] = self.model.to_bytes
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, *, exclude=tuple()):
"""Load the pipe from a bytestring.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (TrainablePipe): The loaded object.
DOCS: https://spacy.io/api/entitylinker#from_bytes
"""
self._validate_serialization_attrs()
def load_model(b):
try:
self.model.from_bytes(b)
except AttributeError:
raise ValueError(Errors.E149) from None
deserialize = {}
if hasattr(self, "cfg") and self.cfg is not None:
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
deserialize["kb"] = lambda b: self.kb.from_bytes(b)
deserialize["model"] = load_model
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> None:
"""Serialize the pipe to disk.
path (str / Path): Path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.
DOCS: https://spacy.io/api/entitylinker#to_disk
"""
serialize = {}
serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["kb"] = lambda p: self.kb.to_disk(p)
serialize["model"] = lambda p: self.model.to_disk(p)
util.to_disk(path, serialize, exclude)
def from_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> "EntityLinker_v1":
"""Load the pipe from disk. Modifies the object in place and returns it.
path (str / Path): Path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (EntityLinker): The modified EntityLinker object.
DOCS: https://spacy.io/api/entitylinker#from_disk
"""
def load_model(p):
try:
with p.open("rb") as infile:
self.model.from_bytes(infile.read())
except AttributeError:
raise ValueError(Errors.E149) from None
deserialize: Dict[str, Callable[[Any], Any]] = {}
deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude)
deserialize["kb"] = lambda p: self.kb.from_disk(p)
deserialize["model"] = load_model
util.from_disk(path, deserialize, exclude)
return self
def rehearse(self, examples, *, sgd=None, losses=None, **config):
raise NotImplementedError
def add_label(self, label):
raise NotImplementedError

View File

@ -21,10 +21,6 @@ from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples from ..training import validate_examples, validate_get_examples
from ..util import registry from ..util import registry
# See #9050
BACKWARD_OVERWRITE = True
BACKWARD_EXTEND = False
default_model_config = """ default_model_config = """
[model] [model]
@architectures = "spacy.Tagger.v2" @architectures = "spacy.Tagger.v2"
@ -102,8 +98,8 @@ class Morphologizer(Tagger):
model: Model, model: Model,
name: str = "morphologizer", name: str = "morphologizer",
*, *,
overwrite: bool = BACKWARD_OVERWRITE, overwrite: bool = False,
extend: bool = BACKWARD_EXTEND, extend: bool = False,
scorer: Optional[Callable] = morphologizer_score, scorer: Optional[Callable] = morphologizer_score,
save_activations: bool = False, save_activations: bool = False,
): ):
@ -113,6 +109,8 @@ class Morphologizer(Tagger):
model (thinc.api.Model): The Thinc Model powering the pipeline component. model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
overwrite (bool): Whether to overwrite existing annotations.
extend (bool): Whether to extend existing annotations.
scorer (Optional[Callable]): The scoring method. Defaults to scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_token_attr for the attributes "pos" and "morph" and Scorer.score_token_attr for the attributes "pos" and "morph" and
Scorer.score_token_attr_per_feat for the attribute "morph". Scorer.score_token_attr_per_feat for the attribute "morph".

View File

@ -10,9 +10,6 @@ from ..language import Language
from ..scorer import Scorer from ..scorer import Scorer
from .. import util from .. import util
# see #9050
BACKWARD_OVERWRITE = False
@Language.factory( @Language.factory(
"sentencizer", "sentencizer",
assigns=["token.is_sent_start", "doc.sents"], assigns=["token.is_sent_start", "doc.sents"],
@ -52,13 +49,14 @@ class Sentencizer(Pipe):
name="sentencizer", name="sentencizer",
*, *,
punct_chars=None, punct_chars=None,
overwrite=BACKWARD_OVERWRITE, overwrite=False,
scorer=senter_score, scorer=senter_score,
): ):
"""Initialize the sentencizer. """Initialize the sentencizer.
punct_chars (list): Punctuation characters to split on. Will be punct_chars (list): Punctuation characters to split on. Will be
serialized with the nlp object. serialized with the nlp object.
overwrite (bool): Whether to overwrite existing annotations.
scorer (Optional[Callable]): The scoring method. Defaults to scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the attribute "sents". Scorer.score_spans for the attribute "sents".

View File

@ -18,8 +18,6 @@ from ..training import validate_examples, validate_get_examples
from ..util import registry from ..util import registry
from .. import util from .. import util
# See #9050
BACKWARD_OVERWRITE = False
default_model_config = """ default_model_config = """
[model] [model]
@ -83,7 +81,7 @@ class SentenceRecognizer(Tagger):
model, model,
name="senter", name="senter",
*, *,
overwrite=BACKWARD_OVERWRITE, overwrite=False,
scorer=senter_score, scorer=senter_score,
save_activations: bool = False, save_activations: bool = False,
): ):
@ -93,6 +91,7 @@ class SentenceRecognizer(Tagger):
model (thinc.api.Model): The Thinc Model powering the pipeline component. model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
overwrite (bool): Whether to overwrite existing annotations.
scorer (Optional[Callable]): The scoring method. Defaults to scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the attribute "sents". Scorer.score_spans for the attribute "sents".
save_activations (bool): save model activations in Doc when annotating. save_activations (bool): save model activations in Doc when annotating.

View File

@ -27,9 +27,6 @@ from .. import util
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]] ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
# See #9050
BACKWARD_OVERWRITE = False
default_model_config = """ default_model_config = """
[model] [model]
@architectures = "spacy.Tagger.v2" @architectures = "spacy.Tagger.v2"
@ -99,7 +96,7 @@ class Tagger(TrainablePipe):
model, model,
name="tagger", name="tagger",
*, *,
overwrite=BACKWARD_OVERWRITE, overwrite=False,
scorer=tagger_score, scorer=tagger_score,
neg_prefix="!", neg_prefix="!",
save_activations: bool = False, save_activations: bool = False,
@ -110,6 +107,7 @@ class Tagger(TrainablePipe):
model (thinc.api.Model): The Thinc Model powering the pipeline component. model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
overwrite (bool): Whether to overwrite existing annotations.
scorer (Optional[Callable]): The scoring method. Defaults to scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_token_attr for the attribute "tag". Scorer.score_token_attr for the attribute "tag".
save_activations (bool): save model activations in Doc when annotating. save_activations (bool): save model activations in Doc when annotating.

View File

@ -422,6 +422,27 @@ class ConfigSchemaInit(BaseModel):
arbitrary_types_allowed = True arbitrary_types_allowed = True
class ConfigSchemaDistillEmpty(BaseModel):
class Config:
extra = "forbid"
class ConfigSchemaDistill(BaseModel):
# fmt: off
batcher: Batcher = Field(..., title="Batcher for the training data")
corpus: StrictStr = Field(..., title="Path in the config to the distillation data")
dropout: StrictFloat = Field(..., title="Dropout rate")
max_epochs: StrictInt = Field(..., title="Maximum number of epochs to distill for")
max_steps: StrictInt = Field(..., title="Maximum number of steps to distill for")
optimizer: Optimizer = Field(..., title="The optimizer to use")
student_to_teacher: Dict[str, str] = Field(..., title="Mapping from student to teacher pipe")
# fmt: on
class Config:
extra = "forbid"
arbitrary_types_allowed = True
class ConfigSchema(BaseModel): class ConfigSchema(BaseModel):
training: ConfigSchemaTraining training: ConfigSchemaTraining
nlp: ConfigSchemaNlp nlp: ConfigSchemaNlp
@ -429,6 +450,7 @@ class ConfigSchema(BaseModel):
components: Dict[str, Dict[str, Any]] components: Dict[str, Dict[str, Any]]
corpora: Dict[str, Reader] corpora: Dict[str, Reader]
initialize: ConfigSchemaInit initialize: ConfigSchemaInit
distillation: Union[ConfigSchemaDistill, ConfigSchemaDistillEmpty] = {} # type: ignore[assignment]
class Config: class Config:
extra = "allow" extra = "allow"
@ -440,6 +462,7 @@ CONFIG_SCHEMAS = {
"training": ConfigSchemaTraining, "training": ConfigSchemaTraining,
"pretraining": ConfigSchemaPretrain, "pretraining": ConfigSchemaPretrain,
"initialize": ConfigSchemaInit, "initialize": ConfigSchemaInit,
"distill": ConfigSchemaDistill,
} }

View File

@ -104,7 +104,7 @@ class Scorer:
def __init__( def __init__(
self, self,
nlp: Optional["Language"] = None, nlp: Optional["Language"] = None,
default_lang: str = "xx", default_lang: str = "mul",
default_pipeline: Iterable[str] = DEFAULT_PIPELINE, default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
**cfg, **cfg,
) -> None: ) -> None:

View File

@ -86,7 +86,7 @@ These are the main fixtures that are currently available:
| Fixture | Description | | Fixture | Description |
| ----------------------------------- | ---------------------------------------------------------------------------- | | ----------------------------------- | ---------------------------------------------------------------------------- |
| `tokenizer` | Basic, language-independent tokenizer. Identical to the `xx` language class. | | `tokenizer` | Basic, language-independent tokenizer. Identical to the `mul` language class. |
| `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. | | `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. |
| `en_vocab` | Creates an instance of the English `Vocab`. | | `en_vocab` | Creates an instance of the English `Vocab`. |

View File

@ -83,7 +83,7 @@ def register_cython_tests(cython_mod_name: str, test_mod_name: str):
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def tokenizer(): def tokenizer():
return get_lang_class("xx")().tokenizer return get_lang_class("mul")().tokenizer
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
@ -243,8 +243,8 @@ def id_tokenizer():
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def is_tokenizer(): def isl_tokenizer():
return get_lang_class("is")().tokenizer return get_lang_class("isl")().tokenizer
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
@ -496,8 +496,8 @@ def vi_tokenizer():
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def xx_tokenizer(): def mul_tokenizer():
return get_lang_class("xx")().tokenizer return get_lang_class("mul")().tokenizer
@pytest.fixture(scope="session") @pytest.fixture(scope="session")

View File

@ -9,7 +9,7 @@ from thinc.api import NumpyOps, get_current_ops
from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
from spacy.attrs import SENT_START, TAG from spacy.attrs import SENT_START, TAG
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.xx import MultiLanguage from spacy.lang.mul import MultiLanguage
from spacy.language import Language from spacy.language import Language
from spacy.lexeme import Lexeme from spacy.lexeme import Lexeme
from spacy.tokens import Doc, Span, SpanGroup, Token from spacy.tokens import Doc, Span, SpanGroup, Token

View File

@ -175,6 +175,18 @@ def test_modify_span_group(doc):
assert group[0].label == doc.vocab.strings["TEST"] assert group[0].label == doc.vocab.strings["TEST"]
def test_char_span_attributes(doc):
label = "LABEL"
kb_id = "KB_ID"
span_id = "SPAN_ID"
span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id)
span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id)
assert span1.text == span2.text
assert span1.label_ == span2.label_ == label
assert span1.kb_id_ == span2.kb_id_ == kb_id
assert span1.id_ == span2.id_ == span_id
def test_spans_sent_spans(doc): def test_spans_sent_spans(doc):
sents = list(doc.sents) sents = list(doc.sents)
assert sents[0].start == 0 assert sents[0].start == 0
@ -354,6 +366,14 @@ def test_spans_by_character(doc):
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk" span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
) )
# Span.char_span + alignment mode "contract"
span2 = doc[0:2].char_span(
span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
)
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
def test_span_to_array(doc): def test_span_to_array(doc):
span = doc[1:-2] span = doc[1:-2]

View File

@ -1,7 +1,7 @@
import pytest import pytest
def test_long_text(is_tokenizer): def test_long_text(isl_tokenizer):
# Excerpt: European Convention on Human Rights # Excerpt: European Convention on Human Rights
text = """ text = """
hafa í huga, yfirlýsing þessi hefur það markmið tryggja hafa í huga, yfirlýsing þessi hefur það markmið tryggja
@ -15,12 +15,12 @@ réttlætis og friðar í heiminum og best er tryggt, annars vegar með
virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi
og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins; og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins;
""" """
tokens = is_tokenizer(text) tokens = isl_tokenizer(text)
assert len(tokens) == 120 assert len(tokens) == 120
@pytest.mark.xfail @pytest.mark.xfail
def test_ordinal_number(is_tokenizer): def test_ordinal_number(isl_tokenizer):
text = "10. desember 1948" text = "10. desember 1948"
tokens = is_tokenizer(text) tokens = isl_tokenizer(text)
assert len(tokens) == 3 assert len(tokens) == 3

View File

@ -1,6 +1,6 @@
import pytest import pytest
IS_BASIC_TOKENIZATION_TESTS = [ ISL_BASIC_TOKENIZATION_TESTS = [
( (
"Enginn maður skal sæta pyndingum eða ómannlegri eða " "Enginn maður skal sæta pyndingum eða ómannlegri eða "
"vanvirðandi meðferð eða refsingu. ", "vanvirðandi meðferð eða refsingu. ",
@ -23,8 +23,8 @@ IS_BASIC_TOKENIZATION_TESTS = [
] ]
@pytest.mark.parametrize("text,expected_tokens", IS_BASIC_TOKENIZATION_TESTS) @pytest.mark.parametrize("text,expected_tokens", ISL_BASIC_TOKENIZATION_TESTS)
def test_is_tokenizer_basic(is_tokenizer, text, expected_tokens): def test_isl_tokenizer_basic(isl_tokenizer, text, expected_tokens):
tokens = is_tokenizer(text) tokens = isl_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space] token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list assert expected_tokens == token_list

View File

@ -1,7 +1,7 @@
import pytest import pytest
def test_long_text(xx_tokenizer): def test_long_text(mul_tokenizer):
# Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi # Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi
text = """ text = """
ʹmmla lie Euroopp unioon oʹdinakai alggmeer. ʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest. ʹmmla lie Euroopp unioon oʹdinakai alggmeer. ʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest.
@ -20,5 +20,5 @@ vuâđđlääʹjj meâldlaž jiõččvaaldâšm. Säʹmmlai jiõččvaldšma kuu
Sääʹmteʹǧǧ. Sääʹmteʹǧǧ.
""" """
tokens = xx_tokenizer(text) tokens = mul_tokenizer(text)
assert len(tokens) == 179 assert len(tokens) == 179

View File

@ -1,6 +1,6 @@
import pytest import pytest
XX_BASIC_TOKENIZATION_TESTS = [ MUL_BASIC_TOKENIZATION_TESTS = [
( (
"Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel", "Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel",
[ [
@ -18,8 +18,8 @@ XX_BASIC_TOKENIZATION_TESTS = [
] ]
@pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS) @pytest.mark.parametrize("text,expected_tokens", MUL_BASIC_TOKENIZATION_TESTS)
def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens): def test_mul_tokenizer_basic(mul_tokenizer, text, expected_tokens):
tokens = xx_tokenizer(text) tokens = mul_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space] token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list assert expected_tokens == token_list

View File

@ -7,10 +7,10 @@ from spacy.util import get_lang_class
# excluded: ja, ko, th, vi, zh # excluded: ja, ko, th, vi, zh
LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
"en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi", "en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
"hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv", "hr", "hu", "hy", "id", "isl", "it", "kn", "ky", "lb", "lt", "lv",
"mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa", "mk", "ml", "mr", "mul", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn", "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
"tr", "tt", "uk", "ur", "xx", "yo"] "tr", "tt", "uk", "ur", "yo"]
# fmt: on # fmt: on

View File

@ -54,9 +54,11 @@ def test_annotates_on_update():
return AssertSents(name) return AssertSents(name)
class AssertSents: class AssertSents:
model = None
is_trainable = True
def __init__(self, name, **cfg): def __init__(self, name, **cfg):
self.name = name self.name = name
pass
def __call__(self, doc): def __call__(self, doc):
if not doc.has_annotation("SENT_START"): if not doc.has_annotation("SENT_START"):
@ -64,10 +66,16 @@ def test_annotates_on_update():
return doc return doc
def update(self, examples, *, drop=0.0, sgd=None, losses=None): def update(self, examples, *, drop=0.0, sgd=None, losses=None):
losses.setdefault(self.name, 0.0)
for example in examples: for example in examples:
if not example.predicted.has_annotation("SENT_START"): if not example.predicted.has_annotation("SENT_START"):
raise ValueError("No sents") raise ValueError("No sents")
return {}
return losses
def finish_update(self, sgd=None):
pass
nlp = English() nlp = English()
nlp.add_pipe("sentencizer") nlp.add_pipe("sentencizer")

View File

@ -12,7 +12,6 @@ from spacy.lang.en import English
from spacy.ml import load_kb from spacy.ml import load_kb
from spacy.ml.models.entity_linker import build_span_maker from spacy.ml.models.entity_linker import build_span_maker
from spacy.pipeline import EntityLinker, TrainablePipe from spacy.pipeline import EntityLinker, TrainablePipe
from spacy.pipeline.legacy import EntityLinker_v1
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.scorer import Scorer from spacy.scorer import Scorer
from spacy.tests.util import make_tempdir from spacy.tests.util import make_tempdir
@ -997,6 +996,8 @@ def test_scorer_links():
) )
# fmt: on # fmt: on
def test_legacy_architectures(name, config): def test_legacy_architectures(name, config):
from spacy_legacy.components.entity_linker import EntityLinker_v1
# Ensure that the legacy architectures still work # Ensure that the legacy architectures still work
vector_length = 3 vector_length = 3
nlp = English() nlp = English()

View File

@ -47,7 +47,7 @@ def person_org_date_patterns(person_org_patterns):
def test_span_ruler_add_empty(patterns): def test_span_ruler_add_empty(patterns):
"""Test that patterns don't get added excessively.""" """Test that patterns don't get added excessively."""
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler", config={"validate": True}) ruler = nlp.add_pipe("span_ruler", config={"validate": True})
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
@ -58,7 +58,7 @@ def test_span_ruler_add_empty(patterns):
def test_span_ruler_init(patterns): def test_span_ruler_init(patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
assert len(ruler) == len(patterns) assert len(ruler) == len(patterns)
@ -74,7 +74,7 @@ def test_span_ruler_init(patterns):
def test_span_ruler_no_patterns_warns(): def test_span_ruler_no_patterns_warns():
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
assert len(ruler) == 0 assert len(ruler) == 0
assert len(ruler.labels) == 0 assert len(ruler.labels) == 0
@ -86,7 +86,7 @@ def test_span_ruler_no_patterns_warns():
def test_span_ruler_init_patterns(patterns): def test_span_ruler_init_patterns(patterns):
# initialize with patterns # initialize with patterns
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
assert len(ruler.labels) == 0 assert len(ruler.labels) == 0
ruler.initialize(lambda: [], patterns=patterns) ruler.initialize(lambda: [], patterns=patterns)
@ -110,7 +110,7 @@ def test_span_ruler_init_patterns(patterns):
def test_span_ruler_init_clear(patterns): def test_span_ruler_init_clear(patterns):
"""Test that initialization clears patterns.""" """Test that initialization clears patterns."""
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
assert len(ruler.labels) == 4 assert len(ruler.labels) == 4
@ -119,7 +119,7 @@ def test_span_ruler_init_clear(patterns):
def test_span_ruler_clear(patterns): def test_span_ruler_clear(patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
assert len(ruler.labels) == 4 assert len(ruler.labels) == 4
@ -133,7 +133,7 @@ def test_span_ruler_clear(patterns):
def test_span_ruler_existing(patterns): def test_span_ruler_existing(patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler", config={"overwrite": False}) ruler = nlp.add_pipe("span_ruler", config={"overwrite": False})
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
doc = nlp.make_doc("OH HELLO WORLD bye bye") doc = nlp.make_doc("OH HELLO WORLD bye bye")
@ -148,7 +148,7 @@ def test_span_ruler_existing(patterns):
def test_span_ruler_existing_overwrite(patterns): def test_span_ruler_existing_overwrite(patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler", config={"overwrite": True}) ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
doc = nlp.make_doc("OH HELLO WORLD bye bye") doc = nlp.make_doc("OH HELLO WORLD bye bye")
@ -161,13 +161,13 @@ def test_span_ruler_existing_overwrite(patterns):
def test_span_ruler_serialize_bytes(patterns): def test_span_ruler_serialize_bytes(patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
assert len(ruler) == len(patterns) assert len(ruler) == len(patterns)
assert len(ruler.labels) == 4 assert len(ruler.labels) == 4
ruler_bytes = ruler.to_bytes() ruler_bytes = ruler.to_bytes()
new_nlp = spacy.blank("xx") new_nlp = spacy.blank("mul")
new_ruler = new_nlp.add_pipe("span_ruler") new_ruler = new_nlp.add_pipe("span_ruler")
assert len(new_ruler) == 0 assert len(new_ruler) == 0
assert len(new_ruler.labels) == 0 assert len(new_ruler.labels) == 0
@ -181,7 +181,7 @@ def test_span_ruler_serialize_bytes(patterns):
def test_span_ruler_validate(): def test_span_ruler_validate():
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
validated_ruler = nlp.add_pipe( validated_ruler = nlp.add_pipe(
"span_ruler", name="validated_span_ruler", config={"validate": True} "span_ruler", name="validated_span_ruler", config={"validate": True}
@ -203,14 +203,14 @@ def test_span_ruler_validate():
def test_span_ruler_properties(patterns): def test_span_ruler_properties(patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler", config={"overwrite": True}) ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
assert sorted(ruler.labels) == sorted(set([p["label"] for p in patterns])) assert sorted(ruler.labels) == sorted(set([p["label"] for p in patterns]))
def test_span_ruler_overlapping_spans(overlapping_patterns): def test_span_ruler_overlapping_spans(overlapping_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(overlapping_patterns) ruler.add_patterns(overlapping_patterns)
doc = ruler(nlp.make_doc("foo bar baz")) doc = ruler(nlp.make_doc("foo bar baz"))
@ -220,7 +220,7 @@ def test_span_ruler_overlapping_spans(overlapping_patterns):
def test_span_ruler_scorer(overlapping_patterns): def test_span_ruler_scorer(overlapping_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(overlapping_patterns) ruler.add_patterns(overlapping_patterns)
text = "foo bar baz" text = "foo bar baz"
@ -243,7 +243,7 @@ def test_span_ruler_multiprocessing(n_process):
patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut"}] patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut"}]
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
@ -253,7 +253,7 @@ def test_span_ruler_multiprocessing(n_process):
def test_span_ruler_serialize_dir(patterns): def test_span_ruler_serialize_dir(patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
with make_tempdir() as d: with make_tempdir() as d:
@ -264,7 +264,7 @@ def test_span_ruler_serialize_dir(patterns):
def test_span_ruler_remove_basic(person_org_patterns): def test_span_ruler_remove_basic(person_org_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(person_org_patterns) ruler.add_patterns(person_org_patterns)
doc = ruler(nlp.make_doc("Dina went to school")) doc = ruler(nlp.make_doc("Dina went to school"))
@ -279,7 +279,7 @@ def test_span_ruler_remove_basic(person_org_patterns):
def test_span_ruler_remove_nonexisting_pattern(person_org_patterns): def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(person_org_patterns) ruler.add_patterns(person_org_patterns)
assert len(ruler.patterns) == 3 assert len(ruler.patterns) == 3
@ -290,7 +290,7 @@ def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
def test_span_ruler_remove_several_patterns(person_org_patterns): def test_span_ruler_remove_several_patterns(person_org_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(person_org_patterns) ruler.add_patterns(person_org_patterns)
doc = ruler(nlp.make_doc("Dina founded the company ACME.")) doc = ruler(nlp.make_doc("Dina founded the company ACME."))
@ -314,7 +314,7 @@ def test_span_ruler_remove_several_patterns(person_org_patterns):
def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns): def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(person_org_date_patterns) ruler.add_patterns(person_org_date_patterns)
doc = ruler(nlp.make_doc("Dina founded the company ACME on June 14th")) doc = ruler(nlp.make_doc("Dina founded the company ACME on June 14th"))
@ -332,7 +332,7 @@ def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
def test_span_ruler_remove_all_patterns(person_org_date_patterns): def test_span_ruler_remove_all_patterns(person_org_date_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(person_org_date_patterns) ruler.add_patterns(person_org_date_patterns)
assert len(ruler.patterns) == 4 assert len(ruler.patterns) == 4
@ -348,7 +348,7 @@ def test_span_ruler_remove_all_patterns(person_org_date_patterns):
def test_span_ruler_remove_and_add(): def test_span_ruler_remove_and_add():
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
patterns1 = [{"label": "DATE1", "pattern": "last time"}] patterns1 = [{"label": "DATE1", "pattern": "last time"}]
ruler.add_patterns(patterns1) ruler.add_patterns(patterns1)
@ -404,7 +404,7 @@ def test_span_ruler_remove_and_add():
def test_span_ruler_spans_filter(overlapping_patterns): def test_span_ruler_spans_filter(overlapping_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe( ruler = nlp.add_pipe(
"span_ruler", "span_ruler",
config={"spans_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}}, config={"spans_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}},
@ -416,7 +416,7 @@ def test_span_ruler_spans_filter(overlapping_patterns):
def test_span_ruler_ents_default_filter(overlapping_patterns): def test_span_ruler_ents_default_filter(overlapping_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler", config={"annotate_ents": True}) ruler = nlp.add_pipe("span_ruler", config={"annotate_ents": True})
ruler.add_patterns(overlapping_patterns) ruler.add_patterns(overlapping_patterns)
doc = ruler(nlp.make_doc("foo bar baz")) doc = ruler(nlp.make_doc("foo bar baz"))
@ -425,7 +425,7 @@ def test_span_ruler_ents_default_filter(overlapping_patterns):
def test_span_ruler_ents_overwrite_filter(overlapping_patterns): def test_span_ruler_ents_overwrite_filter(overlapping_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe( ruler = nlp.add_pipe(
"span_ruler", "span_ruler",
config={ config={
@ -452,7 +452,7 @@ def test_span_ruler_ents_bad_filter(overlapping_patterns):
return pass_through_filter return pass_through_filter
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe( ruler = nlp.add_pipe(
"span_ruler", "span_ruler",
config={ config={

View File

@ -6,10 +6,11 @@ import spacy
from spacy.lang.de import German from spacy.lang.de import German
from spacy.lang.en import English from spacy.lang.en import English
from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
from spacy.language import DEFAULT_CONFIG_DISTILL_PATH
from spacy.language import Language from spacy.language import Language
from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed
from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain from spacy.schemas import ConfigSchema, ConfigSchemaDistill, ConfigSchemaPretrain
from spacy.util import load_config, load_config_from_str from spacy.util import load_config, load_config_from_str
from spacy.util import load_model_from_config, registry from spacy.util import load_model_from_config, registry
@ -66,6 +67,60 @@ factory = "tagger"
width = ${components.tok2vec.model.width} width = ${components.tok2vec.model.width}
""" """
distill_config_string = """
[paths]
train = null
dev = null
[corpora]
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
[training]
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
size = 666
[nlp]
lang = "en"
pipeline = ["tok2vec", "tagger"]
[components]
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 342
depth = 4
window_size = 1
embed_size = 2000
maxout_pieces = 3
subword_features = true
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v2"
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.width}
[distill]
"""
pretrain_config_string = """ pretrain_config_string = """
[paths] [paths]
train = null train = null
@ -201,6 +256,14 @@ def test_create_nlp_from_config():
load_model_from_config(Config(bad_cfg), auto_fill=True) load_model_from_config(Config(bad_cfg), auto_fill=True)
def test_nlp_from_distillation_config():
"""Test that the default distillation config validates properly"""
config = Config().from_str(distill_config_string)
distill_config = load_config(DEFAULT_CONFIG_DISTILL_PATH)
filled = config.merge(distill_config)
registry.resolve(filled["distillation"], schema=ConfigSchemaDistill)
def test_create_nlp_from_pretraining_config(): def test_create_nlp_from_pretraining_config():
"""Test that the default pretraining config validates properly""" """Test that the default pretraining config validates properly"""
config = Config().from_str(pretrain_config_string) config = Config().from_str(pretrain_config_string)

View File

@ -1017,8 +1017,6 @@ def test_local_remote_storage_pull_missing():
def test_cli_find_threshold(capsys): def test_cli_find_threshold(capsys):
thresholds = numpy.linspace(0, 1, 10)
def make_examples(nlp: Language) -> List[Example]: def make_examples(nlp: Language) -> List[Example]:
docs: List[Example] = [] docs: List[Example] = []
@ -1082,8 +1080,6 @@ def test_cli_find_threshold(capsys):
scores_key="cats_macro_f", scores_key="cats_macro_f",
silent=True, silent=True,
) )
assert best_threshold != thresholds[0]
assert thresholds[0] < best_threshold < thresholds[9]
assert best_score == max(res.values()) assert best_score == max(res.values())
assert res[1.0] == 0.0 assert res[1.0] == 0.0
@ -1091,7 +1087,7 @@ def test_cli_find_threshold(capsys):
nlp, _ = init_nlp((("spancat", {}),)) nlp, _ = init_nlp((("spancat", {}),))
with make_tempdir() as nlp_dir: with make_tempdir() as nlp_dir:
nlp.to_disk(nlp_dir) nlp.to_disk(nlp_dir)
res = find_threshold( best_threshold, best_score, res = find_threshold(
model=nlp_dir, model=nlp_dir,
data_path=docs_dir / "docs.spacy", data_path=docs_dir / "docs.spacy",
pipe_name="spancat", pipe_name="spancat",
@ -1099,10 +1095,8 @@ def test_cli_find_threshold(capsys):
scores_key="spans_sc_f", scores_key="spans_sc_f",
silent=True, silent=True,
) )
assert res[0] != thresholds[0] assert best_score == max(res.values())
assert thresholds[0] < res[0] < thresholds[8] assert res[1.0] == 0.0
assert res[1] >= 0.6
assert res[2][1.0] == 0.0
# Having multiple textcat_multilabel components should work, since the name has to be specified. # Having multiple textcat_multilabel components should work, since the name has to be specified.
nlp, _ = init_nlp((("textcat_multilabel", {}),)) nlp, _ = init_nlp((("textcat_multilabel", {}),))

View File

@ -4,7 +4,7 @@ from typer.testing import CliRunner
from spacy.tokens import DocBin, Doc from spacy.tokens import DocBin, Doc
from spacy.cli._util import app from spacy.cli._util import app
from .util import make_tempdir from .util import make_tempdir, normalize_whitespace
def test_convert_auto(): def test_convert_auto():
@ -38,8 +38,8 @@ def test_benchmark_accuracy_alias():
# Verify that the `evaluate` alias works correctly. # Verify that the `evaluate` alias works correctly.
result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"]) result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"]) result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
assert result_benchmark.stdout == result_evaluate.stdout.replace( assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace(
"spacy evaluate", "spacy benchmark accuracy" result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy")
) )

View File

@ -10,8 +10,9 @@ from spacy.training import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.de import German from spacy.lang.de import German
from spacy.util import registry, ignore_error, raise_error, find_matching_language from spacy.util import registry, ignore_error, raise_error, find_matching_language
from spacy.util import load_model_from_config
import spacy import spacy
from thinc.api import CupyOps, NumpyOps, get_current_ops from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops
from .util import add_vecs_to_vocab, assert_docs_equal from .util import add_vecs_to_vocab, assert_docs_equal
@ -25,6 +26,51 @@ try:
except ImportError: except ImportError:
pass pass
TAGGER_CFG_STRING = """
[nlp]
lang = "en"
pipeline = ["tok2vec","tagger"]
[components]
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v2"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode.width}
rows = [2000, 1000, 1000, 1000]
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3
"""
TAGGER_TRAIN_DATA = [
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
("Eat blue ham", {"tags": ["V", "J", "N"]}),
]
TAGGER_TRAIN_DATA = [ TAGGER_TRAIN_DATA = [
("I like green eggs", {"tags": ["N", "V", "J", "N"]}), ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
@ -91,6 +137,26 @@ def test_language_update(nlp):
example = Example.from_dict(doc, wrongkeyannots) example = Example.from_dict(doc, wrongkeyannots)
def test_language_update_updates():
config = Config().from_str(TAGGER_CFG_STRING)
nlp = load_model_from_config(config, auto_fill=True, validate=True)
train_examples = []
for t in TAGGER_TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
optimizer = nlp.initialize(get_examples=lambda: train_examples)
docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
nlp.update(train_examples, sgd=optimizer)
docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
xp = get_array_module(docs_after_update[0].tensor)
assert xp.any(
xp.not_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
)
def test_language_evaluate(nlp): def test_language_evaluate(nlp):
text = "hello world" text = "hello world"
annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}} annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
@ -664,11 +730,12 @@ def test_spacy_blank():
("fra", "fr"), ("fra", "fr"),
("fre", "fr"), ("fre", "fr"),
("iw", "he"), ("iw", "he"),
("is", "isl"),
("mo", "ro"), ("mo", "ro"),
("mul", "xx"), ("mul", "mul"),
("no", "nb"), ("no", "nb"),
("pt-BR", "pt"), ("pt-BR", "pt"),
("xx", "xx"), ("xx", "mul"),
("zh-Hans", "zh"), ("zh-Hans", "zh"),
("zh-Hant", None), ("zh-Hant", None),
("zxx", None), ("zxx", None),
@ -689,11 +756,11 @@ def test_language_matching(lang, target):
("fra", "fr"), ("fra", "fr"),
("fre", "fr"), ("fre", "fr"),
("iw", "he"), ("iw", "he"),
("is", "isl"),
("mo", "ro"), ("mo", "ro"),
("mul", "xx"), ("xx", "mul"),
("no", "nb"), ("no", "nb"),
("pt-BR", "pt"), ("pt-BR", "pt"),
("xx", "xx"),
("zh-Hans", "zh"), ("zh-Hans", "zh"),
], ],
) )

View File

@ -36,6 +36,7 @@ LANGUAGES = [
"hu", "hu",
pytest.param("id", marks=pytest.mark.slow()), pytest.param("id", marks=pytest.mark.slow()),
pytest.param("it", marks=pytest.mark.slow()), pytest.param("it", marks=pytest.mark.slow()),
pytest.param("isl", marks=pytest.mark.slow()),
pytest.param("kn", marks=pytest.mark.slow()), pytest.param("kn", marks=pytest.mark.slow()),
pytest.param("lb", marks=pytest.mark.slow()), pytest.param("lb", marks=pytest.mark.slow()),
pytest.param("lt", marks=pytest.mark.slow()), pytest.param("lt", marks=pytest.mark.slow()),

View File

@ -1,6 +1,7 @@
import numpy import numpy
import tempfile import tempfile
import contextlib import contextlib
import re
import srsly import srsly
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.vocab import Vocab from spacy.vocab import Vocab
@ -95,3 +96,7 @@ def assert_packed_msg_equal(b1, b2):
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())): for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
assert k1 == k2 assert k1 == k2
assert v1 == v2 assert v1 == v2
def normalize_whitespace(s):
return re.sub(r"\s+", " ", s)

View File

@ -108,6 +108,7 @@ class Doc:
kb_id: Union[int, str] = ..., kb_id: Union[int, str] = ...,
vector: Optional[Floats1d] = ..., vector: Optional[Floats1d] = ...,
alignment_mode: str = ..., alignment_mode: str = ...,
span_id: Union[int, str] = ...,
) -> Span: ... ) -> Span: ...
def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ... def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
@property @property

View File

@ -528,9 +528,9 @@ cdef class Doc:
doc (Doc): The parent document. doc (Doc): The parent document.
start_idx (int): The index of the first character of the span. start_idx (int): The index of the first character of the span.
end_idx (int): The index of the first character after the span. end_idx (int): The index of the first character after the span.
label (uint64 or string): A label to attach to the Span, e.g. for label (Union[int, str]): A label to attach to the Span, e.g. for
named entities. named entities.
kb_id (uint64 or string): An ID from a KB to capture the meaning of a kb_id (Union[int, str]): An ID from a KB to capture the meaning of a
named entity. named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span. the span.
@ -539,6 +539,7 @@ cdef class Doc:
with token boundaries), "contract" (span of all tokens completely with token boundaries), "contract" (span of all tokens completely
within the character span), "expand" (span of all tokens at least within the character span), "expand" (span of all tokens at least
partially covered by the character span). Defaults to "strict". partially covered by the character span). Defaults to "strict".
span_id (Union[int, str]): An identifier to associate with the span.
RETURNS (Span): The newly constructed object. RETURNS (Span): The newly constructed object.
DOCS: https://spacy.io/api/doc#char_span DOCS: https://spacy.io/api/doc#char_span

View File

@ -96,6 +96,9 @@ class Span:
label: Union[int, str] = ..., label: Union[int, str] = ...,
kb_id: Union[int, str] = ..., kb_id: Union[int, str] = ...,
vector: Optional[Floats1d] = ..., vector: Optional[Floats1d] = ...,
id: Union[int, str] = ...,
alignment_mode: str = ...,
span_id: Union[int, str] = ...,
) -> Span: ... ) -> Span: ...
@property @property
def conjuncts(self) -> Tuple[Token]: ... def conjuncts(self) -> Tuple[Token]: ...

View File

@ -382,7 +382,7 @@ cdef class Span:
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't) # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item() return result.item()
cpdef np.ndarray to_array(self, object py_attr_ids): cpdef np.ndarray to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy """Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape `(N, M)`, where `N` is the length of the document. `ndarray` of shape `(N, M)`, where `N` is the length of the document.
@ -656,22 +656,29 @@ cdef class Span:
else: else:
return self.doc[root] return self.doc[root]
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0): def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0):
"""Create a `Span` object from the slice `span.text[start : end]`. """Create a `Span` object from the slice `span.text[start : end]`.
start (int): The index of the first character of the span. start (int): The index of the first character of the span.
end (int): The index of the first character after the span. end (int): The index of the first character after the span.
label (uint64 or string): A label to attach to the Span, e.g. for label (Union[int, str]): A label to attach to the Span, e.g. for
named entities. named entities.
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity. kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span. the span.
id (Union[int, str]): Unused.
alignment_mode (str): How character indices are aligned to token
boundaries. Options: "strict" (character indices must be aligned
with token boundaries), "contract" (span of all tokens completely
within the character span), "expand" (span of all tokens at least
partially covered by the character span). Defaults to "strict".
span_id (Union[int, str]): An identifier to associate with the span.
RETURNS (Span): The newly constructed object. RETURNS (Span): The newly constructed object.
""" """
cdef SpanC* span_c = self.span_c() cdef SpanC* span_c = self.span_c()
start_idx += span_c.start_char start_idx += span_c.start_char
end_idx += span_c.start_char end_idx += span_c.start_char
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector) return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id)
@property @property
def conjuncts(self): def conjuncts(self):

View File

@ -86,7 +86,7 @@ def conll_ner_to_docs(
if model: if model:
nlp = load_model(model) nlp = load_model(model)
else: else:
nlp = get_lang_class("xx")() nlp = get_lang_class("mul")()
for conll_doc in input_data.strip().split(doc_delimiter): for conll_doc in input_data.strip().split(doc_delimiter):
conll_doc = conll_doc.strip() conll_doc = conll_doc.strip()
if not conll_doc: if not conll_doc:
@ -133,7 +133,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
"Segmenting sentences with sentencizer. (Use `-b model` for " "Segmenting sentences with sentencizer. (Use `-b model` for "
"improved parser-based sentence segmentation.)" "improved parser-based sentence segmentation.)"
) )
nlp = get_lang_class("xx")() nlp = get_lang_class("mul")()
sentencizer = nlp.create_pipe("sentencizer") sentencizer = nlp.create_pipe("sentencizer")
lines = doc.strip().split("\n") lines = doc.strip().split("\n")
words = [line.strip().split()[0] for line in lines] words = [line.strip().split()[0] for line in lines]

View File

@ -3,7 +3,7 @@ from ..gold_io import json_iterate, json_to_annotations
from ..example import annotations_to_doc from ..example import annotations_to_doc
from ..example import _fix_legacy_dict_data, _parse_example_dict_data from ..example import _fix_legacy_dict_data, _parse_example_dict_data
from ...util import load_model from ...util import load_model
from ...lang.xx import MultiLanguage from ...lang.mul import MultiLanguage
def json_to_docs(input_data, model=None, **kwargs): def json_to_docs(input_data, model=None, **kwargs):

View File

@ -210,7 +210,7 @@ def train_while_improving(
subbatch, subbatch,
drop=dropout, drop=dropout,
losses=losses, losses=losses,
sgd=False, # type: ignore[arg-type] sgd=None,
exclude=exclude, exclude=exclude,
annotates=annotating_components, annotates=annotating_components,
) )

View File

@ -283,7 +283,7 @@ def find_matching_language(lang: str) -> Optional[str]:
import spacy.lang # noqa: F401 import spacy.lang # noqa: F401
if lang == "xx": if lang == "xx":
return "xx" return "mul"
# Find out which language modules we have # Find out which language modules we have
possible_languages = [] possible_languages = []
@ -301,11 +301,7 @@ def find_matching_language(lang: str) -> Optional[str]:
# is labeled that way is probably trying to be distinct from 'zh' and # is labeled that way is probably trying to be distinct from 'zh' and
# shouldn't automatically match. # shouldn't automatically match.
match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9) match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
if match == "mul": return match
# Convert 'mul' back to spaCy's 'xx'
return "xx"
else:
return match
def get_lang_class(lang: str) -> Type["Language"]: def get_lang_class(lang: str) -> Type["Language"]:

View File

@ -1410,12 +1410,13 @@ $ python -m spacy project assets [project_dir]
> $ python -m spacy project assets [--sparse] > $ python -m spacy project assets [--sparse]
> ``` > ```
| Name | Description | | Name | Description |
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | | `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ | | `--extra`, `-e` <Tag variant="new">3.3.1</Tag> | Download assets marked as "extra". Default false. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
| **CREATES** | Downloaded or copied assets defined in the `project.yml`. | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | Downloaded or copied assets defined in the `project.yml`. |
### project run {id="project-run",tag="command"} ### project run {id="project-run",tag="command"}

View File

@ -37,7 +37,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ | | `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ |
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ | | `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ | | `user_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
| `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | | `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | | `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | | `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
@ -209,15 +209,16 @@ alignment mode `"strict".
> assert span.text == "New York" > assert span.text == "New York"
> ``` > ```
| Name | Description | | Name | Description |
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `start` | The index of the first character of the span. ~~int~~ | | `start` | The index of the first character of the span. ~~int~~ |
| `end` | The index of the last character after the span. ~~int~~ | | `end` | The index of the last character after the span. ~~int~~ |
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | | `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | | `span_id` <Tag variant="new">3.3.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~ |
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
## Doc.set_ents {id="set_ents",tag="method",version="3"} ## Doc.set_ents {id="set_ents",tag="method",version="3"}

View File

@ -63,7 +63,7 @@ architectures and their arguments and hyperparameters.
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | | `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | | `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | | `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | | `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | | `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ | | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | | `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |

View File

@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters.
| Setting | Description | | Setting | Description |
| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | | `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ | | `overwrite` <Tag variant="new">3.2</Tag> | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~ |
| `extend` <Tag variant="new">3.2</Tag> | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ | | `extend` <Tag variant="new">3.2</Tag> | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | | `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ | | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ |

View File

@ -30,7 +30,7 @@ Create a new `Scorer`.
| Name | Description | | Name | Description |
| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline is constructed using the `default_lang` and `default_pipeline` settings. ~~Optional[Language]~~ | | `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline is constructed using the `default_lang` and `default_pipeline` settings. ~~Optional[Language]~~ |
| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~ | | `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `mul`. ~~str~~ |
| `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ | | `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ | | `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ |

View File

@ -186,14 +186,17 @@ the character indices don't map to a valid span.
> assert span.text == "New York" > assert span.text == "New York"
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ----------------------------------------------------------------------------------------- | | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `start` | The index of the first character of the span. ~~int~~ | | `start` | The index of the first character of the span. ~~int~~ |
| `end` | The index of the last character after the span. ~~int~~ | | `end` | The index of the last character after the span. ~~int~~ |
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | | `id` | Unused. ~~Union[int, str]~~ |
| `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
| `span_id` <Tag variant="new">3.5.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~ |
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
## Span.similarity {id="similarity",tag="method",model="vectors"} ## Span.similarity {id="similarity",tag="method",model="vectors"}

View File

@ -21,8 +21,8 @@ menu:
## Package naming conventions {id="conventions"} ## Package naming conventions {id="conventions"}
In general, spaCy expects all pipeline packages to follow the naming convention In general, spaCy expects all pipeline packages to follow the naming convention
of `[lang]\_[name]`. For spaCy's pipelines, we also chose to divide the name of `[lang]_[name]`. For spaCy's pipelines, we also chose to divide the name into
into three components: three components:
1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with 1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with
tagging, parsing, lemmatization and named entity recognition, or `dep` for tagging, parsing, lemmatization and named entity recognition, or `dep` for

View File

@ -74,23 +74,23 @@ your data.
> ```python > ```python
> # Standard import > # Standard import
> from spacy.lang.xx import MultiLanguage > from spacy.lang.mul import MultiLanguage
> nlp = MultiLanguage() > nlp = MultiLanguage()
> >
> # With lazy-loading > # With lazy-loading
> nlp = spacy.blank("xx") > nlp = spacy.blank("mul")
> ``` > ```
spaCy also supports pipelines trained on more than one language. This is spaCy also supports pipelines trained on more than one language. This is
especially useful for named entity recognition. The language ID used for especially useful for named entity recognition. The language ID used for
multi-language or language-neutral pipelines is `xx`. The language class, a multi-language or language-neutral pipelines is `mul`. The language class, a
generic subclass containing only the base language data, can be found in generic subclass containing only the base language data, can be found in
[`lang/xx`](%%GITHUB_SPACY/spacy/lang/xx). [`lang/mul`](%%GITHUB_SPACY/spacy/lang/mul).
To train a pipeline using the neutral multi-language class, you can set To train a pipeline using the neutral multi-language class, you can set
`lang = "xx"` in your [training config](/usage/training#config). You can also `lang = "mul"` in your [training config](/usage/training#config). You can also
\import the `MultiLanguage` class directly, or call \import the `MultiLanguage` class directly, or call
[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading. [`spacy.blank("mul")`](/api/top-level#spacy.blank) for lazy-loading.
### Chinese language support {id="chinese",version="2.3"} ### Chinese language support {id="chinese",version="2.3"}

View File

@ -155,6 +155,21 @@ An error is now raised when unsupported values are given as input to train a
`textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0` `textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0`
as explained in the [docs](/api/textcategorizer#assigned-attributes). as explained in the [docs](/api/textcategorizer#assigned-attributes).
### Using the default knowledge base
As `KnowledgeBase` is now an abstract class, you should call the constructor of
the new `InMemoryLookupKB` instead when you want to use spaCy's default KB
implementation:
```diff
- kb = KnowledgeBase()
+ kb = InMemoryLookupKB()
```
If you've written a custom KB that inherits from `KnowledgeBase`, you'll need to
implement its abstract methods, or alternatively inherit from `InMemoryLookupKB`
instead.
### Updated scorers for tokenization and textcat {id="scores"} ### Updated scorers for tokenization and textcat {id="scores"}
We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported

View File

@ -165,7 +165,7 @@
"has_examples": true "has_examples": true
}, },
{ {
"code": "is", "code": "isl",
"name": "Icelandic" "name": "Icelandic"
}, },
{ {
@ -434,9 +434,9 @@
] ]
}, },
{ {
"code": "xx", "code": "mul",
"name": "Multi-language", "name": "Multi-language",
"models": ["xx_ent_wiki_sm", "xx_sent_ud_sm"], "models": ["mul_ent_wiki_sm", "mul_sent_ud_sm"],
"example": "This is a sentence about Facebook." "example": "This is a sentence about Facebook."
}, },
{ {

View File

@ -103,7 +103,7 @@ const QuickstartInstall = ({ id, title, description, children }) => {
</QS> </QS>
<QS config="example" prompt="python"> <QS config="example" prompt="python">
print([ print([
{code === 'xx' {code === 'mul'
? '(ent.text, ent.label) for ent in doc.ents' ? '(ent.text, ent.label) for ent in doc.ents'
: '(w.text, w.pos_) for w in doc'} : '(w.text, w.pos_) for w in doc'}
]) ])