Merge pull request #12767 from adrianeboyd/backport/v3.5.4-1

Backports for v3.5.4
This commit is contained in:
Adriane Boyd 2023-06-28 10:49:15 +02:00 committed by GitHub
commit 7a2833bf22
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 206 additions and 45 deletions

View File

@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.2.0 wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.8.0 typer>=0.3.0,<0.10.0
pathy>=0.10.0 pathy>=0.10.0
smart-open>=5.2.1,<7.0.0 smart-open>=5.2.1,<7.0.0
# Third party dependencies # Third party dependencies

View File

@ -52,7 +52,7 @@ install_requires =
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
# Third-party dependencies # Third-party dependencies
typer>=0.3.0,<0.8.0 typer>=0.3.0,<0.10.0
pathy>=0.10.0 pathy>=0.10.0
smart-open>=5.2.1,<7.0.0 smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "3.5.3" __version__ = "3.5.4"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects" __projects__ = "https://github.com/explosion/projects"

View File

@ -716,6 +716,11 @@ class Language:
) )
) )
pipe = source.get_pipe(source_name) pipe = source.get_pipe(source_name)
# There is no actual solution here. Either the component has the right
# name for the source pipeline or the component has the right name for
# the current pipeline. This prioritizes the current pipeline.
if hasattr(pipe, "name"):
pipe.name = name
# Make sure the source config is interpolated so we don't end up with # Make sure the source config is interpolated so we don't end up with
# orphaned variables in our final config # orphaned variables in our final config
source_config = source.config.interpolate() source_config = source.config.interpolate()
@ -793,6 +798,7 @@ class Language:
pipe_index = self._get_pipe_index(before, after, first, last) pipe_index = self._get_pipe_index(before, after, first, last)
self._pipe_meta[name] = self.get_factory_meta(factory_name) self._pipe_meta[name] = self.get_factory_meta(factory_name)
self._components.insert(pipe_index, (name, pipe_component)) self._components.insert(pipe_index, (name, pipe_component))
self._link_components()
return pipe_component return pipe_component
def _get_pipe_index( def _get_pipe_index(
@ -928,6 +934,7 @@ class Language:
if old_name in self._config["initialize"]["components"]: if old_name in self._config["initialize"]["components"]:
init_cfg = self._config["initialize"]["components"].pop(old_name) init_cfg = self._config["initialize"]["components"].pop(old_name)
self._config["initialize"]["components"][new_name] = init_cfg self._config["initialize"]["components"][new_name] = init_cfg
self._link_components()
def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]: def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]:
"""Remove a component from the pipeline. """Remove a component from the pipeline.
@ -951,6 +958,7 @@ class Language:
# Make sure the name is also removed from the set of disabled components # Make sure the name is also removed from the set of disabled components
if name in self.disabled: if name in self.disabled:
self._disabled.remove(name) self._disabled.remove(name)
self._link_components()
return removed return removed
def disable_pipe(self, name: str) -> None: def disable_pipe(self, name: str) -> None:
@ -1673,8 +1681,16 @@ class Language:
# The problem is we need to do it during deserialization...And the # The problem is we need to do it during deserialization...And the
# components don't receive the pipeline then. So this does have to be # components don't receive the pipeline then. So this does have to be
# here :( # here :(
# First, fix up all the internal component names in case they have
# gotten out of sync due to sourcing components from different
# pipelines, since find_listeners uses proc2.name for the listener
# map.
for name, proc in self.pipeline:
if hasattr(proc, "name"):
proc.name = name
for i, (name1, proc1) in enumerate(self.pipeline): for i, (name1, proc1) in enumerate(self.pipeline):
if isinstance(proc1, ty.ListenedToComponent): if isinstance(proc1, ty.ListenedToComponent):
proc1.listener_map = {}
for name2, proc2 in self.pipeline[i + 1 :]: for name2, proc2 in self.pipeline[i + 1 :]:
proc1.find_listeners(proc2) proc1.find_listeners(proc2)
@ -1808,6 +1824,7 @@ class Language:
raw_config=raw_config, raw_config=raw_config,
) )
else: else:
assert "source" in pipe_cfg
# We need the sourced components to reference the same # We need the sourced components to reference the same
# vocab without modifying the current vocab state **AND** # vocab without modifying the current vocab state **AND**
# we still want to load the source model vectors to perform # we still want to load the source model vectors to perform
@ -1827,6 +1844,10 @@ class Language:
source_name = pipe_cfg.get("component", pipe_name) source_name = pipe_cfg.get("component", pipe_name)
listeners_replaced = False listeners_replaced = False
if "replace_listeners" in pipe_cfg: if "replace_listeners" in pipe_cfg:
# Make sure that the listened-to component has the
# state of the source pipeline listener map so that the
# replace_listeners method below works as intended.
source_nlps[model]._link_components()
for name, proc in source_nlps[model].pipeline: for name, proc in source_nlps[model].pipeline:
if source_name in getattr(proc, "listening_components", []): if source_name in getattr(proc, "listening_components", []):
source_nlps[model].replace_listeners( source_nlps[model].replace_listeners(
@ -1838,6 +1859,8 @@ class Language:
nlp.add_pipe( nlp.add_pipe(
source_name, source=source_nlps[model], name=pipe_name source_name, source=source_nlps[model], name=pipe_name
) )
# At this point after nlp.add_pipe, the listener map
# corresponds to the new pipeline.
if model not in source_nlp_vectors_hashes: if model not in source_nlp_vectors_hashes:
source_nlp_vectors_hashes[model] = hash( source_nlp_vectors_hashes[model] = hash(
source_nlps[model].vocab.vectors.to_bytes( source_nlps[model].vocab.vectors.to_bytes(
@ -1892,27 +1915,6 @@ class Language:
raise ValueError( raise ValueError(
Errors.E942.format(name="pipeline_creation", value=type(nlp)) Errors.E942.format(name="pipeline_creation", value=type(nlp))
) )
# Detect components with listeners that are not frozen consistently
for name, proc in nlp.pipeline:
if isinstance(proc, ty.ListenedToComponent):
# Remove listeners not in the pipeline
listener_names = proc.listening_components
unused_listener_names = [
ll for ll in listener_names if ll not in nlp.pipe_names
]
for listener_name in unused_listener_names:
for listener in proc.listener_map.get(listener_name, []):
proc.remove_listener(listener, listener_name)
for listener_name in proc.listening_components:
# e.g. tok2vec/transformer
# If it's a component sourced from another pipeline, we check if
# the tok2vec listeners should be replaced with standalone tok2vec
# models (e.g. so component can be frozen without its performance
# degrading when other components/tok2vec are updated)
paths = sourced.get(listener_name, {}).get("replace_listeners", [])
if paths:
nlp.replace_listeners(name, listener_name, paths)
return nlp return nlp
def replace_listeners( def replace_listeners(

View File

@ -188,8 +188,7 @@ def test_tok2vec_listener(with_vectors):
for tag in t[1]["tags"]: for tag in t[1]["tags"]:
tagger.add_label(tag) tagger.add_label(tag)
# Check that the Tok2Vec component finds it listeners # Check that the Tok2Vec component finds its listeners
assert tok2vec.listeners == []
optimizer = nlp.initialize(lambda: train_examples) optimizer = nlp.initialize(lambda: train_examples)
assert tok2vec.listeners == [tagger_tok2vec] assert tok2vec.listeners == [tagger_tok2vec]
@ -217,7 +216,6 @@ def test_tok2vec_listener_callback():
assert nlp.pipe_names == ["tok2vec", "tagger"] assert nlp.pipe_names == ["tok2vec", "tagger"]
tagger = nlp.get_pipe("tagger") tagger = nlp.get_pipe("tagger")
tok2vec = nlp.get_pipe("tok2vec") tok2vec = nlp.get_pipe("tok2vec")
nlp._link_components()
docs = [nlp.make_doc("A random sentence")] docs = [nlp.make_doc("A random sentence")]
tok2vec.model.initialize(X=docs) tok2vec.model.initialize(X=docs)
gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs] gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs]
@ -426,29 +424,46 @@ def test_replace_listeners_from_config():
nlp.to_disk(dir_path) nlp.to_disk(dir_path)
base_model = str(dir_path) base_model = str(dir_path)
new_config = { new_config = {
"nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]}, "nlp": {
"lang": "en",
"pipeline": ["tok2vec", "tagger2", "ner3", "tagger4"],
},
"components": { "components": {
"tok2vec": {"source": base_model}, "tok2vec": {"source": base_model},
"tagger": { "tagger2": {
"source": base_model, "source": base_model,
"component": "tagger",
"replace_listeners": ["model.tok2vec"], "replace_listeners": ["model.tok2vec"],
}, },
"ner": {"source": base_model}, "ner3": {
"source": base_model,
"component": "ner",
},
"tagger4": {
"source": base_model,
"component": "tagger",
},
}, },
} }
new_nlp = util.load_model_from_config(new_config, auto_fill=True) new_nlp = util.load_model_from_config(new_config, auto_fill=True)
new_nlp.initialize(lambda: examples) new_nlp.initialize(lambda: examples)
tok2vec = new_nlp.get_pipe("tok2vec") tok2vec = new_nlp.get_pipe("tok2vec")
tagger = new_nlp.get_pipe("tagger") tagger = new_nlp.get_pipe("tagger2")
ner = new_nlp.get_pipe("ner") ner = new_nlp.get_pipe("ner3")
assert tok2vec.listening_components == ["ner"] assert "ner" not in new_nlp.pipe_names
assert "tagger" not in new_nlp.pipe_names
assert tok2vec.listening_components == ["ner3", "tagger4"]
assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk()) assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk())
assert not any(isinstance(node, Tok2VecListener) for node in tagger.model.walk()) assert not any(isinstance(node, Tok2VecListener) for node in tagger.model.walk())
t2v_cfg = new_nlp.config["components"]["tok2vec"]["model"] t2v_cfg = new_nlp.config["components"]["tok2vec"]["model"]
assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2" assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2"
assert new_nlp.config["components"]["tagger"]["model"]["tok2vec"] == t2v_cfg assert new_nlp.config["components"]["tagger2"]["model"]["tok2vec"] == t2v_cfg
assert ( assert (
new_nlp.config["components"]["ner"]["model"]["tok2vec"]["@architectures"] new_nlp.config["components"]["ner3"]["model"]["tok2vec"]["@architectures"]
== "spacy.Tok2VecListener.v1"
)
assert (
new_nlp.config["components"]["tagger4"]["model"]["tok2vec"]["@architectures"]
== "spacy.Tok2VecListener.v1" == "spacy.Tok2VecListener.v1"
) )
@ -540,3 +555,57 @@ def test_tok2vec_listeners_textcat():
assert cats1["imperative"] < 0.9 assert cats1["imperative"] < 0.9
assert [t.tag_ for t in docs[0]] == ["V", "J", "N"] assert [t.tag_ for t in docs[0]] == ["V", "J", "N"]
assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"] assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"]
def test_tok2vec_listener_source_link_name():
"""The component's internal name and the tok2vec listener map correspond
to the most recently modified pipeline.
"""
orig_config = Config().from_str(cfg_string_multi)
nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
nlp2 = English()
nlp2.add_pipe("tok2vec", source=nlp1)
nlp2.add_pipe("tagger", name="tagger2", source=nlp1)
# there is no way to have the component have the right name for both
# pipelines, right now the most recently modified pipeline is prioritized
assert nlp1.get_pipe("tagger").name == nlp2.get_pipe("tagger2").name == "tagger2"
# there is no way to have the tok2vec have the right listener map for both
# pipelines, right now the most recently modified pipeline is prioritized
assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
nlp2.add_pipe("ner", name="ner3", source=nlp1)
assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2", "ner3"]
nlp2.remove_pipe("ner3")
assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
nlp2.remove_pipe("tagger2")
assert nlp2.get_pipe("tok2vec").listening_components == []
# at this point the tok2vec component corresponds to nlp2
assert nlp1.get_pipe("tok2vec").listening_components == []
# modifying the nlp1 pipeline syncs the tok2vec listener map back to nlp1
nlp1.add_pipe("sentencizer")
assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
# modifying nlp2 syncs it back to nlp2
nlp2.add_pipe("sentencizer")
assert nlp1.get_pipe("tok2vec").listening_components == []
def test_tok2vec_listener_source_replace_listeners():
orig_config = Config().from_str(cfg_string_multi)
nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
nlp1.replace_listeners("tok2vec", "tagger", ["model.tok2vec"])
assert nlp1.get_pipe("tok2vec").listening_components == ["ner"]
nlp2 = English()
nlp2.add_pipe("tok2vec", source=nlp1)
assert nlp2.get_pipe("tok2vec").listening_components == []
nlp2.add_pipe("tagger", source=nlp1)
assert nlp2.get_pipe("tok2vec").listening_components == []
nlp2.add_pipe("ner", name="ner2", source=nlp1)
assert nlp2.get_pipe("tok2vec").listening_components == ["ner2"]

View File

@ -72,7 +72,7 @@ def entity_linker():
def create_kb(vocab): def create_kb(vocab):
kb = InMemoryLookupKB(vocab, entity_vector_length=1) kb = InMemoryLookupKB(vocab, entity_vector_length=1)
kb.add_entity("test", 0.0, zeros((1, 1), dtype="f")) kb.add_entity("test", 0.0, zeros((1,), dtype="f"))
return kb return kb
entity_linker = nlp.add_pipe("entity_linker") entity_linker = nlp.add_pipe("entity_linker")

View File

@ -10,6 +10,7 @@ from spacy.language import Language
from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed
from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
from spacy.training import Example
from spacy.util import load_config, load_config_from_str from spacy.util import load_config, load_config_from_str
from spacy.util import load_model_from_config, registry from spacy.util import load_model_from_config, registry
@ -415,6 +416,55 @@ def test_config_overrides():
assert nlp.pipe_names == ["tok2vec", "tagger"] assert nlp.pipe_names == ["tok2vec", "tagger"]
@pytest.mark.filterwarnings("ignore:\\[W036")
def test_config_overrides_registered_functions():
nlp = spacy.blank("en")
nlp.add_pipe("attribute_ruler")
with make_tempdir() as d:
nlp.to_disk(d)
nlp_re1 = spacy.load(
d,
config={
"components": {
"attribute_ruler": {
"scorer": {"@scorers": "spacy.tagger_scorer.v1"}
}
}
},
)
assert (
nlp_re1.config["components"]["attribute_ruler"]["scorer"]["@scorers"]
== "spacy.tagger_scorer.v1"
)
@registry.misc("test_some_other_key")
def misc_some_other_key():
return "some_other_key"
nlp_re2 = spacy.load(
d,
config={
"components": {
"attribute_ruler": {
"scorer": {
"@scorers": "spacy.overlapping_labeled_spans_scorer.v1",
"spans_key": {"@misc": "test_some_other_key"},
}
}
}
},
)
assert nlp_re2.config["components"]["attribute_ruler"]["scorer"][
"spans_key"
] == {"@misc": "test_some_other_key"}
# run dummy evaluation (will return None scores) in order to test that
# the spans_key value in the nested override is working as intended in
# the config
example = Example.from_dict(nlp_re2.make_doc("a b c"), {})
scores = nlp_re2.evaluate([example])
assert "spans_some_other_key_f" in scores
def test_config_interpolation(): def test_config_interpolation():
config = Config().from_str(nlp_config_string, interpolate=False) config = Config().from_str(nlp_config_string, interpolate=False)
assert config["corpora"]["train"]["path"] == "${paths.train}" assert config["corpora"]["train"]["path"] == "${paths.train}"

View File

@ -237,6 +237,10 @@ def test_minor_version(a1, a2, b1, b2, is_match):
{"training.batch_size": 128, "training.optimizer.learn_rate": 0.01}, {"training.batch_size": 128, "training.optimizer.learn_rate": 0.01},
{"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}}, {"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}},
), ),
(
{"attribute_ruler.scorer.@scorers": "spacy.tagger_scorer.v1"},
{"attribute_ruler": {"scorer": {"@scorers": "spacy.tagger_scorer.v1"}}},
),
], ],
) )
def test_dot_to_dict(dot_notation, expected): def test_dot_to_dict(dot_notation, expected):
@ -245,6 +249,29 @@ def test_dot_to_dict(dot_notation, expected):
assert util.dict_to_dot(result) == dot_notation assert util.dict_to_dot(result) == dot_notation
@pytest.mark.parametrize(
"dot_notation,expected",
[
(
{"token.pos": True, "token._.xyz": True},
{"token": {"pos": True, "_": {"xyz": True}}},
),
(
{"training.batch_size": 128, "training.optimizer.learn_rate": 0.01},
{"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}},
),
(
{"attribute_ruler.scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
{"attribute_ruler": {"scorer": {"@scorers": "spacy.tagger_scorer.v1"}}},
),
],
)
def test_dot_to_dict_overrides(dot_notation, expected):
result = util.dot_to_dict(dot_notation)
assert result == expected
assert util.dict_to_dot(result, for_overrides=True) == dot_notation
def test_set_dot_to_object(): def test_set_dot_to_object():
config = {"foo": {"bar": 1, "baz": {"x": "y"}}, "test": {"a": {"b": "c"}}} config = {"foo": {"bar": 1, "baz": {"x": "y"}}, "test": {"a": {"b": "c"}}}
with pytest.raises(KeyError): with pytest.raises(KeyError):

View File

@ -67,7 +67,8 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
with nlp.select_pipes(enable=resume_components): with nlp.select_pipes(enable=resume_components):
logger.info("Resuming training for: %s", resume_components) logger.info("Resuming training for: %s", resume_components)
nlp.resume_training(sgd=optimizer) nlp.resume_training(sgd=optimizer)
# Make sure that listeners are defined before initializing further # Make sure that internal component names are synced and listeners are
# defined before initializing further
nlp._link_components() nlp._link_components()
with nlp.select_pipes(disable=[*frozen_components, *resume_components]): with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
if T["max_epochs"] == -1: if T["max_epochs"] == -1:

View File

@ -511,7 +511,7 @@ def load_model_from_path(
if not meta: if not meta:
meta = get_model_meta(model_path) meta = get_model_meta(model_path)
config_path = model_path / "config.cfg" config_path = model_path / "config.cfg"
overrides = dict_to_dot(config) overrides = dict_to_dot(config, for_overrides=True)
config = load_config(config_path, overrides=overrides) config = load_config(config_path, overrides=overrides)
nlp = load_model_from_config( nlp = load_model_from_config(
config, config,
@ -1479,14 +1479,19 @@ def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
return result return result
def dict_to_dot(obj: Dict[str, dict]) -> Dict[str, Any]: def dict_to_dot(obj: Dict[str, dict], *, for_overrides: bool = False) -> Dict[str, Any]:
"""Convert dot notation to a dict. For example: {"token": {"pos": True, """Convert dot notation to a dict. For example: {"token": {"pos": True,
"_": {"xyz": True }}} becomes {"token.pos": True, "token._.xyz": True}. "_": {"xyz": True }}} becomes {"token.pos": True, "token._.xyz": True}.
values (Dict[str, dict]): The dict to convert. obj (Dict[str, dict]): The dict to convert.
for_overrides (bool): Whether to enable special handling for registered
functions in overrides.
RETURNS (Dict[str, Any]): The key/value pairs. RETURNS (Dict[str, Any]): The key/value pairs.
""" """
return {".".join(key): value for key, value in walk_dict(obj)} return {
".".join(key): value
for key, value in walk_dict(obj, for_overrides=for_overrides)
}
def dot_to_object(config: Config, section: str): def dot_to_object(config: Config, section: str):
@ -1528,13 +1533,20 @@ def set_dot_to_object(config: Config, section: str, value: Any) -> None:
def walk_dict( def walk_dict(
node: Dict[str, Any], parent: List[str] = [] node: Dict[str, Any], parent: List[str] = [], *, for_overrides: bool = False
) -> Iterator[Tuple[List[str], Any]]: ) -> Iterator[Tuple[List[str], Any]]:
"""Walk a dict and yield the path and values of the leaves.""" """Walk a dict and yield the path and values of the leaves.
for_overrides (bool): Whether to treat registered functions that start with
@ as final values rather than dicts to traverse.
"""
for key, value in node.items(): for key, value in node.items():
key_parent = [*parent, key] key_parent = [*parent, key]
if isinstance(value, dict): if isinstance(value, dict) and (
yield from walk_dict(value, key_parent) not for_overrides
or not any(value_key.startswith("@") for value_key in value)
):
yield from walk_dict(value, key_parent, for_overrides=for_overrides)
else: else:
yield (key_parent, value) yield (key_parent, value)