Merge branch 'master' into feat/add-pipe-instance

This commit is contained in:
svlandeg 2023-07-05 10:35:25 +02:00
commit 8a79a71190
23 changed files with 360 additions and 158 deletions

View File

@ -32,6 +32,7 @@ def init_vectors_cli(
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
# fmt: on
):
"""Convert word vectors for use with spaCy. Will export an nlp object that
@ -50,6 +51,7 @@ def init_vectors_cli(
prune=prune,
name=name,
mode=mode,
attr=attr,
)
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
nlp.to_disk(output_dir)

View File

@ -216,7 +216,10 @@ class Warnings(metaclass=ErrorsWithCodes):
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
W125 = (
W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
"key attribute for vectors, configure it through Vectors(attr=) or "
"'spacy init vectors --attr'")
W126 = (
"Pipe instance '{name}' is being added with a vocab "
"instance that will not match other components. This is "
"usually an error."

View File

@ -742,6 +742,11 @@ class Language:
)
)
pipe = source.get_pipe(source_name)
# There is no actual solution here. Either the component has the right
# name for the source pipeline or the component has the right name for
# the current pipeline. This prioritizes the current pipeline.
if hasattr(pipe, "name"):
pipe.name = name
# Make sure the source config is interpolated so we don't end up with
# orphaned variables in our final config
source_config = source.config.interpolate()
@ -822,6 +827,7 @@ class Language:
self._pipe_meta[name] = self.get_factory_meta(factory_name)
pipe_index = self._get_pipe_index(before, after, first, last)
self._components.insert(pipe_index, (name, pipe_component))
self._link_components()
return pipe_component
def add_pipe_instance(
@ -1006,6 +1012,7 @@ class Language:
if old_name in self._config["initialize"]["components"]:
init_cfg = self._config["initialize"]["components"].pop(old_name)
self._config["initialize"]["components"][new_name] = init_cfg
self._link_components()
def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]:
"""Remove a component from the pipeline.
@ -1029,6 +1036,7 @@ class Language:
# Make sure the name is also removed from the set of disabled components
if name in self.disabled:
self._disabled.remove(name)
self._link_components()
return removed
def disable_pipe(self, name: str) -> None:
@ -1757,8 +1765,16 @@ class Language:
# The problem is we need to do it during deserialization...And the
# components don't receive the pipeline then. So this does have to be
# here :(
# First, fix up all the internal component names in case they have
# gotten out of sync due to sourcing components from different
# pipelines, since find_listeners uses proc2.name for the listener
# map.
for name, proc in self.pipeline:
if hasattr(proc, "name"):
proc.name = name
for i, (name1, proc1) in enumerate(self.pipeline):
if isinstance(proc1, ty.ListenedToComponent):
proc1.listener_map = {}
for name2, proc2 in self.pipeline[i + 1 :]:
proc1.find_listeners(proc2)
@ -1913,6 +1929,7 @@ class Language:
raw_config=raw_config,
)
else:
assert "source" in pipe_cfg
# We need the sourced components to reference the same
# vocab without modifying the current vocab state **AND**
# we still want to load the source model vectors to perform
@ -1932,6 +1949,10 @@ class Language:
source_name = pipe_cfg.get("component", pipe_name)
listeners_replaced = False
if "replace_listeners" in pipe_cfg:
# Make sure that the listened-to component has the
# state of the source pipeline listener map so that the
# replace_listeners method below works as intended.
source_nlps[model]._link_components()
for name, proc in source_nlps[model].pipeline:
if source_name in getattr(proc, "listening_components", []):
source_nlps[model].replace_listeners(
@ -1943,6 +1964,8 @@ class Language:
nlp.add_pipe(
source_name, source=source_nlps[model], name=pipe_name
)
# At this point after nlp.add_pipe, the listener map
# corresponds to the new pipeline.
if model not in source_nlp_vectors_hashes:
source_nlp_vectors_hashes[model] = hash(
source_nlps[model].vocab.vectors.to_bytes(
@ -1997,27 +2020,6 @@ class Language:
raise ValueError(
Errors.E942.format(name="pipeline_creation", value=type(nlp))
)
# Detect components with listeners that are not frozen consistently
for name, proc in nlp.pipeline:
if isinstance(proc, ty.ListenedToComponent):
# Remove listeners not in the pipeline
listener_names = proc.listening_components
unused_listener_names = [
ll for ll in listener_names if ll not in nlp.pipe_names
]
for listener_name in unused_listener_names:
for listener in proc.listener_map.get(listener_name, []):
proc.remove_listener(listener, listener_name)
for listener_name in proc.listening_components:
# e.g. tok2vec/transformer
# If it's a component sourced from another pipeline, we check if
# the tok2vec listeners should be replaced with standalone tok2vec
# models (e.g. so component can be frozen without its performance
# degrading when other components/tok2vec are updated)
paths = sourced.get(listener_name, {}).get("replace_listeners", [])
if paths:
nlp.replace_listeners(name, listener_name, paths)
return nlp
def replace_listeners(

View File

@ -1,3 +1,4 @@
import warnings
from typing import Callable, List, Optional, Sequence, Tuple, cast
from thinc.api import Model, Ops, registry
@ -5,7 +6,8 @@ from thinc.initializers import glorot_uniform_init
from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
from thinc.util import partial
from ..errors import Errors
from ..attrs import ORTH
from ..errors import Errors, Warnings
from ..tokens import Doc
from ..vectors import Mode
from ..vocab import Vocab
@ -24,6 +26,8 @@ def StaticVectors(
linear projection to control the dimensionality. If a dropout rate is
specified, the dropout is applied per dimension over the whole batch.
"""
if key_attr != "ORTH":
warnings.warn(Warnings.W125, DeprecationWarning)
return Model(
"static_vectors",
forward,
@ -40,9 +44,9 @@ def forward(
token_count = sum(len(doc) for doc in docs)
if not token_count:
return _handle_empty(model.ops, model.get_dim("nO"))
key_attr: int = model.attrs["key_attr"]
keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
vocab: Vocab = docs[0].vocab
key_attr: int = getattr(vocab.vectors, "attr", ORTH)
keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
if vocab.vectors.mode == Mode.default:
V = model.ops.asarray(vocab.vectors.data)

View File

@ -53,9 +53,9 @@ DEFAULT_SPAN_FINDER_MODEL = Config().from_str(span_finder_default_config)["model
"scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
},
default_score_weights={
f"span_finder_{DEFAULT_SPANS_KEY}_f": 1.0,
f"span_finder_{DEFAULT_SPANS_KEY}_p": 0.0,
f"span_finder_{DEFAULT_SPANS_KEY}_r": 0.0,
f"spans_{DEFAULT_SPANS_KEY}_f": 1.0,
f"spans_{DEFAULT_SPANS_KEY}_p": 0.0,
f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
},
)
def make_span_finder(
@ -104,7 +104,7 @@ def make_span_finder_scorer():
def span_finder_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
kwargs = dict(kwargs)
attr_prefix = "span_finder_"
attr_prefix = "spans_"
key = kwargs["spans_key"]
kwargs.setdefault("attr", f"{attr_prefix}{key}")
kwargs.setdefault(

View File

@ -230,10 +230,10 @@ def test_overfitting_IO():
# Test scoring
scores = nlp.evaluate(train_examples)
assert f"span_finder_{SPANS_KEY}_f" in scores
assert f"spans_{SPANS_KEY}_f" in scores
# It's not perfect 1.0 F1 because it's designed to overgenerate for now.
assert scores[f"span_finder_{SPANS_KEY}_p"] == 0.75
assert scores[f"span_finder_{SPANS_KEY}_r"] == 1.0
assert scores[f"spans_{SPANS_KEY}_p"] == 0.75
assert scores[f"spans_{SPANS_KEY}_r"] == 1.0
# also test that the spancat works for just a single entity in a sentence
doc = nlp("London")

View File

@ -192,8 +192,7 @@ def test_tok2vec_listener(with_vectors):
for tag in t[1]["tags"]:
tagger.add_label(tag)
# Check that the Tok2Vec component finds it listeners
assert tok2vec.listeners == []
# Check that the Tok2Vec component finds its listeners
optimizer = nlp.initialize(lambda: train_examples)
assert tok2vec.listeners == [tagger_tok2vec]
@ -221,7 +220,6 @@ def test_tok2vec_listener_callback():
assert nlp.pipe_names == ["tok2vec", "tagger"]
tagger = nlp.get_pipe("tagger")
tok2vec = nlp.get_pipe("tok2vec")
nlp._link_components()
docs = [nlp.make_doc("A random sentence")]
tok2vec.model.initialize(X=docs)
gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs]
@ -430,29 +428,46 @@ def test_replace_listeners_from_config():
nlp.to_disk(dir_path)
base_model = str(dir_path)
new_config = {
"nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]},
"nlp": {
"lang": "en",
"pipeline": ["tok2vec", "tagger2", "ner3", "tagger4"],
},
"components": {
"tok2vec": {"source": base_model},
"tagger": {
"tagger2": {
"source": base_model,
"component": "tagger",
"replace_listeners": ["model.tok2vec"],
},
"ner": {"source": base_model},
"ner3": {
"source": base_model,
"component": "ner",
},
"tagger4": {
"source": base_model,
"component": "tagger",
},
},
}
new_nlp = util.load_model_from_config(new_config, auto_fill=True)
new_nlp.initialize(lambda: examples)
tok2vec = new_nlp.get_pipe("tok2vec")
tagger = new_nlp.get_pipe("tagger")
ner = new_nlp.get_pipe("ner")
assert tok2vec.listening_components == ["ner"]
tagger = new_nlp.get_pipe("tagger2")
ner = new_nlp.get_pipe("ner3")
assert "ner" not in new_nlp.pipe_names
assert "tagger" not in new_nlp.pipe_names
assert tok2vec.listening_components == ["ner3", "tagger4"]
assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk())
assert not any(isinstance(node, Tok2VecListener) for node in tagger.model.walk())
t2v_cfg = new_nlp.config["components"]["tok2vec"]["model"]
assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2"
assert new_nlp.config["components"]["tagger"]["model"]["tok2vec"] == t2v_cfg
assert new_nlp.config["components"]["tagger2"]["model"]["tok2vec"] == t2v_cfg
assert (
new_nlp.config["components"]["ner"]["model"]["tok2vec"]["@architectures"]
new_nlp.config["components"]["ner3"]["model"]["tok2vec"]["@architectures"]
== "spacy.Tok2VecListener.v1"
)
assert (
new_nlp.config["components"]["tagger4"]["model"]["tok2vec"]["@architectures"]
== "spacy.Tok2VecListener.v1"
)
@ -544,3 +559,57 @@ def test_tok2vec_listeners_textcat():
assert cats1["imperative"] < 0.9
assert [t.tag_ for t in docs[0]] == ["V", "J", "N"]
assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"]
def test_tok2vec_listener_source_link_name():
"""The component's internal name and the tok2vec listener map correspond
to the most recently modified pipeline.
"""
orig_config = Config().from_str(cfg_string_multi)
nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
nlp2 = English()
nlp2.add_pipe("tok2vec", source=nlp1)
nlp2.add_pipe("tagger", name="tagger2", source=nlp1)
# there is no way to have the component have the right name for both
# pipelines, right now the most recently modified pipeline is prioritized
assert nlp1.get_pipe("tagger").name == nlp2.get_pipe("tagger2").name == "tagger2"
# there is no way to have the tok2vec have the right listener map for both
# pipelines, right now the most recently modified pipeline is prioritized
assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
nlp2.add_pipe("ner", name="ner3", source=nlp1)
assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2", "ner3"]
nlp2.remove_pipe("ner3")
assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
nlp2.remove_pipe("tagger2")
assert nlp2.get_pipe("tok2vec").listening_components == []
# at this point the tok2vec component corresponds to nlp2
assert nlp1.get_pipe("tok2vec").listening_components == []
# modifying the nlp1 pipeline syncs the tok2vec listener map back to nlp1
nlp1.add_pipe("sentencizer")
assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
# modifying nlp2 syncs it back to nlp2
nlp2.add_pipe("sentencizer")
assert nlp1.get_pipe("tok2vec").listening_components == []
def test_tok2vec_listener_source_replace_listeners():
orig_config = Config().from_str(cfg_string_multi)
nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
nlp1.replace_listeners("tok2vec", "tagger", ["model.tok2vec"])
assert nlp1.get_pipe("tok2vec").listening_components == ["ner"]
nlp2 = English()
nlp2.add_pipe("tok2vec", source=nlp1)
assert nlp2.get_pipe("tok2vec").listening_components == []
nlp2.add_pipe("tagger", source=nlp1)
assert nlp2.get_pipe("tok2vec").listening_components == []
nlp2.add_pipe("ner", name="ner2", source=nlp1)
assert nlp2.get_pipe("tok2vec").listening_components == ["ner2"]

View File

@ -13,6 +13,7 @@ from spacy.ml.models import (
build_Tok2Vec_model,
)
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
from spacy.training import Example
from spacy.util import (
load_config,
load_config_from_str,
@ -422,6 +423,55 @@ def test_config_overrides():
assert nlp.pipe_names == ["tok2vec", "tagger"]
@pytest.mark.filterwarnings("ignore:\\[W036")
def test_config_overrides_registered_functions():
nlp = spacy.blank("en")
nlp.add_pipe("attribute_ruler")
with make_tempdir() as d:
nlp.to_disk(d)
nlp_re1 = spacy.load(
d,
config={
"components": {
"attribute_ruler": {
"scorer": {"@scorers": "spacy.tagger_scorer.v1"}
}
}
},
)
assert (
nlp_re1.config["components"]["attribute_ruler"]["scorer"]["@scorers"]
== "spacy.tagger_scorer.v1"
)
@registry.misc("test_some_other_key")
def misc_some_other_key():
return "some_other_key"
nlp_re2 = spacy.load(
d,
config={
"components": {
"attribute_ruler": {
"scorer": {
"@scorers": "spacy.overlapping_labeled_spans_scorer.v1",
"spans_key": {"@misc": "test_some_other_key"},
}
}
}
},
)
assert nlp_re2.config["components"]["attribute_ruler"]["scorer"][
"spans_key"
] == {"@misc": "test_some_other_key"}
# run dummy evaluation (will return None scores) in order to test that
# the spans_key value in the nested override is working as intended in
# the config
example = Example.from_dict(nlp_re2.make_doc("a b c"), {})
scores = nlp_re2.evaluate([example])
assert "spans_some_other_key_f" in scores
def test_config_interpolation():
config = Config().from_str(nlp_config_string, interpolate=False)
assert config["corpora"]["train"]["path"] == "${paths.train}"

View File

@ -252,6 +252,10 @@ def test_minor_version(a1, a2, b1, b2, is_match):
{"training.batch_size": 128, "training.optimizer.learn_rate": 0.01},
{"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}},
),
(
{"attribute_ruler.scorer.@scorers": "spacy.tagger_scorer.v1"},
{"attribute_ruler": {"scorer": {"@scorers": "spacy.tagger_scorer.v1"}}},
),
],
)
def test_dot_to_dict(dot_notation, expected):
@ -260,6 +264,29 @@ def test_dot_to_dict(dot_notation, expected):
assert util.dict_to_dot(result) == dot_notation
@pytest.mark.parametrize(
"dot_notation,expected",
[
(
{"token.pos": True, "token._.xyz": True},
{"token": {"pos": True, "_": {"xyz": True}}},
),
(
{"training.batch_size": 128, "training.optimizer.learn_rate": 0.01},
{"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}},
),
(
{"attribute_ruler.scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
{"attribute_ruler": {"scorer": {"@scorers": "spacy.tagger_scorer.v1"}}},
),
],
)
def test_dot_to_dict_overrides(dot_notation, expected):
result = util.dot_to_dict(dot_notation)
assert result == expected
assert util.dict_to_dot(result, for_overrides=True) == dot_notation
def test_set_dot_to_object():
config = {"foo": {"bar": 1, "baz": {"x": "y"}}, "test": {"a": {"b": "c"}}}
with pytest.raises(KeyError):

View File

@ -402,6 +402,7 @@ def test_vectors_serialize():
row_r = v_r.add("D", vector=OPS.asarray([10, 20, 30, 40], dtype="f"))
assert row == row_r
assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data))
assert v.attr == v_r.attr
def test_vector_is_oov():
@ -646,3 +647,32 @@ def test_equality():
vectors1.resize((5, 9))
vectors2.resize((5, 9))
assert vectors1 == vectors2
def test_vectors_attr():
data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
# default ORTH
nlp = English()
nlp.vocab.vectors = Vectors(data=data, keys=["A", "B", "C"])
assert nlp.vocab.strings["A"] in nlp.vocab.vectors.key2row
assert nlp.vocab.strings["a"] not in nlp.vocab.vectors.key2row
assert nlp.vocab["A"].has_vector is True
assert nlp.vocab["a"].has_vector is False
assert nlp("A")[0].has_vector is True
assert nlp("a")[0].has_vector is False
# custom LOWER
nlp = English()
nlp.vocab.vectors = Vectors(data=data, keys=["a", "b", "c"], attr="LOWER")
assert nlp.vocab.strings["A"] not in nlp.vocab.vectors.key2row
assert nlp.vocab.strings["a"] in nlp.vocab.vectors.key2row
assert nlp.vocab["A"].has_vector is True
assert nlp.vocab["a"].has_vector is True
assert nlp("A")[0].has_vector is True
assert nlp("a")[0].has_vector is True
# add a new vectors entry
assert nlp.vocab["D"].has_vector is False
assert nlp.vocab["d"].has_vector is False
nlp.vocab.set_vector("D", numpy.asarray([4, 5, 6]))
assert nlp.vocab["D"].has_vector is True
assert nlp.vocab["d"].has_vector is True

View File

@ -35,6 +35,7 @@ from ..attrs cimport (
LENGTH,
MORPH,
NORM,
ORTH,
POS,
SENT_START,
SPACY,
@ -613,13 +614,26 @@ cdef class Doc:
"""
if "similarity" in self.user_hooks:
return self.user_hooks["similarity"](self, other)
if isinstance(other, (Lexeme, Token)) and self.length == 1:
if self.c[0].lex.orth == other.orth:
attr = getattr(self.vocab.vectors, "attr", ORTH)
cdef Token this_token
cdef Token other_token
cdef Lexeme other_lex
if len(self) == 1 and isinstance(other, Token):
this_token = self[0]
other_token = other
if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr):
return 1.0
elif isinstance(other, (Span, Doc)) and len(self) == len(other):
elif len(self) == 1 and isinstance(other, Lexeme):
this_token = self[0]
other_lex = other
if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr):
return 1.0
elif isinstance(other, (Doc, Span)) and len(self) == len(other):
similar = True
for i in range(self.length):
if self[i].orth != other[i].orth:
for i in range(len(self)):
this_token = self[i]
other_token = other[i]
if Token.get_struct_attr(this_token.c, attr) != Token.get_struct_attr(other_token.c, attr):
similar = False
break
if similar:

View File

@ -8,13 +8,14 @@ import numpy
from thinc.api import get_array_module
from ..attrs cimport *
from ..attrs cimport attr_id_t
from ..attrs cimport ORTH, attr_id_t
from ..lexeme cimport Lexeme
from ..parts_of_speech cimport univ_pos_t
from ..structs cimport LexemeC, TokenC
from ..symbols cimport dep
from ..typedefs cimport attr_t, flags_t, hash_t
from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
from .token cimport Token
from ..errors import Errors, Warnings
from ..util import normalize_slice
@ -341,13 +342,26 @@ cdef class Span:
"""
if "similarity" in self.doc.user_span_hooks:
return self.doc.user_span_hooks["similarity"](self, other)
if len(self) == 1 and hasattr(other, "orth"):
if self[0].orth == other.orth:
attr = getattr(self.doc.vocab.vectors, "attr", ORTH)
cdef Token this_token
cdef Token other_token
cdef Lexeme other_lex
if len(self) == 1 and isinstance(other, Token):
this_token = self[0]
other_token = other
if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr):
return 1.0
elif len(self) == 1 and isinstance(other, Lexeme):
this_token = self[0]
other_lex = other
if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr):
return 1.0
elif isinstance(other, (Doc, Span)) and len(self) == len(other):
similar = True
for i in range(len(self)):
if self[i].orth != getattr(other[i], "orth", None):
this_token = self[i]
other_token = other[i]
if Token.get_struct_attr(this_token.c, attr) != Token.get_struct_attr(other_token.c, attr):
similar = False
break
if similar:

View File

@ -28,6 +28,7 @@ from ..attrs cimport (
LIKE_EMAIL,
LIKE_NUM,
LIKE_URL,
ORTH,
)
from ..lexeme cimport Lexeme
from ..symbols cimport conj
@ -214,11 +215,17 @@ cdef class Token:
"""
if "similarity" in self.doc.user_token_hooks:
return self.doc.user_token_hooks["similarity"](self, other)
if hasattr(other, "__len__") and len(other) == 1 and hasattr(other, "__getitem__"):
if self.c.lex.orth == getattr(other[0], "orth", None):
attr = getattr(self.doc.vocab.vectors, "attr", ORTH)
cdef Token this_token = self
cdef Token other_token
cdef Lexeme other_lex
if isinstance(other, Token):
other_token = other
if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr):
return 1.0
elif hasattr(other, "orth"):
if self.c.lex.orth == other.orth:
elif isinstance(other, Lexeme):
other_lex = other
if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr):
return 1.0
if self.vocab.vectors.n_keys == 0:
warnings.warn(Warnings.W007.format(obj="Token"))
@ -415,7 +422,7 @@ cdef class Token:
return self.doc.user_token_hooks["has_vector"](self)
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
return True
return self.vocab.has_vector(self.c.lex.orth)
return self.vocab.has_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
@property
def vector(self):
@ -431,7 +438,7 @@ cdef class Token:
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
return self.doc.tensor[self.i]
else:
return self.vocab.get_vector(self.c.lex.orth)
return self.vocab.get_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
@property
def vector_norm(self):

View File

@ -76,7 +76,8 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
with nlp.select_pipes(enable=resume_components):
logger.info("Resuming training for: %s", resume_components)
nlp.resume_training(sgd=optimizer)
# Make sure that listeners are defined before initializing further
# Make sure that internal component names are synced and listeners are
# defined before initializing further
nlp._link_components()
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
if T["max_epochs"] == -1:
@ -215,9 +216,14 @@ def convert_vectors(
prune: int,
name: Optional[str] = None,
mode: str = VectorsMode.default,
attr: str = "ORTH",
) -> None:
vectors_loc = ensure_path(vectors_loc)
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
if attr != "ORTH":
raise ValueError(
"ORTH is the only attribute supported for vectors in .npz format."
)
nlp.vocab.vectors = Vectors(
strings=nlp.vocab.strings, data=numpy.load(vectors_loc.open("rb"))
)
@ -245,11 +251,15 @@ def convert_vectors(
nlp.vocab.vectors = Vectors(
strings=nlp.vocab.strings,
data=vectors_data,
attr=attr,
**floret_settings,
)
else:
nlp.vocab.vectors = Vectors(
strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys
strings=nlp.vocab.strings,
data=vectors_data,
keys=vector_keys,
attr=attr,
)
nlp.vocab.deduplicate_vectors()
if name is None:

View File

@ -547,7 +547,7 @@ def load_model_from_path(
if not meta:
meta = get_model_meta(model_path)
config_path = model_path / "config.cfg"
overrides = dict_to_dot(config)
overrides = dict_to_dot(config, for_overrides=True)
config = load_config(config_path, overrides=overrides)
nlp = load_model_from_config(
config,
@ -1525,14 +1525,19 @@ def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
return result
def dict_to_dot(obj: Dict[str, dict]) -> Dict[str, Any]:
def dict_to_dot(obj: Dict[str, dict], *, for_overrides: bool = False) -> Dict[str, Any]:
"""Convert dot notation to a dict. For example: {"token": {"pos": True,
"_": {"xyz": True }}} becomes {"token.pos": True, "token._.xyz": True}.
values (Dict[str, dict]): The dict to convert.
obj (Dict[str, dict]): The dict to convert.
for_overrides (bool): Whether to enable special handling for registered
functions in overrides.
RETURNS (Dict[str, Any]): The key/value pairs.
"""
return {".".join(key): value for key, value in walk_dict(obj)}
return {
".".join(key): value
for key, value in walk_dict(obj, for_overrides=for_overrides)
}
def dot_to_object(config: Config, section: str):
@ -1574,13 +1579,20 @@ def set_dot_to_object(config: Config, section: str, value: Any) -> None:
def walk_dict(
node: Dict[str, Any], parent: List[str] = []
node: Dict[str, Any], parent: List[str] = [], *, for_overrides: bool = False
) -> Iterator[Tuple[List[str], Any]]:
"""Walk a dict and yield the path and values of the leaves."""
"""Walk a dict and yield the path and values of the leaves.
for_overrides (bool): Whether to treat registered functions that start with
@ as final values rather than dicts to traverse.
"""
for key, value in node.items():
key_parent = [*parent, key]
if isinstance(value, dict):
yield from walk_dict(value, key_parent)
if isinstance(value, dict) and (
not for_overrides
or not any(value_key.startswith("@") for value_key in value)
):
yield from walk_dict(value, key_parent, for_overrides=for_overrides)
else:
yield (key_parent, value)

View File

@ -15,9 +15,11 @@ from thinc.api import Ops, get_array_module, get_current_ops
from thinc.backends import get_array_ops
from thinc.types import Floats2d
from .attrs cimport ORTH, attr_id_t
from .strings cimport StringStore
from . import util
from .attrs import IDS
from .errors import Errors, Warnings
from .strings import get_string_id
@ -64,8 +66,9 @@ cdef class Vectors:
cdef readonly uint32_t hash_seed
cdef readonly unicode bow
cdef readonly unicode eow
cdef readonly attr_id_t attr
def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">", attr="ORTH"):
"""Create a new vector store.
strings (StringStore): The string store.
@ -80,6 +83,8 @@ cdef class Vectors:
hash_seed (int): The floret hash seed (default: 0).
bow (str): The floret BOW string (default: "<").
eow (str): The floret EOW string (default: ">").
attr (Union[int, str]): The token attribute for the vector keys
(default: "ORTH").
DOCS: https://spacy.io/api/vectors#init
"""
@ -103,6 +108,14 @@ cdef class Vectors:
self.hash_seed = hash_seed
self.bow = bow
self.eow = eow
if isinstance(attr, (int, long)):
self.attr = attr
else:
attr = attr.upper()
if attr == "TEXT":
attr = "ORTH"
self.attr = IDS.get(attr, ORTH)
if self.mode == Mode.default:
if data is None:
if shape is None:
@ -546,6 +559,7 @@ cdef class Vectors:
"hash_seed": self.hash_seed,
"bow": self.bow,
"eow": self.eow,
"attr": self.attr,
}
def _set_cfg(self, cfg):
@ -556,6 +570,7 @@ cdef class Vectors:
self.hash_seed = cfg.get("hash_seed", 0)
self.bow = cfg.get("bow", "<")
self.eow = cfg.get("eow", ">")
self.attr = cfg.get("attr", ORTH)
def to_disk(self, path, *, exclude=tuple()):
"""Save the current state to a directory.

View File

@ -365,8 +365,13 @@ cdef class Vocab:
self[orth]
# Make prob negative so it sorts by rank ascending
# (key2row contains the rank)
priority = [(-lex.prob, self.vectors.key2row[lex.orth], lex.orth)
for lex in self if lex.orth in self.vectors.key2row]
priority = []
cdef Lexeme lex
cdef attr_t value
for lex in self:
value = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
if value in self.vectors.key2row:
priority.append((-lex.prob, self.vectors.key2row[value], value))
priority.sort()
indices = xp.asarray([i for (prob, i, key) in priority], dtype="uint64")
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
@ -399,8 +404,10 @@ cdef class Vocab:
"""
if isinstance(orth, str):
orth = self.strings.add(orth)
if self.has_vector(orth):
return self.vectors[orth]
cdef Lexeme lex = self[orth]
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
if self.has_vector(key):
return self.vectors[key]
xp = get_array_module(self.vectors.data)
vectors = xp.zeros((self.vectors_length,), dtype="f")
return vectors
@ -416,15 +423,16 @@ cdef class Vocab:
"""
if isinstance(orth, str):
orth = self.strings.add(orth)
if self.vectors.is_full and orth not in self.vectors:
cdef Lexeme lex = self[orth]
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
if self.vectors.is_full and key not in self.vectors:
new_rows = max(100, int(self.vectors.shape[0]*1.3))
if self.vectors.shape[1] == 0:
width = vector.size
else:
width = self.vectors.shape[1]
self.vectors.resize((new_rows, width))
lex = self[orth] # Add word to vocab if necessary
row = self.vectors.add(orth, vector=vector)
row = self.vectors.add(key, vector=vector)
if row >= 0:
lex.rank = row
@ -439,7 +447,9 @@ cdef class Vocab:
"""
if isinstance(orth, str):
orth = self.strings.add(orth)
return orth in self.vectors
cdef Lexeme lex = self[orth]
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
return key in self.vectors
property lookups:
def __get__(self):

View File

@ -303,7 +303,7 @@ mapped to a zero vector. See the documentation on
| `nM` | The width of the static vectors. ~~Optional[int]~~ |
| `dropout` | Optional dropout rate. If set, it's applied per dimension over the whole batch. Defaults to `None`. ~~Optional[float]~~ |
| `init_W` | The [initialization function](https://thinc.ai/docs/api-initializers). Defaults to [`glorot_uniform_init`](https://thinc.ai/docs/api-initializers#glorot_uniform_init). ~~Callable[[Ops, Tuple[int, ...]]], FloatsXd]~~ |
| `key_attr` | Defaults to `"ORTH"`. ~~str~~ |
| `key_attr` | This setting is ignored in spaCy v3.6+. To set a custom key attribute for vectors, configure it through [`Vectors`](/api/vectors) or [`spacy init vectors`](/api/cli#init-vectors). Defaults to `"ORTH"`. ~~str~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ |
### spacy.FeatureExtractor.v1 {id="FeatureExtractor"}

View File

@ -211,7 +211,8 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
| `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ |
| `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~str \(option)~~ |
| `--attr`, `-a` | Token attribute to use for vectors, e.g. `LOWER` or `NORM`) Defaults to `ORTH`. ~~str \(option)~~ |
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |

View File

@ -60,6 +60,7 @@ modified later.
| `hash_seed` <Tag variant="new">3.2</Tag> | The floret hash seed (default: `0`). ~~int~~ |
| `bow` <Tag variant="new">3.2</Tag> | The floret BOW string (default: `"<"`). ~~str~~ |
| `eow` <Tag variant="new">3.2</Tag> | The floret EOW string (default: `">"`). ~~str~~ |
| `attr` <Tag variant="new">3.6</Tag> | The token attribute for the vector keys (default: `"ORTH"`). ~~Union[int, str]~~ |
## Vectors.\_\_getitem\_\_ {id="getitem",tag="method"}
@ -453,8 +454,9 @@ Load state from a binary string.
## Attributes {id="attributes"}
| Name | Description |
| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `data` | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
| `key2row` | Dictionary mapping word hashes to rows in the `Vectors.data` table. ~~Dict[int, int]~~ |
| `keys` | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
| Name | Description |
| ----------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `data` | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
| `key2row` | Dictionary mapping word hashes to rows in the `Vectors.data` table. ~~Dict[int, int]~~ |
| `keys` | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
| `attr` <Tag variant="new">3.6</Tag> | The token attribute for the vector keys. ~~int~~ |

View File

@ -11,7 +11,6 @@ menu:
- ['Custom Functions', 'custom-functions']
- ['Initialization', 'initialization']
- ['Data Utilities', 'data']
- ['Parallel Training', 'parallel-training']
- ['Internal API', 'api']
---
@ -1565,77 +1564,6 @@ token-based annotations like the dependency parse or entity labels, you'll need
to take care to adjust the `Example` object so its annotations match and remain
valid.
## Parallel & distributed training with Ray {id="parallel-training"}
> #### Installation
>
> ```bash
> $ pip install -U %%SPACY_PKG_NAME[ray]%%SPACY_PKG_FLAGS
> # Check that the CLI is registered
> $ python -m spacy ray --help
> ```
[Ray](https://ray.io/) is a fast and simple framework for building and running
**distributed applications**. You can use Ray to train spaCy on one or more
remote machines, potentially speeding up your training process. Parallel
training won't always be faster though it depends on your batch size, models,
and hardware.
<Infobox variant="warning">
To use Ray with spaCy, you need the
[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
Installing the package will automatically add the `ray` command to the spaCy
CLI.
</Infobox>
The [`spacy ray train`](/api/cli#ray-train) command follows the same API as
[`spacy train`](/api/cli#train), with a few extra options to configure the Ray
setup. You can optionally set the `--address` option to point to your Ray
cluster. If it's not set, Ray will run locally.
```bash
python -m spacy ray train config.cfg --n-workers 2
```
<Project id="integrations/ray">
Get started with parallel training using our project template. It trains a
simple model on a Universal Dependencies Treebank and lets you parallelize the
training with Ray.
</Project>
### How parallel training works {id="parallel-training-details"}
Each worker receives a shard of the **data** and builds a copy of the **model
and optimizer** from the [`config.cfg`](#config). It also has a communication
channel to **pass gradients and parameters** to the other workers. Additionally,
each worker is given ownership of a subset of the parameter arrays. Every
parameter array is owned by exactly one worker, and the workers are given a
mapping so they know which worker owns which parameter.
![Illustration of setup](/images/spacy-ray.svg)
As training proceeds, every worker will be computing gradients for **all** of
the model parameters. When they compute gradients for parameters they don't own,
they'll **send them to the worker** that does own that parameter, along with a
version identifier so that the owner can decide whether to discard the gradient.
Workers use the gradients they receive and the ones they compute locally to
update the parameters they own, and then broadcast the updated array and a new
version ID to the other workers.
This training procedure is **asynchronous** and **non-blocking**. Workers always
push their gradient increments and parameter updates, they do not have to pull
them and block on the result, so the transfers can happen in the background,
overlapped with the actual training work. The workers also do not have to stop
and wait for each other ("synchronize") at the start of each batch. This is very
useful for spaCy, because spaCy is often trained on long documents, which means
**batches can vary in size** significantly. Uneven workloads make synchronous
gradient descent inefficient, because if one batch is slow, all of the other
workers are stuck waiting for it to complete before they can continue.
## Internal training API {id="api"}
<Infobox variant="danger">

View File

@ -4372,7 +4372,7 @@
"code_example": [
"import spacy",
"",
"nlp = spacy.load(\"en_core_web_sm\", disable=[\"ner\"])",
"nlp = spacy.load(\"en_core_web_sm\", exclude=[\"ner\"])",
"nlp.add_pipe(\"span_marker\", config={\"model\": \"tomaarsen/span-marker-roberta-large-ontonotes5\"})",
"",
"text = \"\"\"Cleopatra VII, also known as Cleopatra the Great, was the last active ruler of the \\",

View File

@ -13,6 +13,8 @@ import 'prismjs/components/prism-json.min.js'
import 'prismjs/components/prism-markdown.min.js'
import 'prismjs/components/prism-python.min.js'
import 'prismjs/components/prism-yaml.min.js'
import 'prismjs/components/prism-docker.min.js'
import 'prismjs/components/prism-r.min.js'
import { isString } from './util'
import Link, { OptionalLink } from './link'
@ -172,7 +174,7 @@ const convertLine = ({ line, prompt, lang }) => {
return handlePromot({ lineFlat, prompt })
}
return lang === 'none' || !lineFlat ? (
return lang === 'none' || !lineFlat || !(lang in Prism.languages) ? (
lineFlat
) : (
<span