mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge branch 'master' into feat/add-pipe-instance
This commit is contained in:
		
						commit
						8a79a71190
					
				| 
						 | 
				
			
			@ -32,6 +32,7 @@ def init_vectors_cli(
 | 
			
		|||
    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
 | 
			
		||||
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | 
			
		||||
    jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
 | 
			
		||||
    attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
 | 
			
		||||
    # fmt: on
 | 
			
		||||
):
 | 
			
		||||
    """Convert word vectors for use with spaCy. Will export an nlp object that
 | 
			
		||||
| 
						 | 
				
			
			@ -50,6 +51,7 @@ def init_vectors_cli(
 | 
			
		|||
        prune=prune,
 | 
			
		||||
        name=name,
 | 
			
		||||
        mode=mode,
 | 
			
		||||
        attr=attr,
 | 
			
		||||
    )
 | 
			
		||||
    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
 | 
			
		||||
    nlp.to_disk(output_dir)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -216,7 +216,10 @@ class Warnings(metaclass=ErrorsWithCodes):
 | 
			
		|||
    W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
 | 
			
		||||
            "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
 | 
			
		||||
    W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
 | 
			
		||||
    W125 = (
 | 
			
		||||
    W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
 | 
			
		||||
            "key attribute for vectors, configure it through Vectors(attr=) or "
 | 
			
		||||
            "'spacy init vectors --attr'")
 | 
			
		||||
    W126 = (
 | 
			
		||||
        "Pipe instance '{name}' is being added with a vocab "
 | 
			
		||||
        "instance that will not match other components. This is "
 | 
			
		||||
        "usually an error."
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -742,6 +742,11 @@ class Language:
 | 
			
		|||
                )
 | 
			
		||||
            )
 | 
			
		||||
        pipe = source.get_pipe(source_name)
 | 
			
		||||
        # There is no actual solution here. Either the component has the right
 | 
			
		||||
        # name for the source pipeline or the component has the right name for
 | 
			
		||||
        # the current pipeline. This prioritizes the current pipeline.
 | 
			
		||||
        if hasattr(pipe, "name"):
 | 
			
		||||
            pipe.name = name
 | 
			
		||||
        # Make sure the source config is interpolated so we don't end up with
 | 
			
		||||
        # orphaned variables in our final config
 | 
			
		||||
        source_config = source.config.interpolate()
 | 
			
		||||
| 
						 | 
				
			
			@ -822,6 +827,7 @@ class Language:
 | 
			
		|||
        self._pipe_meta[name] = self.get_factory_meta(factory_name)
 | 
			
		||||
        pipe_index = self._get_pipe_index(before, after, first, last)
 | 
			
		||||
        self._components.insert(pipe_index, (name, pipe_component))
 | 
			
		||||
        self._link_components()
 | 
			
		||||
        return pipe_component
 | 
			
		||||
 | 
			
		||||
    def add_pipe_instance(
 | 
			
		||||
| 
						 | 
				
			
			@ -1006,6 +1012,7 @@ class Language:
 | 
			
		|||
        if old_name in self._config["initialize"]["components"]:
 | 
			
		||||
            init_cfg = self._config["initialize"]["components"].pop(old_name)
 | 
			
		||||
            self._config["initialize"]["components"][new_name] = init_cfg
 | 
			
		||||
        self._link_components()
 | 
			
		||||
 | 
			
		||||
    def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]:
 | 
			
		||||
        """Remove a component from the pipeline.
 | 
			
		||||
| 
						 | 
				
			
			@ -1029,6 +1036,7 @@ class Language:
 | 
			
		|||
        # Make sure the name is also removed from the set of disabled components
 | 
			
		||||
        if name in self.disabled:
 | 
			
		||||
            self._disabled.remove(name)
 | 
			
		||||
        self._link_components()
 | 
			
		||||
        return removed
 | 
			
		||||
 | 
			
		||||
    def disable_pipe(self, name: str) -> None:
 | 
			
		||||
| 
						 | 
				
			
			@ -1757,8 +1765,16 @@ class Language:
 | 
			
		|||
        # The problem is we need to do it during deserialization...And the
 | 
			
		||||
        # components don't receive the pipeline then. So this does have to be
 | 
			
		||||
        # here :(
 | 
			
		||||
        # First, fix up all the internal component names in case they have
 | 
			
		||||
        # gotten out of sync due to sourcing components from different
 | 
			
		||||
        # pipelines, since find_listeners uses proc2.name for the listener
 | 
			
		||||
        # map.
 | 
			
		||||
        for name, proc in self.pipeline:
 | 
			
		||||
            if hasattr(proc, "name"):
 | 
			
		||||
                proc.name = name
 | 
			
		||||
        for i, (name1, proc1) in enumerate(self.pipeline):
 | 
			
		||||
            if isinstance(proc1, ty.ListenedToComponent):
 | 
			
		||||
                proc1.listener_map = {}
 | 
			
		||||
                for name2, proc2 in self.pipeline[i + 1 :]:
 | 
			
		||||
                    proc1.find_listeners(proc2)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1913,6 +1929,7 @@ class Language:
 | 
			
		|||
                        raw_config=raw_config,
 | 
			
		||||
                    )
 | 
			
		||||
                else:
 | 
			
		||||
                    assert "source" in pipe_cfg
 | 
			
		||||
                    # We need the sourced components to reference the same
 | 
			
		||||
                    # vocab without modifying the current vocab state **AND**
 | 
			
		||||
                    # we still want to load the source model vectors to perform
 | 
			
		||||
| 
						 | 
				
			
			@ -1932,6 +1949,10 @@ class Language:
 | 
			
		|||
                    source_name = pipe_cfg.get("component", pipe_name)
 | 
			
		||||
                    listeners_replaced = False
 | 
			
		||||
                    if "replace_listeners" in pipe_cfg:
 | 
			
		||||
                        # Make sure that the listened-to component has the
 | 
			
		||||
                        # state of the source pipeline listener map so that the
 | 
			
		||||
                        # replace_listeners method below works as intended.
 | 
			
		||||
                        source_nlps[model]._link_components()
 | 
			
		||||
                        for name, proc in source_nlps[model].pipeline:
 | 
			
		||||
                            if source_name in getattr(proc, "listening_components", []):
 | 
			
		||||
                                source_nlps[model].replace_listeners(
 | 
			
		||||
| 
						 | 
				
			
			@ -1943,6 +1964,8 @@ class Language:
 | 
			
		|||
                        nlp.add_pipe(
 | 
			
		||||
                            source_name, source=source_nlps[model], name=pipe_name
 | 
			
		||||
                        )
 | 
			
		||||
                        # At this point after nlp.add_pipe, the listener map
 | 
			
		||||
                        # corresponds to the new pipeline.
 | 
			
		||||
                    if model not in source_nlp_vectors_hashes:
 | 
			
		||||
                        source_nlp_vectors_hashes[model] = hash(
 | 
			
		||||
                            source_nlps[model].vocab.vectors.to_bytes(
 | 
			
		||||
| 
						 | 
				
			
			@ -1997,27 +2020,6 @@ class Language:
 | 
			
		|||
                raise ValueError(
 | 
			
		||||
                    Errors.E942.format(name="pipeline_creation", value=type(nlp))
 | 
			
		||||
                )
 | 
			
		||||
        # Detect components with listeners that are not frozen consistently
 | 
			
		||||
        for name, proc in nlp.pipeline:
 | 
			
		||||
            if isinstance(proc, ty.ListenedToComponent):
 | 
			
		||||
                # Remove listeners not in the pipeline
 | 
			
		||||
                listener_names = proc.listening_components
 | 
			
		||||
                unused_listener_names = [
 | 
			
		||||
                    ll for ll in listener_names if ll not in nlp.pipe_names
 | 
			
		||||
                ]
 | 
			
		||||
                for listener_name in unused_listener_names:
 | 
			
		||||
                    for listener in proc.listener_map.get(listener_name, []):
 | 
			
		||||
                        proc.remove_listener(listener, listener_name)
 | 
			
		||||
 | 
			
		||||
                for listener_name in proc.listening_components:
 | 
			
		||||
                    # e.g. tok2vec/transformer
 | 
			
		||||
                    # If it's a component sourced from another pipeline, we check if
 | 
			
		||||
                    # the tok2vec listeners should be replaced with standalone tok2vec
 | 
			
		||||
                    # models (e.g. so component can be frozen without its performance
 | 
			
		||||
                    # degrading when other components/tok2vec are updated)
 | 
			
		||||
                    paths = sourced.get(listener_name, {}).get("replace_listeners", [])
 | 
			
		||||
                    if paths:
 | 
			
		||||
                        nlp.replace_listeners(name, listener_name, paths)
 | 
			
		||||
        return nlp
 | 
			
		||||
 | 
			
		||||
    def replace_listeners(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,3 +1,4 @@
 | 
			
		|||
import warnings
 | 
			
		||||
from typing import Callable, List, Optional, Sequence, Tuple, cast
 | 
			
		||||
 | 
			
		||||
from thinc.api import Model, Ops, registry
 | 
			
		||||
| 
						 | 
				
			
			@ -5,7 +6,8 @@ from thinc.initializers import glorot_uniform_init
 | 
			
		|||
from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
 | 
			
		||||
from thinc.util import partial
 | 
			
		||||
 | 
			
		||||
from ..errors import Errors
 | 
			
		||||
from ..attrs import ORTH
 | 
			
		||||
from ..errors import Errors, Warnings
 | 
			
		||||
from ..tokens import Doc
 | 
			
		||||
from ..vectors import Mode
 | 
			
		||||
from ..vocab import Vocab
 | 
			
		||||
| 
						 | 
				
			
			@ -24,6 +26,8 @@ def StaticVectors(
 | 
			
		|||
    linear projection to control the dimensionality. If a dropout rate is
 | 
			
		||||
    specified, the dropout is applied per dimension over the whole batch.
 | 
			
		||||
    """
 | 
			
		||||
    if key_attr != "ORTH":
 | 
			
		||||
        warnings.warn(Warnings.W125, DeprecationWarning)
 | 
			
		||||
    return Model(
 | 
			
		||||
        "static_vectors",
 | 
			
		||||
        forward,
 | 
			
		||||
| 
						 | 
				
			
			@ -40,9 +44,9 @@ def forward(
 | 
			
		|||
    token_count = sum(len(doc) for doc in docs)
 | 
			
		||||
    if not token_count:
 | 
			
		||||
        return _handle_empty(model.ops, model.get_dim("nO"))
 | 
			
		||||
    key_attr: int = model.attrs["key_attr"]
 | 
			
		||||
    keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
 | 
			
		||||
    vocab: Vocab = docs[0].vocab
 | 
			
		||||
    key_attr: int = getattr(vocab.vectors, "attr", ORTH)
 | 
			
		||||
    keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
 | 
			
		||||
    W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
 | 
			
		||||
    if vocab.vectors.mode == Mode.default:
 | 
			
		||||
        V = model.ops.asarray(vocab.vectors.data)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -53,9 +53,9 @@ DEFAULT_SPAN_FINDER_MODEL = Config().from_str(span_finder_default_config)["model
 | 
			
		|||
        "scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
 | 
			
		||||
    },
 | 
			
		||||
    default_score_weights={
 | 
			
		||||
        f"span_finder_{DEFAULT_SPANS_KEY}_f": 1.0,
 | 
			
		||||
        f"span_finder_{DEFAULT_SPANS_KEY}_p": 0.0,
 | 
			
		||||
        f"span_finder_{DEFAULT_SPANS_KEY}_r": 0.0,
 | 
			
		||||
        f"spans_{DEFAULT_SPANS_KEY}_f": 1.0,
 | 
			
		||||
        f"spans_{DEFAULT_SPANS_KEY}_p": 0.0,
 | 
			
		||||
        f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
 | 
			
		||||
    },
 | 
			
		||||
)
 | 
			
		||||
def make_span_finder(
 | 
			
		||||
| 
						 | 
				
			
			@ -104,7 +104,7 @@ def make_span_finder_scorer():
 | 
			
		|||
 | 
			
		||||
def span_finder_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
 | 
			
		||||
    kwargs = dict(kwargs)
 | 
			
		||||
    attr_prefix = "span_finder_"
 | 
			
		||||
    attr_prefix = "spans_"
 | 
			
		||||
    key = kwargs["spans_key"]
 | 
			
		||||
    kwargs.setdefault("attr", f"{attr_prefix}{key}")
 | 
			
		||||
    kwargs.setdefault(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -230,10 +230,10 @@ def test_overfitting_IO():
 | 
			
		|||
 | 
			
		||||
    # Test scoring
 | 
			
		||||
    scores = nlp.evaluate(train_examples)
 | 
			
		||||
    assert f"span_finder_{SPANS_KEY}_f" in scores
 | 
			
		||||
    assert f"spans_{SPANS_KEY}_f" in scores
 | 
			
		||||
    # It's not perfect 1.0 F1 because it's designed to overgenerate for now.
 | 
			
		||||
    assert scores[f"span_finder_{SPANS_KEY}_p"] == 0.75
 | 
			
		||||
    assert scores[f"span_finder_{SPANS_KEY}_r"] == 1.0
 | 
			
		||||
    assert scores[f"spans_{SPANS_KEY}_p"] == 0.75
 | 
			
		||||
    assert scores[f"spans_{SPANS_KEY}_r"] == 1.0
 | 
			
		||||
 | 
			
		||||
    # also test that the spancat works for just a single entity in a sentence
 | 
			
		||||
    doc = nlp("London")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -192,8 +192,7 @@ def test_tok2vec_listener(with_vectors):
 | 
			
		|||
        for tag in t[1]["tags"]:
 | 
			
		||||
            tagger.add_label(tag)
 | 
			
		||||
 | 
			
		||||
    # Check that the Tok2Vec component finds it listeners
 | 
			
		||||
    assert tok2vec.listeners == []
 | 
			
		||||
    # Check that the Tok2Vec component finds its listeners
 | 
			
		||||
    optimizer = nlp.initialize(lambda: train_examples)
 | 
			
		||||
    assert tok2vec.listeners == [tagger_tok2vec]
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -221,7 +220,6 @@ def test_tok2vec_listener_callback():
 | 
			
		|||
    assert nlp.pipe_names == ["tok2vec", "tagger"]
 | 
			
		||||
    tagger = nlp.get_pipe("tagger")
 | 
			
		||||
    tok2vec = nlp.get_pipe("tok2vec")
 | 
			
		||||
    nlp._link_components()
 | 
			
		||||
    docs = [nlp.make_doc("A random sentence")]
 | 
			
		||||
    tok2vec.model.initialize(X=docs)
 | 
			
		||||
    gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs]
 | 
			
		||||
| 
						 | 
				
			
			@ -430,29 +428,46 @@ def test_replace_listeners_from_config():
 | 
			
		|||
        nlp.to_disk(dir_path)
 | 
			
		||||
        base_model = str(dir_path)
 | 
			
		||||
        new_config = {
 | 
			
		||||
            "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]},
 | 
			
		||||
            "nlp": {
 | 
			
		||||
                "lang": "en",
 | 
			
		||||
                "pipeline": ["tok2vec", "tagger2", "ner3", "tagger4"],
 | 
			
		||||
            },
 | 
			
		||||
            "components": {
 | 
			
		||||
                "tok2vec": {"source": base_model},
 | 
			
		||||
                "tagger": {
 | 
			
		||||
                "tagger2": {
 | 
			
		||||
                    "source": base_model,
 | 
			
		||||
                    "component": "tagger",
 | 
			
		||||
                    "replace_listeners": ["model.tok2vec"],
 | 
			
		||||
                },
 | 
			
		||||
                "ner": {"source": base_model},
 | 
			
		||||
                "ner3": {
 | 
			
		||||
                    "source": base_model,
 | 
			
		||||
                    "component": "ner",
 | 
			
		||||
                },
 | 
			
		||||
                "tagger4": {
 | 
			
		||||
                    "source": base_model,
 | 
			
		||||
                    "component": "tagger",
 | 
			
		||||
                },
 | 
			
		||||
            },
 | 
			
		||||
        }
 | 
			
		||||
        new_nlp = util.load_model_from_config(new_config, auto_fill=True)
 | 
			
		||||
    new_nlp.initialize(lambda: examples)
 | 
			
		||||
    tok2vec = new_nlp.get_pipe("tok2vec")
 | 
			
		||||
    tagger = new_nlp.get_pipe("tagger")
 | 
			
		||||
    ner = new_nlp.get_pipe("ner")
 | 
			
		||||
    assert tok2vec.listening_components == ["ner"]
 | 
			
		||||
    tagger = new_nlp.get_pipe("tagger2")
 | 
			
		||||
    ner = new_nlp.get_pipe("ner3")
 | 
			
		||||
    assert "ner" not in new_nlp.pipe_names
 | 
			
		||||
    assert "tagger" not in new_nlp.pipe_names
 | 
			
		||||
    assert tok2vec.listening_components == ["ner3", "tagger4"]
 | 
			
		||||
    assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk())
 | 
			
		||||
    assert not any(isinstance(node, Tok2VecListener) for node in tagger.model.walk())
 | 
			
		||||
    t2v_cfg = new_nlp.config["components"]["tok2vec"]["model"]
 | 
			
		||||
    assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2"
 | 
			
		||||
    assert new_nlp.config["components"]["tagger"]["model"]["tok2vec"] == t2v_cfg
 | 
			
		||||
    assert new_nlp.config["components"]["tagger2"]["model"]["tok2vec"] == t2v_cfg
 | 
			
		||||
    assert (
 | 
			
		||||
        new_nlp.config["components"]["ner"]["model"]["tok2vec"]["@architectures"]
 | 
			
		||||
        new_nlp.config["components"]["ner3"]["model"]["tok2vec"]["@architectures"]
 | 
			
		||||
        == "spacy.Tok2VecListener.v1"
 | 
			
		||||
    )
 | 
			
		||||
    assert (
 | 
			
		||||
        new_nlp.config["components"]["tagger4"]["model"]["tok2vec"]["@architectures"]
 | 
			
		||||
        == "spacy.Tok2VecListener.v1"
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -544,3 +559,57 @@ def test_tok2vec_listeners_textcat():
 | 
			
		|||
    assert cats1["imperative"] < 0.9
 | 
			
		||||
    assert [t.tag_ for t in docs[0]] == ["V", "J", "N"]
 | 
			
		||||
    assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_tok2vec_listener_source_link_name():
 | 
			
		||||
    """The component's internal name and the tok2vec listener map correspond
 | 
			
		||||
    to the most recently modified pipeline.
 | 
			
		||||
    """
 | 
			
		||||
    orig_config = Config().from_str(cfg_string_multi)
 | 
			
		||||
    nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
 | 
			
		||||
    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
 | 
			
		||||
 | 
			
		||||
    nlp2 = English()
 | 
			
		||||
    nlp2.add_pipe("tok2vec", source=nlp1)
 | 
			
		||||
    nlp2.add_pipe("tagger", name="tagger2", source=nlp1)
 | 
			
		||||
 | 
			
		||||
    # there is no way to have the component have the right name for both
 | 
			
		||||
    # pipelines, right now the most recently modified pipeline is prioritized
 | 
			
		||||
    assert nlp1.get_pipe("tagger").name == nlp2.get_pipe("tagger2").name == "tagger2"
 | 
			
		||||
 | 
			
		||||
    # there is no way to have the tok2vec have the right listener map for both
 | 
			
		||||
    # pipelines, right now the most recently modified pipeline is prioritized
 | 
			
		||||
    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
 | 
			
		||||
    nlp2.add_pipe("ner", name="ner3", source=nlp1)
 | 
			
		||||
    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2", "ner3"]
 | 
			
		||||
    nlp2.remove_pipe("ner3")
 | 
			
		||||
    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
 | 
			
		||||
    nlp2.remove_pipe("tagger2")
 | 
			
		||||
    assert nlp2.get_pipe("tok2vec").listening_components == []
 | 
			
		||||
 | 
			
		||||
    # at this point the tok2vec component corresponds to nlp2
 | 
			
		||||
    assert nlp1.get_pipe("tok2vec").listening_components == []
 | 
			
		||||
 | 
			
		||||
    # modifying the nlp1 pipeline syncs the tok2vec listener map back to nlp1
 | 
			
		||||
    nlp1.add_pipe("sentencizer")
 | 
			
		||||
    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
 | 
			
		||||
 | 
			
		||||
    # modifying nlp2 syncs it back to nlp2
 | 
			
		||||
    nlp2.add_pipe("sentencizer")
 | 
			
		||||
    assert nlp1.get_pipe("tok2vec").listening_components == []
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_tok2vec_listener_source_replace_listeners():
 | 
			
		||||
    orig_config = Config().from_str(cfg_string_multi)
 | 
			
		||||
    nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
 | 
			
		||||
    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
 | 
			
		||||
    nlp1.replace_listeners("tok2vec", "tagger", ["model.tok2vec"])
 | 
			
		||||
    assert nlp1.get_pipe("tok2vec").listening_components == ["ner"]
 | 
			
		||||
 | 
			
		||||
    nlp2 = English()
 | 
			
		||||
    nlp2.add_pipe("tok2vec", source=nlp1)
 | 
			
		||||
    assert nlp2.get_pipe("tok2vec").listening_components == []
 | 
			
		||||
    nlp2.add_pipe("tagger", source=nlp1)
 | 
			
		||||
    assert nlp2.get_pipe("tok2vec").listening_components == []
 | 
			
		||||
    nlp2.add_pipe("ner", name="ner2", source=nlp1)
 | 
			
		||||
    assert nlp2.get_pipe("tok2vec").listening_components == ["ner2"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -13,6 +13,7 @@ from spacy.ml.models import (
 | 
			
		|||
    build_Tok2Vec_model,
 | 
			
		||||
)
 | 
			
		||||
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
 | 
			
		||||
from spacy.training import Example
 | 
			
		||||
from spacy.util import (
 | 
			
		||||
    load_config,
 | 
			
		||||
    load_config_from_str,
 | 
			
		||||
| 
						 | 
				
			
			@ -422,6 +423,55 @@ def test_config_overrides():
 | 
			
		|||
    assert nlp.pipe_names == ["tok2vec", "tagger"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.filterwarnings("ignore:\\[W036")
 | 
			
		||||
def test_config_overrides_registered_functions():
 | 
			
		||||
    nlp = spacy.blank("en")
 | 
			
		||||
    nlp.add_pipe("attribute_ruler")
 | 
			
		||||
    with make_tempdir() as d:
 | 
			
		||||
        nlp.to_disk(d)
 | 
			
		||||
        nlp_re1 = spacy.load(
 | 
			
		||||
            d,
 | 
			
		||||
            config={
 | 
			
		||||
                "components": {
 | 
			
		||||
                    "attribute_ruler": {
 | 
			
		||||
                        "scorer": {"@scorers": "spacy.tagger_scorer.v1"}
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
        )
 | 
			
		||||
        assert (
 | 
			
		||||
            nlp_re1.config["components"]["attribute_ruler"]["scorer"]["@scorers"]
 | 
			
		||||
            == "spacy.tagger_scorer.v1"
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        @registry.misc("test_some_other_key")
 | 
			
		||||
        def misc_some_other_key():
 | 
			
		||||
            return "some_other_key"
 | 
			
		||||
 | 
			
		||||
        nlp_re2 = spacy.load(
 | 
			
		||||
            d,
 | 
			
		||||
            config={
 | 
			
		||||
                "components": {
 | 
			
		||||
                    "attribute_ruler": {
 | 
			
		||||
                        "scorer": {
 | 
			
		||||
                            "@scorers": "spacy.overlapping_labeled_spans_scorer.v1",
 | 
			
		||||
                            "spans_key": {"@misc": "test_some_other_key"},
 | 
			
		||||
                        }
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
        )
 | 
			
		||||
        assert nlp_re2.config["components"]["attribute_ruler"]["scorer"][
 | 
			
		||||
            "spans_key"
 | 
			
		||||
        ] == {"@misc": "test_some_other_key"}
 | 
			
		||||
        # run dummy evaluation (will return None scores) in order to test that
 | 
			
		||||
        # the spans_key value in the nested override is working as intended in
 | 
			
		||||
        # the config
 | 
			
		||||
        example = Example.from_dict(nlp_re2.make_doc("a b c"), {})
 | 
			
		||||
        scores = nlp_re2.evaluate([example])
 | 
			
		||||
        assert "spans_some_other_key_f" in scores
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_config_interpolation():
 | 
			
		||||
    config = Config().from_str(nlp_config_string, interpolate=False)
 | 
			
		||||
    assert config["corpora"]["train"]["path"] == "${paths.train}"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -252,6 +252,10 @@ def test_minor_version(a1, a2, b1, b2, is_match):
 | 
			
		|||
            {"training.batch_size": 128, "training.optimizer.learn_rate": 0.01},
 | 
			
		||||
            {"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}},
 | 
			
		||||
        ),
 | 
			
		||||
        (
 | 
			
		||||
            {"attribute_ruler.scorer.@scorers": "spacy.tagger_scorer.v1"},
 | 
			
		||||
            {"attribute_ruler": {"scorer": {"@scorers": "spacy.tagger_scorer.v1"}}},
 | 
			
		||||
        ),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_dot_to_dict(dot_notation, expected):
 | 
			
		||||
| 
						 | 
				
			
			@ -260,6 +264,29 @@ def test_dot_to_dict(dot_notation, expected):
 | 
			
		|||
    assert util.dict_to_dot(result) == dot_notation
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "dot_notation,expected",
 | 
			
		||||
    [
 | 
			
		||||
        (
 | 
			
		||||
            {"token.pos": True, "token._.xyz": True},
 | 
			
		||||
            {"token": {"pos": True, "_": {"xyz": True}}},
 | 
			
		||||
        ),
 | 
			
		||||
        (
 | 
			
		||||
            {"training.batch_size": 128, "training.optimizer.learn_rate": 0.01},
 | 
			
		||||
            {"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}},
 | 
			
		||||
        ),
 | 
			
		||||
        (
 | 
			
		||||
            {"attribute_ruler.scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
 | 
			
		||||
            {"attribute_ruler": {"scorer": {"@scorers": "spacy.tagger_scorer.v1"}}},
 | 
			
		||||
        ),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_dot_to_dict_overrides(dot_notation, expected):
 | 
			
		||||
    result = util.dot_to_dict(dot_notation)
 | 
			
		||||
    assert result == expected
 | 
			
		||||
    assert util.dict_to_dot(result, for_overrides=True) == dot_notation
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_set_dot_to_object():
 | 
			
		||||
    config = {"foo": {"bar": 1, "baz": {"x": "y"}}, "test": {"a": {"b": "c"}}}
 | 
			
		||||
    with pytest.raises(KeyError):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -402,6 +402,7 @@ def test_vectors_serialize():
 | 
			
		|||
        row_r = v_r.add("D", vector=OPS.asarray([10, 20, 30, 40], dtype="f"))
 | 
			
		||||
        assert row == row_r
 | 
			
		||||
        assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data))
 | 
			
		||||
        assert v.attr == v_r.attr
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_vector_is_oov():
 | 
			
		||||
| 
						 | 
				
			
			@ -646,3 +647,32 @@ def test_equality():
 | 
			
		|||
    vectors1.resize((5, 9))
 | 
			
		||||
    vectors2.resize((5, 9))
 | 
			
		||||
    assert vectors1 == vectors2
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_vectors_attr():
 | 
			
		||||
    data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
 | 
			
		||||
    # default ORTH
 | 
			
		||||
    nlp = English()
 | 
			
		||||
    nlp.vocab.vectors = Vectors(data=data, keys=["A", "B", "C"])
 | 
			
		||||
    assert nlp.vocab.strings["A"] in nlp.vocab.vectors.key2row
 | 
			
		||||
    assert nlp.vocab.strings["a"] not in nlp.vocab.vectors.key2row
 | 
			
		||||
    assert nlp.vocab["A"].has_vector is True
 | 
			
		||||
    assert nlp.vocab["a"].has_vector is False
 | 
			
		||||
    assert nlp("A")[0].has_vector is True
 | 
			
		||||
    assert nlp("a")[0].has_vector is False
 | 
			
		||||
 | 
			
		||||
    # custom LOWER
 | 
			
		||||
    nlp = English()
 | 
			
		||||
    nlp.vocab.vectors = Vectors(data=data, keys=["a", "b", "c"], attr="LOWER")
 | 
			
		||||
    assert nlp.vocab.strings["A"] not in nlp.vocab.vectors.key2row
 | 
			
		||||
    assert nlp.vocab.strings["a"] in nlp.vocab.vectors.key2row
 | 
			
		||||
    assert nlp.vocab["A"].has_vector is True
 | 
			
		||||
    assert nlp.vocab["a"].has_vector is True
 | 
			
		||||
    assert nlp("A")[0].has_vector is True
 | 
			
		||||
    assert nlp("a")[0].has_vector is True
 | 
			
		||||
    # add a new vectors entry
 | 
			
		||||
    assert nlp.vocab["D"].has_vector is False
 | 
			
		||||
    assert nlp.vocab["d"].has_vector is False
 | 
			
		||||
    nlp.vocab.set_vector("D", numpy.asarray([4, 5, 6]))
 | 
			
		||||
    assert nlp.vocab["D"].has_vector is True
 | 
			
		||||
    assert nlp.vocab["d"].has_vector is True
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -35,6 +35,7 @@ from ..attrs cimport (
 | 
			
		|||
    LENGTH,
 | 
			
		||||
    MORPH,
 | 
			
		||||
    NORM,
 | 
			
		||||
    ORTH,
 | 
			
		||||
    POS,
 | 
			
		||||
    SENT_START,
 | 
			
		||||
    SPACY,
 | 
			
		||||
| 
						 | 
				
			
			@ -613,13 +614,26 @@ cdef class Doc:
 | 
			
		|||
        """
 | 
			
		||||
        if "similarity" in self.user_hooks:
 | 
			
		||||
            return self.user_hooks["similarity"](self, other)
 | 
			
		||||
        if isinstance(other, (Lexeme, Token)) and self.length == 1:
 | 
			
		||||
            if self.c[0].lex.orth == other.orth:
 | 
			
		||||
        attr = getattr(self.vocab.vectors, "attr", ORTH)
 | 
			
		||||
        cdef Token this_token
 | 
			
		||||
        cdef Token other_token
 | 
			
		||||
        cdef Lexeme other_lex
 | 
			
		||||
        if len(self) == 1 and isinstance(other, Token):
 | 
			
		||||
            this_token = self[0]
 | 
			
		||||
            other_token = other
 | 
			
		||||
            if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr):
 | 
			
		||||
                return 1.0
 | 
			
		||||
        elif isinstance(other, (Span, Doc)) and len(self) == len(other):
 | 
			
		||||
        elif len(self) == 1 and isinstance(other, Lexeme):
 | 
			
		||||
            this_token = self[0]
 | 
			
		||||
            other_lex = other
 | 
			
		||||
            if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr):
 | 
			
		||||
                return 1.0
 | 
			
		||||
        elif isinstance(other, (Doc, Span)) and len(self) == len(other):
 | 
			
		||||
            similar = True
 | 
			
		||||
            for i in range(self.length):
 | 
			
		||||
                if self[i].orth != other[i].orth:
 | 
			
		||||
            for i in range(len(self)):
 | 
			
		||||
                this_token = self[i]
 | 
			
		||||
                other_token = other[i]
 | 
			
		||||
                if Token.get_struct_attr(this_token.c, attr) != Token.get_struct_attr(other_token.c, attr):
 | 
			
		||||
                    similar = False
 | 
			
		||||
                    break
 | 
			
		||||
            if similar:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,13 +8,14 @@ import numpy
 | 
			
		|||
from thinc.api import get_array_module
 | 
			
		||||
 | 
			
		||||
from ..attrs cimport *
 | 
			
		||||
from ..attrs cimport attr_id_t
 | 
			
		||||
from ..attrs cimport ORTH, attr_id_t
 | 
			
		||||
from ..lexeme cimport Lexeme
 | 
			
		||||
from ..parts_of_speech cimport univ_pos_t
 | 
			
		||||
from ..structs cimport LexemeC, TokenC
 | 
			
		||||
from ..symbols cimport dep
 | 
			
		||||
from ..typedefs cimport attr_t, flags_t, hash_t
 | 
			
		||||
from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
 | 
			
		||||
from .token cimport Token
 | 
			
		||||
 | 
			
		||||
from ..errors import Errors, Warnings
 | 
			
		||||
from ..util import normalize_slice
 | 
			
		||||
| 
						 | 
				
			
			@ -341,13 +342,26 @@ cdef class Span:
 | 
			
		|||
        """
 | 
			
		||||
        if "similarity" in self.doc.user_span_hooks:
 | 
			
		||||
            return self.doc.user_span_hooks["similarity"](self, other)
 | 
			
		||||
        if len(self) == 1 and hasattr(other, "orth"):
 | 
			
		||||
            if self[0].orth == other.orth:
 | 
			
		||||
        attr = getattr(self.doc.vocab.vectors, "attr", ORTH)
 | 
			
		||||
        cdef Token this_token
 | 
			
		||||
        cdef Token other_token
 | 
			
		||||
        cdef Lexeme other_lex
 | 
			
		||||
        if len(self) == 1 and isinstance(other, Token):
 | 
			
		||||
            this_token = self[0]
 | 
			
		||||
            other_token = other
 | 
			
		||||
            if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr):
 | 
			
		||||
                return 1.0
 | 
			
		||||
        elif len(self) == 1 and isinstance(other, Lexeme):
 | 
			
		||||
            this_token = self[0]
 | 
			
		||||
            other_lex = other
 | 
			
		||||
            if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr):
 | 
			
		||||
                return 1.0
 | 
			
		||||
        elif isinstance(other, (Doc, Span)) and len(self) == len(other):
 | 
			
		||||
            similar = True
 | 
			
		||||
            for i in range(len(self)):
 | 
			
		||||
                if self[i].orth != getattr(other[i], "orth", None):
 | 
			
		||||
                this_token = self[i]
 | 
			
		||||
                other_token = other[i]
 | 
			
		||||
                if Token.get_struct_attr(this_token.c, attr) != Token.get_struct_attr(other_token.c, attr):
 | 
			
		||||
                    similar = False
 | 
			
		||||
                    break
 | 
			
		||||
            if similar:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -28,6 +28,7 @@ from ..attrs cimport (
 | 
			
		|||
    LIKE_EMAIL,
 | 
			
		||||
    LIKE_NUM,
 | 
			
		||||
    LIKE_URL,
 | 
			
		||||
    ORTH,
 | 
			
		||||
)
 | 
			
		||||
from ..lexeme cimport Lexeme
 | 
			
		||||
from ..symbols cimport conj
 | 
			
		||||
| 
						 | 
				
			
			@ -214,11 +215,17 @@ cdef class Token:
 | 
			
		|||
        """
 | 
			
		||||
        if "similarity" in self.doc.user_token_hooks:
 | 
			
		||||
            return self.doc.user_token_hooks["similarity"](self, other)
 | 
			
		||||
        if hasattr(other, "__len__") and len(other) == 1 and hasattr(other, "__getitem__"):
 | 
			
		||||
            if self.c.lex.orth == getattr(other[0], "orth", None):
 | 
			
		||||
        attr = getattr(self.doc.vocab.vectors, "attr", ORTH)
 | 
			
		||||
        cdef Token this_token = self
 | 
			
		||||
        cdef Token other_token
 | 
			
		||||
        cdef Lexeme other_lex
 | 
			
		||||
        if isinstance(other, Token):
 | 
			
		||||
            other_token = other
 | 
			
		||||
            if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr):
 | 
			
		||||
                return 1.0
 | 
			
		||||
        elif hasattr(other, "orth"):
 | 
			
		||||
            if self.c.lex.orth == other.orth:
 | 
			
		||||
        elif isinstance(other, Lexeme):
 | 
			
		||||
            other_lex = other
 | 
			
		||||
            if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr):
 | 
			
		||||
                return 1.0
 | 
			
		||||
        if self.vocab.vectors.n_keys == 0:
 | 
			
		||||
            warnings.warn(Warnings.W007.format(obj="Token"))
 | 
			
		||||
| 
						 | 
				
			
			@ -415,7 +422,7 @@ cdef class Token:
 | 
			
		|||
            return self.doc.user_token_hooks["has_vector"](self)
 | 
			
		||||
        if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
 | 
			
		||||
            return True
 | 
			
		||||
        return self.vocab.has_vector(self.c.lex.orth)
 | 
			
		||||
        return self.vocab.has_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def vector(self):
 | 
			
		||||
| 
						 | 
				
			
			@ -431,7 +438,7 @@ cdef class Token:
 | 
			
		|||
        if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
 | 
			
		||||
            return self.doc.tensor[self.i]
 | 
			
		||||
        else:
 | 
			
		||||
            return self.vocab.get_vector(self.c.lex.orth)
 | 
			
		||||
            return self.vocab.get_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def vector_norm(self):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -76,7 +76,8 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
 | 
			
		|||
        with nlp.select_pipes(enable=resume_components):
 | 
			
		||||
            logger.info("Resuming training for: %s", resume_components)
 | 
			
		||||
            nlp.resume_training(sgd=optimizer)
 | 
			
		||||
    # Make sure that listeners are defined before initializing further
 | 
			
		||||
    # Make sure that internal component names are synced and listeners are
 | 
			
		||||
    # defined before initializing further
 | 
			
		||||
    nlp._link_components()
 | 
			
		||||
    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
 | 
			
		||||
        if T["max_epochs"] == -1:
 | 
			
		||||
| 
						 | 
				
			
			@ -215,9 +216,14 @@ def convert_vectors(
 | 
			
		|||
    prune: int,
 | 
			
		||||
    name: Optional[str] = None,
 | 
			
		||||
    mode: str = VectorsMode.default,
 | 
			
		||||
    attr: str = "ORTH",
 | 
			
		||||
) -> None:
 | 
			
		||||
    vectors_loc = ensure_path(vectors_loc)
 | 
			
		||||
    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
 | 
			
		||||
        if attr != "ORTH":
 | 
			
		||||
            raise ValueError(
 | 
			
		||||
                "ORTH is the only attribute supported for vectors in .npz format."
 | 
			
		||||
            )
 | 
			
		||||
        nlp.vocab.vectors = Vectors(
 | 
			
		||||
            strings=nlp.vocab.strings, data=numpy.load(vectors_loc.open("rb"))
 | 
			
		||||
        )
 | 
			
		||||
| 
						 | 
				
			
			@ -245,11 +251,15 @@ def convert_vectors(
 | 
			
		|||
                nlp.vocab.vectors = Vectors(
 | 
			
		||||
                    strings=nlp.vocab.strings,
 | 
			
		||||
                    data=vectors_data,
 | 
			
		||||
                    attr=attr,
 | 
			
		||||
                    **floret_settings,
 | 
			
		||||
                )
 | 
			
		||||
            else:
 | 
			
		||||
                nlp.vocab.vectors = Vectors(
 | 
			
		||||
                    strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys
 | 
			
		||||
                    strings=nlp.vocab.strings,
 | 
			
		||||
                    data=vectors_data,
 | 
			
		||||
                    keys=vector_keys,
 | 
			
		||||
                    attr=attr,
 | 
			
		||||
                )
 | 
			
		||||
                nlp.vocab.deduplicate_vectors()
 | 
			
		||||
    if name is None:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -547,7 +547,7 @@ def load_model_from_path(
 | 
			
		|||
    if not meta:
 | 
			
		||||
        meta = get_model_meta(model_path)
 | 
			
		||||
    config_path = model_path / "config.cfg"
 | 
			
		||||
    overrides = dict_to_dot(config)
 | 
			
		||||
    overrides = dict_to_dot(config, for_overrides=True)
 | 
			
		||||
    config = load_config(config_path, overrides=overrides)
 | 
			
		||||
    nlp = load_model_from_config(
 | 
			
		||||
        config,
 | 
			
		||||
| 
						 | 
				
			
			@ -1525,14 +1525,19 @@ def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
 | 
			
		|||
    return result
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def dict_to_dot(obj: Dict[str, dict]) -> Dict[str, Any]:
 | 
			
		||||
def dict_to_dot(obj: Dict[str, dict], *, for_overrides: bool = False) -> Dict[str, Any]:
 | 
			
		||||
    """Convert dot notation to a dict. For example: {"token": {"pos": True,
 | 
			
		||||
    "_": {"xyz": True }}} becomes {"token.pos": True, "token._.xyz": True}.
 | 
			
		||||
 | 
			
		||||
    values (Dict[str, dict]): The dict to convert.
 | 
			
		||||
    obj (Dict[str, dict]): The dict to convert.
 | 
			
		||||
    for_overrides (bool): Whether to enable special handling for registered
 | 
			
		||||
        functions in overrides.
 | 
			
		||||
    RETURNS (Dict[str, Any]): The key/value pairs.
 | 
			
		||||
    """
 | 
			
		||||
    return {".".join(key): value for key, value in walk_dict(obj)}
 | 
			
		||||
    return {
 | 
			
		||||
        ".".join(key): value
 | 
			
		||||
        for key, value in walk_dict(obj, for_overrides=for_overrides)
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def dot_to_object(config: Config, section: str):
 | 
			
		||||
| 
						 | 
				
			
			@ -1574,13 +1579,20 @@ def set_dot_to_object(config: Config, section: str, value: Any) -> None:
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def walk_dict(
 | 
			
		||||
    node: Dict[str, Any], parent: List[str] = []
 | 
			
		||||
    node: Dict[str, Any], parent: List[str] = [], *, for_overrides: bool = False
 | 
			
		||||
) -> Iterator[Tuple[List[str], Any]]:
 | 
			
		||||
    """Walk a dict and yield the path and values of the leaves."""
 | 
			
		||||
    """Walk a dict and yield the path and values of the leaves.
 | 
			
		||||
 | 
			
		||||
    for_overrides (bool): Whether to treat registered functions that start with
 | 
			
		||||
        @ as final values rather than dicts to traverse.
 | 
			
		||||
    """
 | 
			
		||||
    for key, value in node.items():
 | 
			
		||||
        key_parent = [*parent, key]
 | 
			
		||||
        if isinstance(value, dict):
 | 
			
		||||
            yield from walk_dict(value, key_parent)
 | 
			
		||||
        if isinstance(value, dict) and (
 | 
			
		||||
            not for_overrides
 | 
			
		||||
            or not any(value_key.startswith("@") for value_key in value)
 | 
			
		||||
        ):
 | 
			
		||||
            yield from walk_dict(value, key_parent, for_overrides=for_overrides)
 | 
			
		||||
        else:
 | 
			
		||||
            yield (key_parent, value)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -15,9 +15,11 @@ from thinc.api import Ops, get_array_module, get_current_ops
 | 
			
		|||
from thinc.backends import get_array_ops
 | 
			
		||||
from thinc.types import Floats2d
 | 
			
		||||
 | 
			
		||||
from .attrs cimport ORTH, attr_id_t
 | 
			
		||||
from .strings cimport StringStore
 | 
			
		||||
 | 
			
		||||
from . import util
 | 
			
		||||
from .attrs import IDS
 | 
			
		||||
from .errors import Errors, Warnings
 | 
			
		||||
from .strings import get_string_id
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -64,8 +66,9 @@ cdef class Vectors:
 | 
			
		|||
    cdef readonly uint32_t hash_seed
 | 
			
		||||
    cdef readonly unicode bow
 | 
			
		||||
    cdef readonly unicode eow
 | 
			
		||||
    cdef readonly attr_id_t attr
 | 
			
		||||
 | 
			
		||||
    def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
 | 
			
		||||
    def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">", attr="ORTH"):
 | 
			
		||||
        """Create a new vector store.
 | 
			
		||||
 | 
			
		||||
        strings (StringStore): The string store.
 | 
			
		||||
| 
						 | 
				
			
			@ -80,6 +83,8 @@ cdef class Vectors:
 | 
			
		|||
        hash_seed (int): The floret hash seed (default: 0).
 | 
			
		||||
        bow (str): The floret BOW string (default: "<").
 | 
			
		||||
        eow (str): The floret EOW string (default: ">").
 | 
			
		||||
        attr (Union[int, str]): The token attribute for the vector keys
 | 
			
		||||
            (default: "ORTH").
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/vectors#init
 | 
			
		||||
        """
 | 
			
		||||
| 
						 | 
				
			
			@ -103,6 +108,14 @@ cdef class Vectors:
 | 
			
		|||
        self.hash_seed = hash_seed
 | 
			
		||||
        self.bow = bow
 | 
			
		||||
        self.eow = eow
 | 
			
		||||
        if isinstance(attr, (int, long)):
 | 
			
		||||
            self.attr = attr
 | 
			
		||||
        else:
 | 
			
		||||
            attr = attr.upper()
 | 
			
		||||
            if attr == "TEXT":
 | 
			
		||||
                attr = "ORTH"
 | 
			
		||||
            self.attr = IDS.get(attr, ORTH)
 | 
			
		||||
 | 
			
		||||
        if self.mode == Mode.default:
 | 
			
		||||
            if data is None:
 | 
			
		||||
                if shape is None:
 | 
			
		||||
| 
						 | 
				
			
			@ -546,6 +559,7 @@ cdef class Vectors:
 | 
			
		|||
                "hash_seed": self.hash_seed,
 | 
			
		||||
                "bow": self.bow,
 | 
			
		||||
                "eow": self.eow,
 | 
			
		||||
                "attr": self.attr,
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
    def _set_cfg(self, cfg):
 | 
			
		||||
| 
						 | 
				
			
			@ -556,6 +570,7 @@ cdef class Vectors:
 | 
			
		|||
        self.hash_seed = cfg.get("hash_seed", 0)
 | 
			
		||||
        self.bow = cfg.get("bow", "<")
 | 
			
		||||
        self.eow = cfg.get("eow", ">")
 | 
			
		||||
        self.attr = cfg.get("attr", ORTH)
 | 
			
		||||
 | 
			
		||||
    def to_disk(self, path, *, exclude=tuple()):
 | 
			
		||||
        """Save the current state to a directory.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -365,8 +365,13 @@ cdef class Vocab:
 | 
			
		|||
            self[orth]
 | 
			
		||||
        # Make prob negative so it sorts by rank ascending
 | 
			
		||||
        # (key2row contains the rank)
 | 
			
		||||
        priority = [(-lex.prob, self.vectors.key2row[lex.orth], lex.orth)
 | 
			
		||||
                    for lex in self if lex.orth in self.vectors.key2row]
 | 
			
		||||
        priority = []
 | 
			
		||||
        cdef Lexeme lex
 | 
			
		||||
        cdef attr_t value
 | 
			
		||||
        for lex in self:
 | 
			
		||||
            value = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
 | 
			
		||||
            if value in self.vectors.key2row:
 | 
			
		||||
                priority.append((-lex.prob, self.vectors.key2row[value], value))
 | 
			
		||||
        priority.sort()
 | 
			
		||||
        indices = xp.asarray([i for (prob, i, key) in priority], dtype="uint64")
 | 
			
		||||
        keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
 | 
			
		||||
| 
						 | 
				
			
			@ -399,8 +404,10 @@ cdef class Vocab:
 | 
			
		|||
        """
 | 
			
		||||
        if isinstance(orth, str):
 | 
			
		||||
            orth = self.strings.add(orth)
 | 
			
		||||
        if self.has_vector(orth):
 | 
			
		||||
            return self.vectors[orth]
 | 
			
		||||
        cdef Lexeme lex = self[orth]
 | 
			
		||||
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
 | 
			
		||||
        if self.has_vector(key):
 | 
			
		||||
            return self.vectors[key]
 | 
			
		||||
        xp = get_array_module(self.vectors.data)
 | 
			
		||||
        vectors = xp.zeros((self.vectors_length,), dtype="f")
 | 
			
		||||
        return vectors
 | 
			
		||||
| 
						 | 
				
			
			@ -416,15 +423,16 @@ cdef class Vocab:
 | 
			
		|||
        """
 | 
			
		||||
        if isinstance(orth, str):
 | 
			
		||||
            orth = self.strings.add(orth)
 | 
			
		||||
        if self.vectors.is_full and orth not in self.vectors:
 | 
			
		||||
        cdef Lexeme lex = self[orth]
 | 
			
		||||
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
 | 
			
		||||
        if self.vectors.is_full and key not in self.vectors:
 | 
			
		||||
            new_rows = max(100, int(self.vectors.shape[0]*1.3))
 | 
			
		||||
            if self.vectors.shape[1] == 0:
 | 
			
		||||
                width = vector.size
 | 
			
		||||
            else:
 | 
			
		||||
                width = self.vectors.shape[1]
 | 
			
		||||
            self.vectors.resize((new_rows, width))
 | 
			
		||||
        lex = self[orth]  # Add word to vocab if necessary
 | 
			
		||||
        row = self.vectors.add(orth, vector=vector)
 | 
			
		||||
        row = self.vectors.add(key, vector=vector)
 | 
			
		||||
        if row >= 0:
 | 
			
		||||
            lex.rank = row
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -439,7 +447,9 @@ cdef class Vocab:
 | 
			
		|||
        """
 | 
			
		||||
        if isinstance(orth, str):
 | 
			
		||||
            orth = self.strings.add(orth)
 | 
			
		||||
        return orth in self.vectors
 | 
			
		||||
        cdef Lexeme lex = self[orth]
 | 
			
		||||
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
 | 
			
		||||
        return key in self.vectors
 | 
			
		||||
 | 
			
		||||
    property lookups:
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -303,7 +303,7 @@ mapped to a zero vector. See the documentation on
 | 
			
		|||
| `nM`        | The width of the static vectors. ~~Optional[int]~~                                                                                                                                                                      |
 | 
			
		||||
| `dropout`   | Optional dropout rate. If set, it's applied per dimension over the whole batch. Defaults to `None`. ~~Optional[float]~~                                                                                                 |
 | 
			
		||||
| `init_W`    | The [initialization function](https://thinc.ai/docs/api-initializers). Defaults to [`glorot_uniform_init`](https://thinc.ai/docs/api-initializers#glorot_uniform_init). ~~Callable[[Ops, Tuple[int, ...]]], FloatsXd]~~ |
 | 
			
		||||
| `key_attr`  | Defaults to `"ORTH"`. ~~str~~                                                                                                                                                                                           |
 | 
			
		||||
| `key_attr`  | This setting is ignored in spaCy v3.6+. To set a custom key attribute for vectors, configure it through [`Vectors`](/api/vectors) or [`spacy init vectors`](/api/cli#init-vectors). Defaults to `"ORTH"`. ~~str~~       |
 | 
			
		||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~                                                                                                                                                          |
 | 
			
		||||
 | 
			
		||||
### spacy.FeatureExtractor.v1 {id="FeatureExtractor"}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -211,7 +211,8 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
 | 
			
		|||
| `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
 | 
			
		||||
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
 | 
			
		||||
| `--prune`, `-p`    | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~                                                                                                                                                                     |
 | 
			
		||||
| `--mode`, `-m`     | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~                                                                                                                                      |
 | 
			
		||||
| `--mode`, `-m`     | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~str \(option)~~                                                                                                                                                |
 | 
			
		||||
| `--attr`, `-a`     | Token attribute to use for vectors, e.g. `LOWER` or `NORM`) Defaults to `ORTH`. ~~str \(option)~~                                                                                                                                                                   |
 | 
			
		||||
| `--name`, `-n`     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~                                                                                                                                                   |
 | 
			
		||||
| `--verbose`, `-V`  | Print additional information and explanations. ~~bool (flag)~~                                                                                                                                                                                                      |
 | 
			
		||||
| `--help`, `-h`     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                          |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -60,6 +60,7 @@ modified later.
 | 
			
		|||
| `hash_seed` <Tag variant="new">3.2</Tag>  | The floret hash seed (default: `0`). ~~int~~                                                                                                                                           |
 | 
			
		||||
| `bow` <Tag variant="new">3.2</Tag>        | The floret BOW string (default: `"<"`). ~~str~~                                                                                                                                        |
 | 
			
		||||
| `eow` <Tag variant="new">3.2</Tag>        | The floret EOW string (default: `">"`). ~~str~~                                                                                                                                        |
 | 
			
		||||
| `attr` <Tag variant="new">3.6</Tag>       | The token attribute for the vector keys (default: `"ORTH"`). ~~Union[int, str]~~                                                                                                       |
 | 
			
		||||
 | 
			
		||||
## Vectors.\_\_getitem\_\_ {id="getitem",tag="method"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -453,8 +454,9 @@ Load state from a binary string.
 | 
			
		|||
 | 
			
		||||
## Attributes {id="attributes"}
 | 
			
		||||
 | 
			
		||||
| Name      | Description                                                                                                                                                          |
 | 
			
		||||
| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `data`    | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~   |
 | 
			
		||||
| `key2row` | Dictionary mapping word hashes to rows in the `Vectors.data` table. ~~Dict[int, int]~~                                                                               |
 | 
			
		||||
| `keys`    | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
 | 
			
		||||
| Name                                | Description                                                                                                                                                          |
 | 
			
		||||
| ----------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `data`                              | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~   |
 | 
			
		||||
| `key2row`                           | Dictionary mapping word hashes to rows in the `Vectors.data` table. ~~Dict[int, int]~~                                                                               |
 | 
			
		||||
| `keys`                              | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
 | 
			
		||||
| `attr` <Tag variant="new">3.6</Tag> | The token attribute for the vector keys. ~~int~~                                                                                                                     |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -11,7 +11,6 @@ menu:
 | 
			
		|||
  - ['Custom Functions', 'custom-functions']
 | 
			
		||||
  - ['Initialization', 'initialization']
 | 
			
		||||
  - ['Data Utilities', 'data']
 | 
			
		||||
  - ['Parallel Training', 'parallel-training']
 | 
			
		||||
  - ['Internal API', 'api']
 | 
			
		||||
---
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1565,77 +1564,6 @@ token-based annotations like the dependency parse or entity labels, you'll need
 | 
			
		|||
to take care to adjust the `Example` object so its annotations match and remain
 | 
			
		||||
valid.
 | 
			
		||||
 | 
			
		||||
## Parallel & distributed training with Ray {id="parallel-training"}
 | 
			
		||||
 | 
			
		||||
> #### Installation
 | 
			
		||||
>
 | 
			
		||||
> ```bash
 | 
			
		||||
> $ pip install -U %%SPACY_PKG_NAME[ray]%%SPACY_PKG_FLAGS
 | 
			
		||||
> # Check that the CLI is registered
 | 
			
		||||
> $ python -m spacy ray --help
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
[Ray](https://ray.io/) is a fast and simple framework for building and running
 | 
			
		||||
**distributed applications**. You can use Ray to train spaCy on one or more
 | 
			
		||||
remote machines, potentially speeding up your training process. Parallel
 | 
			
		||||
training won't always be faster though – it depends on your batch size, models,
 | 
			
		||||
and hardware.
 | 
			
		||||
 | 
			
		||||
<Infobox variant="warning">
 | 
			
		||||
 | 
			
		||||
To use Ray with spaCy, you need the
 | 
			
		||||
[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
 | 
			
		||||
Installing the package will automatically add the `ray` command to the spaCy
 | 
			
		||||
CLI.
 | 
			
		||||
 | 
			
		||||
</Infobox>
 | 
			
		||||
 | 
			
		||||
The [`spacy ray train`](/api/cli#ray-train) command follows the same API as
 | 
			
		||||
[`spacy train`](/api/cli#train), with a few extra options to configure the Ray
 | 
			
		||||
setup. You can optionally set the `--address` option to point to your Ray
 | 
			
		||||
cluster. If it's not set, Ray will run locally.
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
python -m spacy ray train config.cfg --n-workers 2
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
<Project id="integrations/ray">
 | 
			
		||||
 | 
			
		||||
Get started with parallel training using our project template. It trains a
 | 
			
		||||
simple model on a Universal Dependencies Treebank and lets you parallelize the
 | 
			
		||||
training with Ray.
 | 
			
		||||
 | 
			
		||||
</Project>
 | 
			
		||||
 | 
			
		||||
### How parallel training works {id="parallel-training-details"}
 | 
			
		||||
 | 
			
		||||
Each worker receives a shard of the **data** and builds a copy of the **model
 | 
			
		||||
and optimizer** from the [`config.cfg`](#config). It also has a communication
 | 
			
		||||
channel to **pass gradients and parameters** to the other workers. Additionally,
 | 
			
		||||
each worker is given ownership of a subset of the parameter arrays. Every
 | 
			
		||||
parameter array is owned by exactly one worker, and the workers are given a
 | 
			
		||||
mapping so they know which worker owns which parameter.
 | 
			
		||||
 | 
			
		||||

 | 
			
		||||
 | 
			
		||||
As training proceeds, every worker will be computing gradients for **all** of
 | 
			
		||||
the model parameters. When they compute gradients for parameters they don't own,
 | 
			
		||||
they'll **send them to the worker** that does own that parameter, along with a
 | 
			
		||||
version identifier so that the owner can decide whether to discard the gradient.
 | 
			
		||||
Workers use the gradients they receive and the ones they compute locally to
 | 
			
		||||
update the parameters they own, and then broadcast the updated array and a new
 | 
			
		||||
version ID to the other workers.
 | 
			
		||||
 | 
			
		||||
This training procedure is **asynchronous** and **non-blocking**. Workers always
 | 
			
		||||
push their gradient increments and parameter updates, they do not have to pull
 | 
			
		||||
them and block on the result, so the transfers can happen in the background,
 | 
			
		||||
overlapped with the actual training work. The workers also do not have to stop
 | 
			
		||||
and wait for each other ("synchronize") at the start of each batch. This is very
 | 
			
		||||
useful for spaCy, because spaCy is often trained on long documents, which means
 | 
			
		||||
**batches can vary in size** significantly. Uneven workloads make synchronous
 | 
			
		||||
gradient descent inefficient, because if one batch is slow, all of the other
 | 
			
		||||
workers are stuck waiting for it to complete before they can continue.
 | 
			
		||||
 | 
			
		||||
## Internal training API {id="api"}
 | 
			
		||||
 | 
			
		||||
<Infobox variant="danger">
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -4372,7 +4372,7 @@
 | 
			
		|||
            "code_example": [
 | 
			
		||||
                "import spacy",
 | 
			
		||||
                "",
 | 
			
		||||
                "nlp = spacy.load(\"en_core_web_sm\", disable=[\"ner\"])",
 | 
			
		||||
                "nlp = spacy.load(\"en_core_web_sm\", exclude=[\"ner\"])",
 | 
			
		||||
                "nlp.add_pipe(\"span_marker\", config={\"model\": \"tomaarsen/span-marker-roberta-large-ontonotes5\"})",
 | 
			
		||||
                "",
 | 
			
		||||
                "text = \"\"\"Cleopatra VII, also known as Cleopatra the Great, was the last active ruler of the \\",
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -13,6 +13,8 @@ import 'prismjs/components/prism-json.min.js'
 | 
			
		|||
import 'prismjs/components/prism-markdown.min.js'
 | 
			
		||||
import 'prismjs/components/prism-python.min.js'
 | 
			
		||||
import 'prismjs/components/prism-yaml.min.js'
 | 
			
		||||
import 'prismjs/components/prism-docker.min.js'
 | 
			
		||||
import 'prismjs/components/prism-r.min.js'
 | 
			
		||||
 | 
			
		||||
import { isString } from './util'
 | 
			
		||||
import Link, { OptionalLink } from './link'
 | 
			
		||||
| 
						 | 
				
			
			@ -172,7 +174,7 @@ const convertLine = ({ line, prompt, lang }) => {
 | 
			
		|||
        return handlePromot({ lineFlat, prompt })
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return lang === 'none' || !lineFlat ? (
 | 
			
		||||
    return lang === 'none' || !lineFlat || !(lang in Prism.languages) ? (
 | 
			
		||||
        lineFlat
 | 
			
		||||
    ) : (
 | 
			
		||||
        <span
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user