mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
		
						commit
						24f5fe8839
					
				| 
						 | 
					@ -36,3 +36,44 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 | 
				
			||||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 | 
					THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 | 
				
			||||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 | 
					(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 | 
				
			||||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
					OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					scikit-learn
 | 
				
			||||||
 | 
					------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					* Files: scorer.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The following implementation of roc_auc_score() is adapted from
 | 
				
			||||||
 | 
					scikit-learn, which is distributed under the following license:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					New BSD License
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Copyright (c) 2007–2019 The scikit-learn developers.
 | 
				
			||||||
 | 
					All rights reserved.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Redistribution and use in source and binary forms, with or without
 | 
				
			||||||
 | 
					modification, are permitted provided that the following conditions are met:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  a. Redistributions of source code must retain the above copyright notice,
 | 
				
			||||||
 | 
					     this list of conditions and the following disclaimer.
 | 
				
			||||||
 | 
					  b. Redistributions in binary form must reproduce the above copyright
 | 
				
			||||||
 | 
					     notice, this list of conditions and the following disclaimer in the
 | 
				
			||||||
 | 
					     documentation and/or other materials provided with the distribution.
 | 
				
			||||||
 | 
					  c. Neither the name of the Scikit-learn Developers  nor the names of
 | 
				
			||||||
 | 
					     its contributors may be used to endorse or promote products
 | 
				
			||||||
 | 
					     derived from this software without specific prior written
 | 
				
			||||||
 | 
					     permission.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
				
			||||||
 | 
					AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
				
			||||||
 | 
					IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
				
			||||||
 | 
					ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
 | 
				
			||||||
 | 
					ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
				
			||||||
 | 
					DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
				
			||||||
 | 
					SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
				
			||||||
 | 
					CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 | 
				
			||||||
 | 
					LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 | 
				
			||||||
 | 
					OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 | 
				
			||||||
 | 
					DAMAGE.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -35,7 +35,7 @@ def download_cli(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def download(model: str, direct: bool = False, *pip_args) -> None:
 | 
					def download(model: str, direct: bool = False, *pip_args) -> None:
 | 
				
			||||||
    if not is_package("spacy") and "--no-deps" not in pip_args:
 | 
					    if not (is_package("spacy") or is_package("spacy-nightly")) and "--no-deps" not in pip_args:
 | 
				
			||||||
        msg.warn(
 | 
					        msg.warn(
 | 
				
			||||||
            "Skipping pipeline package dependencies and setting `--no-deps`. "
 | 
					            "Skipping pipeline package dependencies and setting `--no-deps`. "
 | 
				
			||||||
            "You don't seem to have the spaCy package itself installed "
 | 
					            "You don't seem to have the spaCy package itself installed "
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -103,6 +103,9 @@ def package(
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
    Path.mkdir(package_path, parents=True)
 | 
					    Path.mkdir(package_path, parents=True)
 | 
				
			||||||
    shutil.copytree(str(input_dir), str(package_path / model_name_v))
 | 
					    shutil.copytree(str(input_dir), str(package_path / model_name_v))
 | 
				
			||||||
 | 
					    license_path = package_path / model_name_v / "LICENSE"
 | 
				
			||||||
 | 
					    if license_path.exists():
 | 
				
			||||||
 | 
					        shutil.move(str(license_path), str(main_path))
 | 
				
			||||||
    create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
 | 
					    create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
 | 
				
			||||||
    create_file(main_path / "setup.py", TEMPLATE_SETUP)
 | 
					    create_file(main_path / "setup.py", TEMPLATE_SETUP)
 | 
				
			||||||
    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
 | 
					    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
 | 
				
			||||||
| 
						 | 
					@ -238,7 +241,7 @@ if __name__ == '__main__':
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TEMPLATE_MANIFEST = """
 | 
					TEMPLATE_MANIFEST = """
 | 
				
			||||||
include meta.json
 | 
					include meta.json
 | 
				
			||||||
include config.cfg
 | 
					include LICENSE
 | 
				
			||||||
""".strip()
 | 
					""".strip()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -125,8 +125,9 @@ class Warnings:
 | 
				
			||||||
class Errors:
 | 
					class Errors:
 | 
				
			||||||
    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
 | 
					    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
 | 
				
			||||||
    E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
 | 
					    E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
 | 
				
			||||||
            "This usually happens when spaCy calls `nlp.{method}` with custom "
 | 
					            "This usually happens when spaCy calls `nlp.{method}` with a custom "
 | 
				
			||||||
            "component name that's not registered on the current language class. "
 | 
					            "component name that's not registered on the current language class. "
 | 
				
			||||||
 | 
					            "If you're using a Transformer, make sure to install 'spacy-transformers'. "
 | 
				
			||||||
            "If you're using a custom component, make sure you've added the "
 | 
					            "If you're using a custom component, make sure you've added the "
 | 
				
			||||||
            "decorator `@Language.component` (for function components) or "
 | 
					            "decorator `@Language.component` (for function components) or "
 | 
				
			||||||
            "`@Language.factory` (for class components).\n\nAvailable "
 | 
					            "`@Language.factory` (for class components).\n\nAvailable "
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -67,9 +67,6 @@ class Morphologizer(Tagger):
 | 
				
			||||||
        vocab: Vocab,
 | 
					        vocab: Vocab,
 | 
				
			||||||
        model: Model,
 | 
					        model: Model,
 | 
				
			||||||
        name: str = "morphologizer",
 | 
					        name: str = "morphologizer",
 | 
				
			||||||
        *,
 | 
					 | 
				
			||||||
        labels_morph: Optional[dict] = None,
 | 
					 | 
				
			||||||
        labels_pos: Optional[dict] = None,
 | 
					 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
        """Initialize a morphologizer.
 | 
					        """Initialize a morphologizer.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -77,8 +74,6 @@ class Morphologizer(Tagger):
 | 
				
			||||||
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
 | 
					        model (thinc.api.Model): The Thinc Model powering the pipeline component.
 | 
				
			||||||
        name (str): The component instance name, used to add entries to the
 | 
					        name (str): The component instance name, used to add entries to the
 | 
				
			||||||
            losses during training.
 | 
					            losses during training.
 | 
				
			||||||
        labels_morph (dict): Mapping of morph + POS tags to morph labels.
 | 
					 | 
				
			||||||
        labels_pos (dict): Mapping of morph + POS tags to POS tags.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://nightly.spacy.io/api/morphologizer#init
 | 
					        DOCS: https://nightly.spacy.io/api/morphologizer#init
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
| 
						 | 
					@ -90,7 +85,7 @@ class Morphologizer(Tagger):
 | 
				
			||||||
        # store mappings from morph+POS labels to token-level annotations:
 | 
					        # store mappings from morph+POS labels to token-level annotations:
 | 
				
			||||||
        # 1) labels_morph stores a mapping from morph+POS->morph
 | 
					        # 1) labels_morph stores a mapping from morph+POS->morph
 | 
				
			||||||
        # 2) labels_pos stores a mapping from morph+POS->POS
 | 
					        # 2) labels_pos stores a mapping from morph+POS->POS
 | 
				
			||||||
        cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}}
 | 
					        cfg = {"labels_morph": {}, "labels_pos": {}}
 | 
				
			||||||
        self.cfg = dict(sorted(cfg.items()))
 | 
					        self.cfg = dict(sorted(cfg.items()))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -47,7 +47,7 @@ class MultitaskObjective(Tagger):
 | 
				
			||||||
    side-objective.
 | 
					    side-objective.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, vocab, model, name="nn_labeller", *, labels, target):
 | 
					    def __init__(self, vocab, model, name="nn_labeller", *, target):
 | 
				
			||||||
        self.vocab = vocab
 | 
					        self.vocab = vocab
 | 
				
			||||||
        self.model = model
 | 
					        self.model = model
 | 
				
			||||||
        self.name = name
 | 
					        self.name = name
 | 
				
			||||||
| 
						 | 
					@ -67,7 +67,7 @@ class MultitaskObjective(Tagger):
 | 
				
			||||||
            self.make_label = target
 | 
					            self.make_label = target
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            raise ValueError(Errors.E016)
 | 
					            raise ValueError(Errors.E016)
 | 
				
			||||||
        cfg = {"labels": labels or {}, "target": target}
 | 
					        cfg = {"labels": {}, "target": target}
 | 
				
			||||||
        self.cfg = dict(cfg)
 | 
					        self.cfg = dict(cfg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
| 
						 | 
					@ -81,15 +81,18 @@ class MultitaskObjective(Tagger):
 | 
				
			||||||
    def set_annotations(self, docs, dep_ids):
 | 
					    def set_annotations(self, docs, dep_ids):
 | 
				
			||||||
        pass
 | 
					        pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def initialize(self, get_examples, nlp=None):
 | 
					    def initialize(self, get_examples, nlp=None, labels=None):
 | 
				
			||||||
        if not hasattr(get_examples, "__call__"):
 | 
					        if not hasattr(get_examples, "__call__"):
 | 
				
			||||||
            err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
 | 
					            err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
 | 
				
			||||||
            raise ValueError(err)
 | 
					            raise ValueError(err)
 | 
				
			||||||
        for example in get_examples():
 | 
					        if labels is not None:
 | 
				
			||||||
            for token in example.y:
 | 
					            self.labels = labels
 | 
				
			||||||
                label = self.make_label(token)
 | 
					        else:
 | 
				
			||||||
                if label is not None and label not in self.labels:
 | 
					            for example in get_examples():
 | 
				
			||||||
                    self.labels[label] = len(self.labels)
 | 
					                for token in example.y:
 | 
				
			||||||
 | 
					                    label = self.make_label(token)
 | 
				
			||||||
 | 
					                    if label is not None and label not in self.labels:
 | 
				
			||||||
 | 
					                        self.labels[label] = len(self.labels)
 | 
				
			||||||
        self.model.initialize()   # TODO: fix initialization by defining X and Y
 | 
					        self.model.initialize()   # TODO: fix initialization by defining X and Y
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def predict(self, docs):
 | 
					    def predict(self, docs):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -61,14 +61,13 @@ class Tagger(TrainablePipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DOCS: https://nightly.spacy.io/api/tagger
 | 
					    DOCS: https://nightly.spacy.io/api/tagger
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    def __init__(self, vocab, model, name="tagger", *, labels=None):
 | 
					    def __init__(self, vocab, model, name="tagger"):
 | 
				
			||||||
        """Initialize a part-of-speech tagger.
 | 
					        """Initialize a part-of-speech tagger.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        vocab (Vocab): The shared vocabulary.
 | 
					        vocab (Vocab): The shared vocabulary.
 | 
				
			||||||
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
 | 
					        model (thinc.api.Model): The Thinc Model powering the pipeline component.
 | 
				
			||||||
        name (str): The component instance name, used to add entries to the
 | 
					        name (str): The component instance name, used to add entries to the
 | 
				
			||||||
            losses during training.
 | 
					            losses during training.
 | 
				
			||||||
        labels (List): The set of labels. Defaults to None.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://nightly.spacy.io/api/tagger#init
 | 
					        DOCS: https://nightly.spacy.io/api/tagger#init
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
| 
						 | 
					@ -76,7 +75,7 @@ class Tagger(TrainablePipe):
 | 
				
			||||||
        self.model = model
 | 
					        self.model = model
 | 
				
			||||||
        self.name = name
 | 
					        self.name = name
 | 
				
			||||||
        self._rehearsal_model = None
 | 
					        self._rehearsal_model = None
 | 
				
			||||||
        cfg = {"labels": labels or []}
 | 
					        cfg = {"labels": []}
 | 
				
			||||||
        self.cfg = dict(sorted(cfg.items()))
 | 
					        self.cfg = dict(sorted(cfg.items()))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -720,44 +720,10 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#############################################################################
 | 
					 | 
				
			||||||
#
 | 
					 | 
				
			||||||
# The following implementation of roc_auc_score() is adapted from
 | 
					# The following implementation of roc_auc_score() is adapted from
 | 
				
			||||||
# scikit-learn, which is distributed under the following license:
 | 
					# scikit-learn, which is distributed under the New BSD License.
 | 
				
			||||||
#
 | 
					 | 
				
			||||||
# New BSD License
 | 
					 | 
				
			||||||
#
 | 
					 | 
				
			||||||
# Copyright (c) 2007–2019 The scikit-learn developers.
 | 
					# Copyright (c) 2007–2019 The scikit-learn developers.
 | 
				
			||||||
# All rights reserved.
 | 
					# See licenses/3rd_party_licenses.txt
 | 
				
			||||||
#
 | 
					 | 
				
			||||||
#
 | 
					 | 
				
			||||||
# Redistribution and use in source and binary forms, with or without
 | 
					 | 
				
			||||||
# modification, are permitted provided that the following conditions are met:
 | 
					 | 
				
			||||||
#
 | 
					 | 
				
			||||||
#   a. Redistributions of source code must retain the above copyright notice,
 | 
					 | 
				
			||||||
#      this list of conditions and the following disclaimer.
 | 
					 | 
				
			||||||
#   b. Redistributions in binary form must reproduce the above copyright
 | 
					 | 
				
			||||||
#      notice, this list of conditions and the following disclaimer in the
 | 
					 | 
				
			||||||
#      documentation and/or other materials provided with the distribution.
 | 
					 | 
				
			||||||
#   c. Neither the name of the Scikit-learn Developers  nor the names of
 | 
					 | 
				
			||||||
#      its contributors may be used to endorse or promote products
 | 
					 | 
				
			||||||
#      derived from this software without specific prior written
 | 
					 | 
				
			||||||
#      permission.
 | 
					 | 
				
			||||||
#
 | 
					 | 
				
			||||||
#
 | 
					 | 
				
			||||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
					 | 
				
			||||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
					 | 
				
			||||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
					 | 
				
			||||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
 | 
					 | 
				
			||||||
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
					 | 
				
			||||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
					 | 
				
			||||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
					 | 
				
			||||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 | 
					 | 
				
			||||||
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 | 
					 | 
				
			||||||
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 | 
					 | 
				
			||||||
# DAMAGE.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _roc_auc_score(y_true, y_score):
 | 
					def _roc_auc_score(y_true, y_score):
 | 
				
			||||||
    """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
 | 
					    """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
 | 
				
			||||||
    from prediction scores.
 | 
					    from prediction scores.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,35 +1,38 @@
 | 
				
			||||||
from thinc.api import fix_random_seed
 | 
					import pytest
 | 
				
			||||||
 | 
					from thinc.api import Config, fix_random_seed
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.lang.en import English
 | 
					from spacy.lang.en import English
 | 
				
			||||||
 | 
					from spacy.pipeline.textcat import default_model_config, bow_model_config
 | 
				
			||||||
 | 
					from spacy.pipeline.textcat import cnn_model_config
 | 
				
			||||||
from spacy.tokens import Span
 | 
					from spacy.tokens import Span
 | 
				
			||||||
from spacy import displacy
 | 
					from spacy import displacy
 | 
				
			||||||
from spacy.pipeline import merge_entities
 | 
					from spacy.pipeline import merge_entities
 | 
				
			||||||
 | 
					from spacy.training import Example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_issue5551():
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "textcat_config", [default_model_config, bow_model_config, cnn_model_config]
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def test_issue5551(textcat_config):
 | 
				
			||||||
    """Test that after fixing the random seed, the results of the pipeline are truly identical"""
 | 
					    """Test that after fixing the random seed, the results of the pipeline are truly identical"""
 | 
				
			||||||
    component = "textcat"
 | 
					    component = "textcat"
 | 
				
			||||||
    pipe_cfg = {
 | 
					
 | 
				
			||||||
        "model": {
 | 
					    pipe_cfg = Config().from_str(textcat_config)
 | 
				
			||||||
            "@architectures": "spacy.TextCatBOW.v1",
 | 
					 | 
				
			||||||
            "exclusive_classes": True,
 | 
					 | 
				
			||||||
            "ngram_size": 2,
 | 
					 | 
				
			||||||
            "no_output_layer": False,
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    results = []
 | 
					    results = []
 | 
				
			||||||
    for i in range(3):
 | 
					    for i in range(3):
 | 
				
			||||||
        fix_random_seed(0)
 | 
					        fix_random_seed(0)
 | 
				
			||||||
        nlp = English()
 | 
					        nlp = English()
 | 
				
			||||||
        example = (
 | 
					        text = "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g."
 | 
				
			||||||
            "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g.",
 | 
					        annots = {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}}
 | 
				
			||||||
            {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}},
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
 | 
					        pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
 | 
				
			||||||
        for label in set(example[1]["cats"]):
 | 
					        for label in set(annots["cats"]):
 | 
				
			||||||
            pipe.add_label(label)
 | 
					            pipe.add_label(label)
 | 
				
			||||||
 | 
					        # Train
 | 
				
			||||||
        nlp.initialize()
 | 
					        nlp.initialize()
 | 
				
			||||||
 | 
					        doc = nlp.make_doc(text)
 | 
				
			||||||
 | 
					        nlp.update([Example.from_dict(doc, annots)])
 | 
				
			||||||
        # Store the result of each iteration
 | 
					        # Store the result of each iteration
 | 
				
			||||||
        result = pipe.model.predict([nlp.make_doc(example[0])])
 | 
					        result = pipe.model.predict([doc])
 | 
				
			||||||
        results.append(list(result[0]))
 | 
					        results.append(list(result[0]))
 | 
				
			||||||
    # All results should be the same because of the fixed seed
 | 
					    # All results should be the same because of the fixed seed
 | 
				
			||||||
    assert len(results) == 3
 | 
					    assert len(results) == 3
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -72,6 +72,10 @@ def test_readers():
 | 
				
			||||||
def test_cat_readers(reader, additional_config):
 | 
					def test_cat_readers(reader, additional_config):
 | 
				
			||||||
    nlp_config_string = """
 | 
					    nlp_config_string = """
 | 
				
			||||||
    [training]
 | 
					    [training]
 | 
				
			||||||
 | 
					    seed = 0
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    [training.score_weights]
 | 
				
			||||||
 | 
					    cats_macro_auc = 1.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    [corpora]
 | 
					    [corpora]
 | 
				
			||||||
    @readers = "PLACEHOLDER"
 | 
					    @readers = "PLACEHOLDER"
 | 
				
			||||||
| 
						 | 
					@ -92,9 +96,7 @@ def test_cat_readers(reader, additional_config):
 | 
				
			||||||
    config["corpora"]["@readers"] = reader
 | 
					    config["corpora"]["@readers"] = reader
 | 
				
			||||||
    config["corpora"].update(additional_config)
 | 
					    config["corpora"].update(additional_config)
 | 
				
			||||||
    nlp = load_model_from_config(config, auto_fill=True)
 | 
					    nlp = load_model_from_config(config, auto_fill=True)
 | 
				
			||||||
    T = registry.resolve(
 | 
					    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
 | 
				
			||||||
        nlp.config["training"].interpolate(), schema=ConfigSchemaTraining
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    dot_names = [T["train_corpus"], T["dev_corpus"]]
 | 
					    dot_names = [T["train_corpus"], T["dev_corpus"]]
 | 
				
			||||||
    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
 | 
					    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
 | 
				
			||||||
    optimizer = T["optimizer"]
 | 
					    optimizer = T["optimizer"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -17,7 +17,7 @@ from ..ml.models.multi_task import build_cloze_multi_task_model
 | 
				
			||||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 | 
					from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 | 
				
			||||||
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
 | 
					from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
from ..util import registry, load_model_from_config, resolve_dot_names
 | 
					from ..util import registry, load_model_from_config, dot_to_object
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def pretrain(
 | 
					def pretrain(
 | 
				
			||||||
| 
						 | 
					@ -38,7 +38,8 @@ def pretrain(
 | 
				
			||||||
    _config = nlp.config.interpolate()
 | 
					    _config = nlp.config.interpolate()
 | 
				
			||||||
    T = registry.resolve(_config["training"], schema=ConfigSchemaTraining)
 | 
					    T = registry.resolve(_config["training"], schema=ConfigSchemaTraining)
 | 
				
			||||||
    P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain)
 | 
					    P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain)
 | 
				
			||||||
    corpus = resolve_dot_names(_config, [P["corpus"]])[0]
 | 
					    corpus = dot_to_object(_config, P["corpus"])
 | 
				
			||||||
 | 
					    corpus = registry.resolve({"corpus": corpus})["corpus"]
 | 
				
			||||||
    batcher = P["batcher"]
 | 
					    batcher = P["batcher"]
 | 
				
			||||||
    model = create_pretraining_model(nlp, P)
 | 
					    model = create_pretraining_model(nlp, P)
 | 
				
			||||||
    optimizer = P["optimizer"]
 | 
					    optimizer = P["optimizer"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -143,10 +143,10 @@ argument that connects to the shared `tok2vec` component in the pipeline.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Construct an embedding layer that separately embeds a number of lexical
 | 
					Construct an embedding layer that separately embeds a number of lexical
 | 
				
			||||||
attributes using hash embedding, concatenates the results, and passes it through
 | 
					attributes using hash embedding, concatenates the results, and passes it through
 | 
				
			||||||
a feed-forward subnetwork to build a mixed representation. The features used
 | 
					a feed-forward subnetwork to build a mixed representation. The features used can
 | 
				
			||||||
can be configured with the `attrs` argument. The suggested attributes are
 | 
					be configured with the `attrs` argument. The suggested attributes are `NORM`,
 | 
				
			||||||
`NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account
 | 
					`PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some
 | 
				
			||||||
some subword information, without construction a fully character-based
 | 
					subword information, without construction a fully character-based
 | 
				
			||||||
representation. If pretrained vectors are available, they can be included in the
 | 
					representation. If pretrained vectors are available, they can be included in the
 | 
				
			||||||
representation as well, with the vectors table will be kept static (i.e. it's
 | 
					representation as well, with the vectors table will be kept static (i.e. it's
 | 
				
			||||||
not updated).
 | 
					not updated).
 | 
				
			||||||
| 
						 | 
					@ -393,11 +393,12 @@ operate over wordpieces, which usually don't align one-to-one against spaCy
 | 
				
			||||||
tokens. The layer therefore requires a reduction operation in order to calculate
 | 
					tokens. The layer therefore requires a reduction operation in order to calculate
 | 
				
			||||||
a single token vector given zero or more wordpiece vectors.
 | 
					a single token vector given zero or more wordpiece vectors.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name          | Description                                                                                                                                                                                                                                                                   |
 | 
					| Name          | Description                                                                                                                                                                                                                                                                                                                                      |
 | 
				
			||||||
| ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
| `pooling`     | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~                            |
 | 
					| `pooling`     | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~                                                                                               |
 | 
				
			||||||
| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ |
 | 
					| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~                                                                    |
 | 
				
			||||||
| **CREATES**   | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                        |
 | 
					| `upstream`    | A string to identify the "upstream" `Transformer` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Transformer` component. You'll almost never have multiple upstream `Transformer` components, so the wildcard string will almost always be fine. ~~str~~ |
 | 
				
			||||||
 | 
					| **CREATES**   | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                           |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer}
 | 
					### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -563,7 +564,8 @@ from the linear model, where it is stored in `model.attrs["multi_label"]`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Accordion title="spacy.TextCatEnsemble.v1 definition" spaced>
 | 
					<Accordion title="spacy.TextCatEnsemble.v1 definition" spaced>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The v1 was functionally similar, but used an internal `tok2vec` instead of taking it as argument.
 | 
					The v1 was functionally similar, but used an internal `tok2vec` instead of
 | 
				
			||||||
 | 
					taking it as argument.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name                 | Description                                                                                                                                                                                    |
 | 
					| Name                 | Description                                                                                                                                                                                    |
 | 
				
			||||||
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -66,9 +66,6 @@ shortcut for this and instantiate the component using its string name and
 | 
				
			||||||
| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                     |
 | 
					| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                     |
 | 
				
			||||||
| `model`        | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
 | 
					| `model`        | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
 | 
				
			||||||
| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                  |
 | 
					| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                  |
 | 
				
			||||||
| _keyword-only_ |                                                                                                                      |
 | 
					 | 
				
			||||||
| `labels_morph` | Mapping of morph + POS tags to morph labels. ~~Dict[str, str]~~                                                      |
 | 
					 | 
				
			||||||
| `labels_pos`   | Mapping of morph + POS tags to POS tags. ~~Dict[str, str]~~                                                          |
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Morphologizer.\_\_call\_\_ {#call tag="method"}
 | 
					## Morphologizer.\_\_call\_\_ {#call tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -21,16 +21,12 @@ architectures and their arguments and hyperparameters.
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
> ```python
 | 
					> ```python
 | 
				
			||||||
> from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
 | 
					> from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
 | 
				
			||||||
> config = {
 | 
					> config = {"model": DEFAULT_TAGGER_MODEL}
 | 
				
			||||||
>    "set_morphology": False,
 | 
					 | 
				
			||||||
>    "model": DEFAULT_TAGGER_MODEL,
 | 
					 | 
				
			||||||
> }
 | 
					 | 
				
			||||||
> nlp.add_pipe("tagger", config=config)
 | 
					> nlp.add_pipe("tagger", config=config)
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Setting          | Description                                                                                                                                                                                                                                                                                            |
 | 
					| Setting          | Description                                                                                                                                                                                                                                                                                            |
 | 
				
			||||||
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
					| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
| `set_morphology` | Whether to set morphological features. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                                   |
 | 
					 | 
				
			||||||
| `model`          | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
 | 
					| `model`          | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```python
 | 
					```python
 | 
				
			||||||
| 
						 | 
					@ -63,8 +59,6 @@ shortcut for this and instantiate the component using its string name and
 | 
				
			||||||
| `vocab`          | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                      |
 | 
					| `vocab`          | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                      |
 | 
				
			||||||
| `model`          | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
 | 
					| `model`          | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
 | 
				
			||||||
| `name`           | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                   |
 | 
					| `name`           | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                   |
 | 
				
			||||||
| _keyword-only_   |                                                                                                                                                                                                                                                       |
 | 
					 | 
				
			||||||
| `set_morphology` | Whether to set morphological features. ~~bool~~                                                                                                                                                                                                       |
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Tagger.\_\_call\_\_ {#call tag="method"}
 | 
					## Tagger.\_\_call\_\_ {#call tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -502,7 +502,7 @@ with Model.define_operators({">>": chain}):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Create new trainable components {#components}
 | 
					## Create new trainable components {#components}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
In addition to [swapping out](#swap-architectures) default models in built-in
 | 
					In addition to [swapping out](#swap-architectures) layers in existing
 | 
				
			||||||
components, you can also implement an entirely new,
 | 
					components, you can also implement an entirely new,
 | 
				
			||||||
[trainable](/usage/processing-pipelines#trainable-components) pipeline component
 | 
					[trainable](/usage/processing-pipelines#trainable-components) pipeline component
 | 
				
			||||||
from scratch. This can be done by creating a new class inheriting from
 | 
					from scratch. This can be done by creating a new class inheriting from
 | 
				
			||||||
| 
						 | 
					@ -523,20 +523,28 @@ overview of the `TrainablePipe` methods used by
 | 
				
			||||||
This section outlines an example use-case of implementing a **novel relation
 | 
					This section outlines an example use-case of implementing a **novel relation
 | 
				
			||||||
extraction component** from scratch. We'll implement a binary relation
 | 
					extraction component** from scratch. We'll implement a binary relation
 | 
				
			||||||
extraction method that determines whether or not **two entities** in a document
 | 
					extraction method that determines whether or not **two entities** in a document
 | 
				
			||||||
are related, and if so, what type of relation. We'll allow multiple types of
 | 
					are related, and if so, what type of relation connects them. We allow multiple
 | 
				
			||||||
relations between two such entities (multi-label setting). There are two major
 | 
					types of relations between two such entities (a multi-label setting). There are
 | 
				
			||||||
steps required:
 | 
					two major steps required:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
1. Implement a [machine learning model](#component-rel-model) specific to this
 | 
					1. Implement a [machine learning model](#component-rel-model) specific to this
 | 
				
			||||||
   task. It will have to extract candidates from a [`Doc`](/api/doc) and predict
 | 
					   task. It will have to extract candidate relation instances from a
 | 
				
			||||||
   a relation for the available candidate pairs.
 | 
					   [`Doc`](/api/doc) and predict the corresponding scores for each relation
 | 
				
			||||||
2. Implement a custom [pipeline component](#component-rel-pipe) powered by the
 | 
					   label.
 | 
				
			||||||
   machine learning model that sets annotations on the [`Doc`](/api/doc) passing
 | 
					2. Implement a custom [pipeline component](#component-rel-pipe) - powered by the
 | 
				
			||||||
   through the pipeline.
 | 
					   machine learning model from step 1 - that translates the predicted scores
 | 
				
			||||||
 | 
					   into annotations that are stored on the [`Doc`](/api/doc) objects as they
 | 
				
			||||||
 | 
					   pass through the `nlp` pipeline.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<!-- TODO: <Project id="tutorials/ner-relations">
 | 
					<Project id="tutorials/rel_component">
 | 
				
			||||||
 | 
					Run this example use-case by using our project template. It includes all the 
 | 
				
			||||||
</Project> -->
 | 
					code to create the ML model and the pipeline component from scratch.
 | 
				
			||||||
 | 
					It also contains two config files to train the model: 
 | 
				
			||||||
 | 
					one to run on CPU with a Tok2Vec layer, and one for the GPU using a transformer.
 | 
				
			||||||
 | 
					The project applies the relation extraction component to identify biomolecular 
 | 
				
			||||||
 | 
					interactions in a sample dataset, but you can easily swap in your own dataset 
 | 
				
			||||||
 | 
					for your experiments in any other domain.
 | 
				
			||||||
 | 
					</Project>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#### Step 1: Implementing the Model {#component-rel-model}
 | 
					#### Step 1: Implementing the Model {#component-rel-model}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -552,41 +560,17 @@ matrix** (~~Floats2d~~) of predictions:
 | 
				
			||||||
> for details.
 | 
					> for details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```python
 | 
					```python
 | 
				
			||||||
### Register the model architecture
 | 
					### The model architecture
 | 
				
			||||||
@registry.architectures.register("rel_model.v1")
 | 
					@spacy.registry.architectures.register("rel_model.v1")
 | 
				
			||||||
def create_relation_model(...) -> Model[List[Doc], Floats2d]:
 | 
					def create_relation_model(...) -> Model[List[Doc], Floats2d]:
 | 
				
			||||||
    model = ...  # 👈 model will go here
 | 
					    model = ...  # 👈 model will go here
 | 
				
			||||||
    return model
 | 
					    return model
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The first layer in this model will typically be an
 | 
					We adapt a **modular approach** to the definition of this relation model, and
 | 
				
			||||||
[embedding layer](/usage/embeddings-transformers) such as a
 | 
					define it as chaining two layers together: the first layer that generates an
 | 
				
			||||||
[`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This
 | 
					instance tensor from a given set of documents, and the second layer that
 | 
				
			||||||
layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it
 | 
					transforms the instance tensor into a final tensor holding the predictions:
 | 
				
			||||||
transforms each **document into a list of tokens**, with each token being
 | 
					 | 
				
			||||||
represented by its embedding in the vector space.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Next, we need a method that **generates pairs of entities** that we want to
 | 
					 | 
				
			||||||
classify as being related or not. As these candidate pairs are typically formed
 | 
					 | 
				
			||||||
within one document, this function takes a [`Doc`](/api/doc) as input and
 | 
					 | 
				
			||||||
outputs a `List` of `Span` tuples. For instance, a very straightforward
 | 
					 | 
				
			||||||
implementation would be to just take any two entities from the same document:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```python
 | 
					 | 
				
			||||||
### Simple candiate generation
 | 
					 | 
				
			||||||
def get_candidates(doc: Doc) -> List[Tuple[Span, Span]]:
 | 
					 | 
				
			||||||
    candidates = []
 | 
					 | 
				
			||||||
    for ent1 in doc.ents:
 | 
					 | 
				
			||||||
        for ent2 in doc.ents:
 | 
					 | 
				
			||||||
            candidates.append((ent1, ent2))
 | 
					 | 
				
			||||||
    return candidates
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
But we could also refine this further by **excluding relations** of an entity
 | 
					 | 
				
			||||||
with itself, and posing a **maximum distance** (in number of tokens) between two
 | 
					 | 
				
			||||||
entities. We register this function in the
 | 
					 | 
				
			||||||
[`@misc` registry](/api/top-level#registry) so we can refer to it from the
 | 
					 | 
				
			||||||
config, and easily swap it out for any other candidate generation function.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
> #### config.cfg (excerpt)
 | 
					> #### config.cfg (excerpt)
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
| 
						 | 
					@ -594,18 +578,159 @@ config, and easily swap it out for any other candidate generation function.
 | 
				
			||||||
> [model]
 | 
					> [model]
 | 
				
			||||||
> @architectures = "rel_model.v1"
 | 
					> @architectures = "rel_model.v1"
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
> [model.tok2vec]
 | 
					> [model.create_instance_tensor]
 | 
				
			||||||
> # ...
 | 
					> # ...
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
> [model.get_candidates]
 | 
					> [model.classification_layer]
 | 
				
			||||||
> @misc = "rel_cand_generator.v1"
 | 
					> # ...
 | 
				
			||||||
> max_length = 20
 | 
					 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```python
 | 
					```python
 | 
				
			||||||
### Extended candidate generation {highlight="1,2,7,8"}
 | 
					### The model architecture {highlight="6"}
 | 
				
			||||||
@registry.misc.register("rel_cand_generator.v1")
 | 
					@spacy.registry.architectures.register("rel_model.v1")
 | 
				
			||||||
def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
 | 
					def create_relation_model(
 | 
				
			||||||
 | 
					    create_instance_tensor: Model[List[Doc], Floats2d],
 | 
				
			||||||
 | 
					    classification_layer: Model[Floats2d, Floats2d],
 | 
				
			||||||
 | 
					) -> Model[List[Doc], Floats2d]:
 | 
				
			||||||
 | 
					    model = chain(create_instance_tensor, classification_layer)
 | 
				
			||||||
 | 
					    return model
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The `classification_layer` could be something like a
 | 
				
			||||||
 | 
					[Linear](https://thinc.ai/docs/api-layers#linear) layer followed by a
 | 
				
			||||||
 | 
					[logistic](https://thinc.ai/docs/api-layers#logistic) activation function:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### config.cfg (excerpt)
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```ini
 | 
				
			||||||
 | 
					> [model.classification_layer]
 | 
				
			||||||
 | 
					> @architectures = "rel_classification_layer.v1"
 | 
				
			||||||
 | 
					> nI = null
 | 
				
			||||||
 | 
					> nO = null
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```python
 | 
				
			||||||
 | 
					### The classification layer
 | 
				
			||||||
 | 
					@spacy.registry.architectures.register("rel_classification_layer.v1")
 | 
				
			||||||
 | 
					def create_classification_layer(
 | 
				
			||||||
 | 
					    nO: int = None, nI: int = None
 | 
				
			||||||
 | 
					) -> Model[Floats2d, Floats2d]:
 | 
				
			||||||
 | 
					    return chain(Linear(nO=nO, nI=nI), Logistic())
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The first layer that **creates the instance tensor** can be defined by
 | 
				
			||||||
 | 
					implementing a
 | 
				
			||||||
 | 
					[custom forward function](https://thinc.ai/docs/usage-models#weights-layers-forward)
 | 
				
			||||||
 | 
					with an appropriate backpropagation callback. We also define an
 | 
				
			||||||
 | 
					[initialization method](https://thinc.ai/docs/usage-models#weights-layers-init)
 | 
				
			||||||
 | 
					that ensures that the layer is properly set up for training.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					We omit some of the implementation details here, and refer to the
 | 
				
			||||||
 | 
					[spaCy project](https://github.com/explosion/projects/tree/v3/tutorials/rel_component)
 | 
				
			||||||
 | 
					that has the full implementation.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### config.cfg (excerpt)
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```ini
 | 
				
			||||||
 | 
					> [model.create_instance_tensor]
 | 
				
			||||||
 | 
					> @architectures = "rel_instance_tensor.v1"
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> [model.create_instance_tensor.tok2vec]
 | 
				
			||||||
 | 
					> @architectures = "spacy.HashEmbedCNN.v1"
 | 
				
			||||||
 | 
					> # ...
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> [model.create_instance_tensor.pooling]
 | 
				
			||||||
 | 
					> @layers = "reduce_mean.v1"
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> [model.create_instance_tensor.get_instances]
 | 
				
			||||||
 | 
					> # ...
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```python
 | 
				
			||||||
 | 
					### The layer that creates the instance tensor
 | 
				
			||||||
 | 
					@spacy.registry.architectures.register("rel_instance_tensor.v1")
 | 
				
			||||||
 | 
					def create_tensors(
 | 
				
			||||||
 | 
					    tok2vec: Model[List[Doc], List[Floats2d]],
 | 
				
			||||||
 | 
					    pooling: Model[Ragged, Floats2d],
 | 
				
			||||||
 | 
					    get_instances: Callable[[Doc], List[Tuple[Span, Span]]],
 | 
				
			||||||
 | 
					) -> Model[List[Doc], Floats2d]:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return Model(
 | 
				
			||||||
 | 
					        "instance_tensors",
 | 
				
			||||||
 | 
					        instance_forward,
 | 
				
			||||||
 | 
					        init=instance_init,
 | 
				
			||||||
 | 
					        layers=[tok2vec, pooling],
 | 
				
			||||||
 | 
					        refs={"tok2vec": tok2vec, "pooling": pooling},
 | 
				
			||||||
 | 
					        attrs={"get_instances": get_instances},
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# The custom forward function
 | 
				
			||||||
 | 
					def instance_forward(
 | 
				
			||||||
 | 
					    model: Model[List[Doc], Floats2d],
 | 
				
			||||||
 | 
					    docs: List[Doc],
 | 
				
			||||||
 | 
					    is_train: bool,
 | 
				
			||||||
 | 
					) -> Tuple[Floats2d, Callable]:
 | 
				
			||||||
 | 
					    tok2vec = model.get_ref("tok2vec")
 | 
				
			||||||
 | 
					    tokvecs, bp_tokvecs = tok2vec(docs, is_train)
 | 
				
			||||||
 | 
					    get_instances = model.attrs["get_instances"]
 | 
				
			||||||
 | 
					    all_instances = [get_instances(doc) for doc in docs]
 | 
				
			||||||
 | 
					    pooling = model.get_ref("pooling")
 | 
				
			||||||
 | 
					    relations = ...
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def backprop(d_relations: Floats2d) -> List[Doc]:
 | 
				
			||||||
 | 
					        d_tokvecs = ...
 | 
				
			||||||
 | 
					        return bp_tokvecs(d_tokvecs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return relations, backprop
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# The custom initialization method
 | 
				
			||||||
 | 
					def instance_init(
 | 
				
			||||||
 | 
					    model: Model,
 | 
				
			||||||
 | 
					    X: List[Doc] = None,
 | 
				
			||||||
 | 
					    Y: Floats2d = None,
 | 
				
			||||||
 | 
					) -> Model:
 | 
				
			||||||
 | 
					    tok2vec = model.get_ref("tok2vec")
 | 
				
			||||||
 | 
					    tok2vec.initialize(X)
 | 
				
			||||||
 | 
					    return model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This custom layer uses an [embedding layer](/usage/embeddings-transformers) such
 | 
				
			||||||
 | 
					as a [`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer).
 | 
				
			||||||
 | 
					This layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it
 | 
				
			||||||
 | 
					transforms each **document into a list of tokens**, with each token being
 | 
				
			||||||
 | 
					represented by its embedding in the vector space.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The `pooling` layer will be applied to summarize the token vectors into **entity
 | 
				
			||||||
 | 
					vectors**, as named entities (represented by ~~Span~~ objects) can consist of
 | 
				
			||||||
 | 
					one or multiple tokens. For instance, the pooling layer could resort to
 | 
				
			||||||
 | 
					calculating the average of all token vectors in an entity. Thinc provides
 | 
				
			||||||
 | 
					several
 | 
				
			||||||
 | 
					[built-in pooling operators](https://thinc.ai/docs/api-layers#reduction-ops) for
 | 
				
			||||||
 | 
					this purpose.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Finally, we need a `get_instances` method that **generates pairs of entities**
 | 
				
			||||||
 | 
					that we want to classify as being related or not. As these candidate pairs are
 | 
				
			||||||
 | 
					typically formed within one document, this function takes a [`Doc`](/api/doc) as
 | 
				
			||||||
 | 
					input and outputs a `List` of `Span` tuples. For instance, the following
 | 
				
			||||||
 | 
					implementation takes any two entities from the same document, as long as they
 | 
				
			||||||
 | 
					are within a **maximum distance** (in number of tokens) of eachother:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### config.cfg (excerpt)
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```ini
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> [model.create_instance_tensor.get_instances]
 | 
				
			||||||
 | 
					> @misc = "rel_instance_generator.v1"
 | 
				
			||||||
 | 
					> max_length = 100
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```python
 | 
				
			||||||
 | 
					### Candidate generation
 | 
				
			||||||
 | 
					@spacy.registry.misc.register("rel_instance_generator.v1")
 | 
				
			||||||
 | 
					def create_instances(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
 | 
				
			||||||
    def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
 | 
					    def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
 | 
				
			||||||
        candidates = []
 | 
					        candidates = []
 | 
				
			||||||
        for ent1 in doc.ents:
 | 
					        for ent1 in doc.ents:
 | 
				
			||||||
| 
						 | 
					@ -617,45 +742,39 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span
 | 
				
			||||||
    return get_candidates
 | 
					    return get_candidates
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Finally, we require a method that transforms the candidate entity pairs into a
 | 
					This function in added to the [`@misc` registry](/api/top-level#registry) so we
 | 
				
			||||||
2D tensor using the specified [`Tok2Vec`](/api/tok2vec) or
 | 
					can refer to it from the config, and easily swap it out for any other candidate
 | 
				
			||||||
[`Transformer`](/api/transformer). The resulting ~~Floats2~~ object will then be
 | 
					generation function.
 | 
				
			||||||
processed by a final `output_layer` of the network. Putting all this together,
 | 
					 | 
				
			||||||
we can define our relation model in a config file as such:
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
```ini
 | 
					#### Intermezzo: define how to store the relations data {#component-rel-attribute}
 | 
				
			||||||
### config.cfg
 | 
					 | 
				
			||||||
[model]
 | 
					 | 
				
			||||||
@architectures = "rel_model.v1"
 | 
					 | 
				
			||||||
# ...
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
[model.tok2vec]
 | 
					> #### Example output
 | 
				
			||||||
# ...
 | 
					>
 | 
				
			||||||
 | 
					> ```python
 | 
				
			||||||
 | 
					> doc = nlp("Amsterdam is the capital of the Netherlands.")
 | 
				
			||||||
 | 
					> print("spans", [(e.start, e.text, e.label_) for e in doc.ents])
 | 
				
			||||||
 | 
					> for value, rel_dict in doc._.rel.items():
 | 
				
			||||||
 | 
					>     print(f"{value}: {rel_dict}")
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> # spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')]
 | 
				
			||||||
 | 
					> # (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002}
 | 
				
			||||||
 | 
					> # (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017}
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[model.get_candidates]
 | 
					For our new relation extraction component, we will use a custom
 | 
				
			||||||
@misc = "rel_cand_generator.v1"
 | 
					[extension attribute](/usage/processing-pipelines#custom-components-attributes)
 | 
				
			||||||
max_length = 20
 | 
					`doc._.rel` in which we store relation data. The attribute refers to a
 | 
				
			||||||
 | 
					dictionary, keyed by the **start offsets of each entity** involved in the
 | 
				
			||||||
[model.create_candidate_tensor]
 | 
					candidate relation. The values in the dictionary refer to another dictionary
 | 
				
			||||||
@misc = "rel_cand_tensor.v1"
 | 
					where relation labels are mapped to values between 0 and 1. We assume anything
 | 
				
			||||||
 | 
					above 0.5 to be a `True` relation. The ~~Example~~ instances that we'll use as
 | 
				
			||||||
[model.output_layer]
 | 
					training data, will include their gold-standard relation annotations in
 | 
				
			||||||
@architectures = "rel_output_layer.v1"
 | 
					`example.reference._.rel`.
 | 
				
			||||||
# ...
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
<!-- TODO: link to project for implementation details -->
 | 
					 | 
				
			||||||
<!-- TODO: maybe embed files from project that show the architectures? -->
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
When creating this model, we store the custom functions as
 | 
					 | 
				
			||||||
[attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as
 | 
					 | 
				
			||||||
references, so we can access them easily:
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
```python
 | 
					```python
 | 
				
			||||||
tok2vec_layer = model.get_ref("tok2vec")
 | 
					### Registering the extension attribute
 | 
				
			||||||
output_layer = model.get_ref("output_layer")
 | 
					from spacy.tokens import Doc
 | 
				
			||||||
create_candidate_tensor = model.attrs["create_candidate_tensor"]
 | 
					Doc.set_extension("rel", default={})
 | 
				
			||||||
get_candidates = model.attrs["get_candidates"]
 | 
					 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#### Step 2: Implementing the pipeline component {#component-rel-pipe}
 | 
					#### Step 2: Implementing the pipeline component {#component-rel-pipe}
 | 
				
			||||||
| 
						 | 
					@ -698,19 +817,44 @@ class RelationExtractor(TrainablePipe):
 | 
				
			||||||
        ...
 | 
					        ...
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Before the model can be used, it needs to be
 | 
					Typically, the **constructor** defines the vocab, the Machine Learning model,
 | 
				
			||||||
[initialized](/usage/training#initialization). This function receives a callback
 | 
					and the name of this component. Additionally, this component, just like the
 | 
				
			||||||
to access the full **training data set**, or a representative sample. This data
 | 
					`textcat` and the `tagger`, stores an **internal list of labels**. The ML model
 | 
				
			||||||
set can be used to deduce all **relevant labels**. Alternatively, a list of
 | 
					will predict scores for each label. We add convenience methods to easily
 | 
				
			||||||
labels can be provided to `initialize`, or you can call
 | 
					retrieve and add to them.
 | 
				
			||||||
`RelationExtractor.add_label` directly. The number of labels defines the output
 | 
					
 | 
				
			||||||
dimensionality of the network, and will be used to do
 | 
					```python
 | 
				
			||||||
 | 
					### The constructor (continued) 
 | 
				
			||||||
 | 
					    def __init__(self, vocab, model, name="rel"):
 | 
				
			||||||
 | 
					        """Create a component instance."""
 | 
				
			||||||
 | 
					        # ...
 | 
				
			||||||
 | 
					        self.cfg = {"labels": []}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def labels(self) -> Tuple[str]:
 | 
				
			||||||
 | 
					        """Returns the labels currently added to the component."""
 | 
				
			||||||
 | 
					        return tuple(self.cfg["labels"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def add_label(self, label: str):
 | 
				
			||||||
 | 
					        """Add a new label to the pipe."""
 | 
				
			||||||
 | 
					        self.cfg["labels"] = list(self.labels) + [label]
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					After creation, the component needs to be
 | 
				
			||||||
 | 
					[initialized](/usage/training#initialization). This method can define the
 | 
				
			||||||
 | 
					relevant labels in two ways: explicitely by setting the `labels` argument in the
 | 
				
			||||||
 | 
					[`initialize` block](/api/data-formats#config-initialize) of the config, or
 | 
				
			||||||
 | 
					implicately by deducing them from the `get_examples` callback that generates the
 | 
				
			||||||
 | 
					full **training data set**, or a representative sample.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The final number of labels defines the output dimensionality of the network, and
 | 
				
			||||||
 | 
					will be used to do
 | 
				
			||||||
[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
 | 
					[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
 | 
				
			||||||
layers of the neural network. This is triggered by calling
 | 
					layers of the neural network. This is triggered by calling
 | 
				
			||||||
[`Model.initialize`](https://thinc.ai/api/model#initialize).
 | 
					[`Model.initialize`](https://thinc.ai/api/model#initialize).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```python
 | 
					```python
 | 
				
			||||||
### The initialize method {highlight="12,18,22"}
 | 
					### The initialize method {highlight="12,15,18,22"}
 | 
				
			||||||
from itertools import islice
 | 
					from itertools import islice
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def initialize(
 | 
					def initialize(
 | 
				
			||||||
| 
						 | 
					@ -741,7 +885,7 @@ Typically, this happens when the pipeline is set up before training in
 | 
				
			||||||
[`spacy train`](/api/cli#training). After initialization, the pipeline component
 | 
					[`spacy train`](/api/cli#training). After initialization, the pipeline component
 | 
				
			||||||
and its internal model can be trained and used to make predictions.
 | 
					and its internal model can be trained and used to make predictions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
During training, the function [`update`](/api/pipe#update) is invoked which
 | 
					During training, the method [`update`](/api/pipe#update) is invoked which
 | 
				
			||||||
delegates to
 | 
					delegates to
 | 
				
			||||||
[`Model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
 | 
					[`Model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
 | 
				
			||||||
[`get_loss`](/api/pipe#get_loss) function that **calculates the loss** for a
 | 
					[`get_loss`](/api/pipe#get_loss) function that **calculates the loss** for a
 | 
				
			||||||
| 
						 | 
					@ -761,18 +905,18 @@ def update(
 | 
				
			||||||
    sgd: Optional[Optimizer] = None,
 | 
					    sgd: Optional[Optimizer] = None,
 | 
				
			||||||
    losses: Optional[Dict[str, float]] = None,
 | 
					    losses: Optional[Dict[str, float]] = None,
 | 
				
			||||||
) -> Dict[str, float]:
 | 
					) -> Dict[str, float]:
 | 
				
			||||||
    ...
 | 
					    # ...
 | 
				
			||||||
    docs = [ex.predicted for ex in examples]
 | 
					    docs = [eg.predicted for eg in examples]
 | 
				
			||||||
    predictions, backprop = self.model.begin_update(docs)
 | 
					    predictions, backprop = self.model.begin_update(docs)
 | 
				
			||||||
    loss, gradient = self.get_loss(examples, predictions)
 | 
					    loss, gradient = self.get_loss(examples, predictions)
 | 
				
			||||||
    backprop(gradient)
 | 
					    backprop(gradient)
 | 
				
			||||||
    losses[self.name] += loss
 | 
					    losses[self.name] += loss
 | 
				
			||||||
    ...
 | 
					    # ...
 | 
				
			||||||
    return losses
 | 
					    return losses
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
When the internal model is trained, the component can be used to make novel
 | 
					After training the model, the component can be used to make novel
 | 
				
			||||||
**predictions**. The [`predict`](/api/pipe#predict) function needs to be
 | 
					**predictions**. The [`predict`](/api/pipe#predict) method needs to be
 | 
				
			||||||
implemented for each subclass of `TrainablePipe`. In our case, we can simply
 | 
					implemented for each subclass of `TrainablePipe`. In our case, we can simply
 | 
				
			||||||
delegate to the internal model's
 | 
					delegate to the internal model's
 | 
				
			||||||
[predict](https://thinc.ai/docs/api-model#predict) function that takes a batch
 | 
					[predict](https://thinc.ai/docs/api-model#predict) function that takes a batch
 | 
				
			||||||
| 
						 | 
					@ -788,42 +932,21 @@ def predict(self, docs: Iterable[Doc]) -> Floats2d:
 | 
				
			||||||
The final method that needs to be implemented, is
 | 
					The final method that needs to be implemented, is
 | 
				
			||||||
[`set_annotations`](/api/pipe#set_annotations). This function takes the
 | 
					[`set_annotations`](/api/pipe#set_annotations). This function takes the
 | 
				
			||||||
predictions, and modifies the given `Doc` object in place to store them. For our
 | 
					predictions, and modifies the given `Doc` object in place to store them. For our
 | 
				
			||||||
relation extraction component, we store the data as a dictionary in a custom
 | 
					relation extraction component, we store the data in the
 | 
				
			||||||
[extension attribute](/usage/processing-pipelines#custom-components-attributes)
 | 
					[custom attribute](#component-rel-attribute)`doc._.rel`.
 | 
				
			||||||
`doc._.rel`. As keys, we represent the candidate pair by the **start offsets of
 | 
					 | 
				
			||||||
each entity**, as this defines an entity pair uniquely within one document.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
To interpret the scores predicted by the relation extraction model correctly, we
 | 
					To interpret the scores predicted by the relation extraction model correctly, we
 | 
				
			||||||
need to refer to the model's `get_candidates` function that defined which pairs
 | 
					need to refer to the model's `get_instances` function that defined which pairs
 | 
				
			||||||
of entities were relevant candidates, so that the predictions can be linked to
 | 
					of entities were relevant candidates, so that the predictions can be linked to
 | 
				
			||||||
those exact entities:
 | 
					those exact entities:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
> #### Example output
 | 
					 | 
				
			||||||
>
 | 
					 | 
				
			||||||
> ```python
 | 
					 | 
				
			||||||
> doc = nlp("Amsterdam is the capital of the Netherlands.")
 | 
					 | 
				
			||||||
> print("spans", [(e.start, e.text, e.label_) for e in doc.ents])
 | 
					 | 
				
			||||||
> for value, rel_dict in doc._.rel.items():
 | 
					 | 
				
			||||||
>     print(f"{value}: {rel_dict}")
 | 
					 | 
				
			||||||
>
 | 
					 | 
				
			||||||
> # spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')]
 | 
					 | 
				
			||||||
> # (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002}
 | 
					 | 
				
			||||||
> # (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017}
 | 
					 | 
				
			||||||
> ```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```python
 | 
					 | 
				
			||||||
### Registering the extension attribute
 | 
					 | 
				
			||||||
from spacy.tokens import Doc
 | 
					 | 
				
			||||||
Doc.set_extension("rel", default={})
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```python
 | 
					```python
 | 
				
			||||||
### The set_annotations method {highlight="5-6,10"}
 | 
					### The set_annotations method {highlight="5-6,10"}
 | 
				
			||||||
def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
 | 
					def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
 | 
				
			||||||
    c = 0
 | 
					    c = 0
 | 
				
			||||||
    get_candidates = self.model.attrs["get_candidates"]
 | 
					    get_instances = self.model.attrs["get_instances"]
 | 
				
			||||||
    for doc in docs:
 | 
					    for doc in docs:
 | 
				
			||||||
        for (e1, e2) in get_candidates(doc):
 | 
					        for (e1, e2) in get_instances(doc):
 | 
				
			||||||
            offset = (e1.start, e2.start)
 | 
					            offset = (e1.start, e2.start)
 | 
				
			||||||
            if offset not in doc._.rel:
 | 
					            if offset not in doc._.rel:
 | 
				
			||||||
                doc._.rel[offset] = {}
 | 
					                doc._.rel[offset] = {}
 | 
				
			||||||
| 
						 | 
					@ -837,15 +960,15 @@ Under the hood, when the pipe is applied to a document, it delegates to the
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```python
 | 
					```python
 | 
				
			||||||
### The __call__ method
 | 
					### The __call__ method
 | 
				
			||||||
def __call__(self, Doc doc):
 | 
					def __call__(self, doc: Doc):
 | 
				
			||||||
    predictions = self.predict([doc])
 | 
					    predictions = self.predict([doc])
 | 
				
			||||||
    self.set_annotations([doc], predictions)
 | 
					    self.set_annotations([doc], predictions)
 | 
				
			||||||
    return doc
 | 
					    return doc
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
There is one more optional method to implement: [`score`](/api/pipe#score)
 | 
					There is one more optional method to implement: [`score`](/api/pipe#score)
 | 
				
			||||||
calculates the performance of your component on a set of examples, and 
 | 
					calculates the performance of your component on a set of examples, and returns
 | 
				
			||||||
returns the results as a dictionary:
 | 
					the results as a dictionary:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```python
 | 
					```python
 | 
				
			||||||
### The score method
 | 
					### The score method
 | 
				
			||||||
| 
						 | 
					@ -861,8 +984,8 @@ def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
This is particularly useful to see the scores on the development corpus 
 | 
					This is particularly useful for calculating relevant scores on the development
 | 
				
			||||||
when training the component with [`spacy train`](/api/cli#training).
 | 
					corpus when training the component with [`spacy train`](/api/cli#training).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Once our `TrainablePipe` subclass is fully implemented, we can
 | 
					Once our `TrainablePipe` subclass is fully implemented, we can
 | 
				
			||||||
[register](/usage/processing-pipelines#custom-components-factories) the
 | 
					[register](/usage/processing-pipelines#custom-components-factories) the
 | 
				
			||||||
| 
						 | 
					@ -879,14 +1002,8 @@ assigns it a name and lets you create the component with
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
> [components.relation_extractor.model]
 | 
					> [components.relation_extractor.model]
 | 
				
			||||||
> @architectures = "rel_model.v1"
 | 
					> @architectures = "rel_model.v1"
 | 
				
			||||||
>
 | 
					 | 
				
			||||||
> [components.relation_extractor.model.tok2vec]
 | 
					 | 
				
			||||||
> # ...
 | 
					> # ...
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
> [components.relation_extractor.model.get_candidates]
 | 
					 | 
				
			||||||
> @misc = "rel_cand_generator.v1"
 | 
					 | 
				
			||||||
> max_length = 20
 | 
					 | 
				
			||||||
> 
 | 
					 | 
				
			||||||
> [training.score_weights]
 | 
					> [training.score_weights]
 | 
				
			||||||
> rel_micro_p = 0.0
 | 
					> rel_micro_p = 0.0
 | 
				
			||||||
> rel_micro_r = 0.0
 | 
					> rel_micro_r = 0.0
 | 
				
			||||||
| 
						 | 
					@ -924,6 +1041,12 @@ def make_relation_extractor(nlp, name, model):
 | 
				
			||||||
    return RelationExtractor(nlp.vocab, model, name)
 | 
					    return RelationExtractor(nlp.vocab, model, name)
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<!-- TODO: <Project id="tutorials/ner-relations">
 | 
					<Project id="tutorials/rel_component">
 | 
				
			||||||
 | 
					Run this example use-case by using our project template. It includes all the 
 | 
				
			||||||
</Project> -->
 | 
					code to create the ML model and the pipeline component from scratch.
 | 
				
			||||||
 | 
					It contains two config files to train the model: 
 | 
				
			||||||
 | 
					one to run on CPU with a Tok2Vec layer, and one for the GPU using a transformer.
 | 
				
			||||||
 | 
					The project applies the relation extraction component to identify biomolecular 
 | 
				
			||||||
 | 
					interactions, but you can easily swap in your own dataset for your experiments 
 | 
				
			||||||
 | 
					in any other domain.
 | 
				
			||||||
 | 
					</Project>
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -969,18 +969,18 @@ The [`Language.update`](/api/language#update),
 | 
				
			||||||
raw text and a dictionary of annotations.
 | 
					raw text and a dictionary of annotations.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```python
 | 
					```python
 | 
				
			||||||
### Training loop {highlight="11"}
 | 
					### Training loop {highlight="5-8,12"}
 | 
				
			||||||
TRAIN_DATA = [
 | 
					TRAIN_DATA = [
 | 
				
			||||||
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
 | 
					    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
 | 
				
			||||||
    ("I like London.", {"entities": [(7, 13, "LOC")]}),
 | 
					    ("I like London.", {"entities": [(7, 13, "LOC")]}),
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
nlp.initialize()
 | 
					examples = []
 | 
				
			||||||
 | 
					for text, annots in TRAIN_DATA:
 | 
				
			||||||
 | 
					    examples.append(Example.from_dict(nlp.make_doc(text), annots))
 | 
				
			||||||
 | 
					nlp.initialize(lambda: examples)
 | 
				
			||||||
for i in range(20):
 | 
					for i in range(20):
 | 
				
			||||||
    random.shuffle(TRAIN_DATA)
 | 
					    random.shuffle(examples)
 | 
				
			||||||
    for batch in minibatch(TRAIN_DATA):
 | 
					    for batch in minibatch(examples, size=8):
 | 
				
			||||||
        examples = []
 | 
					 | 
				
			||||||
        for text, annots in batch:
 | 
					 | 
				
			||||||
            examples.append(Example.from_dict(nlp.make_doc(text), annots))
 | 
					 | 
				
			||||||
        nlp.update(examples)
 | 
					        nlp.update(examples)
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -995,7 +995,7 @@ network,
 | 
				
			||||||
setting up the label scheme.
 | 
					setting up the label scheme.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```diff
 | 
					```diff
 | 
				
			||||||
- nlp.initialize(examples)
 | 
					- nlp.begin_training()
 | 
				
			||||||
+ nlp.initialize(lambda: examples)
 | 
					+ nlp.initialize(lambda: examples)
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user