From 789fb3d1247c977887251254e5803bd7700fd970 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 9 Nov 2020 21:42:58 +0100 Subject: [PATCH 01/18] add docs for upstream argument of TransformerListener --- spacy/errors.py | 3 ++- spacy/ml/models/textcat.py | 2 +- website/docs/api/architectures.md | 22 ++++++++++++---------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index f4fd3731f..9cbc4ef1b 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -125,8 +125,9 @@ class Warnings: class Errors: E001 = ("No component '{name}' found in pipeline. Available names: {opts}") E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). " - "This usually happens when spaCy calls `nlp.{method}` with custom " + "This usually happens when spaCy calls `nlp.{method}` with a custom " "component name that's not registered on the current language class. " + "If you're using a Transformer, make sure to install 'spacy-transformers'. " "If you're using a custom component, make sure you've added the " "decorator `@Language.component` (for function components) or " "`@Language.factory` (for class components).\n\nAvailable " diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index d4aed2839..181bbcf4c 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -94,7 +94,7 @@ def build_text_classifier( # TODO: move to legacy @registry.architectures.register("spacy.TextCatEnsemble.v1") -def build_text_classifier( +def build_text_classifier_v1( width: int, embed_size: int, pretrained_vectors: Optional[bool], diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 517ab9e7a..479e56f88 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -143,10 +143,10 @@ argument that connects to the shared `tok2vec` component in the pipeline. Construct an embedding layer that separately embeds a number of lexical attributes using hash embedding, concatenates the results, and passes it through -a feed-forward subnetwork to build a mixed representation. The features used -can be configured with the `attrs` argument. The suggested attributes are -`NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account -some subword information, without construction a fully character-based +a feed-forward subnetwork to build a mixed representation. The features used can +be configured with the `attrs` argument. The suggested attributes are `NORM`, +`PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some +subword information, without construction a fully character-based representation. If pretrained vectors are available, they can be included in the representation as well, with the vectors table will be kept static (i.e. it's not updated). @@ -393,11 +393,12 @@ operate over wordpieces, which usually don't align one-to-one against spaCy tokens. The layer therefore requires a reduction operation in order to calculate a single token vector given zero or more wordpiece vectors. -| Name | Description | -| ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ | -| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | +| Name | Description | +| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ | +| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ | +| `upstream` | A string to identify the "upstream" `Transformer` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Transformer` component. You'll almost never have multiple upstream `Transformer` components, so the wildcard string will almost always be fine. ~~str~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer} @@ -563,7 +564,8 @@ from the linear model, where it is stored in `model.attrs["multi_label"]`. -The v1 was functionally similar, but used an internal `tok2vec` instead of taking it as argument. +The v1 was functionally similar, but used an internal `tok2vec` instead of +taking it as argument. | Name | Description | | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | From fcd79e0655727513a72b45dbd98651dff88244a9 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 11 Nov 2020 21:32:34 +0100 Subject: [PATCH 02/18] remove set_morphology from docs --- website/docs/api/tagger.md | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index 2123004b6..f337b51fd 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -21,16 +21,12 @@ architectures and their arguments and hyperparameters. > > ```python > from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL -> config = { -> "set_morphology": False, -> "model": DEFAULT_TAGGER_MODEL, -> } +> config = {"model": DEFAULT_TAGGER_MODEL} > nlp.add_pipe("tagger", config=config) > ``` | Setting | Description | | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `set_morphology` | Whether to set morphological features. Defaults to `False`. ~~bool~~ | | `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | ```python @@ -63,8 +59,6 @@ shortcut for this and instantiate the component using its string name and | `vocab` | The shared vocabulary. ~~Vocab~~ | | `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| `set_morphology` | Whether to set morphological features. ~~bool~~ | ## Tagger.\_\_call\_\_ {#call tag="method"} From d5a920325f25939721f5895e3367c274cf1ecfe6 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 11 Nov 2020 21:34:12 +0100 Subject: [PATCH 03/18] remove labels from constructor --- spacy/pipeline/multitask.pyx | 19 +++++++++++-------- spacy/pipeline/tagger.pyx | 5 ++--- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index e1ea49849..9c7bb5914 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -47,7 +47,7 @@ class MultitaskObjective(Tagger): side-objective. """ - def __init__(self, vocab, model, name="nn_labeller", *, labels, target): + def __init__(self, vocab, model, name="nn_labeller", *, target): self.vocab = vocab self.model = model self.name = name @@ -67,7 +67,7 @@ class MultitaskObjective(Tagger): self.make_label = target else: raise ValueError(Errors.E016) - cfg = {"labels": labels or {}, "target": target} + cfg = {"labels": {}, "target": target} self.cfg = dict(cfg) @property @@ -81,15 +81,18 @@ class MultitaskObjective(Tagger): def set_annotations(self, docs, dep_ids): pass - def initialize(self, get_examples, nlp=None): + def initialize(self, get_examples, nlp=None, labels=None): if not hasattr(get_examples, "__call__"): err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples)) raise ValueError(err) - for example in get_examples(): - for token in example.y: - label = self.make_label(token) - if label is not None and label not in self.labels: - self.labels[label] = len(self.labels) + if labels is not None: + self.labels = labels + else: + for example in get_examples(): + for token in example.y: + label = self.make_label(token) + if label is not None and label not in self.labels: + self.labels[label] = len(self.labels) self.model.initialize() # TODO: fix initialization by defining X and Y def predict(self, docs): diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 16633a7b8..08f09b002 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -61,14 +61,13 @@ class Tagger(TrainablePipe): DOCS: https://nightly.spacy.io/api/tagger """ - def __init__(self, vocab, model, name="tagger", *, labels=None): + def __init__(self, vocab, model, name="tagger"): """Initialize a part-of-speech tagger. vocab (Vocab): The shared vocabulary. model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. - labels (List): The set of labels. Defaults to None. DOCS: https://nightly.spacy.io/api/tagger#init """ @@ -76,7 +75,7 @@ class Tagger(TrainablePipe): self.model = model self.name = name self._rehearsal_model = None - cfg = {"labels": labels or []} + cfg = {"labels": []} self.cfg = dict(sorted(cfg.items())) @property From 73fc1ed963121ce48c78e576b3123bf6e49455a8 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 11 Nov 2020 21:48:50 +0100 Subject: [PATCH 04/18] remove labels from morphologizer constructor --- spacy/pipeline/morphologizer.pyx | 7 +------ website/docs/api/morphologizer.md | 3 --- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 305f8f5df..66e0787ef 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -67,9 +67,6 @@ class Morphologizer(Tagger): vocab: Vocab, model: Model, name: str = "morphologizer", - *, - labels_morph: Optional[dict] = None, - labels_pos: Optional[dict] = None, ): """Initialize a morphologizer. @@ -77,8 +74,6 @@ class Morphologizer(Tagger): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. - labels_morph (dict): Mapping of morph + POS tags to morph labels. - labels_pos (dict): Mapping of morph + POS tags to POS tags. DOCS: https://nightly.spacy.io/api/morphologizer#init """ @@ -90,7 +85,7 @@ class Morphologizer(Tagger): # store mappings from morph+POS labels to token-level annotations: # 1) labels_morph stores a mapping from morph+POS->morph # 2) labels_pos stores a mapping from morph+POS->POS - cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}} + cfg = {"labels_morph": {}, "labels_pos": {}} self.cfg = dict(sorted(cfg.items())) @property diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index d32514fb0..fed86ff5d 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -66,9 +66,6 @@ shortcut for this and instantiate the component using its string name and | `vocab` | The shared vocabulary. ~~Vocab~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| `labels_morph` | Mapping of morph + POS tags to morph labels. ~~Dict[str, str]~~ | -| `labels_pos` | Mapping of morph + POS tags to POS tags. ~~Dict[str, str]~~ | ## Morphologizer.\_\_call\_\_ {#call tag="method"} From 99d0412b6ebf1af8e5bd4755f2ec8d1947d8cef5 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 15 Nov 2020 18:35:56 +0100 Subject: [PATCH 05/18] add link to REL project --- website/docs/usage/layers-architectures.md | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 641db02f5..481a60574 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -534,9 +534,14 @@ steps required: machine learning model that sets annotations on the [`Doc`](/api/doc) passing through the pipeline. - + +Run this example use-case by using our project template. It includes all the +code to create the ML model and the pipeline component from scratch. +It contains two config files to train the model: +one to run on CPU with a Tok2Vec layer, and one for the GPU using a transformer. +The project applies the relation extraction component to identify biomolecular +interactions, but you can easily swap in your own dataset for your experiments. + #### Step 1: Implementing the Model {#component-rel-model} @@ -924,6 +929,11 @@ def make_relation_extractor(nlp, name, model): return RelationExtractor(nlp.vocab, model, name) ``` - + +Run this example use-case by using our project template. It includes all the +code to create the ML model and the pipeline component from scratch. +It contains two config files to train the model: +one to run on CPU with a Tok2Vec layer, and one for the GPU using a transformer. +The project applies the relation extraction component to identify biomolecular +interactions, but you can easily swap in your own dataset for your experiments. + From 124f49feb6f44fdab65036f1413dba237ba14599 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 20 Nov 2020 15:25:20 +0100 Subject: [PATCH 06/18] update REL model code --- website/docs/usage/layers-architectures.md | 224 ++++++++++++++------- 1 file changed, 155 insertions(+), 69 deletions(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 481a60574..60d6224dc 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -540,7 +540,8 @@ code to create the ML model and the pipeline component from scratch. It contains two config files to train the model: one to run on CPU with a Tok2Vec layer, and one for the GPU using a transformer. The project applies the relation extraction component to identify biomolecular -interactions, but you can easily swap in your own dataset for your experiments. +interactions, but you can easily swap in your own dataset for your experiments +in any other domain. #### Step 1: Implementing the Model {#component-rel-model} @@ -558,40 +559,17 @@ matrix** (~~Floats2d~~) of predictions: ```python ### Register the model architecture -@registry.architectures.register("rel_model.v1") +@spacy.registry.architectures.register("rel_model.v1") def create_relation_model(...) -> Model[List[Doc], Floats2d]: model = ... # 👈 model will go here return model ``` -The first layer in this model will typically be an -[embedding layer](/usage/embeddings-transformers) such as a -[`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This -layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it -transforms each **document into a list of tokens**, with each token being -represented by its embedding in the vector space. +We will adapt a **modular approach** to the definition of this relation model, and +define it as chaining to layers together: the first layer that generates an +instance tensor from a given set of documents, and the second layer that +transforms this tensor into a final tensor holding the predictions: -Next, we need a method that **generates pairs of entities** that we want to -classify as being related or not. As these candidate pairs are typically formed -within one document, this function takes a [`Doc`](/api/doc) as input and -outputs a `List` of `Span` tuples. For instance, a very straightforward -implementation would be to just take any two entities from the same document: - -```python -### Simple candiate generation -def get_candidates(doc: Doc) -> List[Tuple[Span, Span]]: - candidates = [] - for ent1 in doc.ents: - for ent2 in doc.ents: - candidates.append((ent1, ent2)) - return candidates -``` - -But we could also refine this further by **excluding relations** of an entity -with itself, and posing a **maximum distance** (in number of tokens) between two -entities. We register this function in the -[`@misc` registry](/api/top-level#registry) so we can refer to it from the -config, and easily swap it out for any other candidate generation function. > #### config.cfg (excerpt) > @@ -599,17 +577,151 @@ config, and easily swap it out for any other candidate generation function. > [model] > @architectures = "rel_model.v1" > -> [model.tok2vec] +> [model.create_instance_tensor] > # ... > -> [model.get_candidates] -> @misc = "rel_cand_generator.v1" -> max_length = 20 +> [model.classification_layer] +> ... > ``` ```python -### Extended candidate generation {highlight="1,2,7,8"} -@registry.misc.register("rel_cand_generator.v1") +### Implement the model architecture +@spacy.registry.architectures.register("rel_model.v1") +def create_relation_model( + create_instance_tensor: Model[List[Doc], Floats2d], + classification_layer: Model[Floats2d, Floats2d], +) -> Model[List[Doc], Floats2d]: + model = chain(create_instance_tensor, classification_layer) + return model +``` + +The `classification_layer` could be something simple like a Linear layer +followed by a logistic activation function: + + +> #### config.cfg (excerpt) +> +> ```ini +> [model.classification_layer] +> @architectures = "rel_classification_layer.v1" +> nI = null +> nO = null +> ``` + +```python +### Implement the classification layer +@spacy.registry.architectures.register("rel_classification_layer.v1") +def create_classification_layer( + nO: int = None, nI: int = None +) -> Model[Floats2d, Floats2d]: + return chain(Linear(nO=nO, nI=nI), Logistic()) +``` + +The first layer that **creates the instance tensor** can be defined +by implementing a +[custom forward function](https://thinc.ai/docs/usage-models#weights-layers-forward) +with an appropriate backpropagation callback. We also define an +[initialization method](https://thinc.ai/docs/usage-models#weights-layers-init) +that ensures that the layer is properly set up for training. + +```python +### Implement the custom forward function +def instance_forward( + model: Model[List[Doc], Floats2d], + docs: List[Doc], + is_train: bool +) -> Tuple[Floats2d, Callable]: + ... + tok2vec = model.get_ref("tok2vec") + tokvecs, bp_tokvecs = tok2vec(docs, is_train) + relations = ... + + def backprop(d_relations: Floats2d) -> List[Doc]: + d_tokvecs = ... + return bp_tokvecs(d_tokvecs) + + return relations, backprop + + +### Implement the custom initialization method +def instance_init( + model: Model, + X: List[Doc] = None, + Y: Floats2d = None +) -> Model: + tok2vec = model.get_ref("tok2vec") + tok2vec.initialize(X) + return model + + +### Implement the layer that creates the instance tensor +@spacy.registry.architectures.register("rel_instance_tensor.v1") +def create_tensors( + tok2vec: Model[List[Doc], List[Floats2d]], + pooling: Model[Ragged, Floats2d], + get_instances: Callable[[Doc], List[Tuple[Span, Span]]], +) -> Model[List[Doc], Floats2d]: + + return Model( + "instance_tensors", + instance_forward, + layers=[tok2vec, pooling], + refs={"tok2vec": tok2vec, "pooling": pooling}, + attrs={"get_instances": get_instances}, + init=instance_init, + ) +``` + +> #### config.cfg (excerpt) +> +> ```ini +> [model.create_instance_tensor] +> @architectures = "rel_instance_tensor.v1" +> +> [model.create_instance_tensor.tok2vec] +> @architectures = "spacy.HashEmbedCNN.v1" +> ... +> +> [model.create_instance_tensor.pooling] +> @layers = "reduce_mean.v1" +> +> [model.create_instance_tensor.get_instances] +> ... +> ` + +This custom layer uses an +**[embedding layer](/usage/embeddings-transformers)** such as a +[`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This +layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it +transforms each **document into a list of tokens**, with each token being +represented by its embedding in the vector space. + +The **`pooling`** layer will be applied to summarize the token vectors into entity +vectors, as named entities (represented by `Span` objects) can consist of one +or multiple tokens. For instance, the pooling layer could resort to calculating +the average of all token vectors in an entity. Thinc provides several +[built-in pooling operators](https://thinc.ai/docs/api-layers#reduction-ops) for +this purpose. + +> #### config.cfg (excerpt) +> +> ```ini +> +> [model.create_instance_tensor.get_instances] +> @misc = "rel_instance_generator.v1" +> max_length = 100 +> ``` + +Finally, we need a `get_instances` method that **generates pairs of entities** +that we want to classify as being related or not. As these candidate pairs are typically formed +within one document, this function takes a [`Doc`](/api/doc) as input and +outputs a `List` of `Span` tuples. For instance, this +implementation takes any two entities from the same document, as long as they +are within a **maximum distance** (in number of tokens) of eachother: + +```python +### Simple candiate generation +@spacy.registry.misc.register("rel_instance_generator.v1") def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]: def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: candidates = [] @@ -621,46 +733,19 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span return candidates return get_candidates ``` +This function in added to the +[`@misc` registry](/api/top-level#registry) so we can refer to it from the +config, and easily swap it out for any other candidate generation function. -Finally, we require a method that transforms the candidate entity pairs into a -2D tensor using the specified [`Tok2Vec`](/api/tok2vec) or -[`Transformer`](/api/transformer). The resulting ~~Floats2~~ object will then be -processed by a final `output_layer` of the network. Putting all this together, -we can define our relation model in a config file as such: - -```ini -### config.cfg -[model] -@architectures = "rel_model.v1" -# ... - -[model.tok2vec] -# ... - -[model.get_candidates] -@misc = "rel_cand_generator.v1" -max_length = 20 - -[model.create_candidate_tensor] -@misc = "rel_cand_tensor.v1" - -[model.output_layer] -@architectures = "rel_output_layer.v1" -# ... -``` - - - When creating this model, we store the custom functions as [attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as references, so we can access them easily: ```python -tok2vec_layer = model.get_ref("tok2vec") -output_layer = model.get_ref("output_layer") -create_candidate_tensor = model.attrs["create_candidate_tensor"] -get_candidates = model.attrs["get_candidates"] +pooling = model.get_ref("pooling") +tok2vec = model.get_ref("tok2vec") +get_instances = model.attrs["get_instances"] ``` #### Step 2: Implementing the pipeline component {#component-rel-pipe} @@ -935,5 +1020,6 @@ code to create the ML model and the pipeline component from scratch. It contains two config files to train the model: one to run on CPU with a Tok2Vec layer, and one for the GPU using a transformer. The project applies the relation extraction component to identify biomolecular -interactions, but you can easily swap in your own dataset for your experiments. +interactions, but you can easily swap in your own dataset for your experiments +in any other domain. From 4a3e611abc1d183da22c5a6c70852d2d21fee7b5 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 20 Nov 2020 15:55:05 +0100 Subject: [PATCH 07/18] small fixes and formatting --- website/docs/usage/layers-architectures.md | 214 +++++++++++---------- 1 file changed, 110 insertions(+), 104 deletions(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 60d6224dc..2df4745db 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -502,7 +502,7 @@ with Model.define_operators({">>": chain}): ## Create new trainable components {#components} -In addition to [swapping out](#swap-architectures) default models in built-in +In addition to [swapping out](#swap-architectures) layers in existing components, you can also implement an entirely new, [trainable](/usage/processing-pipelines#trainable-components) pipeline component from scratch. This can be done by creating a new class inheriting from @@ -523,25 +523,27 @@ overview of the `TrainablePipe` methods used by This section outlines an example use-case of implementing a **novel relation extraction component** from scratch. We'll implement a binary relation extraction method that determines whether or not **two entities** in a document -are related, and if so, what type of relation. We'll allow multiple types of -relations between two such entities (multi-label setting). There are two major -steps required: +are related, and if so, what type of relation connects them. We allow multiple +types of relations between two such entities (a multi-label setting). There are +two major steps required: 1. Implement a [machine learning model](#component-rel-model) specific to this - task. It will have to extract candidates from a [`Doc`](/api/doc) and predict - a relation for the available candidate pairs. -2. Implement a custom [pipeline component](#component-rel-pipe) powered by the - machine learning model that sets annotations on the [`Doc`](/api/doc) passing - through the pipeline. + task. It will have to extract candidate relation instances from a + [`Doc`](/api/doc) and predict the corresponding scores for each relation + label. +2. Implement a custom [pipeline component](#component-rel-pipe) - powered by the + machine learning model from step 1 - that translates the predicted scores + into annotations that are stored on the [`Doc`](/api/doc) objects as they + pass through the `nlp` pipeline. Run this example use-case by using our project template. It includes all the code to create the ML model and the pipeline component from scratch. -It contains two config files to train the model: +It also contains two config files to train the model: one to run on CPU with a Tok2Vec layer, and one for the GPU using a transformer. The project applies the relation extraction component to identify biomolecular -interactions, but you can easily swap in your own dataset for your experiments -in any other domain. +interactions in a sample dataset, but you can easily swap in your own dataset +for your experiments in any other domain. #### Step 1: Implementing the Model {#component-rel-model} @@ -558,18 +560,17 @@ matrix** (~~Floats2d~~) of predictions: > for details. ```python -### Register the model architecture +### The model architecture @spacy.registry.architectures.register("rel_model.v1") def create_relation_model(...) -> Model[List[Doc], Floats2d]: model = ... # 👈 model will go here return model ``` -We will adapt a **modular approach** to the definition of this relation model, and -define it as chaining to layers together: the first layer that generates an -instance tensor from a given set of documents, and the second layer that -transforms this tensor into a final tensor holding the predictions: - +We will adapt a **modular approach** to the definition of this relation model, +and define it as chaining two layers together: the first layer that generates an +instance tensor from a given set of documents, and the second layer that +transforms the instance tensor into a final tensor holding the predictions. > #### config.cfg (excerpt) > @@ -581,11 +582,11 @@ transforms this tensor into a final tensor holding the predictions: > # ... > > [model.classification_layer] -> ... +> # ... > ``` ```python -### Implement the model architecture +### The model architecture @spacy.registry.architectures.register("rel_model.v1") def create_relation_model( create_instance_tensor: Model[List[Doc], Floats2d], @@ -595,9 +596,8 @@ def create_relation_model( return model ``` -The `classification_layer` could be something simple like a Linear layer -followed by a logistic activation function: - +The `classification_layer` could be something like a Linear layer followed by a +logistic activation function: > #### config.cfg (excerpt) > @@ -609,7 +609,7 @@ followed by a logistic activation function: > ``` ```python -### Implement the classification layer +### The classification layer @spacy.registry.architectures.register("rel_classification_layer.v1") def create_classification_layer( nO: int = None, nI: int = None @@ -617,60 +617,16 @@ def create_classification_layer( return chain(Linear(nO=nO, nI=nI), Logistic()) ``` -The first layer that **creates the instance tensor** can be defined -by implementing a -[custom forward function](https://thinc.ai/docs/usage-models#weights-layers-forward) -with an appropriate backpropagation callback. We also define an -[initialization method](https://thinc.ai/docs/usage-models#weights-layers-init) +The first layer that **creates the instance tensor** can be defined by +implementing a +[custom forward function](https://thinc.ai/docs/usage-models#weights-layers-forward) +with an appropriate backpropagation callback. We also define an +[initialization method](https://thinc.ai/docs/usage-models#weights-layers-init) that ensures that the layer is properly set up for training. -```python -### Implement the custom forward function -def instance_forward( - model: Model[List[Doc], Floats2d], - docs: List[Doc], - is_train: bool -) -> Tuple[Floats2d, Callable]: - ... - tok2vec = model.get_ref("tok2vec") - tokvecs, bp_tokvecs = tok2vec(docs, is_train) - relations = ... - - def backprop(d_relations: Floats2d) -> List[Doc]: - d_tokvecs = ... - return bp_tokvecs(d_tokvecs) - - return relations, backprop - - -### Implement the custom initialization method -def instance_init( - model: Model, - X: List[Doc] = None, - Y: Floats2d = None -) -> Model: - tok2vec = model.get_ref("tok2vec") - tok2vec.initialize(X) - return model - - -### Implement the layer that creates the instance tensor -@spacy.registry.architectures.register("rel_instance_tensor.v1") -def create_tensors( - tok2vec: Model[List[Doc], List[Floats2d]], - pooling: Model[Ragged, Floats2d], - get_instances: Callable[[Doc], List[Tuple[Span, Span]]], -) -> Model[List[Doc], Floats2d]: - - return Model( - "instance_tensors", - instance_forward, - layers=[tok2vec, pooling], - refs={"tok2vec": tok2vec, "pooling": pooling}, - attrs={"get_instances": get_instances}, - init=instance_init, - ) -``` +We omit some of the implementation details here, and refer to the spaCy project +that has the full implementation +[here](https://github.com/explosion/projects/tree/v3/tutorials/rel_component). > #### config.cfg (excerpt) > @@ -688,19 +644,69 @@ def create_tensors( > [model.create_instance_tensor.get_instances] > ... > ` +> ``` -This custom layer uses an -**[embedding layer](/usage/embeddings-transformers)** such as a -[`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This -layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it +```python +### The layer that creates the instance tensor +@spacy.registry.architectures.register("rel_instance_tensor.v1") +def create_tensors( + tok2vec: Model[List[Doc], List[Floats2d]], + pooling: Model[Ragged, Floats2d], + get_instances: Callable[[Doc], List[Tuple[Span, Span]]], +) -> Model[List[Doc], Floats2d]: + + return Model( + "instance_tensors", + instance_forward, + layers=[tok2vec, pooling], + refs={"tok2vec": tok2vec, "pooling": pooling}, + attrs={"get_instances": get_instances}, + init=instance_init, + ) + + +### The custom forward function +def instance_forward( + model: Model[List[Doc], Floats2d], + docs: List[Doc], + is_train: bool, +) -> Tuple[Floats2d, Callable]: + # ... + tok2vec = model.get_ref("tok2vec") + tokvecs, bp_tokvecs = tok2vec(docs, is_train) + relations = ... + + def backprop(d_relations: Floats2d) -> List[Doc]: + d_tokvecs = ... + return bp_tokvecs(d_tokvecs) + + return relations, backprop + + +### The custom initialization method +def instance_init( + model: Model, + X: List[Doc] = None, + Y: Floats2d = None, +) -> Model: + tok2vec = model.get_ref("tok2vec") + tok2vec.initialize(X) + return model + +``` + +This custom layer uses an [embedding layer](/usage/embeddings-transformers) such +as a [`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). +This layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it transforms each **document into a list of tokens**, with each token being -represented by its embedding in the vector space. +represented by its embedding in the vector space. -The **`pooling`** layer will be applied to summarize the token vectors into entity -vectors, as named entities (represented by `Span` objects) can consist of one -or multiple tokens. For instance, the pooling layer could resort to calculating -the average of all token vectors in an entity. Thinc provides several -[built-in pooling operators](https://thinc.ai/docs/api-layers#reduction-ops) for +The `pooling` layer will be applied to summarize the token vectors into **entity +vectors**, as named entities (represented by ~~Span~~ objects) can consist of +one or multiple tokens. For instance, the pooling layer could resort to +calculating the average of all token vectors in an entity. Thinc provides +several +[built-in pooling operators](https://thinc.ai/docs/api-layers#reduction-ops) for this purpose. > #### config.cfg (excerpt) @@ -712,15 +718,15 @@ this purpose. > max_length = 100 > ``` -Finally, we need a `get_instances` method that **generates pairs of entities** -that we want to classify as being related or not. As these candidate pairs are typically formed -within one document, this function takes a [`Doc`](/api/doc) as input and -outputs a `List` of `Span` tuples. For instance, this +Finally, we need a `get_instances` method that **generates pairs of entities** +that we want to classify as being related or not. As these candidate pairs are +typically formed within one document, this function takes a [`Doc`](/api/doc) as +input and outputs a `List` of `Span` tuples. For instance, the following implementation takes any two entities from the same document, as long as they are within a **maximum distance** (in number of tokens) of eachother: ```python -### Simple candiate generation +### Candiate generation @spacy.registry.misc.register("rel_instance_generator.v1") def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]: def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: @@ -733,10 +739,10 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span return candidates return get_candidates ``` -This function in added to the -[`@misc` registry](/api/top-level#registry) so we can refer to it from the -config, and easily swap it out for any other candidate generation function. +This function in added to the [`@misc` registry](/api/top-level#registry) so we +can refer to it from the config, and easily swap it out for any other candidate +generation function. When creating this model, we store the custom functions as [attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as @@ -851,13 +857,13 @@ def update( sgd: Optional[Optimizer] = None, losses: Optional[Dict[str, float]] = None, ) -> Dict[str, float]: - ... + # ... docs = [ex.predicted for ex in examples] predictions, backprop = self.model.begin_update(docs) loss, gradient = self.get_loss(examples, predictions) backprop(gradient) losses[self.name] += loss - ... + # ... return losses ``` @@ -933,9 +939,9 @@ def __call__(self, Doc doc): return doc ``` -There is one more optional method to implement: [`score`](/api/pipe#score) -calculates the performance of your component on a set of examples, and -returns the results as a dictionary: +There is one more optional method to implement: [`score`](/api/pipe#score) +calculates the performance of your component on a set of examples, and returns +the results as a dictionary: ```python ### The score method @@ -951,8 +957,8 @@ def score(self, examples: Iterable[Example]) -> Dict[str, Any]: } ``` -This is particularly useful to see the scores on the development corpus -when training the component with [`spacy train`](/api/cli#training). +This is particularly useful to see the scores on the development corpus when +training the component with [`spacy train`](/api/cli#training). Once our `TrainablePipe` subclass is fully implemented, we can [register](/usage/processing-pipelines#custom-components-factories) the @@ -976,7 +982,7 @@ assigns it a name and lets you create the component with > [components.relation_extractor.model.get_candidates] > @misc = "rel_cand_generator.v1" > max_length = 20 -> +> > [training.score_weights] > rel_micro_p = 0.0 > rel_micro_r = 0.0 @@ -992,8 +998,8 @@ def make_relation_extractor(nlp, name, model): return RelationExtractor(nlp.vocab, model, name) ``` -You can extend the decorator to include information such as the type of -annotations that are required for this component to run, the type of annotations +You can extend the decorator to include information such as the type of +annotations that are required for this component to run, the type of annotations it produces, and the scores that can be calculated: ```python From 331ec83493a4158c5eda98e7e23bb8ae8d492de2 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 20 Nov 2020 21:41:52 +0100 Subject: [PATCH 08/18] edits and updates to implementing REL component docs --- website/docs/usage/layers-architectures.md | 139 ++++++++++++--------- 1 file changed, 80 insertions(+), 59 deletions(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 2df4745db..01108e5c8 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -624,9 +624,9 @@ with an appropriate backpropagation callback. We also define an [initialization method](https://thinc.ai/docs/usage-models#weights-layers-init) that ensures that the layer is properly set up for training. -We omit some of the implementation details here, and refer to the spaCy project -that has the full implementation -[here](https://github.com/explosion/projects/tree/v3/tutorials/rel_component). +We omit some of the implementation details here, and refer to the +[spaCy project](https://github.com/explosion/projects/tree/v3/tutorials/rel_component) +that has the full implementation. > #### config.cfg (excerpt) > @@ -636,13 +636,13 @@ that has the full implementation > > [model.create_instance_tensor.tok2vec] > @architectures = "spacy.HashEmbedCNN.v1" -> ... +> # ... > > [model.create_instance_tensor.pooling] > @layers = "reduce_mean.v1" > > [model.create_instance_tensor.get_instances] -> ... +> # ... > ` > ``` @@ -658,10 +658,10 @@ def create_tensors( return Model( "instance_tensors", instance_forward, + init=instance_init, layers=[tok2vec, pooling], refs={"tok2vec": tok2vec, "pooling": pooling}, attrs={"get_instances": get_instances}, - init=instance_init, ) @@ -671,9 +671,11 @@ def instance_forward( docs: List[Doc], is_train: bool, ) -> Tuple[Floats2d, Callable]: - # ... tok2vec = model.get_ref("tok2vec") tokvecs, bp_tokvecs = tok2vec(docs, is_train) + get_instances = model.attrs["get_instances"] + all_instances = [get_instances(doc) for doc in docs] + pooling = model.get_ref("pooling") relations = ... def backprop(d_relations: Floats2d) -> List[Doc]: @@ -744,14 +746,35 @@ This function in added to the [`@misc` registry](/api/top-level#registry) so we can refer to it from the config, and easily swap it out for any other candidate generation function. -When creating this model, we store the custom functions as -[attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as -references, so we can access them easily: +#### Intermezzo: define how to store the relations data {#component-rel-attribute} + +For our new relation extraction component, we will use a custom +[extension attribute](/usage/processing-pipelines#custom-components-attributes) +`doc._.rel` in which we store relation data. The attribute refers to a +dictionary, keyed by the **start offsets of each entity** involved in the +candidate relation. The values in the dictionary refer to another dictionary +where relation labels are mapped to values between 0 and 1. We assume anything +above 0.5 to be a `True` relation. The ~~Example~~ instances that we'll use as +training data, will include their gold-standard relation annotations in +`example.reference._.rel`. + +> #### Example output +> +> ```python +> doc = nlp("Amsterdam is the capital of the Netherlands.") +> print("spans", [(e.start, e.text, e.label_) for e in doc.ents]) +> for value, rel_dict in doc._.rel.items(): +> print(f"{value}: {rel_dict}") +> +> # spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')] +> # (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002} +> # (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017} +> ``` ```python -pooling = model.get_ref("pooling") -tok2vec = model.get_ref("tok2vec") -get_instances = model.attrs["get_instances"] +### Registering the extension attribute +from spacy.tokens import Doc +Doc.set_extension("rel", default={}) ``` #### Step 2: Implementing the pipeline component {#component-rel-pipe} @@ -794,19 +817,43 @@ class RelationExtractor(TrainablePipe): ... ``` -Before the model can be used, it needs to be -[initialized](/usage/training#initialization). This function receives a callback -to access the full **training data set**, or a representative sample. This data -set can be used to deduce all **relevant labels**. Alternatively, a list of -labels can be provided to `initialize`, or you can call -`RelationExtractor.add_label` directly. The number of labels defines the output -dimensionality of the network, and will be used to do +Typically, the constructor defines the vocab, the Machine Learning model, and +the name of this component. Additionally, this component, just like the +`textcat` and the `tagger`, stores an internal list of labels. The ML model will +predict scores for each label. We add convenience method to easily retrieve and +add to them. + +```python + def __init__(self, vocab, model, name="rel"): + """Create a component instance.""" + # ... + self.cfg = {"labels": []} + + @property + def labels(self) -> Tuple[str]: + """Returns the labels currently added to the component.""" + return tuple(self.cfg["labels"]) + + def add_label(self, label: str): + """Add a new label to the pipe.""" + self.cfg["labels"] = list(self.labels) + [label] +``` + +After creation, the component needs to be +[initialized](/usage/training#initialization). This method can define the +relevant labels in two ways: explicitely by setting the `labels` argument in the +[`initialize` block](/api/data-formats#config-initialize) of the config, or +implicately by deducing them from the `get_examples` callback that generates the +full **training data set**, or a representative sample. + +The final number of labels defines the output dimensionality of the network, and +will be used to do [shape inference](https://thinc.ai/docs/usage-models#validation) throughout the layers of the neural network. This is triggered by calling [`Model.initialize`](https://thinc.ai/api/model#initialize). ```python -### The initialize method {highlight="12,18,22"} +### The initialize method {highlight="12,15,18,22"} from itertools import islice def initialize( @@ -837,7 +884,7 @@ Typically, this happens when the pipeline is set up before training in [`spacy train`](/api/cli#training). After initialization, the pipeline component and its internal model can be trained and used to make predictions. -During training, the function [`update`](/api/pipe#update) is invoked which +During training, the method [`update`](/api/pipe#update) is invoked which delegates to [`Model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a [`get_loss`](/api/pipe#get_loss) function that **calculates the loss** for a @@ -858,7 +905,7 @@ def update( losses: Optional[Dict[str, float]] = None, ) -> Dict[str, float]: # ... - docs = [ex.predicted for ex in examples] + docs = [eg.predicted for eg in examples] predictions, backprop = self.model.begin_update(docs) loss, gradient = self.get_loss(examples, predictions) backprop(gradient) @@ -867,8 +914,8 @@ def update( return losses ``` -When the internal model is trained, the component can be used to make novel -**predictions**. The [`predict`](/api/pipe#predict) function needs to be +After training the model, the component can be used to make novel +**predictions**. The [`predict`](/api/pipe#predict) method needs to be implemented for each subclass of `TrainablePipe`. In our case, we can simply delegate to the internal model's [predict](https://thinc.ai/docs/api-model#predict) function that takes a batch @@ -884,42 +931,21 @@ def predict(self, docs: Iterable[Doc]) -> Floats2d: The final method that needs to be implemented, is [`set_annotations`](/api/pipe#set_annotations). This function takes the predictions, and modifies the given `Doc` object in place to store them. For our -relation extraction component, we store the data as a dictionary in a custom -[extension attribute](/usage/processing-pipelines#custom-components-attributes) -`doc._.rel`. As keys, we represent the candidate pair by the **start offsets of -each entity**, as this defines an entity pair uniquely within one document. +relation extraction component, we store the data in the +[custom attribute](#component-rel-attribute)`doc._.rel`. To interpret the scores predicted by the relation extraction model correctly, we -need to refer to the model's `get_candidates` function that defined which pairs +need to refer to the model's `get_instances` function that defined which pairs of entities were relevant candidates, so that the predictions can be linked to those exact entities: -> #### Example output -> -> ```python -> doc = nlp("Amsterdam is the capital of the Netherlands.") -> print("spans", [(e.start, e.text, e.label_) for e in doc.ents]) -> for value, rel_dict in doc._.rel.items(): -> print(f"{value}: {rel_dict}") -> -> # spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')] -> # (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002} -> # (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017} -> ``` - -```python -### Registering the extension attribute -from spacy.tokens import Doc -Doc.set_extension("rel", default={}) -``` - ```python ### The set_annotations method {highlight="5-6,10"} def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d): c = 0 - get_candidates = self.model.attrs["get_candidates"] + get_instances = self.model.attrs["get_instances"] for doc in docs: - for (e1, e2) in get_candidates(doc): + for (e1, e2) in get_instances(doc): offset = (e1.start, e2.start) if offset not in doc._.rel: doc._.rel[offset] = {} @@ -933,7 +959,7 @@ Under the hood, when the pipe is applied to a document, it delegates to the ```python ### The __call__ method -def __call__(self, Doc doc): +def __call__(self, doc: Doc): predictions = self.predict([doc]) self.set_annotations([doc], predictions) return doc @@ -957,8 +983,8 @@ def score(self, examples: Iterable[Example]) -> Dict[str, Any]: } ``` -This is particularly useful to see the scores on the development corpus when -training the component with [`spacy train`](/api/cli#training). +This is particularly useful for calculating relevant scores on the development +corpus when training the component with [`spacy train`](/api/cli#training). Once our `TrainablePipe` subclass is fully implemented, we can [register](/usage/processing-pipelines#custom-components-factories) the @@ -975,13 +1001,8 @@ assigns it a name and lets you create the component with > > [components.relation_extractor.model] > @architectures = "rel_model.v1" -> -> [components.relation_extractor.model.tok2vec] > # ... > -> [components.relation_extractor.model.get_candidates] -> @misc = "rel_cand_generator.v1" -> max_length = 20 > > [training.score_weights] > rel_micro_p = 0.0 From 5ac0867427cc639ee3addd8af33560ae7a2b6ab6 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 20 Nov 2020 22:18:53 +0100 Subject: [PATCH 09/18] final fixes --- website/docs/usage/layers-architectures.md | 44 +++++++++++----------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 01108e5c8..37d6afaf0 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -567,10 +567,10 @@ def create_relation_model(...) -> Model[List[Doc], Floats2d]: return model ``` -We will adapt a **modular approach** to the definition of this relation model, -and define it as chaining two layers together: the first layer that generates an +We adapt a **modular approach** to the definition of this relation model, and +define it as chaining two layers together: the first layer that generates an instance tensor from a given set of documents, and the second layer that -transforms the instance tensor into a final tensor holding the predictions. +transforms the instance tensor into a final tensor holding the predictions: > #### config.cfg (excerpt) > @@ -586,7 +586,7 @@ transforms the instance tensor into a final tensor holding the predictions. > ``` ```python -### The model architecture +### The model architecture {highlight="6"} @spacy.registry.architectures.register("rel_model.v1") def create_relation_model( create_instance_tensor: Model[List[Doc], Floats2d], @@ -596,8 +596,9 @@ def create_relation_model( return model ``` -The `classification_layer` could be something like a Linear layer followed by a -logistic activation function: +The `classification_layer` could be something like a +[Linear](https://thinc.ai/docs/api-layers#linear) layer followed by a +[logistic](https://thinc.ai/docs/api-layers#logistic) activation function: > #### config.cfg (excerpt) > @@ -748,16 +749,6 @@ generation function. #### Intermezzo: define how to store the relations data {#component-rel-attribute} -For our new relation extraction component, we will use a custom -[extension attribute](/usage/processing-pipelines#custom-components-attributes) -`doc._.rel` in which we store relation data. The attribute refers to a -dictionary, keyed by the **start offsets of each entity** involved in the -candidate relation. The values in the dictionary refer to another dictionary -where relation labels are mapped to values between 0 and 1. We assume anything -above 0.5 to be a `True` relation. The ~~Example~~ instances that we'll use as -training data, will include their gold-standard relation annotations in -`example.reference._.rel`. - > #### Example output > > ```python @@ -771,6 +762,16 @@ training data, will include their gold-standard relation annotations in > # (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017} > ``` +For our new relation extraction component, we will use a custom +[extension attribute](/usage/processing-pipelines#custom-components-attributes) +`doc._.rel` in which we store relation data. The attribute refers to a +dictionary, keyed by the **start offsets of each entity** involved in the +candidate relation. The values in the dictionary refer to another dictionary +where relation labels are mapped to values between 0 and 1. We assume anything +above 0.5 to be a `True` relation. The ~~Example~~ instances that we'll use as +training data, will include their gold-standard relation annotations in +`example.reference._.rel`. + ```python ### Registering the extension attribute from spacy.tokens import Doc @@ -817,11 +818,11 @@ class RelationExtractor(TrainablePipe): ... ``` -Typically, the constructor defines the vocab, the Machine Learning model, and -the name of this component. Additionally, this component, just like the -`textcat` and the `tagger`, stores an internal list of labels. The ML model will -predict scores for each label. We add convenience method to easily retrieve and -add to them. +Typically, the **constructor** defines the vocab, the Machine Learning model, +and the name of this component. Additionally, this component, just like the +`textcat` and the `tagger`, stores an **internal list of labels**. The ML model +will predict scores for each label. We add convenience methods to easily +retrieve and add to them. ```python def __init__(self, vocab, model, name="rel"): @@ -1003,7 +1004,6 @@ assigns it a name and lets you create the component with > @architectures = "rel_model.v1" > # ... > -> > [training.score_weights] > rel_micro_p = 0.0 > rel_micro_r = 0.0 From e861e928df5307fcd1e836d85b6e11c0b076eb25 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 20 Nov 2020 22:29:58 +0100 Subject: [PATCH 10/18] more small corrections --- website/docs/usage/layers-architectures.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 37d6afaf0..a2a6e3167 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -644,7 +644,6 @@ that has the full implementation. > > [model.create_instance_tensor.get_instances] > # ... -> ` > ``` ```python @@ -666,7 +665,7 @@ def create_tensors( ) -### The custom forward function +# The custom forward function def instance_forward( model: Model[List[Doc], Floats2d], docs: List[Doc], @@ -686,7 +685,7 @@ def instance_forward( return relations, backprop -### The custom initialization method +# The custom initialization method def instance_init( model: Model, X: List[Doc] = None, @@ -712,6 +711,13 @@ several [built-in pooling operators](https://thinc.ai/docs/api-layers#reduction-ops) for this purpose. +Finally, we need a `get_instances` method that **generates pairs of entities** +that we want to classify as being related or not. As these candidate pairs are +typically formed within one document, this function takes a [`Doc`](/api/doc) as +input and outputs a `List` of `Span` tuples. For instance, the following +implementation takes any two entities from the same document, as long as they +are within a **maximum distance** (in number of tokens) of eachother: + > #### config.cfg (excerpt) > > ```ini @@ -721,17 +727,10 @@ this purpose. > max_length = 100 > ``` -Finally, we need a `get_instances` method that **generates pairs of entities** -that we want to classify as being related or not. As these candidate pairs are -typically formed within one document, this function takes a [`Doc`](/api/doc) as -input and outputs a `List` of `Span` tuples. For instance, the following -implementation takes any two entities from the same document, as long as they -are within a **maximum distance** (in number of tokens) of eachother: - ```python ### Candiate generation @spacy.registry.misc.register("rel_instance_generator.v1") -def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]: +def create_instances(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]: def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: candidates = [] for ent1 in doc.ents: @@ -825,6 +824,7 @@ will predict scores for each label. We add convenience methods to easily retrieve and add to them. ```python +### The constructor (continued) def __init__(self, vocab, model, name="rel"): """Create a component instance.""" # ... From 218abaa69aed66c90fcd9d67659684832a96828e Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 20 Nov 2020 22:36:49 +0100 Subject: [PATCH 11/18] typo --- website/docs/usage/layers-architectures.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index a2a6e3167..eb6f8b288 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -728,7 +728,7 @@ are within a **maximum distance** (in number of tokens) of eachother: > ``` ```python -### Candiate generation +### Candidate generation @spacy.registry.misc.register("rel_instance_generator.v1") def create_instances(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]: def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: From 079f6ea474bfaf45b2ee126c32c986bb1ba6fba2 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 30 Nov 2020 02:34:29 +0100 Subject: [PATCH 12/18] avoid resolving the full config (#6465) --- spacy/training/pretrain.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py index e5c41c70b..57fbc7781 100644 --- a/spacy/training/pretrain.py +++ b/spacy/training/pretrain.py @@ -17,7 +17,7 @@ from ..ml.models.multi_task import build_cloze_multi_task_model from ..ml.models.multi_task import build_cloze_characters_multi_task_model from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain from ..errors import Errors -from ..util import registry, load_model_from_config, resolve_dot_names +from ..util import registry, load_model_from_config, dot_to_object def pretrain( @@ -38,7 +38,8 @@ def pretrain( _config = nlp.config.interpolate() T = registry.resolve(_config["training"], schema=ConfigSchemaTraining) P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain) - corpus = resolve_dot_names(_config, [P["corpus"]])[0] + corpus = dot_to_object(_config, P["corpus"]) + corpus = registry.resolve({"corpus": corpus})["corpus"] batcher = P["batcher"] model = create_pretraining_model(nlp, P) optimizer = P["optimizer"] From 1442d2f213416f903d948199d6f1723c7a419dda Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 30 Nov 2020 02:39:45 +0100 Subject: [PATCH 13/18] Improve simple training example in v3 migration (#6438) * Create the examples once * Use the examples in the initialization * Provide the batch size * Fix `begin_training` migration example --- website/docs/usage/v3.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index b25b28a6d..47ddcf53a 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -969,18 +969,18 @@ The [`Language.update`](/api/language#update), raw text and a dictionary of annotations. ```python -### Training loop {highlight="11"} +### Training loop {highlight="5-8,12"} TRAIN_DATA = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), ("I like London.", {"entities": [(7, 13, "LOC")]}), ] -nlp.initialize() +examples = [] +for text, annots in TRAIN_DATA: + examples.append(Example.from_dict(nlp.make_doc(text), annots)) +nlp.initialize(lambda: examples) for i in range(20): - random.shuffle(TRAIN_DATA) - for batch in minibatch(TRAIN_DATA): - examples = [] - for text, annots in batch: - examples.append(Example.from_dict(nlp.make_doc(text), annots)) + random.shuffle(examples) + for batch in minibatch(examples, size=8): nlp.update(examples) ``` @@ -995,7 +995,7 @@ network, setting up the label scheme. ```diff -- nlp.initialize(examples) +- nlp.begin_training() + nlp.initialize(lambda: examples) ``` From b0dd13e0ba9ae8b30f08ffc86b5fc25a9d86f662 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 30 Nov 2020 13:43:58 +0100 Subject: [PATCH 14/18] Support LICENSE in spacy package If present, include the file `input_dir/LICENSE` at the top level of the packaged model. --- spacy/cli/package.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 49a0ab75d..203163be9 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -103,6 +103,9 @@ def package( ) Path.mkdir(package_path, parents=True) shutil.copytree(str(input_dir), str(package_path / model_name_v)) + license_path = package_path / model_name_v / "LICENSE" + if license_path.exists(): + shutil.move(str(license_path), str(main_path)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) @@ -239,6 +242,7 @@ if __name__ == '__main__': TEMPLATE_MANIFEST = """ include meta.json include config.cfg +include LICENSE """.strip() From 591cd48aa8b5072364ed9677d6c0960367812bd6 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 1 Dec 2020 12:58:02 +0100 Subject: [PATCH 15/18] Remove config.cfg from MANIFEST --- spacy/cli/package.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 203163be9..4046c35d4 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -241,7 +241,6 @@ if __name__ == '__main__': TEMPLATE_MANIFEST = """ include meta.json -include config.cfg include LICENSE """.strip() From 31ec9a906eb258f9f84a6e45c6c1f8b83d20a0b5 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 2 Dec 2020 10:15:23 +0100 Subject: [PATCH 16/18] Clean up 3rd party license info (#6478) Move scikit-learn license from `Scorer` to `licenses/3rd_party_licenses.txt`. --- licenses/3rd_party_licenses.txt | 41 +++++++++++++++++++++++++++++++++ spacy/scorer.py | 38 ++---------------------------- 2 files changed, 43 insertions(+), 36 deletions(-) diff --git a/licenses/3rd_party_licenses.txt b/licenses/3rd_party_licenses.txt index 0aeef5507..3702ad131 100644 --- a/licenses/3rd_party_licenses.txt +++ b/licenses/3rd_party_licenses.txt @@ -36,3 +36,44 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +scikit-learn +------------ + +* Files: scorer.py + +The following implementation of roc_auc_score() is adapted from +scikit-learn, which is distributed under the following license: + +New BSD License + +Copyright (c) 2007–2019 The scikit-learn developers. +All rights reserved. + + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + a. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + b. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + c. Neither the name of the Scikit-learn Developers nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +DAMAGE. diff --git a/spacy/scorer.py b/spacy/scorer.py index 5cace8fda..371dbd776 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -720,44 +720,10 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]: } -############################################################################# -# # The following implementation of roc_auc_score() is adapted from -# scikit-learn, which is distributed under the following license: -# -# New BSD License -# +# scikit-learn, which is distributed under the New BSD License. # Copyright (c) 2007–2019 The scikit-learn developers. -# All rights reserved. -# -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# a. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# b. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# c. Neither the name of the Scikit-learn Developers nor the names of -# its contributors may be used to endorse or promote products -# derived from this software without specific prior written -# permission. -# -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR -# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH -# DAMAGE. - - +# See licenses/3rd_party_licenses.txt def _roc_auc_score(y_true, y_score): """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores. From d6c616a125b86ced08eb7ad2b4e0dfd8d89eb5e9 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 2 Dec 2020 12:57:08 +0100 Subject: [PATCH 17/18] Fixes in test suite (#6457) * fix slow test for textcat readers * cleanup test_issue5551 * add explicit score weight * cleanup --- spacy/tests/regression/test_issue5501-6000.py | 35 ++++++++++--------- spacy/tests/training/test_readers.py | 8 +++-- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/spacy/tests/regression/test_issue5501-6000.py b/spacy/tests/regression/test_issue5501-6000.py index f0b46cb83..65fc8dda7 100644 --- a/spacy/tests/regression/test_issue5501-6000.py +++ b/spacy/tests/regression/test_issue5501-6000.py @@ -1,35 +1,38 @@ -from thinc.api import fix_random_seed +import pytest +from thinc.api import Config, fix_random_seed + from spacy.lang.en import English +from spacy.pipeline.textcat import default_model_config, bow_model_config +from spacy.pipeline.textcat import cnn_model_config from spacy.tokens import Span from spacy import displacy from spacy.pipeline import merge_entities +from spacy.training import Example -def test_issue5551(): +@pytest.mark.parametrize( + "textcat_config", [default_model_config, bow_model_config, cnn_model_config] +) +def test_issue5551(textcat_config): """Test that after fixing the random seed, the results of the pipeline are truly identical""" component = "textcat" - pipe_cfg = { - "model": { - "@architectures": "spacy.TextCatBOW.v1", - "exclusive_classes": True, - "ngram_size": 2, - "no_output_layer": False, - } - } + + pipe_cfg = Config().from_str(textcat_config) results = [] for i in range(3): fix_random_seed(0) nlp = English() - example = ( - "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g.", - {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}}, - ) + text = "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g." + annots = {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}} pipe = nlp.add_pipe(component, config=pipe_cfg, last=True) - for label in set(example[1]["cats"]): + for label in set(annots["cats"]): pipe.add_label(label) + # Train nlp.initialize() + doc = nlp.make_doc(text) + nlp.update([Example.from_dict(doc, annots)]) # Store the result of each iteration - result = pipe.model.predict([nlp.make_doc(example[0])]) + result = pipe.model.predict([doc]) results.append(list(result[0])) # All results should be the same because of the fixed seed assert len(results) == 3 diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index ff2559d2a..5669bdd11 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -72,6 +72,10 @@ def test_readers(): def test_cat_readers(reader, additional_config): nlp_config_string = """ [training] + seed = 0 + + [training.score_weights] + cats_macro_auc = 1.0 [corpora] @readers = "PLACEHOLDER" @@ -92,9 +96,7 @@ def test_cat_readers(reader, additional_config): config["corpora"]["@readers"] = reader config["corpora"].update(additional_config) nlp = load_model_from_config(config, auto_fill=True) - T = registry.resolve( - nlp.config["training"].interpolate(), schema=ConfigSchemaTraining - ) + T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) optimizer = T["optimizer"] From 78085fab1fba273ed8048c385a8927f2a6535bdc Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 4 Dec 2020 09:40:03 +0100 Subject: [PATCH 18/18] Check for spacy-nightly package in download (#6502) Also check for spacy-nightly in download so that `--no-deps` isn't set for normal nightly installs. --- spacy/cli/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 0e7ec2ea5..d5686586b 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -35,7 +35,7 @@ def download_cli( def download(model: str, direct: bool = False, *pip_args) -> None: - if not is_package("spacy") and "--no-deps" not in pip_args: + if not (is_package("spacy") or is_package("spacy-nightly")) and "--no-deps" not in pip_args: msg.warn( "Skipping pipeline package dependencies and setting `--no-deps`. " "You don't seem to have the spaCy package itself installed "