From 2c4b2ee5e9b29442c119e9c8bb2b5bce761a78aa Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sat, 3 Oct 2020 23:27:05 +0200 Subject: [PATCH 01/12] REL intro and get_candidates function --- website/docs/usage/layers-architectures.md | 54 ++++++++++++++++++++++ website/docs/usage/processing-pipelines.md | 2 +- 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index b65c3d903..678f70667 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -486,6 +486,60 @@ with Model.define_operators({">>": chain}): ## Create new trainable components {#components} +In addition to [swapping out](#swap-architectures) default models in built-in +components, you can also implement an entirely new, +[trainable pipeline component](usage/processing-pipelines#trainable-components) +from scratch. This can be done by creating a new class inheriting from [`Pipe`](/api/pipe), +and linking it up to your custom model implementation. + +### Example: Pipeline component for relation extraction {#component-rel} + +This section will run through an example of implementing a novel relation extraction +component from scratch. As a first step, we need a method that will generate pairs of +entities that we want to classify as being related or not. These candidate pairs are +typically formed within one document, which means we'll have a function that takes a +`Doc` as input and outputs a `List` of `Span` tuples. In this example, we will focus +on binary relation extraction, i.e. the tuple will be of length 2. + +We register this function in the 'misc' register so we can easily refer to it from the config, +and allow swapping it out for any candidate +generation function. For instance, a very straightforward implementation would be to just +take any two entities from the same document: + +```python +@registry.misc.register("rel_cand_generator.v1") +def create_candidate_indices() -> Callable[[Doc], List[Tuple[Span, Span]]]: + def get_candidate_indices(doc: "Doc"): + indices = [] + for ent1 in doc.ents: + for ent2 in doc.ents: + indices.append((ent1, ent2)) + return indices + return get_candidate_indices +``` + +But we could also refine this further by excluding relations of an entity with itself, +and posing a maximum distance (in number of tokens) between two entities: + +```python +### {highlight="1,2,7,8"} +@registry.misc.register("rel_cand_generator.v2") +def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]: + def get_candidate_indices(doc: "Doc"): + indices = [] + for ent1 in doc.ents: + for ent2 in doc.ents: + if ent1 != ent2: + if max_length and abs(ent2.start - ent1.start) <= max_length: + indices.append((ent1, ent2)) + return indices + return get_candidate_indices +``` + + + + + diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index c98bd08bc..3619993c5 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1035,7 +1035,7 @@ plug fully custom machine learning components into your pipeline. You'll need the following: 1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This - can be a model using implemented in + can be a model implemented in [Thinc](/usage/layers-architectures#thinc), or a [wrapped model](/usage/layers-architectures#frameworks) implemented in PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a From 08ad349a1851c3310a4ae7f34170eea37c9e2e3b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 4 Oct 2020 00:08:02 +0200 Subject: [PATCH 02/12] tok2vec layer --- website/docs/usage/layers-architectures.md | 87 ++++++++++++++-------- 1 file changed, 58 insertions(+), 29 deletions(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 678f70667..6f79cc6e8 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -489,51 +489,80 @@ with Model.define_operators({">>": chain}): In addition to [swapping out](#swap-architectures) default models in built-in components, you can also implement an entirely new, [trainable pipeline component](usage/processing-pipelines#trainable-components) -from scratch. This can be done by creating a new class inheriting from [`Pipe`](/api/pipe), -and linking it up to your custom model implementation. +from scratch. This can be done by creating a new class inheriting from +[`Pipe`](/api/pipe), and linking it up to your custom model implementation. ### Example: Pipeline component for relation extraction {#component-rel} -This section will run through an example of implementing a novel relation extraction -component from scratch. As a first step, we need a method that will generate pairs of -entities that we want to classify as being related or not. These candidate pairs are -typically formed within one document, which means we'll have a function that takes a -`Doc` as input and outputs a `List` of `Span` tuples. In this example, we will focus -on binary relation extraction, i.e. the tuple will be of length 2. - -We register this function in the 'misc' register so we can easily refer to it from the config, -and allow swapping it out for any candidate -generation function. For instance, a very straightforward implementation would be to just -take any two entities from the same document: +This section will run through an example of implementing a novel relation +extraction component from scratch. As a first step, we need a method that will +generate pairs of entities that we want to classify as being related or not. +These candidate pairs are typically formed within one document, which means +we'll have a function that takes a `Doc` as input and outputs a `List` of `Span` +tuples. In this example, we will focus on binary relation extraction, i.e. the +tuple will be of length 2. For instance, a very straightforward implementation +would be to just take any two entities from the same document: ```python -@registry.misc.register("rel_cand_generator.v1") -def create_candidate_indices() -> Callable[[Doc], List[Tuple[Span, Span]]]: - def get_candidate_indices(doc: "Doc"): - indices = [] - for ent1 in doc.ents: - for ent2 in doc.ents: - indices.append((ent1, ent2)) - return indices - return get_candidate_indices +def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: + candidates = [] + for ent1 in doc.ents: + for ent2 in doc.ents: + candidates.append((ent1, ent2)) + return candidates ``` -But we could also refine this further by excluding relations of an entity with itself, -and posing a maximum distance (in number of tokens) between two entities: +But we could also refine this further by excluding relations of an entity with +itself, and posing a maximum distance (in number of tokens) between two +entities. We'll also register this function in the +[`@misc` registry](/api/top-level#registry) so we can refer to it from the +config, and easily swap it out for any other candidate generation function. + +> ``` +> [get_candidates] +> @misc = "rel_cand_generator.v2" +> max_length = 6 +> ``` ```python ### {highlight="1,2,7,8"} @registry.misc.register("rel_cand_generator.v2") def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]: - def get_candidate_indices(doc: "Doc"): - indices = [] + def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: + candidates = [] for ent1 in doc.ents: for ent2 in doc.ents: if ent1 != ent2: if max_length and abs(ent2.start - ent1.start) <= max_length: - indices.append((ent1, ent2)) - return indices - return get_candidate_indices + candidates.append((ent1, ent2)) + return candidates + return get_candidates +``` + +> ``` +> [tok2vec] +> @architectures = "spacy.HashEmbedCNN.v1" +> pretrained_vectors = null +> width = 96 +> depth = 2 +> embed_size = 300 +> window_size = 1 +> maxout_pieces = 3 +> subword_features = true +> ``` + +Next, we'll assume we have access to an +[embedding layer](/usage/embeddings-transformers) such as a +[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This +layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it +transforms a list of documents into a list of 2D vectors. Further, this +`tok2vec` component will be trainable, which means that, following the Thinc +paradigm, we'll apply it to some input, and receive the predicted results as +well as a callback to perform backpropagation: + +```python +tok2vec = model.get_ref("tok2vec") +tokvecs, bp_tokvecs = tok2vec(docs, is_train=True) ``` From 452b8309f9e34530e5f592699a3601400f40ffb0 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 4 Oct 2020 13:26:46 +0200 Subject: [PATCH 03/12] slight rewrite to hide some thinc implementation details --- website/docs/usage/layers-architectures.md | 98 ++++++++++++++-------- 1 file changed, 63 insertions(+), 35 deletions(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 6f79cc6e8..25f9a568c 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -373,7 +373,7 @@ gpu_allocator = "pytorch" Of course it's also possible to define the `Model` from the previous section entirely in Thinc. The Thinc documentation provides details on the [various layers](https://thinc.ai/docs/api-layers) and helper functions -available. Combinators can also be used to +available. Combinators can be used to [overload operators](https://thinc.ai/docs/usage-models#operators) and a common usage pattern is to bind `chain` to `>>`. The "native" Thinc version of our simple neural network would then become: @@ -494,13 +494,34 @@ from scratch. This can be done by creating a new class inheriting from ### Example: Pipeline component for relation extraction {#component-rel} -This section will run through an example of implementing a novel relation -extraction component from scratch. As a first step, we need a method that will +This section outlines an example use-case of implementing a novel relation +extraction component from scratch. We assume we want to implement a binary +relation extraction method that determines whether two entities in a document +are related or not, and if so, with what type of relation. We'll allow multiple +types of relations between two such entities - i.e. it is a multi-label setting. + +We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes +a list of documents as input, and outputs a two-dimensional matrix of scores: + +```python +@registry.architectures.register("rel_model.v1") +def create_relation_model(...) -> Model[List[Doc], Floats2d]: + model = _create_my_model() + return model +``` + +The first layer in this model will typically be an +[embedding layer](/usage/embeddings-transformers) such as a +[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This +layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it +transforms each document into a list of tokens, with each token being +represented by its embedding in the vector space. + +Next, we need a method that will generate pairs of entities that we want to classify as being related or not. These candidate pairs are typically formed within one document, which means we'll have a function that takes a `Doc` as input and outputs a `List` of `Span` -tuples. In this example, we will focus on binary relation extraction, i.e. the -tuple will be of length 2. For instance, a very straightforward implementation +tuples. For instance, a very straightforward implementation would be to just take any two entities from the same document: ```python @@ -512,18 +533,24 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: return candidates ``` -But we could also refine this further by excluding relations of an entity with -itself, and posing a maximum distance (in number of tokens) between two -entities. We'll also register this function in the -[`@misc` registry](/api/top-level#registry) so we can refer to it from the -config, and easily swap it out for any other candidate generation function. - > ``` -> [get_candidates] +> [model] +> @architectures = "rel_model.v1" +> +> [model.tok2vec] +> ... +> +> [model.get_candidates] > @misc = "rel_cand_generator.v2" > max_length = 6 > ``` +But we could also refine this further by excluding relations of an entity with +itself, and posing a maximum distance (in number of tokens) between two +entities. We'll register this function in the +[`@misc` registry](/api/top-level#registry) so we can refer to it from the +config, and easily swap it out for any other candidate generation function. + ```python ### {highlight="1,2,7,8"} @registry.misc.register("rel_cand_generator.v2") @@ -539,32 +566,33 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span return get_candidates ``` +Finally, we'll require a method that transforms the candidate pairs of entities into +a 2D tensor using the specified Tok2Vec function, and this `Floats2d` object will then be +processed by a final `output_layer` of the network. Taking all this together, we can define +our relation model like this in the config: + > ``` -> [tok2vec] -> @architectures = "spacy.HashEmbedCNN.v1" -> pretrained_vectors = null -> width = 96 -> depth = 2 -> embed_size = 300 -> window_size = 1 -> maxout_pieces = 3 -> subword_features = true +> [model] +> @architectures = "rel_model.v1" +> nO = null +> +> [model.tok2vec] +> ... +> +> [model.get_candidates] +> @misc = "rel_cand_generator.v2" +> max_length = 6 +> +> [components.relation_extractor.model.create_candidate_tensor] +> @misc = "rel_cand_tensor.v1" +> +> [components.relation_extractor.model.output_layer] +> @architectures = "rel_output_layer.v1" +> nI = null +> nO = null > ``` -Next, we'll assume we have access to an -[embedding layer](/usage/embeddings-transformers) such as a -[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This -layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it -transforms a list of documents into a list of 2D vectors. Further, this -`tok2vec` component will be trainable, which means that, following the Thinc -paradigm, we'll apply it to some input, and receive the predicted results as -well as a callback to perform backpropagation: - -```python -tok2vec = model.get_ref("tok2vec") -tokvecs, bp_tokvecs = tok2vec(docs, is_train=True) -``` - + From 9f40d963fd92d2dc5de04af2bda45d79d440113e Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 4 Oct 2020 14:11:53 +0200 Subject: [PATCH 04/12] highlight the two steps: the model and the pipeline component --- website/docs/usage/layers-architectures.md | 126 ++++++++++++++------- 1 file changed, 88 insertions(+), 38 deletions(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 25f9a568c..c4b3fb9dc 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -495,12 +495,19 @@ from scratch. This can be done by creating a new class inheriting from ### Example: Pipeline component for relation extraction {#component-rel} This section outlines an example use-case of implementing a novel relation -extraction component from scratch. We assume we want to implement a binary -relation extraction method that determines whether two entities in a document -are related or not, and if so, with what type of relation. We'll allow multiple +extraction component from scratch. We assume we want to implement a binary +relation extraction method that determines whether two entities in a document +are related or not, and if so, with what type of relation. We'll allow multiple types of relations between two such entities - i.e. it is a multi-label setting. -We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes +There are two major steps required: first, we need to +[implement a machine learning model](#component-rel-model) specific to this +task, and then we'll use this model to +[implement a custom pipeline component](#component-rel-pipe). + +#### Step 1: Implementing the Model {#component-rel-model} + +We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes a list of documents as input, and outputs a two-dimensional matrix of scores: ```python @@ -514,15 +521,15 @@ The first layer in this model will typically be an [embedding layer](/usage/embeddings-transformers) such as a [`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it -transforms each document into a list of tokens, with each token being +transforms each document into a list of tokens, with each token being represented by its embedding in the vector space. -Next, we need a method that will -generate pairs of entities that we want to classify as being related or not. -These candidate pairs are typically formed within one document, which means -we'll have a function that takes a `Doc` as input and outputs a `List` of `Span` -tuples. For instance, a very straightforward implementation -would be to just take any two entities from the same document: +Next, we need a method that will generate pairs of entities that we want to +classify as being related or not. These candidate pairs are typically formed +within one document, which means we'll have a function that takes a `Doc` as +input and outputs a `List` of `Span` tuples. For instance, a very +straightforward implementation would be to just take any two entities from the +same document: ```python def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: @@ -536,10 +543,10 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: > ``` > [model] > @architectures = "rel_model.v1" -> +> > [model.tok2vec] > ... -> +> > [model.get_candidates] > @misc = "rel_cand_generator.v2" > max_length = 6 @@ -566,33 +573,76 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span return get_candidates ``` -Finally, we'll require a method that transforms the candidate pairs of entities into -a 2D tensor using the specified Tok2Vec function, and this `Floats2d` object will then be -processed by a final `output_layer` of the network. Taking all this together, we can define -our relation model like this in the config: +Finally, we'll require a method that transforms the candidate pairs of entities +into a 2D tensor using the specified Tok2Vec function, and this `Floats2d` +object will then be processed by a final `output_layer` of the network. Taking +all this together, we can define our relation model like this in the config: -> ``` -> [model] -> @architectures = "rel_model.v1" -> nO = null -> -> [model.tok2vec] -> ... -> -> [model.get_candidates] -> @misc = "rel_cand_generator.v2" -> max_length = 6 -> -> [components.relation_extractor.model.create_candidate_tensor] -> @misc = "rel_cand_tensor.v1" -> -> [components.relation_extractor.model.output_layer] -> @architectures = "rel_output_layer.v1" -> nI = null -> nO = null -> ``` +``` +[model] +@architectures = "rel_model.v1" +... - +[model.tok2vec] +... + +[model.get_candidates] +@misc = "rel_cand_generator.v2" +max_length = 6 + +[model.create_candidate_tensor] +@misc = "rel_cand_tensor.v1" + +[model.output_layer] +@architectures = "rel_output_layer.v1" +... +``` + + + +When creating this model, we'll store the custom functions as +[attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as +references, so we can access them easily: + +```python +tok2vec_layer = model.get_ref("tok2vec") +output_layer = model.get_ref("output_layer") +create_candidate_tensor = model.attrs["create_candidate_tensor"] +get_candidates = model.attrs["get_candidates"] +``` + +#### Step 2: Implementing the pipeline component {#component-rel-pipe} + +To use our new relation extraction model as part of a custom component, we +create a subclass of [`Pipe`](/api/pipe) that will hold the model: + +```python +from spacy.pipeline import Pipe +from spacy.language import Language + +class RelationExtractor(Pipe): + def __init__(self, vocab, model, name="rel", labels=[]): + ... + + def predict(self, docs): + ... + + def set_annotations(self, docs, scores): + ... + +@Language.factory("relation_extractor") +def make_relation_extractor(nlp, name, model, labels): + return RelationExtractor(nlp.vocab, model, name, labels=labels) +``` + +The [`predict`](/api/pipe#predict ) function needs to be implemented for each subclass. +In our case, we can simply delegate to the internal model's +[predict](https://thinc.ai/docs/api-model#predict) function: +```python +def predict(self, docs: Iterable[Doc]) -> Floats2d: + scores = self.model.predict(docs) + return self.model.ops.asarray(scores) +``` From b0463fbf75a83127352d52d6ac295bb73d16a6d0 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 4 Oct 2020 14:56:48 +0200 Subject: [PATCH 05/12] set_annotations explanation --- website/docs/usage/layers-architectures.md | 48 ++++++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index c4b3fb9dc..7e563cb5c 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -613,7 +613,7 @@ get_candidates = model.attrs["get_candidates"] #### Step 2: Implementing the pipeline component {#component-rel-pipe} -To use our new relation extraction model as part of a custom component, we +To use our new relation extraction model as part of a custom component, we create a subclass of [`Pipe`](/api/pipe) that will hold the model: ```python @@ -635,15 +635,57 @@ def make_relation_extractor(nlp, name, model, labels): return RelationExtractor(nlp.vocab, model, name, labels=labels) ``` -The [`predict`](/api/pipe#predict ) function needs to be implemented for each subclass. -In our case, we can simply delegate to the internal model's +The [`predict`](/api/pipe#predict) function needs to be implemented for each +subclass. In our case, we can simply delegate to the internal model's [predict](https://thinc.ai/docs/api-model#predict) function: + ```python def predict(self, docs: Iterable[Doc]) -> Floats2d: scores = self.model.predict(docs) return self.model.ops.asarray(scores) ``` +The other method that needs to be implemented, is +[`set_annotations`](/api/pipe#set_annotations). It takes the predicted scores, +and modifies the given `Doc` object in place to hold the predictions. For our +relation extraction component, we'll store the data as a dictionary in a custom +extension attribute `doc._.rel`. As keys, we represent the candidate pair by the +start offsets of each entity, as this defines an entity uniquely within one +document. + +To interpret the scores predicted by the REL model correctly, we need to +refer to the model's `get_candidates` function that originally defined which +pairs of entities would be run through the model, so that the scores can be +related to those exact entities: + +> #### Example output +> +> ```python +> doc = nlp("Amsterdam is the capital of the Netherlands.") +> print(f"spans: {[(e.start, e.text, e.label_) for e in doc.ents]}") +> for value, rel_dict in doc._.rel.items(): +> print(f"{value}: {rel_dict}") +> ``` + +> ``` +> spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')] +> (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002} +> (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017} +> ``` + +```python +def set_annotations(self, docs: Iterable[Doc], rel_scores: Floats2d): + c = 0 + get_candidates = self.model.attrs["get_candidates"] + for doc in docs: + for (e1, e2) in get_candidates(doc): + offset = (e1.start, e2.start) + if offset not in doc._.rel: + doc._.rel[offset] = {} + for j, label in enumerate(self.labels): + doc._.rel[offset][label] = rel_scores[c, j] + c += 1 +``` From 52b660e9dcc412fc1d4bbdf269c1bd31d9e7d3a4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 5 Oct 2020 00:39:36 +0200 Subject: [PATCH 06/12] initialize and update explanation --- website/docs/api/pipe.md | 6 + website/docs/usage/layers-architectures.md | 149 ++++++++++++++++----- 2 files changed, 119 insertions(+), 36 deletions(-) diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index 4f5ac6f61..de35f9eb4 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -226,6 +226,12 @@ the "catastrophic forgetting" problem. This feature is experimental. Find the loss and gradient of loss for the batch of documents and their predicted scores. + + +This method needs to be overwritten with your own custom `get_loss` method. + + + > #### Example > > ```python diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 7e563cb5c..130a7144e 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -618,31 +618,97 @@ create a subclass of [`Pipe`](/api/pipe) that will hold the model: ```python from spacy.pipeline import Pipe -from spacy.language import Language class RelationExtractor(Pipe): def __init__(self, vocab, model, name="rel", labels=[]): + self.model = model ... def predict(self, docs): ... - def set_annotations(self, docs, scores): + def set_annotations(self, docs, predictions): ... - -@Language.factory("relation_extractor") -def make_relation_extractor(nlp, name, model, labels): - return RelationExtractor(nlp.vocab, model, name, labels=labels) ``` +Before the model can be used however, it needs to be +[initialized](/api/pipe#initialize). This function recieves either the full +training data set, or a representative sample. The training data can be used +to deduce all relevant labels. Alternatively, a list of labels can be provided, +or a script can call `rel_component.add_label()` to add each label separately. + +The number of labels will define the output dimensionality of the network, +and will be used to do +[shape inference](https://thinc.ai/docs/usage-models#validation) throughout +the layers of the neural network. This is triggerd by calling `model.initialize`. + +```python +from itertools import islice + +def initialize( + self, + get_examples: Callable[[], Iterable[Example]], + *, + nlp: Language = None, + labels: Optional[List[str]] = None, +): + if labels is not None: + for label in labels: + self.add_label(label) + else: + for example in get_examples(): + relations = example.reference._.rel + for indices, label_dict in relations.items(): + for label in label_dict.keys(): + self.add_label(label) + subbatch = list(islice(get_examples(), 10)) + doc_sample = [eg.reference for eg in subbatch] + label_sample = self._examples_to_truth(subbatch) + self.model.initialize(X=doc_sample, Y=label_sample) +``` + +The `initialize` method will be triggered whenever this component is part of an +`nlp` pipeline, and `nlp.initialize()` is invoked. After doing so, the pipeline +component and its internal model can be trained and used to make predictions. + +During training the function [`update`](/api/pipe#update) is invoked which delegates to +[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and +needs a function [`get_loss`](/api/pipe#get_loss) that will calculate the +loss for a batch of examples, as well as the gradient of loss that will be used to update +the weights of the model layers. + +```python +def update( + self, + examples: Iterable[Example], + *, + drop: float = 0.0, + set_annotations: bool = False, + sgd: Optional[Optimizer] = None, + losses: Optional[Dict[str, float]] = None, +) -> Dict[str, float]: + ... + docs = [ex.predicted for ex in examples] + predictions, backprop = self.model.begin_update(docs) + loss, gradient = self.get_loss(examples, predictions) + backprop(gradient) + losses[self.name] += loss + ... + return losses +``` + +Thinc provides some [loss functions](https://thinc.ai/docs/api-loss) that can be used +for the implementation of the `get_loss` function. + +When the internal model is trained, the component can be used to make novel predictions. The [`predict`](/api/pipe#predict) function needs to be implemented for each -subclass. In our case, we can simply delegate to the internal model's +subclass of `Pipe`. In our case, we can simply delegate to the internal model's [predict](https://thinc.ai/docs/api-model#predict) function: ```python def predict(self, docs: Iterable[Doc]) -> Floats2d: - scores = self.model.predict(docs) - return self.model.ops.asarray(scores) + predictions = self.model.predict(docs) + return self.model.ops.asarray(predictions) ``` The other method that needs to be implemented, is @@ -650,7 +716,7 @@ The other method that needs to be implemented, is and modifies the given `Doc` object in place to hold the predictions. For our relation extraction component, we'll store the data as a dictionary in a custom extension attribute `doc._.rel`. As keys, we represent the candidate pair by the -start offsets of each entity, as this defines an entity uniquely within one +start offsets of each entity, as this defines an entity pair uniquely within one document. To interpret the scores predicted by the REL model correctly, we need to @@ -674,7 +740,7 @@ related to those exact entities: > ``` ```python -def set_annotations(self, docs: Iterable[Doc], rel_scores: Floats2d): +def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d): c = 0 get_candidates = self.model.attrs["get_candidates"] for doc in docs: @@ -683,34 +749,45 @@ def set_annotations(self, docs: Iterable[Doc], rel_scores: Floats2d): if offset not in doc._.rel: doc._.rel[offset] = {} for j, label in enumerate(self.labels): - doc._.rel[offset][label] = rel_scores[c, j] + doc._.rel[offset][label] = predictions[c, j] c += 1 ``` - - - - - - - - + +Once our `Pipe` subclass is fully implemented, we can +[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories) +the component with the +`Language.factory` decorator. This will enable the creation of the component with +`nlp.add_pipe`, or via the config. + +> ``` +> +> [components.relation_extractor] +> factory = "relation_extractor" +> labels = [] +> +> [components.relation_extractor.model] +> @architectures = "rel_model.v1" +> ... +> ``` + +```python +from spacy.language import Language + +@Language.factory("relation_extractor") +def make_relation_extractor(nlp, name, model, labels): + return RelationExtractor(nlp.vocab, model, name, labels=labels) +``` + + + + From 9a6c9b133b796d4b766189740ef1fc88f6dbe3ee Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 5 Oct 2020 01:05:37 +0200 Subject: [PATCH 07/12] various small fixes --- website/docs/usage/layers-architectures.md | 142 +++++++++++---------- 1 file changed, 74 insertions(+), 68 deletions(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 130a7144e..414562d6d 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -288,7 +288,7 @@ those parts of the network. To use our custom model including the PyTorch subnetwork, all we need to do is register the architecture using the -[`architectures` registry](/api/top-level#registry). This will assign the +[`architectures` registry](/api/top-level#registry). This assigns the architecture a name so spaCy knows how to find it, and allows passing in arguments like hyperparameters via the [config](/usage/training#config). The full example then becomes: @@ -488,27 +488,27 @@ with Model.define_operators({">>": chain}): In addition to [swapping out](#swap-architectures) default models in built-in components, you can also implement an entirely new, -[trainable pipeline component](usage/processing-pipelines#trainable-components) +[trainable pipeline component](/usage/processing-pipelines#trainable-components) from scratch. This can be done by creating a new class inheriting from [`Pipe`](/api/pipe), and linking it up to your custom model implementation. ### Example: Pipeline component for relation extraction {#component-rel} This section outlines an example use-case of implementing a novel relation -extraction component from scratch. We assume we want to implement a binary -relation extraction method that determines whether two entities in a document -are related or not, and if so, with what type of relation. We'll allow multiple -types of relations between two such entities - i.e. it is a multi-label setting. +extraction component from scratch. We'll implement a binary relation extraction +method that determines whether or not two entities in a document are related, +and if so, what type of relation. We'll allow multiple types of relations +between two such entities (multi-label setting). There are two major steps required: first, we need to [implement a machine learning model](#component-rel-model) specific to this -task, and then we'll use this model to +task, and subsequently we use this model to [implement a custom pipeline component](#component-rel-pipe). #### Step 1: Implementing the Model {#component-rel-model} -We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes -a list of documents as input, and outputs a two-dimensional matrix of scores: +We need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes a +list of documents as input, and outputs a two-dimensional matrix of predictions: ```python @registry.architectures.register("rel_model.v1") @@ -519,17 +519,16 @@ def create_relation_model(...) -> Model[List[Doc], Floats2d]: The first layer in this model will typically be an [embedding layer](/usage/embeddings-transformers) such as a -[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This -layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it +[`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This +layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it transforms each document into a list of tokens, with each token being represented by its embedding in the vector space. -Next, we need a method that will generate pairs of entities that we want to -classify as being related or not. These candidate pairs are typically formed -within one document, which means we'll have a function that takes a `Doc` as -input and outputs a `List` of `Span` tuples. For instance, a very -straightforward implementation would be to just take any two entities from the -same document: +Next, we need a method that generates pairs of entities that we want to classify +as being related or not. As these candidate pairs are typically formed within +one document, this function takes a `Doc` as input and outputs a `List` of +`Span` tuples. For instance, a very straightforward implementation would be to +just take any two entities from the same document: ```python def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: @@ -549,12 +548,12 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: > > [model.get_candidates] > @misc = "rel_cand_generator.v2" -> max_length = 6 +> max_length = 20 > ``` But we could also refine this further by excluding relations of an entity with itself, and posing a maximum distance (in number of tokens) between two -entities. We'll register this function in the +entities. We register this function in the [`@misc` registry](/api/top-level#registry) so we can refer to it from the config, and easily swap it out for any other candidate generation function. @@ -573,10 +572,10 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span return get_candidates ``` -Finally, we'll require a method that transforms the candidate pairs of entities -into a 2D tensor using the specified Tok2Vec function, and this `Floats2d` -object will then be processed by a final `output_layer` of the network. Taking -all this together, we can define our relation model like this in the config: +Finally, we require a method that transforms the candidate entity pairs into a +2D tensor using the specified `Tok2Vec` function. The resulting `Floats2d` +object will then be processed by a final `output_layer` of the network. Putting +all this together, we can define our relation model in a config file as such: ``` [model] @@ -588,7 +587,7 @@ all this together, we can define our relation model like this in the config: [model.get_candidates] @misc = "rel_cand_generator.v2" -max_length = 6 +max_length = 20 [model.create_candidate_tensor] @misc = "rel_cand_tensor.v1" @@ -600,7 +599,7 @@ max_length = 6 -When creating this model, we'll store the custom functions as +When creating this model, we store the custom functions as [attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as references, so we can access them easily: @@ -614,7 +613,7 @@ get_candidates = model.attrs["get_candidates"] #### Step 2: Implementing the pipeline component {#component-rel-pipe} To use our new relation extraction model as part of a custom component, we -create a subclass of [`Pipe`](/api/pipe) that will hold the model: +create a subclass of [`Pipe`](/api/pipe) that holds the model: ```python from spacy.pipeline import Pipe @@ -624,6 +623,9 @@ class RelationExtractor(Pipe): self.model = model ... + def update(self, examples, ...): + ... + def predict(self, docs): ... @@ -631,18 +633,19 @@ class RelationExtractor(Pipe): ... ``` -Before the model can be used however, it needs to be -[initialized](/api/pipe#initialize). This function recieves either the full -training data set, or a representative sample. The training data can be used -to deduce all relevant labels. Alternatively, a list of labels can be provided, -or a script can call `rel_component.add_label()` to add each label separately. +Before the model can be used, it needs to be +[initialized](/api/pipe#initialize). This function receives either the full +training data set, or a representative sample. This data set can be used to +deduce all relevant labels. Alternatively, a list of labels can be provided, or +a script can call `rel_component.add_label()` directly. -The number of labels will define the output dimensionality of the network, -and will be used to do -[shape inference](https://thinc.ai/docs/usage-models#validation) throughout -the layers of the neural network. This is triggerd by calling `model.initialize`. +The number of labels defines the output dimensionality of the network, and will +be used to do [shape inference](https://thinc.ai/docs/usage-models#validation) +throughout the layers of the neural network. This is triggered by calling +`model.initialize`. ```python +### {highlight="12,18,22"} from itertools import islice def initialize( @@ -666,18 +669,21 @@ def initialize( label_sample = self._examples_to_truth(subbatch) self.model.initialize(X=doc_sample, Y=label_sample) ``` - -The `initialize` method will be triggered whenever this component is part of an -`nlp` pipeline, and `nlp.initialize()` is invoked. After doing so, the pipeline -component and its internal model can be trained and used to make predictions. -During training the function [`update`](/api/pipe#update) is invoked which delegates to -[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and -needs a function [`get_loss`](/api/pipe#get_loss) that will calculate the -loss for a batch of examples, as well as the gradient of loss that will be used to update -the weights of the model layers. +The `initialize` method is triggered whenever this component is part of an `nlp` +pipeline, and [`nlp.initialize()`](/api/language#initialize) is invoked. After +doing so, the pipeline component and its internal model can be trained and used +to make predictions. + +During training, the function [`update`](/api/pipe#update) is invoked which +delegates to +[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a +[`get_loss`](/api/pipe#get_loss) function that calculate the loss for a batch of +examples, as well as the gradient of loss that will be used to update the +weights of the model layers. ```python +### {highlight="12-14"} def update( self, examples: Iterable[Example], @@ -697,13 +703,13 @@ def update( return losses ``` -Thinc provides some [loss functions](https://thinc.ai/docs/api-loss) that can be used -for the implementation of the `get_loss` function. +Thinc provides several [loss functions](https://thinc.ai/docs/api-loss) that can +be used for the implementation of the `get_loss` function. -When the internal model is trained, the component can be used to make novel predictions. -The [`predict`](/api/pipe#predict) function needs to be implemented for each -subclass of `Pipe`. In our case, we can simply delegate to the internal model's -[predict](https://thinc.ai/docs/api-model#predict) function: +When the internal model is trained, the component can be used to make novel +predictions. The [`predict`](/api/pipe#predict) function needs to be implemented +for each subclass of `Pipe`. In our case, we can simply delegate to the internal +model's [predict](https://thinc.ai/docs/api-model#predict) function: ```python def predict(self, docs: Iterable[Doc]) -> Floats2d: @@ -711,24 +717,24 @@ def predict(self, docs: Iterable[Doc]) -> Floats2d: return self.model.ops.asarray(predictions) ``` -The other method that needs to be implemented, is -[`set_annotations`](/api/pipe#set_annotations). It takes the predicted scores, -and modifies the given `Doc` object in place to hold the predictions. For our -relation extraction component, we'll store the data as a dictionary in a custom +The final method that needs to be implemented, is +[`set_annotations`](/api/pipe#set_annotations). This function takes the +predictions, and modifies the given `Doc` object in place to store them. For our +relation extraction component, we store the data as a dictionary in a custom extension attribute `doc._.rel`. As keys, we represent the candidate pair by the start offsets of each entity, as this defines an entity pair uniquely within one document. -To interpret the scores predicted by the REL model correctly, we need to -refer to the model's `get_candidates` function that originally defined which -pairs of entities would be run through the model, so that the scores can be -related to those exact entities: +To interpret the scores predicted by the REL model correctly, we need to refer +to the model's `get_candidates` function that defined which pairs of entities +were relevant candidates, so that the predictions can be linked to those exact +entities: > #### Example output > > ```python > doc = nlp("Amsterdam is the capital of the Netherlands.") -> print(f"spans: {[(e.start, e.text, e.label_) for e in doc.ents]}") +> print(f"spans: [(e.start, e.text, e.label_) for e in doc.ents]") > for value, rel_dict in doc._.rel.items(): > print(f"{value}: {rel_dict}") > ``` @@ -740,6 +746,7 @@ related to those exact entities: > ``` ```python +### {highlight="5-6,10"} def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d): c = 0 get_candidates = self.model.attrs["get_candidates"] @@ -753,8 +760,8 @@ def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d): c += 1 ``` -Under the hood, when the pipe is applied to a document, it will delegate to these -two methods: +Under the hood, when the pipe is applied to a document, it delegates to the +`predict` and `set_annotations` functions: ```python def __call__(self, Doc doc): @@ -763,18 +770,17 @@ def __call__(self, Doc doc): return doc ``` -Once our `Pipe` subclass is fully implemented, we can -[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories) -the component with the -`Language.factory` decorator. This will enable the creation of the component with -`nlp.add_pipe`, or via the config. +Once our `Pipe` subclass is fully implemented, we can +[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories) +the component with the `Language.factory` decorator. This enables the creation +of the component with `nlp.add_pipe`, or via the config. > ``` -> +> > [components.relation_extractor] > factory = "relation_extractor" > labels = [] -> +> > [components.relation_extractor.model] > @architectures = "rel_model.v1" > ... From b0b93854cb2c522090c87544e33a19e6b361ed19 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 5 Oct 2020 09:26:43 +0200 Subject: [PATCH 08/12] Update ru/uk lemmatizers for new nlp.initialize --- spacy/lang/ru/__init__.py | 10 ++++++++-- spacy/lang/ru/lemmatizer.py | 5 ++--- spacy/lang/uk/__init__.py | 4 ++-- spacy/lang/uk/lemmatizer.py | 5 ++--- spacy/tests/conftest.py | 1 - 5 files changed, 14 insertions(+), 11 deletions(-) diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index 1d59ca043..2f3965fcc 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -25,8 +25,14 @@ class Russian(Language): default_config={"model": None, "mode": "pymorphy2"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): - return RussianLemmatizer(nlp.vocab, model, name, mode=mode) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool = False, +): + return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) __all__ = ["Russian"] diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 8d7996c63..3bcac8730 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Tuple from thinc.api import Model -from ...lookups import Lookups from ...pipeline import Lemmatizer from ...symbols import POS from ...tokens import Token @@ -22,9 +21,9 @@ class RussianLemmatizer(Lemmatizer): name: str = "lemmatizer", *, mode: str = "pymorphy2", - lookups: Optional[Lookups] = None, + overwrite: bool = False, ) -> None: - super().__init__(vocab, model, name, mode=mode, lookups=lookups) + super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) try: from pymorphy2 import MorphAnalyzer diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index 73c065379..0abe9170e 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -26,8 +26,8 @@ class Ukrainian(Language): default_config={"model": None, "mode": "pymorphy2"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): - return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False,): + return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) __all__ = ["Ukrainian"] diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index 0d6febce6..009ec5044 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -3,7 +3,6 @@ from typing import Optional from thinc.api import Model from ..ru.lemmatizer import RussianLemmatizer -from ...lookups import Lookups from ...vocab import Vocab @@ -15,9 +14,9 @@ class UkrainianLemmatizer(RussianLemmatizer): name: str = "lemmatizer", *, mode: str = "pymorphy2", - lookups: Optional[Lookups] = None, + overwrite: bool = False, ) -> None: - super().__init__(vocab, model, name, mode=mode, lookups=lookups) + super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) try: from pymorphy2 import MorphAnalyzer except ImportError: diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 4a3d126d7..67860b7e4 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -248,7 +248,6 @@ def tt_tokenizer(): @pytest.fixture(scope="session") def uk_tokenizer(): pytest.importorskip("pymorphy2") - pytest.importorskip("pymorphy2.lang") return get_lang_class("uk")().tokenizer From 1c641e41c3d46c5b555891427833200c0f0087b5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 11:50:11 +0200 Subject: [PATCH 09/12] Remove unused import [ci skip] --- spacy/tests/regression/test_issue5918.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/tests/regression/test_issue5918.py b/spacy/tests/regression/test_issue5918.py index e4ee0135d..d25323ef6 100644 --- a/spacy/tests/regression/test_issue5918.py +++ b/spacy/tests/regression/test_issue5918.py @@ -1,6 +1,5 @@ from spacy.lang.en import English from spacy.pipeline import merge_entities -import pytest def test_issue5918(): From e3acad626443c9cf0b81f600aae2b3b9529b63cd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 13:06:20 +0200 Subject: [PATCH 10/12] Update docs [ci skip] --- website/docs/usage/layers-architectures.md | 261 +++++++++++++-------- 1 file changed, 162 insertions(+), 99 deletions(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 414562d6d..24c7bf1cf 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -86,7 +86,8 @@ see are: ​ | ~~Ragged~~ | A container to handle variable-length sequence data in an unpadded contiguous array. | | ~~Padded~~ | A container to handle variable-length sequence data in a padded contiguous array. | -The model type signatures help you figure out which model architectures and +See the [Thinc type reference](https://thinc.ai/docs/api-types) for details. The +model type signatures help you figure out which model architectures and components can **fit together**. For instance, the [`TextCategorizer`](/api/textcategorizer) class expects a model typed ~~Model[List[Doc], Floats2d]~~, because the model will predict one row of @@ -488,32 +489,57 @@ with Model.define_operators({">>": chain}): In addition to [swapping out](#swap-architectures) default models in built-in components, you can also implement an entirely new, -[trainable pipeline component](/usage/processing-pipelines#trainable-components) +[trainable](/usage/processing-pipelines#trainable-components) pipeline component from scratch. This can be done by creating a new class inheriting from [`Pipe`](/api/pipe), and linking it up to your custom model implementation. -### Example: Pipeline component for relation extraction {#component-rel} + -This section outlines an example use-case of implementing a novel relation -extraction component from scratch. We'll implement a binary relation extraction -method that determines whether or not two entities in a document are related, -and if so, what type of relation. We'll allow multiple types of relations -between two such entities (multi-label setting). +For details on how to implement pipeline components, check out the usage guide +on [custom components](/usage/processing-pipelines#custom-component) and the +overview of the `Pipe` methods used by +[trainable components](/usage/processing-pipelines#trainable-components). -There are two major steps required: first, we need to -[implement a machine learning model](#component-rel-model) specific to this -task, and subsequently we use this model to -[implement a custom pipeline component](#component-rel-pipe). + + +### Example: Entity elation extraction component {#component-rel} + +This section outlines an example use-case of implementing a **novel relation +extraction component** from scratch. We'll implement a binary relation +extraction method that determines whether or not **two entities** in a document +are related, and if so, what type of relation. We'll allow multiple types of +relations between two such entities (multi-label setting). There are two major +steps required: + +1. Implement a [machine learning model](#component-rel-model) specific to this + task. It will have to extract candidates from a [`Doc`](/api/doc) and predict + a relation for the available candidate pairs. +2. Implement a custom [pipeline component](#component-rel-pipe) powered by the + machine learning model that sets annotations on the [`Doc`](/api/doc) passing + through the pipeline. + + #### Step 1: Implementing the Model {#component-rel-model} We need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes a -list of documents as input, and outputs a two-dimensional matrix of predictions: +**list of documents** (~~List[Doc]~~) as input, and outputs a **two-dimensional +matrix** (~~Floats2d~~) of predictions: + +> #### Model type annotations +> +> The `Model` class is a generic type that can specify its input and output +> types, e.g. ~~Model[List[Doc], Floats2d]~~. Type hints are used for static +> type checks and validation. See the section on [type signatures](#type-sigs) +> for details. ```python +### Register the model architecture @registry.architectures.register("rel_model.v1") def create_relation_model(...) -> Model[List[Doc], Floats2d]: - model = _create_my_model() + model = ... # 👈 model will go here return model ``` @@ -521,17 +547,18 @@ The first layer in this model will typically be an [embedding layer](/usage/embeddings-transformers) such as a [`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it -transforms each document into a list of tokens, with each token being +transforms each **document into a list of tokens**, with each token being represented by its embedding in the vector space. -Next, we need a method that generates pairs of entities that we want to classify -as being related or not. As these candidate pairs are typically formed within -one document, this function takes a `Doc` as input and outputs a `List` of -`Span` tuples. For instance, a very straightforward implementation would be to -just take any two entities from the same document: +Next, we need a method that **generates pairs of entities** that we want to +classify as being related or not. As these candidate pairs are typically formed +within one document, this function takes a [`Doc`](/api/doc) as input and +outputs a `List` of `Span` tuples. For instance, a very straightforward +implementation would be to just take any two entities from the same document: ```python -def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: +### Simple candiate generation +def get_candidates(doc: Doc) -> List[Tuple[Span, Span]]: candidates = [] for ent1 in doc.ents: for ent2 in doc.ents: @@ -539,27 +566,29 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: return candidates ``` -> ``` -> [model] -> @architectures = "rel_model.v1" -> -> [model.tok2vec] -> ... -> -> [model.get_candidates] -> @misc = "rel_cand_generator.v2" -> max_length = 20 -> ``` - -But we could also refine this further by excluding relations of an entity with -itself, and posing a maximum distance (in number of tokens) between two +But we could also refine this further by **excluding relations** of an entity +with itself, and posing a **maximum distance** (in number of tokens) between two entities. We register this function in the [`@misc` registry](/api/top-level#registry) so we can refer to it from the config, and easily swap it out for any other candidate generation function. +> #### config.cfg (excerpt) +> +> ```ini +> [model] +> @architectures = "rel_model.v1" +> +> [model.tok2vec] +> # ... +> +> [model.get_candidates] +> @misc = "rel_cand_generator.v1" +> max_length = 20 +> ``` + ```python -### {highlight="1,2,7,8"} -@registry.misc.register("rel_cand_generator.v2") +### Extended candidate generation {highlight="1,2,7,8"} +@registry.misc.register("rel_cand_generator.v1") def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]: def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: candidates = [] @@ -573,17 +602,19 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span ``` Finally, we require a method that transforms the candidate entity pairs into a -2D tensor using the specified `Tok2Vec` function. The resulting `Floats2d` -object will then be processed by a final `output_layer` of the network. Putting -all this together, we can define our relation model in a config file as such: +2D tensor using the specified [`Tok2Vec`](/api/tok2vec) or +[`Transformer`](/api/transformer). The resulting ~~Floats2~~ object will then be +processed by a final `output_layer` of the network. Putting all this together, +we can define our relation model in a config file as such: -``` +```ini +### config.cfg [model] @architectures = "rel_model.v1" -... +# ... [model.tok2vec] -... +# ... [model.get_candidates] @misc = "rel_cand_generator.v2" @@ -594,10 +625,11 @@ max_length = 20 [model.output_layer] @architectures = "rel_output_layer.v1" -... +# ... ``` - + + When creating this model, we store the custom functions as [attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as @@ -612,40 +644,55 @@ get_candidates = model.attrs["get_candidates"] #### Step 2: Implementing the pipeline component {#component-rel-pipe} -To use our new relation extraction model as part of a custom component, we +To use our new relation extraction model as part of a custom +[trainable component](/usage/processing-pipelines#trainable-components), we create a subclass of [`Pipe`](/api/pipe) that holds the model: ```python +### Pipeline component skeleton from spacy.pipeline import Pipe class RelationExtractor(Pipe): - def __init__(self, vocab, model, name="rel", labels=[]): + def __init__(self, vocab, model, name="rel"): + """Create a component instance.""" self.model = model - ... + self.vocab = vocab + self.name = name - def update(self, examples, ...): + def update(self, examples, drop=0.0, set_annotations=False, sgd=None, losses=None): + """Learn from a batch of Example objects.""" ... def predict(self, docs): + """Apply the model to a batch of Doc objects.""" ... def set_annotations(self, docs, predictions): + """Modify a batch of Doc objects using the predictions.""" ... + + def initialize(self, get_examples, nlp=None, labels=None): + """Initialize the model before training.""" + ... + + def add_label(self, label): + """Add a label to the component.""" + ... ``` Before the model can be used, it needs to be -[initialized](/api/pipe#initialize). This function receives either the full -training data set, or a representative sample. This data set can be used to -deduce all relevant labels. Alternatively, a list of labels can be provided, or -a script can call `rel_component.add_label()` directly. - -The number of labels defines the output dimensionality of the network, and will -be used to do [shape inference](https://thinc.ai/docs/usage-models#validation) -throughout the layers of the neural network. This is triggered by calling -`model.initialize`. +[initialized](/usage/training#initialization). This function receives a callback +to access the full **training data set**, or a representative sample. This data +set can be used to deduce all **relevant labels**. Alternatively, a list of +labels can be provided to `initialize`, or you can call the +`RelationExtractoradd_label` directly. The number of labels defines the output +dimensionality of the network, and will be used to do +[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the +layers of the neural network. This is triggered by calling +[`Model.initialize`](https://thinc.ai/api/model#initialize). ```python -### {highlight="12,18,22"} +### The initialize method {highlight="12,18,22"} from itertools import islice def initialize( @@ -671,19 +718,22 @@ def initialize( ``` The `initialize` method is triggered whenever this component is part of an `nlp` -pipeline, and [`nlp.initialize()`](/api/language#initialize) is invoked. After -doing so, the pipeline component and its internal model can be trained and used -to make predictions. +pipeline, and [`nlp.initialize`](/api/language#initialize) is invoked. +Typically, this happens when the pipeline is set up before training in +[`spacy train`](/api/cli#training). After initialization, the pipeline component +and its internal model can be trained and used to make predictions. During training, the function [`update`](/api/pipe#update) is invoked which delegates to -[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a -[`get_loss`](/api/pipe#get_loss) function that calculate the loss for a batch of -examples, as well as the gradient of loss that will be used to update the -weights of the model layers. +[`Model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a +[`get_loss`](/api/pipe#get_loss) function that **calculate the loss** for a +batch of examples, as well as the **gradient** of loss that will be used to +update the weights of the model layers. Thinc provides several +[loss functions](https://thinc.ai/docs/api-loss) that can be used for the +implementation of the `get_loss` function. ```python -### {highlight="12-14"} +### The update method {highlight="12-14"} def update( self, examples: Iterable[Example], @@ -703,15 +753,14 @@ def update( return losses ``` -Thinc provides several [loss functions](https://thinc.ai/docs/api-loss) that can -be used for the implementation of the `get_loss` function. - When the internal model is trained, the component can be used to make novel -predictions. The [`predict`](/api/pipe#predict) function needs to be implemented -for each subclass of `Pipe`. In our case, we can simply delegate to the internal -model's [predict](https://thinc.ai/docs/api-model#predict) function: +**predictions**. The [`predict`](/api/pipe#predict) function needs to be +implemented for each subclass of `Pipe`. In our case, we can simply delegate to +the internal model's [predict](https://thinc.ai/docs/api-model#predict) function +that takes a batch of `Doc` objects and returns a ~~Floats2d~~ array: ```python +### The predict method def predict(self, docs: Iterable[Doc]) -> Floats2d: predictions = self.model.predict(docs) return self.model.ops.asarray(predictions) @@ -721,32 +770,36 @@ The final method that needs to be implemented, is [`set_annotations`](/api/pipe#set_annotations). This function takes the predictions, and modifies the given `Doc` object in place to store them. For our relation extraction component, we store the data as a dictionary in a custom -extension attribute `doc._.rel`. As keys, we represent the candidate pair by the -start offsets of each entity, as this defines an entity pair uniquely within one -document. +[extension attribute](/usage/processing-pipelines#custom-components-attributes) +`doc._.rel`. As keys, we represent the candidate pair by the **start offsets of +each entity**, as this defines an entity pair uniquely within one document. -To interpret the scores predicted by the REL model correctly, we need to refer -to the model's `get_candidates` function that defined which pairs of entities -were relevant candidates, so that the predictions can be linked to those exact -entities: +To interpret the scores predicted by the relation extraction model correctly, we +need to refer to the model's `get_candidates` function that defined which pairs +of entities were relevant candidates, so that the predictions can be linked to +those exact entities: > #### Example output > > ```python > doc = nlp("Amsterdam is the capital of the Netherlands.") -> print(f"spans: [(e.start, e.text, e.label_) for e in doc.ents]") +> print("spans", [(e.start, e.text, e.label_) for e in doc.ents]) > for value, rel_dict in doc._.rel.items(): > print(f"{value}: {rel_dict}") -> ``` - -> ``` -> spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')] -> (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002} -> (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017} +> +> # spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')] +> # (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002} +> # (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017} > ``` ```python -### {highlight="5-6,10"} +### Registering the extension attribute +from spacy.tokens import Doc +Doc.set_extension("rel", default={}) +``` + +```python +### The set_annotations method {highlight="5-6,10"} def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d): c = 0 get_candidates = self.model.attrs["get_candidates"] @@ -761,9 +814,10 @@ def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d): ``` Under the hood, when the pipe is applied to a document, it delegates to the -`predict` and `set_annotations` functions: +`predict` and `set_annotations` methods: ```python +### The __call__ method def __call__(self, Doc doc): predictions = self.predict([doc]) self.set_annotations([doc], predictions) @@ -771,29 +825,38 @@ def __call__(self, Doc doc): ``` Once our `Pipe` subclass is fully implemented, we can -[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories) -the component with the `Language.factory` decorator. This enables the creation -of the component with `nlp.add_pipe`, or via the config. +[register](/usage/processing-pipelines#custom-components-factories) the +component with the [`@Language.factory`](/api/lnguage#factory) decorator. This +assigns it a name and lets you create the component with +[`nlp.add_pipe`](/api/language#add_pipe) and via the +[config](/usage/training#config). -> ``` +> #### config.cfg (excerpt) > +> ```ini > [components.relation_extractor] > factory = "relation_extractor" -> labels = [] > > [components.relation_extractor.model] > @architectures = "rel_model.v1" -> ... +> +> [components.relation_extractor.model.tok2vec] +> # ... +> +> [components.relation_extractor.model.get_candidates] +> @misc = "rel_cand_generator.v1" +> max_length = 20 > ``` ```python +### Registering the pipeline component from spacy.language import Language @Language.factory("relation_extractor") -def make_relation_extractor(nlp, name, model, labels): - return RelationExtractor(nlp.vocab, model, name, labels=labels) +def make_relation_extractor(nlp, name, model): + return RelationExtractor(nlp.vocab, model, name) ``` - + + --> From fd2d48556c1e77f4492693e4a69dc8f4a34cfe34 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 5 Oct 2020 13:43:32 +0200 Subject: [PATCH 11/12] fix E902 and E903 numbering --- spacy/errors.py | 4 ++-- spacy/training/converters/conll_ner_to_docs.py | 2 +- spacy/training/converters/iob_to_docs.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 20edf45b5..9d9a716d2 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -456,10 +456,10 @@ class Errors: "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master - E092 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. " + E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. " "Try checking whitespace and delimiters. See " "https://nightly.spacy.io/api/cli#convert") - E093 = ("The token-per-line NER file is not formatted correctly. Try checking " + E903 = ("The token-per-line NER file is not formatted correctly. Try checking " "whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert") E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This " "dimension refers to the output width, after the linear projection " diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py index 28f0f87c3..c01686aee 100644 --- a/spacy/training/converters/conll_ner_to_docs.py +++ b/spacy/training/converters/conll_ner_to_docs.py @@ -103,7 +103,7 @@ def conll_ner_to_docs( lines = [line.strip() for line in conll_sent.split("\n") if line.strip()] cols = list(zip(*[line.split() for line in lines])) if len(cols) < 2: - raise ValueError(Errors.E093) + raise ValueError(Errors.E903) length = len(cols[0]) words.extend(cols[0]) sent_starts.extend([True] + [False] * (length - 1)) diff --git a/spacy/training/converters/iob_to_docs.py b/spacy/training/converters/iob_to_docs.py index 73ad8953d..a2185fef7 100644 --- a/spacy/training/converters/iob_to_docs.py +++ b/spacy/training/converters/iob_to_docs.py @@ -46,7 +46,7 @@ def read_iob(raw_sents, vocab, n_sents): sent_words, sent_iob = zip(*sent_tokens) sent_tags = ["-"] * len(sent_words) else: - raise ValueError(Errors.E092) + raise ValueError(Errors.E902) words.extend(sent_words) tags.extend(sent_tags) iob.extend(sent_iob) From 20f2a17a09dc053b5f2f06cff637fb92647137ad Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 13:45:57 +0200 Subject: [PATCH 12/12] Merge test_misc and test_util --- spacy/tests/test_misc.py | 134 ++++++++++++++++++++++++++++++++++++++ spacy/tests/test_util.py | 137 --------------------------------------- 2 files changed, 134 insertions(+), 137 deletions(-) delete mode 100644 spacy/tests/test_util.py diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index e6ef45f90..bdf54ad6a 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -7,6 +7,15 @@ from spacy import util from spacy import prefer_gpu, require_gpu from spacy.ml._precomputable_affine import PrecomputableAffine from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding +from spacy.util import dot_to_object, SimpleFrozenList +from thinc.api import Config, Optimizer, ConfigValidationError +from spacy.training.batchers import minibatch_by_words +from spacy.lang.en import English +from spacy.lang.nl import Dutch +from spacy.language import DEFAULT_CONFIG_PATH +from spacy.schemas import ConfigSchemaTraining + +from .util import get_random_doc @pytest.fixture @@ -157,3 +166,128 @@ def test_dot_to_dict(dot_notation, expected): result = util.dot_to_dict(dot_notation) assert result == expected assert util.dict_to_dot(result) == dot_notation + + +@pytest.mark.parametrize( + "doc_sizes, expected_batches", + [ + ([400, 400, 199], [3]), + ([400, 400, 199, 3], [4]), + ([400, 400, 199, 3, 200], [3, 2]), + ([400, 400, 199, 3, 1], [5]), + ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded + ([400, 400, 199, 3, 1, 200], [3, 3]), + ([400, 400, 199, 3, 1, 999], [3, 3]), + ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]), + ([1, 2, 999], [3]), + ([1, 2, 999, 1], [4]), + ([1, 200, 999, 1], [2, 2]), + ([1, 999, 200, 1], [2, 2]), + ], +) +def test_util_minibatch(doc_sizes, expected_batches): + docs = [get_random_doc(doc_size) for doc_size in doc_sizes] + tol = 0.2 + batch_size = 1000 + batches = list( + minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True) + ) + assert [len(batch) for batch in batches] == expected_batches + + max_size = batch_size + batch_size * tol + for batch in batches: + assert sum([len(doc) for doc in batch]) < max_size + + +@pytest.mark.parametrize( + "doc_sizes, expected_batches", + [ + ([400, 4000, 199], [1, 2]), + ([400, 400, 199, 3000, 200], [1, 4]), + ([400, 400, 199, 3, 1, 1500], [1, 5]), + ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]), + ([1, 2, 9999], [1, 2]), + ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]), + ], +) +def test_util_minibatch_oversize(doc_sizes, expected_batches): + """ Test that oversized documents are returned in their own batch""" + docs = [get_random_doc(doc_size) for doc_size in doc_sizes] + tol = 0.2 + batch_size = 1000 + batches = list( + minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False) + ) + assert [len(batch) for batch in batches] == expected_batches + + +def test_util_dot_section(): + cfg_string = """ + [nlp] + lang = "en" + pipeline = ["textcat"] + + [components] + + [components.textcat] + factory = "textcat" + + [components.textcat.model] + @architectures = "spacy.TextCatBOW.v1" + exclusive_classes = true + ngram_size = 1 + no_output_layer = false + """ + nlp_config = Config().from_str(cfg_string) + en_nlp = util.load_model_from_config(nlp_config, auto_fill=True) + default_config = Config().from_disk(DEFAULT_CONFIG_PATH) + default_config["nlp"]["lang"] = "nl" + nl_nlp = util.load_model_from_config(default_config, auto_fill=True) + # Test that creation went OK + assert isinstance(en_nlp, English) + assert isinstance(nl_nlp, Dutch) + assert nl_nlp.pipe_names == [] + assert en_nlp.pipe_names == ["textcat"] + # not exclusive_classes + assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False + # Test that default values got overwritten + assert en_nlp.config["nlp"]["pipeline"] == ["textcat"] + assert nl_nlp.config["nlp"]["pipeline"] == [] # default value [] + # Test proper functioning of 'dot_to_object' + with pytest.raises(KeyError): + dot_to_object(en_nlp.config, "nlp.pipeline.tagger") + with pytest.raises(KeyError): + dot_to_object(en_nlp.config, "nlp.unknownattribute") + T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining) + assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer) + + +def test_simple_frozen_list(): + t = SimpleFrozenList(["foo", "bar"]) + assert t == ["foo", "bar"] + assert t.index("bar") == 1 # okay method + with pytest.raises(NotImplementedError): + t.append("baz") + with pytest.raises(NotImplementedError): + t.sort() + with pytest.raises(NotImplementedError): + t.extend(["baz"]) + with pytest.raises(NotImplementedError): + t.pop() + t = SimpleFrozenList(["foo", "bar"], error="Error!") + with pytest.raises(NotImplementedError): + t.append("baz") + + +def test_resolve_dot_names(): + config = { + "training": {"optimizer": {"@optimizers": "Adam.v1"}}, + "foo": {"bar": "training.optimizer", "baz": "training.xyz"}, + } + result = util.resolve_dot_names(config, ["training.optimizer"]) + assert isinstance(result[0], Optimizer) + with pytest.raises(ConfigValidationError) as e: + util.resolve_dot_names(config, ["training.xyz", "training.optimizer"]) + errors = e.value.errors + assert len(errors) == 1 + assert errors[0]["loc"] == ["training", "xyz"] diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py deleted file mode 100644 index f710a38eb..000000000 --- a/spacy/tests/test_util.py +++ /dev/null @@ -1,137 +0,0 @@ -import pytest - -from spacy import util -from spacy.util import dot_to_object, SimpleFrozenList -from thinc.api import Config, Optimizer, ConfigValidationError -from spacy.training.batchers import minibatch_by_words -from spacy.lang.en import English -from spacy.lang.nl import Dutch -from spacy.language import DEFAULT_CONFIG_PATH -from spacy.schemas import ConfigSchemaTraining - -from .util import get_random_doc - - -@pytest.mark.parametrize( - "doc_sizes, expected_batches", - [ - ([400, 400, 199], [3]), - ([400, 400, 199, 3], [4]), - ([400, 400, 199, 3, 200], [3, 2]), - ([400, 400, 199, 3, 1], [5]), - ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded - ([400, 400, 199, 3, 1, 200], [3, 3]), - ([400, 400, 199, 3, 1, 999], [3, 3]), - ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]), - ([1, 2, 999], [3]), - ([1, 2, 999, 1], [4]), - ([1, 200, 999, 1], [2, 2]), - ([1, 999, 200, 1], [2, 2]), - ], -) -def test_util_minibatch(doc_sizes, expected_batches): - docs = [get_random_doc(doc_size) for doc_size in doc_sizes] - tol = 0.2 - batch_size = 1000 - batches = list( - minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True) - ) - assert [len(batch) for batch in batches] == expected_batches - - max_size = batch_size + batch_size * tol - for batch in batches: - assert sum([len(doc) for doc in batch]) < max_size - - -@pytest.mark.parametrize( - "doc_sizes, expected_batches", - [ - ([400, 4000, 199], [1, 2]), - ([400, 400, 199, 3000, 200], [1, 4]), - ([400, 400, 199, 3, 1, 1500], [1, 5]), - ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]), - ([1, 2, 9999], [1, 2]), - ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]), - ], -) -def test_util_minibatch_oversize(doc_sizes, expected_batches): - """ Test that oversized documents are returned in their own batch""" - docs = [get_random_doc(doc_size) for doc_size in doc_sizes] - tol = 0.2 - batch_size = 1000 - batches = list( - minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False) - ) - assert [len(batch) for batch in batches] == expected_batches - - -def test_util_dot_section(): - cfg_string = """ - [nlp] - lang = "en" - pipeline = ["textcat"] - - [components] - - [components.textcat] - factory = "textcat" - - [components.textcat.model] - @architectures = "spacy.TextCatBOW.v1" - exclusive_classes = true - ngram_size = 1 - no_output_layer = false - """ - nlp_config = Config().from_str(cfg_string) - en_nlp = util.load_model_from_config(nlp_config, auto_fill=True) - default_config = Config().from_disk(DEFAULT_CONFIG_PATH) - default_config["nlp"]["lang"] = "nl" - nl_nlp = util.load_model_from_config(default_config, auto_fill=True) - # Test that creation went OK - assert isinstance(en_nlp, English) - assert isinstance(nl_nlp, Dutch) - assert nl_nlp.pipe_names == [] - assert en_nlp.pipe_names == ["textcat"] - # not exclusive_classes - assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False - # Test that default values got overwritten - assert en_nlp.config["nlp"]["pipeline"] == ["textcat"] - assert nl_nlp.config["nlp"]["pipeline"] == [] # default value [] - # Test proper functioning of 'dot_to_object' - with pytest.raises(KeyError): - dot_to_object(en_nlp.config, "nlp.pipeline.tagger") - with pytest.raises(KeyError): - dot_to_object(en_nlp.config, "nlp.unknownattribute") - T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining) - assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer) - - -def test_simple_frozen_list(): - t = SimpleFrozenList(["foo", "bar"]) - assert t == ["foo", "bar"] - assert t.index("bar") == 1 # okay method - with pytest.raises(NotImplementedError): - t.append("baz") - with pytest.raises(NotImplementedError): - t.sort() - with pytest.raises(NotImplementedError): - t.extend(["baz"]) - with pytest.raises(NotImplementedError): - t.pop() - t = SimpleFrozenList(["foo", "bar"], error="Error!") - with pytest.raises(NotImplementedError): - t.append("baz") - - -def test_resolve_dot_names(): - config = { - "training": {"optimizer": {"@optimizers": "Adam.v1"}}, - "foo": {"bar": "training.optimizer", "baz": "training.xyz"}, - } - result = util.resolve_dot_names(config, ["training.optimizer"]) - assert isinstance(result[0], Optimizer) - with pytest.raises(ConfigValidationError) as e: - util.resolve_dot_names(config, ["training.xyz", "training.optimizer"]) - errors = e.value.errors - assert len(errors) == 1 - assert errors[0]["loc"] == ["training", "xyz"]