From 2c4b2ee5e9b29442c119e9c8bb2b5bce761a78aa Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sat, 3 Oct 2020 23:27:05 +0200 Subject: [PATCH 01/55] REL intro and get_candidates function --- website/docs/usage/layers-architectures.md | 54 ++++++++++++++++++++++ website/docs/usage/processing-pipelines.md | 2 +- 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index b65c3d903..678f70667 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -486,6 +486,60 @@ with Model.define_operators({">>": chain}): ## Create new trainable components {#components} +In addition to [swapping out](#swap-architectures) default models in built-in +components, you can also implement an entirely new, +[trainable pipeline component](usage/processing-pipelines#trainable-components) +from scratch. This can be done by creating a new class inheriting from [`Pipe`](/api/pipe), +and linking it up to your custom model implementation. + +### Example: Pipeline component for relation extraction {#component-rel} + +This section will run through an example of implementing a novel relation extraction +component from scratch. As a first step, we need a method that will generate pairs of +entities that we want to classify as being related or not. These candidate pairs are +typically formed within one document, which means we'll have a function that takes a +`Doc` as input and outputs a `List` of `Span` tuples. In this example, we will focus +on binary relation extraction, i.e. the tuple will be of length 2. + +We register this function in the 'misc' register so we can easily refer to it from the config, +and allow swapping it out for any candidate +generation function. For instance, a very straightforward implementation would be to just +take any two entities from the same document: + +```python +@registry.misc.register("rel_cand_generator.v1") +def create_candidate_indices() -> Callable[[Doc], List[Tuple[Span, Span]]]: + def get_candidate_indices(doc: "Doc"): + indices = [] + for ent1 in doc.ents: + for ent2 in doc.ents: + indices.append((ent1, ent2)) + return indices + return get_candidate_indices +``` + +But we could also refine this further by excluding relations of an entity with itself, +and posing a maximum distance (in number of tokens) between two entities: + +```python +### {highlight="1,2,7,8"} +@registry.misc.register("rel_cand_generator.v2") +def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]: + def get_candidate_indices(doc: "Doc"): + indices = [] + for ent1 in doc.ents: + for ent2 in doc.ents: + if ent1 != ent2: + if max_length and abs(ent2.start - ent1.start) <= max_length: + indices.append((ent1, ent2)) + return indices + return get_candidate_indices +``` + + + + + diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index c98bd08bc..3619993c5 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1035,7 +1035,7 @@ plug fully custom machine learning components into your pipeline. You'll need the following: 1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This - can be a model using implemented in + can be a model implemented in [Thinc](/usage/layers-architectures#thinc), or a [wrapped model](/usage/layers-architectures#frameworks) implemented in PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a From 08ad349a1851c3310a4ae7f34170eea37c9e2e3b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 4 Oct 2020 00:08:02 +0200 Subject: [PATCH 02/55] tok2vec layer --- website/docs/usage/layers-architectures.md | 87 ++++++++++++++-------- 1 file changed, 58 insertions(+), 29 deletions(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 678f70667..6f79cc6e8 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -489,51 +489,80 @@ with Model.define_operators({">>": chain}): In addition to [swapping out](#swap-architectures) default models in built-in components, you can also implement an entirely new, [trainable pipeline component](usage/processing-pipelines#trainable-components) -from scratch. This can be done by creating a new class inheriting from [`Pipe`](/api/pipe), -and linking it up to your custom model implementation. +from scratch. This can be done by creating a new class inheriting from +[`Pipe`](/api/pipe), and linking it up to your custom model implementation. ### Example: Pipeline component for relation extraction {#component-rel} -This section will run through an example of implementing a novel relation extraction -component from scratch. As a first step, we need a method that will generate pairs of -entities that we want to classify as being related or not. These candidate pairs are -typically formed within one document, which means we'll have a function that takes a -`Doc` as input and outputs a `List` of `Span` tuples. In this example, we will focus -on binary relation extraction, i.e. the tuple will be of length 2. - -We register this function in the 'misc' register so we can easily refer to it from the config, -and allow swapping it out for any candidate -generation function. For instance, a very straightforward implementation would be to just -take any two entities from the same document: +This section will run through an example of implementing a novel relation +extraction component from scratch. As a first step, we need a method that will +generate pairs of entities that we want to classify as being related or not. +These candidate pairs are typically formed within one document, which means +we'll have a function that takes a `Doc` as input and outputs a `List` of `Span` +tuples. In this example, we will focus on binary relation extraction, i.e. the +tuple will be of length 2. For instance, a very straightforward implementation +would be to just take any two entities from the same document: ```python -@registry.misc.register("rel_cand_generator.v1") -def create_candidate_indices() -> Callable[[Doc], List[Tuple[Span, Span]]]: - def get_candidate_indices(doc: "Doc"): - indices = [] - for ent1 in doc.ents: - for ent2 in doc.ents: - indices.append((ent1, ent2)) - return indices - return get_candidate_indices +def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: + candidates = [] + for ent1 in doc.ents: + for ent2 in doc.ents: + candidates.append((ent1, ent2)) + return candidates ``` -But we could also refine this further by excluding relations of an entity with itself, -and posing a maximum distance (in number of tokens) between two entities: +But we could also refine this further by excluding relations of an entity with +itself, and posing a maximum distance (in number of tokens) between two +entities. We'll also register this function in the +[`@misc` registry](/api/top-level#registry) so we can refer to it from the +config, and easily swap it out for any other candidate generation function. + +> ``` +> [get_candidates] +> @misc = "rel_cand_generator.v2" +> max_length = 6 +> ``` ```python ### {highlight="1,2,7,8"} @registry.misc.register("rel_cand_generator.v2") def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]: - def get_candidate_indices(doc: "Doc"): - indices = [] + def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: + candidates = [] for ent1 in doc.ents: for ent2 in doc.ents: if ent1 != ent2: if max_length and abs(ent2.start - ent1.start) <= max_length: - indices.append((ent1, ent2)) - return indices - return get_candidate_indices + candidates.append((ent1, ent2)) + return candidates + return get_candidates +``` + +> ``` +> [tok2vec] +> @architectures = "spacy.HashEmbedCNN.v1" +> pretrained_vectors = null +> width = 96 +> depth = 2 +> embed_size = 300 +> window_size = 1 +> maxout_pieces = 3 +> subword_features = true +> ``` + +Next, we'll assume we have access to an +[embedding layer](/usage/embeddings-transformers) such as a +[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This +layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it +transforms a list of documents into a list of 2D vectors. Further, this +`tok2vec` component will be trainable, which means that, following the Thinc +paradigm, we'll apply it to some input, and receive the predicted results as +well as a callback to perform backpropagation: + +```python +tok2vec = model.get_ref("tok2vec") +tokvecs, bp_tokvecs = tok2vec(docs, is_train=True) ``` From 452b8309f9e34530e5f592699a3601400f40ffb0 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 4 Oct 2020 13:26:46 +0200 Subject: [PATCH 03/55] slight rewrite to hide some thinc implementation details --- website/docs/usage/layers-architectures.md | 98 ++++++++++++++-------- 1 file changed, 63 insertions(+), 35 deletions(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 6f79cc6e8..25f9a568c 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -373,7 +373,7 @@ gpu_allocator = "pytorch" Of course it's also possible to define the `Model` from the previous section entirely in Thinc. The Thinc documentation provides details on the [various layers](https://thinc.ai/docs/api-layers) and helper functions -available. Combinators can also be used to +available. Combinators can be used to [overload operators](https://thinc.ai/docs/usage-models#operators) and a common usage pattern is to bind `chain` to `>>`. The "native" Thinc version of our simple neural network would then become: @@ -494,13 +494,34 @@ from scratch. This can be done by creating a new class inheriting from ### Example: Pipeline component for relation extraction {#component-rel} -This section will run through an example of implementing a novel relation -extraction component from scratch. As a first step, we need a method that will +This section outlines an example use-case of implementing a novel relation +extraction component from scratch. We assume we want to implement a binary +relation extraction method that determines whether two entities in a document +are related or not, and if so, with what type of relation. We'll allow multiple +types of relations between two such entities - i.e. it is a multi-label setting. + +We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes +a list of documents as input, and outputs a two-dimensional matrix of scores: + +```python +@registry.architectures.register("rel_model.v1") +def create_relation_model(...) -> Model[List[Doc], Floats2d]: + model = _create_my_model() + return model +``` + +The first layer in this model will typically be an +[embedding layer](/usage/embeddings-transformers) such as a +[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This +layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it +transforms each document into a list of tokens, with each token being +represented by its embedding in the vector space. + +Next, we need a method that will generate pairs of entities that we want to classify as being related or not. These candidate pairs are typically formed within one document, which means we'll have a function that takes a `Doc` as input and outputs a `List` of `Span` -tuples. In this example, we will focus on binary relation extraction, i.e. the -tuple will be of length 2. For instance, a very straightforward implementation +tuples. For instance, a very straightforward implementation would be to just take any two entities from the same document: ```python @@ -512,18 +533,24 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: return candidates ``` -But we could also refine this further by excluding relations of an entity with -itself, and posing a maximum distance (in number of tokens) between two -entities. We'll also register this function in the -[`@misc` registry](/api/top-level#registry) so we can refer to it from the -config, and easily swap it out for any other candidate generation function. - > ``` -> [get_candidates] +> [model] +> @architectures = "rel_model.v1" +> +> [model.tok2vec] +> ... +> +> [model.get_candidates] > @misc = "rel_cand_generator.v2" > max_length = 6 > ``` +But we could also refine this further by excluding relations of an entity with +itself, and posing a maximum distance (in number of tokens) between two +entities. We'll register this function in the +[`@misc` registry](/api/top-level#registry) so we can refer to it from the +config, and easily swap it out for any other candidate generation function. + ```python ### {highlight="1,2,7,8"} @registry.misc.register("rel_cand_generator.v2") @@ -539,32 +566,33 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span return get_candidates ``` +Finally, we'll require a method that transforms the candidate pairs of entities into +a 2D tensor using the specified Tok2Vec function, and this `Floats2d` object will then be +processed by a final `output_layer` of the network. Taking all this together, we can define +our relation model like this in the config: + > ``` -> [tok2vec] -> @architectures = "spacy.HashEmbedCNN.v1" -> pretrained_vectors = null -> width = 96 -> depth = 2 -> embed_size = 300 -> window_size = 1 -> maxout_pieces = 3 -> subword_features = true +> [model] +> @architectures = "rel_model.v1" +> nO = null +> +> [model.tok2vec] +> ... +> +> [model.get_candidates] +> @misc = "rel_cand_generator.v2" +> max_length = 6 +> +> [components.relation_extractor.model.create_candidate_tensor] +> @misc = "rel_cand_tensor.v1" +> +> [components.relation_extractor.model.output_layer] +> @architectures = "rel_output_layer.v1" +> nI = null +> nO = null > ``` -Next, we'll assume we have access to an -[embedding layer](/usage/embeddings-transformers) such as a -[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This -layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it -transforms a list of documents into a list of 2D vectors. Further, this -`tok2vec` component will be trainable, which means that, following the Thinc -paradigm, we'll apply it to some input, and receive the predicted results as -well as a callback to perform backpropagation: - -```python -tok2vec = model.get_ref("tok2vec") -tokvecs, bp_tokvecs = tok2vec(docs, is_train=True) -``` - + From 9f40d963fd92d2dc5de04af2bda45d79d440113e Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 4 Oct 2020 14:11:53 +0200 Subject: [PATCH 04/55] highlight the two steps: the model and the pipeline component --- website/docs/usage/layers-architectures.md | 126 ++++++++++++++------- 1 file changed, 88 insertions(+), 38 deletions(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 25f9a568c..c4b3fb9dc 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -495,12 +495,19 @@ from scratch. This can be done by creating a new class inheriting from ### Example: Pipeline component for relation extraction {#component-rel} This section outlines an example use-case of implementing a novel relation -extraction component from scratch. We assume we want to implement a binary -relation extraction method that determines whether two entities in a document -are related or not, and if so, with what type of relation. We'll allow multiple +extraction component from scratch. We assume we want to implement a binary +relation extraction method that determines whether two entities in a document +are related or not, and if so, with what type of relation. We'll allow multiple types of relations between two such entities - i.e. it is a multi-label setting. -We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes +There are two major steps required: first, we need to +[implement a machine learning model](#component-rel-model) specific to this +task, and then we'll use this model to +[implement a custom pipeline component](#component-rel-pipe). + +#### Step 1: Implementing the Model {#component-rel-model} + +We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes a list of documents as input, and outputs a two-dimensional matrix of scores: ```python @@ -514,15 +521,15 @@ The first layer in this model will typically be an [embedding layer](/usage/embeddings-transformers) such as a [`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it -transforms each document into a list of tokens, with each token being +transforms each document into a list of tokens, with each token being represented by its embedding in the vector space. -Next, we need a method that will -generate pairs of entities that we want to classify as being related or not. -These candidate pairs are typically formed within one document, which means -we'll have a function that takes a `Doc` as input and outputs a `List` of `Span` -tuples. For instance, a very straightforward implementation -would be to just take any two entities from the same document: +Next, we need a method that will generate pairs of entities that we want to +classify as being related or not. These candidate pairs are typically formed +within one document, which means we'll have a function that takes a `Doc` as +input and outputs a `List` of `Span` tuples. For instance, a very +straightforward implementation would be to just take any two entities from the +same document: ```python def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: @@ -536,10 +543,10 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: > ``` > [model] > @architectures = "rel_model.v1" -> +> > [model.tok2vec] > ... -> +> > [model.get_candidates] > @misc = "rel_cand_generator.v2" > max_length = 6 @@ -566,33 +573,76 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span return get_candidates ``` -Finally, we'll require a method that transforms the candidate pairs of entities into -a 2D tensor using the specified Tok2Vec function, and this `Floats2d` object will then be -processed by a final `output_layer` of the network. Taking all this together, we can define -our relation model like this in the config: +Finally, we'll require a method that transforms the candidate pairs of entities +into a 2D tensor using the specified Tok2Vec function, and this `Floats2d` +object will then be processed by a final `output_layer` of the network. Taking +all this together, we can define our relation model like this in the config: -> ``` -> [model] -> @architectures = "rel_model.v1" -> nO = null -> -> [model.tok2vec] -> ... -> -> [model.get_candidates] -> @misc = "rel_cand_generator.v2" -> max_length = 6 -> -> [components.relation_extractor.model.create_candidate_tensor] -> @misc = "rel_cand_tensor.v1" -> -> [components.relation_extractor.model.output_layer] -> @architectures = "rel_output_layer.v1" -> nI = null -> nO = null -> ``` +``` +[model] +@architectures = "rel_model.v1" +... - +[model.tok2vec] +... + +[model.get_candidates] +@misc = "rel_cand_generator.v2" +max_length = 6 + +[model.create_candidate_tensor] +@misc = "rel_cand_tensor.v1" + +[model.output_layer] +@architectures = "rel_output_layer.v1" +... +``` + + + +When creating this model, we'll store the custom functions as +[attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as +references, so we can access them easily: + +```python +tok2vec_layer = model.get_ref("tok2vec") +output_layer = model.get_ref("output_layer") +create_candidate_tensor = model.attrs["create_candidate_tensor"] +get_candidates = model.attrs["get_candidates"] +``` + +#### Step 2: Implementing the pipeline component {#component-rel-pipe} + +To use our new relation extraction model as part of a custom component, we +create a subclass of [`Pipe`](/api/pipe) that will hold the model: + +```python +from spacy.pipeline import Pipe +from spacy.language import Language + +class RelationExtractor(Pipe): + def __init__(self, vocab, model, name="rel", labels=[]): + ... + + def predict(self, docs): + ... + + def set_annotations(self, docs, scores): + ... + +@Language.factory("relation_extractor") +def make_relation_extractor(nlp, name, model, labels): + return RelationExtractor(nlp.vocab, model, name, labels=labels) +``` + +The [`predict`](/api/pipe#predict ) function needs to be implemented for each subclass. +In our case, we can simply delegate to the internal model's +[predict](https://thinc.ai/docs/api-model#predict) function: +```python +def predict(self, docs: Iterable[Doc]) -> Floats2d: + scores = self.model.predict(docs) + return self.model.ops.asarray(scores) +``` From b0463fbf75a83127352d52d6ac295bb73d16a6d0 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 4 Oct 2020 14:56:48 +0200 Subject: [PATCH 05/55] set_annotations explanation --- website/docs/usage/layers-architectures.md | 48 ++++++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index c4b3fb9dc..7e563cb5c 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -613,7 +613,7 @@ get_candidates = model.attrs["get_candidates"] #### Step 2: Implementing the pipeline component {#component-rel-pipe} -To use our new relation extraction model as part of a custom component, we +To use our new relation extraction model as part of a custom component, we create a subclass of [`Pipe`](/api/pipe) that will hold the model: ```python @@ -635,15 +635,57 @@ def make_relation_extractor(nlp, name, model, labels): return RelationExtractor(nlp.vocab, model, name, labels=labels) ``` -The [`predict`](/api/pipe#predict ) function needs to be implemented for each subclass. -In our case, we can simply delegate to the internal model's +The [`predict`](/api/pipe#predict) function needs to be implemented for each +subclass. In our case, we can simply delegate to the internal model's [predict](https://thinc.ai/docs/api-model#predict) function: + ```python def predict(self, docs: Iterable[Doc]) -> Floats2d: scores = self.model.predict(docs) return self.model.ops.asarray(scores) ``` +The other method that needs to be implemented, is +[`set_annotations`](/api/pipe#set_annotations). It takes the predicted scores, +and modifies the given `Doc` object in place to hold the predictions. For our +relation extraction component, we'll store the data as a dictionary in a custom +extension attribute `doc._.rel`. As keys, we represent the candidate pair by the +start offsets of each entity, as this defines an entity uniquely within one +document. + +To interpret the scores predicted by the REL model correctly, we need to +refer to the model's `get_candidates` function that originally defined which +pairs of entities would be run through the model, so that the scores can be +related to those exact entities: + +> #### Example output +> +> ```python +> doc = nlp("Amsterdam is the capital of the Netherlands.") +> print(f"spans: {[(e.start, e.text, e.label_) for e in doc.ents]}") +> for value, rel_dict in doc._.rel.items(): +> print(f"{value}: {rel_dict}") +> ``` + +> ``` +> spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')] +> (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002} +> (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017} +> ``` + +```python +def set_annotations(self, docs: Iterable[Doc], rel_scores: Floats2d): + c = 0 + get_candidates = self.model.attrs["get_candidates"] + for doc in docs: + for (e1, e2) in get_candidates(doc): + offset = (e1.start, e2.start) + if offset not in doc._.rel: + doc._.rel[offset] = {} + for j, label in enumerate(self.labels): + doc._.rel[offset][label] = rel_scores[c, j] + c += 1 +``` From f1d1f78636059abcbd680cd283d643c11310df30 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 4 Oct 2020 22:44:21 +0200 Subject: [PATCH 06/55] Make warning debug log [ci skip] --- spacy/tokens/doc.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 3404274ce..0499dc4a7 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1528,7 +1528,7 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1: while not heads_within_sents: heads_within_sents = _set_lr_kids_and_edges(tokens, start, end, loop_count) if loop_count > 10: - warnings.warn(Warnings.W026) + util.logger.debug(Warnings.W026) break loop_count += 1 # Set sentence starts From 4b15ff7504a6af94b8e98f8406e430b437a889c5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 4 Oct 2020 22:47:04 +0200 Subject: [PATCH 07/55] Increment version [ci skip] --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 037ca6bcb..dce627a38 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a32" +__version__ = "3.0.0a33" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 549758f67dea544ec64271fe88513dbc4117fed8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 4 Oct 2020 23:16:09 +0200 Subject: [PATCH 08/55] Adjust test for now --- spacy/tests/regression/test_issue5918.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/tests/regression/test_issue5918.py b/spacy/tests/regression/test_issue5918.py index db957709c..e4ee0135d 100644 --- a/spacy/tests/regression/test_issue5918.py +++ b/spacy/tests/regression/test_issue5918.py @@ -23,7 +23,8 @@ def test_issue5918(): assert len(doc.ents) == 3 # make it so that the third span's head is within the entity (ent_iob=I) # bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents. - with pytest.warns(UserWarning): - doc[29].head = doc[33] + # TODO: test for logging here + # with pytest.warns(UserWarning): + # doc[29].head = doc[33] doc = merge_entities(doc) assert len(doc.ents) == 3 From 52b660e9dcc412fc1d4bbdf269c1bd31d9e7d3a4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 5 Oct 2020 00:39:36 +0200 Subject: [PATCH 09/55] initialize and update explanation --- website/docs/api/pipe.md | 6 + website/docs/usage/layers-architectures.md | 149 ++++++++++++++++----- 2 files changed, 119 insertions(+), 36 deletions(-) diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index 4f5ac6f61..de35f9eb4 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -226,6 +226,12 @@ the "catastrophic forgetting" problem. This feature is experimental. Find the loss and gradient of loss for the batch of documents and their predicted scores. + + +This method needs to be overwritten with your own custom `get_loss` method. + + + > #### Example > > ```python diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 7e563cb5c..130a7144e 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -618,31 +618,97 @@ create a subclass of [`Pipe`](/api/pipe) that will hold the model: ```python from spacy.pipeline import Pipe -from spacy.language import Language class RelationExtractor(Pipe): def __init__(self, vocab, model, name="rel", labels=[]): + self.model = model ... def predict(self, docs): ... - def set_annotations(self, docs, scores): + def set_annotations(self, docs, predictions): ... - -@Language.factory("relation_extractor") -def make_relation_extractor(nlp, name, model, labels): - return RelationExtractor(nlp.vocab, model, name, labels=labels) ``` +Before the model can be used however, it needs to be +[initialized](/api/pipe#initialize). This function recieves either the full +training data set, or a representative sample. The training data can be used +to deduce all relevant labels. Alternatively, a list of labels can be provided, +or a script can call `rel_component.add_label()` to add each label separately. + +The number of labels will define the output dimensionality of the network, +and will be used to do +[shape inference](https://thinc.ai/docs/usage-models#validation) throughout +the layers of the neural network. This is triggerd by calling `model.initialize`. + +```python +from itertools import islice + +def initialize( + self, + get_examples: Callable[[], Iterable[Example]], + *, + nlp: Language = None, + labels: Optional[List[str]] = None, +): + if labels is not None: + for label in labels: + self.add_label(label) + else: + for example in get_examples(): + relations = example.reference._.rel + for indices, label_dict in relations.items(): + for label in label_dict.keys(): + self.add_label(label) + subbatch = list(islice(get_examples(), 10)) + doc_sample = [eg.reference for eg in subbatch] + label_sample = self._examples_to_truth(subbatch) + self.model.initialize(X=doc_sample, Y=label_sample) +``` + +The `initialize` method will be triggered whenever this component is part of an +`nlp` pipeline, and `nlp.initialize()` is invoked. After doing so, the pipeline +component and its internal model can be trained and used to make predictions. + +During training the function [`update`](/api/pipe#update) is invoked which delegates to +[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and +needs a function [`get_loss`](/api/pipe#get_loss) that will calculate the +loss for a batch of examples, as well as the gradient of loss that will be used to update +the weights of the model layers. + +```python +def update( + self, + examples: Iterable[Example], + *, + drop: float = 0.0, + set_annotations: bool = False, + sgd: Optional[Optimizer] = None, + losses: Optional[Dict[str, float]] = None, +) -> Dict[str, float]: + ... + docs = [ex.predicted for ex in examples] + predictions, backprop = self.model.begin_update(docs) + loss, gradient = self.get_loss(examples, predictions) + backprop(gradient) + losses[self.name] += loss + ... + return losses +``` + +Thinc provides some [loss functions](https://thinc.ai/docs/api-loss) that can be used +for the implementation of the `get_loss` function. + +When the internal model is trained, the component can be used to make novel predictions. The [`predict`](/api/pipe#predict) function needs to be implemented for each -subclass. In our case, we can simply delegate to the internal model's +subclass of `Pipe`. In our case, we can simply delegate to the internal model's [predict](https://thinc.ai/docs/api-model#predict) function: ```python def predict(self, docs: Iterable[Doc]) -> Floats2d: - scores = self.model.predict(docs) - return self.model.ops.asarray(scores) + predictions = self.model.predict(docs) + return self.model.ops.asarray(predictions) ``` The other method that needs to be implemented, is @@ -650,7 +716,7 @@ The other method that needs to be implemented, is and modifies the given `Doc` object in place to hold the predictions. For our relation extraction component, we'll store the data as a dictionary in a custom extension attribute `doc._.rel`. As keys, we represent the candidate pair by the -start offsets of each entity, as this defines an entity uniquely within one +start offsets of each entity, as this defines an entity pair uniquely within one document. To interpret the scores predicted by the REL model correctly, we need to @@ -674,7 +740,7 @@ related to those exact entities: > ``` ```python -def set_annotations(self, docs: Iterable[Doc], rel_scores: Floats2d): +def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d): c = 0 get_candidates = self.model.attrs["get_candidates"] for doc in docs: @@ -683,34 +749,45 @@ def set_annotations(self, docs: Iterable[Doc], rel_scores: Floats2d): if offset not in doc._.rel: doc._.rel[offset] = {} for j, label in enumerate(self.labels): - doc._.rel[offset][label] = rel_scores[c, j] + doc._.rel[offset][label] = predictions[c, j] c += 1 ``` - - - - - - - - + +Once our `Pipe` subclass is fully implemented, we can +[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories) +the component with the +`Language.factory` decorator. This will enable the creation of the component with +`nlp.add_pipe`, or via the config. + +> ``` +> +> [components.relation_extractor] +> factory = "relation_extractor" +> labels = [] +> +> [components.relation_extractor.model] +> @architectures = "rel_model.v1" +> ... +> ``` + +```python +from spacy.language import Language + +@Language.factory("relation_extractor") +def make_relation_extractor(nlp, name, model, labels): + return RelationExtractor(nlp.vocab, model, name, labels=labels) +``` + + + + From 9a6c9b133b796d4b766189740ef1fc88f6dbe3ee Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 5 Oct 2020 01:05:37 +0200 Subject: [PATCH 10/55] various small fixes --- website/docs/usage/layers-architectures.md | 142 +++++++++++---------- 1 file changed, 74 insertions(+), 68 deletions(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 130a7144e..414562d6d 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -288,7 +288,7 @@ those parts of the network. To use our custom model including the PyTorch subnetwork, all we need to do is register the architecture using the -[`architectures` registry](/api/top-level#registry). This will assign the +[`architectures` registry](/api/top-level#registry). This assigns the architecture a name so spaCy knows how to find it, and allows passing in arguments like hyperparameters via the [config](/usage/training#config). The full example then becomes: @@ -488,27 +488,27 @@ with Model.define_operators({">>": chain}): In addition to [swapping out](#swap-architectures) default models in built-in components, you can also implement an entirely new, -[trainable pipeline component](usage/processing-pipelines#trainable-components) +[trainable pipeline component](/usage/processing-pipelines#trainable-components) from scratch. This can be done by creating a new class inheriting from [`Pipe`](/api/pipe), and linking it up to your custom model implementation. ### Example: Pipeline component for relation extraction {#component-rel} This section outlines an example use-case of implementing a novel relation -extraction component from scratch. We assume we want to implement a binary -relation extraction method that determines whether two entities in a document -are related or not, and if so, with what type of relation. We'll allow multiple -types of relations between two such entities - i.e. it is a multi-label setting. +extraction component from scratch. We'll implement a binary relation extraction +method that determines whether or not two entities in a document are related, +and if so, what type of relation. We'll allow multiple types of relations +between two such entities (multi-label setting). There are two major steps required: first, we need to [implement a machine learning model](#component-rel-model) specific to this -task, and then we'll use this model to +task, and subsequently we use this model to [implement a custom pipeline component](#component-rel-pipe). #### Step 1: Implementing the Model {#component-rel-model} -We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes -a list of documents as input, and outputs a two-dimensional matrix of scores: +We need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes a +list of documents as input, and outputs a two-dimensional matrix of predictions: ```python @registry.architectures.register("rel_model.v1") @@ -519,17 +519,16 @@ def create_relation_model(...) -> Model[List[Doc], Floats2d]: The first layer in this model will typically be an [embedding layer](/usage/embeddings-transformers) such as a -[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This -layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it +[`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This +layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it transforms each document into a list of tokens, with each token being represented by its embedding in the vector space. -Next, we need a method that will generate pairs of entities that we want to -classify as being related or not. These candidate pairs are typically formed -within one document, which means we'll have a function that takes a `Doc` as -input and outputs a `List` of `Span` tuples. For instance, a very -straightforward implementation would be to just take any two entities from the -same document: +Next, we need a method that generates pairs of entities that we want to classify +as being related or not. As these candidate pairs are typically formed within +one document, this function takes a `Doc` as input and outputs a `List` of +`Span` tuples. For instance, a very straightforward implementation would be to +just take any two entities from the same document: ```python def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: @@ -549,12 +548,12 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: > > [model.get_candidates] > @misc = "rel_cand_generator.v2" -> max_length = 6 +> max_length = 20 > ``` But we could also refine this further by excluding relations of an entity with itself, and posing a maximum distance (in number of tokens) between two -entities. We'll register this function in the +entities. We register this function in the [`@misc` registry](/api/top-level#registry) so we can refer to it from the config, and easily swap it out for any other candidate generation function. @@ -573,10 +572,10 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span return get_candidates ``` -Finally, we'll require a method that transforms the candidate pairs of entities -into a 2D tensor using the specified Tok2Vec function, and this `Floats2d` -object will then be processed by a final `output_layer` of the network. Taking -all this together, we can define our relation model like this in the config: +Finally, we require a method that transforms the candidate entity pairs into a +2D tensor using the specified `Tok2Vec` function. The resulting `Floats2d` +object will then be processed by a final `output_layer` of the network. Putting +all this together, we can define our relation model in a config file as such: ``` [model] @@ -588,7 +587,7 @@ all this together, we can define our relation model like this in the config: [model.get_candidates] @misc = "rel_cand_generator.v2" -max_length = 6 +max_length = 20 [model.create_candidate_tensor] @misc = "rel_cand_tensor.v1" @@ -600,7 +599,7 @@ max_length = 6 -When creating this model, we'll store the custom functions as +When creating this model, we store the custom functions as [attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as references, so we can access them easily: @@ -614,7 +613,7 @@ get_candidates = model.attrs["get_candidates"] #### Step 2: Implementing the pipeline component {#component-rel-pipe} To use our new relation extraction model as part of a custom component, we -create a subclass of [`Pipe`](/api/pipe) that will hold the model: +create a subclass of [`Pipe`](/api/pipe) that holds the model: ```python from spacy.pipeline import Pipe @@ -624,6 +623,9 @@ class RelationExtractor(Pipe): self.model = model ... + def update(self, examples, ...): + ... + def predict(self, docs): ... @@ -631,18 +633,19 @@ class RelationExtractor(Pipe): ... ``` -Before the model can be used however, it needs to be -[initialized](/api/pipe#initialize). This function recieves either the full -training data set, or a representative sample. The training data can be used -to deduce all relevant labels. Alternatively, a list of labels can be provided, -or a script can call `rel_component.add_label()` to add each label separately. +Before the model can be used, it needs to be +[initialized](/api/pipe#initialize). This function receives either the full +training data set, or a representative sample. This data set can be used to +deduce all relevant labels. Alternatively, a list of labels can be provided, or +a script can call `rel_component.add_label()` directly. -The number of labels will define the output dimensionality of the network, -and will be used to do -[shape inference](https://thinc.ai/docs/usage-models#validation) throughout -the layers of the neural network. This is triggerd by calling `model.initialize`. +The number of labels defines the output dimensionality of the network, and will +be used to do [shape inference](https://thinc.ai/docs/usage-models#validation) +throughout the layers of the neural network. This is triggered by calling +`model.initialize`. ```python +### {highlight="12,18,22"} from itertools import islice def initialize( @@ -666,18 +669,21 @@ def initialize( label_sample = self._examples_to_truth(subbatch) self.model.initialize(X=doc_sample, Y=label_sample) ``` - -The `initialize` method will be triggered whenever this component is part of an -`nlp` pipeline, and `nlp.initialize()` is invoked. After doing so, the pipeline -component and its internal model can be trained and used to make predictions. -During training the function [`update`](/api/pipe#update) is invoked which delegates to -[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and -needs a function [`get_loss`](/api/pipe#get_loss) that will calculate the -loss for a batch of examples, as well as the gradient of loss that will be used to update -the weights of the model layers. +The `initialize` method is triggered whenever this component is part of an `nlp` +pipeline, and [`nlp.initialize()`](/api/language#initialize) is invoked. After +doing so, the pipeline component and its internal model can be trained and used +to make predictions. + +During training, the function [`update`](/api/pipe#update) is invoked which +delegates to +[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a +[`get_loss`](/api/pipe#get_loss) function that calculate the loss for a batch of +examples, as well as the gradient of loss that will be used to update the +weights of the model layers. ```python +### {highlight="12-14"} def update( self, examples: Iterable[Example], @@ -697,13 +703,13 @@ def update( return losses ``` -Thinc provides some [loss functions](https://thinc.ai/docs/api-loss) that can be used -for the implementation of the `get_loss` function. +Thinc provides several [loss functions](https://thinc.ai/docs/api-loss) that can +be used for the implementation of the `get_loss` function. -When the internal model is trained, the component can be used to make novel predictions. -The [`predict`](/api/pipe#predict) function needs to be implemented for each -subclass of `Pipe`. In our case, we can simply delegate to the internal model's -[predict](https://thinc.ai/docs/api-model#predict) function: +When the internal model is trained, the component can be used to make novel +predictions. The [`predict`](/api/pipe#predict) function needs to be implemented +for each subclass of `Pipe`. In our case, we can simply delegate to the internal +model's [predict](https://thinc.ai/docs/api-model#predict) function: ```python def predict(self, docs: Iterable[Doc]) -> Floats2d: @@ -711,24 +717,24 @@ def predict(self, docs: Iterable[Doc]) -> Floats2d: return self.model.ops.asarray(predictions) ``` -The other method that needs to be implemented, is -[`set_annotations`](/api/pipe#set_annotations). It takes the predicted scores, -and modifies the given `Doc` object in place to hold the predictions. For our -relation extraction component, we'll store the data as a dictionary in a custom +The final method that needs to be implemented, is +[`set_annotations`](/api/pipe#set_annotations). This function takes the +predictions, and modifies the given `Doc` object in place to store them. For our +relation extraction component, we store the data as a dictionary in a custom extension attribute `doc._.rel`. As keys, we represent the candidate pair by the start offsets of each entity, as this defines an entity pair uniquely within one document. -To interpret the scores predicted by the REL model correctly, we need to -refer to the model's `get_candidates` function that originally defined which -pairs of entities would be run through the model, so that the scores can be -related to those exact entities: +To interpret the scores predicted by the REL model correctly, we need to refer +to the model's `get_candidates` function that defined which pairs of entities +were relevant candidates, so that the predictions can be linked to those exact +entities: > #### Example output > > ```python > doc = nlp("Amsterdam is the capital of the Netherlands.") -> print(f"spans: {[(e.start, e.text, e.label_) for e in doc.ents]}") +> print(f"spans: [(e.start, e.text, e.label_) for e in doc.ents]") > for value, rel_dict in doc._.rel.items(): > print(f"{value}: {rel_dict}") > ``` @@ -740,6 +746,7 @@ related to those exact entities: > ``` ```python +### {highlight="5-6,10"} def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d): c = 0 get_candidates = self.model.attrs["get_candidates"] @@ -753,8 +760,8 @@ def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d): c += 1 ``` -Under the hood, when the pipe is applied to a document, it will delegate to these -two methods: +Under the hood, when the pipe is applied to a document, it delegates to the +`predict` and `set_annotations` functions: ```python def __call__(self, Doc doc): @@ -763,18 +770,17 @@ def __call__(self, Doc doc): return doc ``` -Once our `Pipe` subclass is fully implemented, we can -[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories) -the component with the -`Language.factory` decorator. This will enable the creation of the component with -`nlp.add_pipe`, or via the config. +Once our `Pipe` subclass is fully implemented, we can +[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories) +the component with the `Language.factory` decorator. This enables the creation +of the component with `nlp.add_pipe`, or via the config. > ``` -> +> > [components.relation_extractor] > factory = "relation_extractor" > labels = [] -> +> > [components.relation_extractor.model] > @architectures = "rel_model.v1" > ... From b0b93854cb2c522090c87544e33a19e6b361ed19 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 5 Oct 2020 09:26:43 +0200 Subject: [PATCH 11/55] Update ru/uk lemmatizers for new nlp.initialize --- spacy/lang/ru/__init__.py | 10 ++++++++-- spacy/lang/ru/lemmatizer.py | 5 ++--- spacy/lang/uk/__init__.py | 4 ++-- spacy/lang/uk/lemmatizer.py | 5 ++--- spacy/tests/conftest.py | 1 - 5 files changed, 14 insertions(+), 11 deletions(-) diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index 1d59ca043..2f3965fcc 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -25,8 +25,14 @@ class Russian(Language): default_config={"model": None, "mode": "pymorphy2"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): - return RussianLemmatizer(nlp.vocab, model, name, mode=mode) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool = False, +): + return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) __all__ = ["Russian"] diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 8d7996c63..3bcac8730 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Tuple from thinc.api import Model -from ...lookups import Lookups from ...pipeline import Lemmatizer from ...symbols import POS from ...tokens import Token @@ -22,9 +21,9 @@ class RussianLemmatizer(Lemmatizer): name: str = "lemmatizer", *, mode: str = "pymorphy2", - lookups: Optional[Lookups] = None, + overwrite: bool = False, ) -> None: - super().__init__(vocab, model, name, mode=mode, lookups=lookups) + super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) try: from pymorphy2 import MorphAnalyzer diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index 73c065379..0abe9170e 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -26,8 +26,8 @@ class Ukrainian(Language): default_config={"model": None, "mode": "pymorphy2"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): - return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False,): + return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) __all__ = ["Ukrainian"] diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index 0d6febce6..009ec5044 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -3,7 +3,6 @@ from typing import Optional from thinc.api import Model from ..ru.lemmatizer import RussianLemmatizer -from ...lookups import Lookups from ...vocab import Vocab @@ -15,9 +14,9 @@ class UkrainianLemmatizer(RussianLemmatizer): name: str = "lemmatizer", *, mode: str = "pymorphy2", - lookups: Optional[Lookups] = None, + overwrite: bool = False, ) -> None: - super().__init__(vocab, model, name, mode=mode, lookups=lookups) + super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) try: from pymorphy2 import MorphAnalyzer except ImportError: diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 4a3d126d7..67860b7e4 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -248,7 +248,6 @@ def tt_tokenizer(): @pytest.fixture(scope="session") def uk_tokenizer(): pytest.importorskip("pymorphy2") - pytest.importorskip("pymorphy2.lang") return get_lang_class("uk")().tokenizer From 03cfb2d2f4afbcc96f99757010ce3263cbc28ebd Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 5 Oct 2020 09:33:05 +0200 Subject: [PATCH 12/55] Always serialize lookups and vectors to disk --- spacy/lookups.py | 13 ++++++------- spacy/vocab.pyx | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/spacy/lookups.py b/spacy/lookups.py index fb5e3d748..133cb0672 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -289,13 +289,12 @@ class Lookups: DOCS: https://nightly.spacy.io/api/lookups#to_disk """ - if len(self._tables): - path = ensure_path(path) - if not path.exists(): - path.mkdir() - filepath = path / filename - with filepath.open("wb") as file_: - file_.write(self.to_bytes()) + path = ensure_path(path) + if not path.exists(): + path.mkdir() + filepath = path / filename + with filepath.open("wb") as file_: + file_.write(self.to_bytes()) def from_disk( self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index a22f12c65..93918250b 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -445,9 +445,9 @@ cdef class Vocab: setters = ["strings", "vectors"] if "strings" not in exclude: self.strings.to_disk(path / "strings.json") - if "vectors" not in "exclude" and self.vectors is not None: + if "vectors" not in "exclude": self.vectors.to_disk(path) - if "lookups" not in "exclude" and self.lookups is not None: + if "lookups" not in "exclude": self.lookups.to_disk(path) def from_disk(self, path, *, exclude=tuple()): From 1c641e41c3d46c5b555891427833200c0f0087b5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 11:50:11 +0200 Subject: [PATCH 13/55] Remove unused import [ci skip] --- spacy/tests/regression/test_issue5918.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/tests/regression/test_issue5918.py b/spacy/tests/regression/test_issue5918.py index e4ee0135d..d25323ef6 100644 --- a/spacy/tests/regression/test_issue5918.py +++ b/spacy/tests/regression/test_issue5918.py @@ -1,6 +1,5 @@ from spacy.lang.en import English from spacy.pipeline import merge_entities -import pytest def test_issue5918(): From e3acad626443c9cf0b81f600aae2b3b9529b63cd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 13:06:20 +0200 Subject: [PATCH 14/55] Update docs [ci skip] --- website/docs/usage/layers-architectures.md | 261 +++++++++++++-------- 1 file changed, 162 insertions(+), 99 deletions(-) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 414562d6d..24c7bf1cf 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -86,7 +86,8 @@ see are: ​ | ~~Ragged~~ | A container to handle variable-length sequence data in an unpadded contiguous array. | | ~~Padded~~ | A container to handle variable-length sequence data in a padded contiguous array. | -The model type signatures help you figure out which model architectures and +See the [Thinc type reference](https://thinc.ai/docs/api-types) for details. The +model type signatures help you figure out which model architectures and components can **fit together**. For instance, the [`TextCategorizer`](/api/textcategorizer) class expects a model typed ~~Model[List[Doc], Floats2d]~~, because the model will predict one row of @@ -488,32 +489,57 @@ with Model.define_operators({">>": chain}): In addition to [swapping out](#swap-architectures) default models in built-in components, you can also implement an entirely new, -[trainable pipeline component](/usage/processing-pipelines#trainable-components) +[trainable](/usage/processing-pipelines#trainable-components) pipeline component from scratch. This can be done by creating a new class inheriting from [`Pipe`](/api/pipe), and linking it up to your custom model implementation. -### Example: Pipeline component for relation extraction {#component-rel} + -This section outlines an example use-case of implementing a novel relation -extraction component from scratch. We'll implement a binary relation extraction -method that determines whether or not two entities in a document are related, -and if so, what type of relation. We'll allow multiple types of relations -between two such entities (multi-label setting). +For details on how to implement pipeline components, check out the usage guide +on [custom components](/usage/processing-pipelines#custom-component) and the +overview of the `Pipe` methods used by +[trainable components](/usage/processing-pipelines#trainable-components). -There are two major steps required: first, we need to -[implement a machine learning model](#component-rel-model) specific to this -task, and subsequently we use this model to -[implement a custom pipeline component](#component-rel-pipe). + + +### Example: Entity elation extraction component {#component-rel} + +This section outlines an example use-case of implementing a **novel relation +extraction component** from scratch. We'll implement a binary relation +extraction method that determines whether or not **two entities** in a document +are related, and if so, what type of relation. We'll allow multiple types of +relations between two such entities (multi-label setting). There are two major +steps required: + +1. Implement a [machine learning model](#component-rel-model) specific to this + task. It will have to extract candidates from a [`Doc`](/api/doc) and predict + a relation for the available candidate pairs. +2. Implement a custom [pipeline component](#component-rel-pipe) powered by the + machine learning model that sets annotations on the [`Doc`](/api/doc) passing + through the pipeline. + + #### Step 1: Implementing the Model {#component-rel-model} We need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes a -list of documents as input, and outputs a two-dimensional matrix of predictions: +**list of documents** (~~List[Doc]~~) as input, and outputs a **two-dimensional +matrix** (~~Floats2d~~) of predictions: + +> #### Model type annotations +> +> The `Model` class is a generic type that can specify its input and output +> types, e.g. ~~Model[List[Doc], Floats2d]~~. Type hints are used for static +> type checks and validation. See the section on [type signatures](#type-sigs) +> for details. ```python +### Register the model architecture @registry.architectures.register("rel_model.v1") def create_relation_model(...) -> Model[List[Doc], Floats2d]: - model = _create_my_model() + model = ... # 👈 model will go here return model ``` @@ -521,17 +547,18 @@ The first layer in this model will typically be an [embedding layer](/usage/embeddings-transformers) such as a [`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it -transforms each document into a list of tokens, with each token being +transforms each **document into a list of tokens**, with each token being represented by its embedding in the vector space. -Next, we need a method that generates pairs of entities that we want to classify -as being related or not. As these candidate pairs are typically formed within -one document, this function takes a `Doc` as input and outputs a `List` of -`Span` tuples. For instance, a very straightforward implementation would be to -just take any two entities from the same document: +Next, we need a method that **generates pairs of entities** that we want to +classify as being related or not. As these candidate pairs are typically formed +within one document, this function takes a [`Doc`](/api/doc) as input and +outputs a `List` of `Span` tuples. For instance, a very straightforward +implementation would be to just take any two entities from the same document: ```python -def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: +### Simple candiate generation +def get_candidates(doc: Doc) -> List[Tuple[Span, Span]]: candidates = [] for ent1 in doc.ents: for ent2 in doc.ents: @@ -539,27 +566,29 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: return candidates ``` -> ``` -> [model] -> @architectures = "rel_model.v1" -> -> [model.tok2vec] -> ... -> -> [model.get_candidates] -> @misc = "rel_cand_generator.v2" -> max_length = 20 -> ``` - -But we could also refine this further by excluding relations of an entity with -itself, and posing a maximum distance (in number of tokens) between two +But we could also refine this further by **excluding relations** of an entity +with itself, and posing a **maximum distance** (in number of tokens) between two entities. We register this function in the [`@misc` registry](/api/top-level#registry) so we can refer to it from the config, and easily swap it out for any other candidate generation function. +> #### config.cfg (excerpt) +> +> ```ini +> [model] +> @architectures = "rel_model.v1" +> +> [model.tok2vec] +> # ... +> +> [model.get_candidates] +> @misc = "rel_cand_generator.v1" +> max_length = 20 +> ``` + ```python -### {highlight="1,2,7,8"} -@registry.misc.register("rel_cand_generator.v2") +### Extended candidate generation {highlight="1,2,7,8"} +@registry.misc.register("rel_cand_generator.v1") def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]: def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: candidates = [] @@ -573,17 +602,19 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span ``` Finally, we require a method that transforms the candidate entity pairs into a -2D tensor using the specified `Tok2Vec` function. The resulting `Floats2d` -object will then be processed by a final `output_layer` of the network. Putting -all this together, we can define our relation model in a config file as such: +2D tensor using the specified [`Tok2Vec`](/api/tok2vec) or +[`Transformer`](/api/transformer). The resulting ~~Floats2~~ object will then be +processed by a final `output_layer` of the network. Putting all this together, +we can define our relation model in a config file as such: -``` +```ini +### config.cfg [model] @architectures = "rel_model.v1" -... +# ... [model.tok2vec] -... +# ... [model.get_candidates] @misc = "rel_cand_generator.v2" @@ -594,10 +625,11 @@ max_length = 20 [model.output_layer] @architectures = "rel_output_layer.v1" -... +# ... ``` - + + When creating this model, we store the custom functions as [attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as @@ -612,40 +644,55 @@ get_candidates = model.attrs["get_candidates"] #### Step 2: Implementing the pipeline component {#component-rel-pipe} -To use our new relation extraction model as part of a custom component, we +To use our new relation extraction model as part of a custom +[trainable component](/usage/processing-pipelines#trainable-components), we create a subclass of [`Pipe`](/api/pipe) that holds the model: ```python +### Pipeline component skeleton from spacy.pipeline import Pipe class RelationExtractor(Pipe): - def __init__(self, vocab, model, name="rel", labels=[]): + def __init__(self, vocab, model, name="rel"): + """Create a component instance.""" self.model = model - ... + self.vocab = vocab + self.name = name - def update(self, examples, ...): + def update(self, examples, drop=0.0, set_annotations=False, sgd=None, losses=None): + """Learn from a batch of Example objects.""" ... def predict(self, docs): + """Apply the model to a batch of Doc objects.""" ... def set_annotations(self, docs, predictions): + """Modify a batch of Doc objects using the predictions.""" ... + + def initialize(self, get_examples, nlp=None, labels=None): + """Initialize the model before training.""" + ... + + def add_label(self, label): + """Add a label to the component.""" + ... ``` Before the model can be used, it needs to be -[initialized](/api/pipe#initialize). This function receives either the full -training data set, or a representative sample. This data set can be used to -deduce all relevant labels. Alternatively, a list of labels can be provided, or -a script can call `rel_component.add_label()` directly. - -The number of labels defines the output dimensionality of the network, and will -be used to do [shape inference](https://thinc.ai/docs/usage-models#validation) -throughout the layers of the neural network. This is triggered by calling -`model.initialize`. +[initialized](/usage/training#initialization). This function receives a callback +to access the full **training data set**, or a representative sample. This data +set can be used to deduce all **relevant labels**. Alternatively, a list of +labels can be provided to `initialize`, or you can call the +`RelationExtractoradd_label` directly. The number of labels defines the output +dimensionality of the network, and will be used to do +[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the +layers of the neural network. This is triggered by calling +[`Model.initialize`](https://thinc.ai/api/model#initialize). ```python -### {highlight="12,18,22"} +### The initialize method {highlight="12,18,22"} from itertools import islice def initialize( @@ -671,19 +718,22 @@ def initialize( ``` The `initialize` method is triggered whenever this component is part of an `nlp` -pipeline, and [`nlp.initialize()`](/api/language#initialize) is invoked. After -doing so, the pipeline component and its internal model can be trained and used -to make predictions. +pipeline, and [`nlp.initialize`](/api/language#initialize) is invoked. +Typically, this happens when the pipeline is set up before training in +[`spacy train`](/api/cli#training). After initialization, the pipeline component +and its internal model can be trained and used to make predictions. During training, the function [`update`](/api/pipe#update) is invoked which delegates to -[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a -[`get_loss`](/api/pipe#get_loss) function that calculate the loss for a batch of -examples, as well as the gradient of loss that will be used to update the -weights of the model layers. +[`Model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a +[`get_loss`](/api/pipe#get_loss) function that **calculate the loss** for a +batch of examples, as well as the **gradient** of loss that will be used to +update the weights of the model layers. Thinc provides several +[loss functions](https://thinc.ai/docs/api-loss) that can be used for the +implementation of the `get_loss` function. ```python -### {highlight="12-14"} +### The update method {highlight="12-14"} def update( self, examples: Iterable[Example], @@ -703,15 +753,14 @@ def update( return losses ``` -Thinc provides several [loss functions](https://thinc.ai/docs/api-loss) that can -be used for the implementation of the `get_loss` function. - When the internal model is trained, the component can be used to make novel -predictions. The [`predict`](/api/pipe#predict) function needs to be implemented -for each subclass of `Pipe`. In our case, we can simply delegate to the internal -model's [predict](https://thinc.ai/docs/api-model#predict) function: +**predictions**. The [`predict`](/api/pipe#predict) function needs to be +implemented for each subclass of `Pipe`. In our case, we can simply delegate to +the internal model's [predict](https://thinc.ai/docs/api-model#predict) function +that takes a batch of `Doc` objects and returns a ~~Floats2d~~ array: ```python +### The predict method def predict(self, docs: Iterable[Doc]) -> Floats2d: predictions = self.model.predict(docs) return self.model.ops.asarray(predictions) @@ -721,32 +770,36 @@ The final method that needs to be implemented, is [`set_annotations`](/api/pipe#set_annotations). This function takes the predictions, and modifies the given `Doc` object in place to store them. For our relation extraction component, we store the data as a dictionary in a custom -extension attribute `doc._.rel`. As keys, we represent the candidate pair by the -start offsets of each entity, as this defines an entity pair uniquely within one -document. +[extension attribute](/usage/processing-pipelines#custom-components-attributes) +`doc._.rel`. As keys, we represent the candidate pair by the **start offsets of +each entity**, as this defines an entity pair uniquely within one document. -To interpret the scores predicted by the REL model correctly, we need to refer -to the model's `get_candidates` function that defined which pairs of entities -were relevant candidates, so that the predictions can be linked to those exact -entities: +To interpret the scores predicted by the relation extraction model correctly, we +need to refer to the model's `get_candidates` function that defined which pairs +of entities were relevant candidates, so that the predictions can be linked to +those exact entities: > #### Example output > > ```python > doc = nlp("Amsterdam is the capital of the Netherlands.") -> print(f"spans: [(e.start, e.text, e.label_) for e in doc.ents]") +> print("spans", [(e.start, e.text, e.label_) for e in doc.ents]) > for value, rel_dict in doc._.rel.items(): > print(f"{value}: {rel_dict}") -> ``` - -> ``` -> spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')] -> (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002} -> (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017} +> +> # spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')] +> # (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002} +> # (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017} > ``` ```python -### {highlight="5-6,10"} +### Registering the extension attribute +from spacy.tokens import Doc +Doc.set_extension("rel", default={}) +``` + +```python +### The set_annotations method {highlight="5-6,10"} def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d): c = 0 get_candidates = self.model.attrs["get_candidates"] @@ -761,9 +814,10 @@ def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d): ``` Under the hood, when the pipe is applied to a document, it delegates to the -`predict` and `set_annotations` functions: +`predict` and `set_annotations` methods: ```python +### The __call__ method def __call__(self, Doc doc): predictions = self.predict([doc]) self.set_annotations([doc], predictions) @@ -771,29 +825,38 @@ def __call__(self, Doc doc): ``` Once our `Pipe` subclass is fully implemented, we can -[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories) -the component with the `Language.factory` decorator. This enables the creation -of the component with `nlp.add_pipe`, or via the config. +[register](/usage/processing-pipelines#custom-components-factories) the +component with the [`@Language.factory`](/api/lnguage#factory) decorator. This +assigns it a name and lets you create the component with +[`nlp.add_pipe`](/api/language#add_pipe) and via the +[config](/usage/training#config). -> ``` +> #### config.cfg (excerpt) > +> ```ini > [components.relation_extractor] > factory = "relation_extractor" -> labels = [] > > [components.relation_extractor.model] > @architectures = "rel_model.v1" -> ... +> +> [components.relation_extractor.model.tok2vec] +> # ... +> +> [components.relation_extractor.model.get_candidates] +> @misc = "rel_cand_generator.v1" +> max_length = 20 > ``` ```python +### Registering the pipeline component from spacy.language import Language @Language.factory("relation_extractor") -def make_relation_extractor(nlp, name, model, labels): - return RelationExtractor(nlp.vocab, model, name, labels=labels) +def make_relation_extractor(nlp, name, model): + return RelationExtractor(nlp.vocab, model, name) ``` - + + --> From fd2d48556c1e77f4492693e4a69dc8f4a34cfe34 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 5 Oct 2020 13:43:32 +0200 Subject: [PATCH 15/55] fix E902 and E903 numbering --- spacy/errors.py | 4 ++-- spacy/training/converters/conll_ner_to_docs.py | 2 +- spacy/training/converters/iob_to_docs.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 20edf45b5..9d9a716d2 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -456,10 +456,10 @@ class Errors: "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master - E092 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. " + E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. " "Try checking whitespace and delimiters. See " "https://nightly.spacy.io/api/cli#convert") - E093 = ("The token-per-line NER file is not formatted correctly. Try checking " + E903 = ("The token-per-line NER file is not formatted correctly. Try checking " "whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert") E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This " "dimension refers to the output width, after the linear projection " diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py index 28f0f87c3..c01686aee 100644 --- a/spacy/training/converters/conll_ner_to_docs.py +++ b/spacy/training/converters/conll_ner_to_docs.py @@ -103,7 +103,7 @@ def conll_ner_to_docs( lines = [line.strip() for line in conll_sent.split("\n") if line.strip()] cols = list(zip(*[line.split() for line in lines])) if len(cols) < 2: - raise ValueError(Errors.E093) + raise ValueError(Errors.E903) length = len(cols[0]) words.extend(cols[0]) sent_starts.extend([True] + [False] * (length - 1)) diff --git a/spacy/training/converters/iob_to_docs.py b/spacy/training/converters/iob_to_docs.py index 73ad8953d..a2185fef7 100644 --- a/spacy/training/converters/iob_to_docs.py +++ b/spacy/training/converters/iob_to_docs.py @@ -46,7 +46,7 @@ def read_iob(raw_sents, vocab, n_sents): sent_words, sent_iob = zip(*sent_tokens) sent_tags = ["-"] * len(sent_words) else: - raise ValueError(Errors.E092) + raise ValueError(Errors.E902) words.extend(sent_words) tags.extend(sent_tags) iob.extend(sent_iob) From 20f2a17a09dc053b5f2f06cff637fb92647137ad Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 13:45:57 +0200 Subject: [PATCH 16/55] Merge test_misc and test_util --- spacy/tests/test_misc.py | 134 ++++++++++++++++++++++++++++++++++++++ spacy/tests/test_util.py | 137 --------------------------------------- 2 files changed, 134 insertions(+), 137 deletions(-) delete mode 100644 spacy/tests/test_util.py diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index e6ef45f90..bdf54ad6a 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -7,6 +7,15 @@ from spacy import util from spacy import prefer_gpu, require_gpu from spacy.ml._precomputable_affine import PrecomputableAffine from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding +from spacy.util import dot_to_object, SimpleFrozenList +from thinc.api import Config, Optimizer, ConfigValidationError +from spacy.training.batchers import minibatch_by_words +from spacy.lang.en import English +from spacy.lang.nl import Dutch +from spacy.language import DEFAULT_CONFIG_PATH +from spacy.schemas import ConfigSchemaTraining + +from .util import get_random_doc @pytest.fixture @@ -157,3 +166,128 @@ def test_dot_to_dict(dot_notation, expected): result = util.dot_to_dict(dot_notation) assert result == expected assert util.dict_to_dot(result) == dot_notation + + +@pytest.mark.parametrize( + "doc_sizes, expected_batches", + [ + ([400, 400, 199], [3]), + ([400, 400, 199, 3], [4]), + ([400, 400, 199, 3, 200], [3, 2]), + ([400, 400, 199, 3, 1], [5]), + ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded + ([400, 400, 199, 3, 1, 200], [3, 3]), + ([400, 400, 199, 3, 1, 999], [3, 3]), + ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]), + ([1, 2, 999], [3]), + ([1, 2, 999, 1], [4]), + ([1, 200, 999, 1], [2, 2]), + ([1, 999, 200, 1], [2, 2]), + ], +) +def test_util_minibatch(doc_sizes, expected_batches): + docs = [get_random_doc(doc_size) for doc_size in doc_sizes] + tol = 0.2 + batch_size = 1000 + batches = list( + minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True) + ) + assert [len(batch) for batch in batches] == expected_batches + + max_size = batch_size + batch_size * tol + for batch in batches: + assert sum([len(doc) for doc in batch]) < max_size + + +@pytest.mark.parametrize( + "doc_sizes, expected_batches", + [ + ([400, 4000, 199], [1, 2]), + ([400, 400, 199, 3000, 200], [1, 4]), + ([400, 400, 199, 3, 1, 1500], [1, 5]), + ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]), + ([1, 2, 9999], [1, 2]), + ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]), + ], +) +def test_util_minibatch_oversize(doc_sizes, expected_batches): + """ Test that oversized documents are returned in their own batch""" + docs = [get_random_doc(doc_size) for doc_size in doc_sizes] + tol = 0.2 + batch_size = 1000 + batches = list( + minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False) + ) + assert [len(batch) for batch in batches] == expected_batches + + +def test_util_dot_section(): + cfg_string = """ + [nlp] + lang = "en" + pipeline = ["textcat"] + + [components] + + [components.textcat] + factory = "textcat" + + [components.textcat.model] + @architectures = "spacy.TextCatBOW.v1" + exclusive_classes = true + ngram_size = 1 + no_output_layer = false + """ + nlp_config = Config().from_str(cfg_string) + en_nlp = util.load_model_from_config(nlp_config, auto_fill=True) + default_config = Config().from_disk(DEFAULT_CONFIG_PATH) + default_config["nlp"]["lang"] = "nl" + nl_nlp = util.load_model_from_config(default_config, auto_fill=True) + # Test that creation went OK + assert isinstance(en_nlp, English) + assert isinstance(nl_nlp, Dutch) + assert nl_nlp.pipe_names == [] + assert en_nlp.pipe_names == ["textcat"] + # not exclusive_classes + assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False + # Test that default values got overwritten + assert en_nlp.config["nlp"]["pipeline"] == ["textcat"] + assert nl_nlp.config["nlp"]["pipeline"] == [] # default value [] + # Test proper functioning of 'dot_to_object' + with pytest.raises(KeyError): + dot_to_object(en_nlp.config, "nlp.pipeline.tagger") + with pytest.raises(KeyError): + dot_to_object(en_nlp.config, "nlp.unknownattribute") + T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining) + assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer) + + +def test_simple_frozen_list(): + t = SimpleFrozenList(["foo", "bar"]) + assert t == ["foo", "bar"] + assert t.index("bar") == 1 # okay method + with pytest.raises(NotImplementedError): + t.append("baz") + with pytest.raises(NotImplementedError): + t.sort() + with pytest.raises(NotImplementedError): + t.extend(["baz"]) + with pytest.raises(NotImplementedError): + t.pop() + t = SimpleFrozenList(["foo", "bar"], error="Error!") + with pytest.raises(NotImplementedError): + t.append("baz") + + +def test_resolve_dot_names(): + config = { + "training": {"optimizer": {"@optimizers": "Adam.v1"}}, + "foo": {"bar": "training.optimizer", "baz": "training.xyz"}, + } + result = util.resolve_dot_names(config, ["training.optimizer"]) + assert isinstance(result[0], Optimizer) + with pytest.raises(ConfigValidationError) as e: + util.resolve_dot_names(config, ["training.xyz", "training.optimizer"]) + errors = e.value.errors + assert len(errors) == 1 + assert errors[0]["loc"] == ["training", "xyz"] diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py deleted file mode 100644 index f710a38eb..000000000 --- a/spacy/tests/test_util.py +++ /dev/null @@ -1,137 +0,0 @@ -import pytest - -from spacy import util -from spacy.util import dot_to_object, SimpleFrozenList -from thinc.api import Config, Optimizer, ConfigValidationError -from spacy.training.batchers import minibatch_by_words -from spacy.lang.en import English -from spacy.lang.nl import Dutch -from spacy.language import DEFAULT_CONFIG_PATH -from spacy.schemas import ConfigSchemaTraining - -from .util import get_random_doc - - -@pytest.mark.parametrize( - "doc_sizes, expected_batches", - [ - ([400, 400, 199], [3]), - ([400, 400, 199, 3], [4]), - ([400, 400, 199, 3, 200], [3, 2]), - ([400, 400, 199, 3, 1], [5]), - ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded - ([400, 400, 199, 3, 1, 200], [3, 3]), - ([400, 400, 199, 3, 1, 999], [3, 3]), - ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]), - ([1, 2, 999], [3]), - ([1, 2, 999, 1], [4]), - ([1, 200, 999, 1], [2, 2]), - ([1, 999, 200, 1], [2, 2]), - ], -) -def test_util_minibatch(doc_sizes, expected_batches): - docs = [get_random_doc(doc_size) for doc_size in doc_sizes] - tol = 0.2 - batch_size = 1000 - batches = list( - minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True) - ) - assert [len(batch) for batch in batches] == expected_batches - - max_size = batch_size + batch_size * tol - for batch in batches: - assert sum([len(doc) for doc in batch]) < max_size - - -@pytest.mark.parametrize( - "doc_sizes, expected_batches", - [ - ([400, 4000, 199], [1, 2]), - ([400, 400, 199, 3000, 200], [1, 4]), - ([400, 400, 199, 3, 1, 1500], [1, 5]), - ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]), - ([1, 2, 9999], [1, 2]), - ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]), - ], -) -def test_util_minibatch_oversize(doc_sizes, expected_batches): - """ Test that oversized documents are returned in their own batch""" - docs = [get_random_doc(doc_size) for doc_size in doc_sizes] - tol = 0.2 - batch_size = 1000 - batches = list( - minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False) - ) - assert [len(batch) for batch in batches] == expected_batches - - -def test_util_dot_section(): - cfg_string = """ - [nlp] - lang = "en" - pipeline = ["textcat"] - - [components] - - [components.textcat] - factory = "textcat" - - [components.textcat.model] - @architectures = "spacy.TextCatBOW.v1" - exclusive_classes = true - ngram_size = 1 - no_output_layer = false - """ - nlp_config = Config().from_str(cfg_string) - en_nlp = util.load_model_from_config(nlp_config, auto_fill=True) - default_config = Config().from_disk(DEFAULT_CONFIG_PATH) - default_config["nlp"]["lang"] = "nl" - nl_nlp = util.load_model_from_config(default_config, auto_fill=True) - # Test that creation went OK - assert isinstance(en_nlp, English) - assert isinstance(nl_nlp, Dutch) - assert nl_nlp.pipe_names == [] - assert en_nlp.pipe_names == ["textcat"] - # not exclusive_classes - assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False - # Test that default values got overwritten - assert en_nlp.config["nlp"]["pipeline"] == ["textcat"] - assert nl_nlp.config["nlp"]["pipeline"] == [] # default value [] - # Test proper functioning of 'dot_to_object' - with pytest.raises(KeyError): - dot_to_object(en_nlp.config, "nlp.pipeline.tagger") - with pytest.raises(KeyError): - dot_to_object(en_nlp.config, "nlp.unknownattribute") - T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining) - assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer) - - -def test_simple_frozen_list(): - t = SimpleFrozenList(["foo", "bar"]) - assert t == ["foo", "bar"] - assert t.index("bar") == 1 # okay method - with pytest.raises(NotImplementedError): - t.append("baz") - with pytest.raises(NotImplementedError): - t.sort() - with pytest.raises(NotImplementedError): - t.extend(["baz"]) - with pytest.raises(NotImplementedError): - t.pop() - t = SimpleFrozenList(["foo", "bar"], error="Error!") - with pytest.raises(NotImplementedError): - t.append("baz") - - -def test_resolve_dot_names(): - config = { - "training": {"optimizer": {"@optimizers": "Adam.v1"}}, - "foo": {"bar": "training.optimizer", "baz": "training.xyz"}, - } - result = util.resolve_dot_names(config, ["training.optimizer"]) - assert isinstance(result[0], Optimizer) - with pytest.raises(ConfigValidationError) as e: - util.resolve_dot_names(config, ["training.xyz", "training.optimizer"]) - errors = e.value.errors - assert len(errors) == 1 - assert errors[0]["loc"] == ["training", "xyz"] From 6958510bdaaa279c8b4f5184bbdbbe6cf3c7cf8a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 13:53:07 +0200 Subject: [PATCH 17/55] Include spaCy version check in project CLI --- spacy/cli/_util.py | 7 +++++-- spacy/cli/project/remote_storage.py | 7 +++++-- spacy/cli/project/run.py | 31 +++++++++++++++++++++++++---- spacy/tests/test_misc.py | 15 ++++++++++++++ spacy/util.py | 27 +++++++++++++++++++++++++ 5 files changed, 79 insertions(+), 8 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index c959c9861..676a7c8d7 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -1,4 +1,4 @@ -from typing import Dict, Any, Union, List, Optional, Tuple, TYPE_CHECKING +from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECKING import sys import shutil from pathlib import Path @@ -193,12 +193,15 @@ def validate_project_commands(config: Dict[str, Any]) -> None: ) -def get_hash(data) -> str: +def get_hash(data, exclude: Iterable[str] = tuple()) -> str: """Get the hash for a JSON-serializable object. data: The data to hash. + exclude (Iterable[str]): Top-level keys to exclude if data is a dict. RETURNS (str): The hash. """ + if isinstance(data, dict): + data = {k: v for k, v in data.items() if k not in exclude} data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") return hashlib.md5(data_str).hexdigest() diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py index e7e7cbbe8..7e2caa8d7 100644 --- a/spacy/cli/project/remote_storage.py +++ b/spacy/cli/project/remote_storage.py @@ -7,7 +7,8 @@ import tarfile from pathlib import Path from .._util import get_hash, get_checksum, download_file, ensure_pathy -from ...util import make_tempdir +from ...util import make_tempdir, get_minor_version +from ... import about if TYPE_CHECKING: from pathy import Pathy # noqa: F401 @@ -129,7 +130,9 @@ def get_command_hash( currently installed packages, whatever environment variables have been marked as relevant, and the command. """ - hashes = [site_hash, env_hash] + [get_checksum(dep) for dep in sorted(deps)] + spacy_v = get_minor_version(about.__version__) + dep_checksums = [get_checksum(dep) for dep in sorted(deps)] + hashes = [spacy_v, site_hash, env_hash] + dep_checksums hashes.extend(cmd) creation_bytes = "".join(hashes).encode("utf8") return hashlib.md5(creation_bytes).hexdigest() diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 69c49fba7..94d4371d0 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -4,8 +4,10 @@ from wasabi import msg import sys import srsly +from ... import about +from ...git_info import GIT_VERSION from ...util import working_dir, run_command, split_command, is_cwd, join_command -from ...util import SimpleFrozenList +from ...util import SimpleFrozenList, is_minor_version_match from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash from .._util import get_checksum, project_cli, Arg, Opt, COMMAND @@ -63,11 +65,11 @@ def project_run( err_kwargs = {"exits": 1} if not dry else {} msg.fail(err, err_help, **err_kwargs) with working_dir(project_dir) as current_dir: + msg.divider(subcommand) rerun = check_rerun(current_dir, cmd) if not rerun and not force: msg.info(f"Skipping '{cmd['name']}': nothing changed") else: - msg.divider(subcommand) run_commands(cmd["script"], dry=dry) if not dry: update_lockfile(current_dir, cmd) @@ -171,12 +173,18 @@ def validate_subcommand( ) -def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool: +def check_rerun( + project_dir: Path, + command: Dict[str, Any], + check_spacy_version: bool = True, + check_spacy_commit: bool = False, +) -> bool: """Check if a command should be rerun because its settings or inputs/outputs changed. project_dir (Path): The current project directory. command (Dict[str, Any]): The command, as defined in the project.yml. + strict_version (bool): RETURNS (bool): Whether to re-run the command. """ lock_path = project_dir / PROJECT_LOCK @@ -189,10 +197,23 @@ def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool: # Always run commands with no outputs (otherwise they'd always be skipped) if not entry.get("outs", []): return True + # Always rerun if spaCy version or commit hash changed + spacy_v = entry.get("spacy_version") + commit = entry.get("spacy_git_version") + if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__): + info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)" + msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}") + return True + if check_spacy_commit and commit != GIT_VERSION: + info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)" + msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}") + return True # If the entry in the lockfile matches the lockfile entry that would be # generated from the current command, we don't rerun because it means that # all inputs/outputs, hashes and scripts are the same and nothing changed - return get_hash(get_lock_entry(project_dir, command)) != get_hash(entry) + lock_entry = get_lock_entry(project_dir, command) + exclude = ["spacy_version", "spacy_git_version"] + return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude) def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None: @@ -231,6 +252,8 @@ def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any] "script": command["script"], "deps": deps, "outs": [*outs, *outs_nc], + "spacy_version": about.__version__, + "spacy_git_version": GIT_VERSION, } diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index bdf54ad6a..b9a0a9d05 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -149,6 +149,21 @@ def test_is_unconstrained_version(constraint, expected): assert util.is_unconstrained_version(constraint) is expected +@pytest.mark.parametrize( + "a1,a2,b1,b2,is_match", + [ + ("3.0.0", "3.0", "3.0.1", "3.0", True), + ("3.1.0", "3.1", "3.2.1", "3.2", False), + ("xxx", None, "1.2.3.dev0", "1.2", False), + ], +) +def test_minor_version(a1, a2, b1, b2, is_match): + assert util.get_minor_version(a1) == a2 + assert util.get_minor_version(b1) == b2 + assert util.is_minor_version_match(a1, b1) is is_match + assert util.is_minor_version_match(a2, b2) is is_match + + @pytest.mark.parametrize( "dot_notation,expected", [ diff --git a/spacy/util.py b/spacy/util.py index 4d68e829c..4b2cb018a 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -584,6 +584,33 @@ def get_base_version(version: str) -> str: return Version(version).base_version +def get_minor_version(version: str) -> Optional[str]: + """Get the major + minor version (without patch or prerelease identifiers). + + version (str): The version. + RETURNS (str): The major + minor version or None if version is invalid. + """ + try: + v = Version(version) + except (TypeError, InvalidVersion): + return None + return f"{v.major}.{v.minor}" + + +def is_minor_version_match(version_a: str, version_b: str) -> bool: + """Compare two versions and check if they match in major and minor, without + patch or prerelease identifiers. Used internally for compatibility checks + that should be insensitive to patch releases. + + version_a (str): The first version + version_b (str): The second version. + RETURNS (bool): Whether the versions match. + """ + a = get_minor_version(version_a) + b = get_minor_version(version_b) + return a is not None and b is not None and a == b + + def load_meta(path: Union[str, Path]) -> Dict[str, Any]: """Load a model meta.json from a path and validate its contents. From d2b9aafb8c8d91ea74c2418d9fb32f1ce8812bbf Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 14:14:49 +0200 Subject: [PATCH 18/55] Fix augmenter --- spacy/training/augment.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/spacy/training/augment.py b/spacy/training/augment.py index e6d10a195..06656bdd8 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -120,8 +120,8 @@ def make_orth_variants( ndsv = orth_variants.get("single", []) ndpv = orth_variants.get("paired", []) logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants") - words = token_dict.get("words", []) - tags = token_dict.get("tags", []) + words = token_dict.get("ORTH", []) + tags = token_dict.get("TAG", []) # keep unmodified if words or tags are not defined if words and tags: if lower: @@ -131,7 +131,7 @@ def make_orth_variants( for word_idx in range(len(words)): for punct_idx in range(len(ndsv)): if ( - tags[word_idx] in ndsv[punct_idx]["tags"] + tags[word_idx] in ndsv[punct_idx]["TAG"] and words[word_idx] in ndsv[punct_idx]["variants"] ): words[word_idx] = punct_choices[punct_idx] @@ -139,14 +139,14 @@ def make_orth_variants( punct_choices = [random.choice(x["variants"]) for x in ndpv] for word_idx in range(len(words)): for punct_idx in range(len(ndpv)): - if tags[word_idx] in ndpv[punct_idx]["tags"] and words[ + if tags[word_idx] in ndpv[punct_idx]["TAG"] and words[ word_idx ] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): # backup option: random left vs. right from pair pair_idx = random.choice([0, 1]) # best option: rely on paired POS tags like `` / '' - if len(ndpv[punct_idx]["tags"]) == 2: - pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) + if len(ndpv[punct_idx]["TAG"]) == 2: + pair_idx = ndpv[punct_idx]["TAG"].index(tags[word_idx]) # next best option: rely on position in variants # (may not be unambiguous, so order of variants matters) else: @@ -154,8 +154,8 @@ def make_orth_variants( if words[word_idx] in pair: pair_idx = pair.index(words[word_idx]) words[word_idx] = punct_choices[punct_idx][pair_idx] - token_dict["words"] = words - token_dict["tags"] = tags + token_dict["ORTH"] = words + token_dict["TAG"] = tags # modify raw if raw is not None: variants = [] From 5d19dfc9d32c7fd039118d9fe0f8cf713e7af471 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 5 Oct 2020 14:21:53 +0200 Subject: [PATCH 19/55] Update Chinese tokenizer for spacy-pkuseg fork --- spacy/lang/zh/__init__.py | 62 +++++++++++---------------------------- spacy/tests/conftest.py | 5 ++-- 2 files changed, 19 insertions(+), 48 deletions(-) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 55a77330a..8864ae119 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -17,8 +17,7 @@ from ... import util # fmt: off -_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`" -_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7." +_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install spacy-pkuseg==0.0.26`" # fmt: on DEFAULT_CONFIG = """ @@ -120,12 +119,12 @@ class ChineseTokenizer(DummyTokenizer): if self.segmenter == Segmenter.pkuseg: if reset: try: - import pkuseg + import spacy_pkuseg - self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None) + self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(None) except ImportError: msg = ( - "pkuseg not installed: unable to reset pkuseg " + "spacy_pkuseg not installed: unable to reset pkuseg " "user dict. Please " + _PKUSEG_INSTALL_MSG ) raise ImportError(msg) from None @@ -156,22 +155,6 @@ class ChineseTokenizer(DummyTokenizer): self.pkuseg_seg.feature_extractor.save(tempdir) self.pkuseg_seg.model.save(tempdir) tempdir = Path(tempdir) - # pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which - # means that it will be saved with pickle protocol 5 with - # python 3.8, which can't be reloaded with python 3.6-3.7. - # To try to make the model compatible with python 3.6+, reload - # the data with pickle5 and convert it back to protocol 4. - try: - import pickle5 - - with open(tempdir / "features.pkl", "rb") as fileh: - features = pickle5.load(fileh) - with open(tempdir / "features.pkl", "wb") as fileh: - pickle5.dump(features, fileh, protocol=4) - except ImportError as e: - raise e - except Exception: - warnings.warn(_PKUSEG_PICKLE_WARNING) with open(tempdir / "features.pkl", "rb") as fileh: pkuseg_features_b = fileh.read() with open(tempdir / "weights.npz", "rb") as fileh: @@ -218,17 +201,17 @@ class ChineseTokenizer(DummyTokenizer): with open(tempdir / "weights.npz", "wb") as fileh: fileh.write(pkuseg_data["weights_b"]) try: - import pkuseg + import spacy_pkuseg except ImportError: raise ImportError( - "pkuseg not installed. To use this model, " + "spacy_pkuseg not installed. To use this model, " + _PKUSEG_INSTALL_MSG ) from None - self.pkuseg_seg = pkuseg.pkuseg(str(tempdir)) + self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir)) if pkuseg_data["processors_data"]: processors_data = pkuseg_data["processors_data"] (user_dict, do_process, common_words, other_words) = processors_data - self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) + self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict) self.pkuseg_seg.postprocesser.do_process = do_process self.pkuseg_seg.postprocesser.common_words = set(common_words) self.pkuseg_seg.postprocesser.other_words = set(other_words) @@ -244,18 +227,6 @@ class ChineseTokenizer(DummyTokenizer): path.mkdir(parents=True) self.pkuseg_seg.model.save(path) self.pkuseg_seg.feature_extractor.save(path) - # try to convert features.pkl to pickle protocol 4 - try: - import pickle5 - - with open(path / "features.pkl", "rb") as fileh: - features = pickle5.load(fileh) - with open(path / "features.pkl", "wb") as fileh: - pickle5.dump(features, fileh, protocol=4) - except ImportError as e: - raise e - except Exception: - warnings.warn(_PKUSEG_PICKLE_WARNING) def save_pkuseg_processors(path): if self.pkuseg_seg: @@ -279,26 +250,26 @@ class ChineseTokenizer(DummyTokenizer): def load_pkuseg_model(path): try: - import pkuseg + import spacy_pkuseg except ImportError: if self.segmenter == Segmenter.pkuseg: raise ImportError( - "pkuseg not installed. To use this model, " + "spacy_pkuseg not installed. To use this model, " + _PKUSEG_INSTALL_MSG ) from None if path.exists(): - self.pkuseg_seg = pkuseg.pkuseg(path) + self.pkuseg_seg = spacy_pkuseg.pkuseg(path) def load_pkuseg_processors(path): try: - import pkuseg + import spacy_pkuseg except ImportError: if self.segmenter == Segmenter.pkuseg: raise ImportError(self._pkuseg_install_msg) from None if self.segmenter == Segmenter.pkuseg: data = srsly.read_msgpack(path) (user_dict, do_process, common_words, other_words) = data - self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) + self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict) self.pkuseg_seg.postprocesser.do_process = do_process self.pkuseg_seg.postprocesser.common_words = set(common_words) self.pkuseg_seg.postprocesser.other_words = set(other_words) @@ -341,12 +312,13 @@ def try_jieba_import() -> None: def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None: try: - import pkuseg + import spacy_pkuseg - return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) except ImportError: - msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG + msg = "spacy_pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG raise ImportError(msg) from None + try: + return spacy_pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) except FileNotFoundError: msg = "Unable to load pkuseg model from: " + pkuseg_model raise FileNotFoundError(msg) from None diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 4a3d126d7..bb9f770bc 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -285,8 +285,7 @@ def zh_tokenizer_jieba(): @pytest.fixture(scope="session") def zh_tokenizer_pkuseg(): - pytest.importorskip("pkuseg") - pytest.importorskip("pickle5") + pytest.importorskip("spacy_pkuseg") config = { "nlp": { "tokenizer": { @@ -296,7 +295,7 @@ def zh_tokenizer_pkuseg(): }, "initialize": { "tokenizer": { - "pkuseg_model": "default", + "pkuseg_model": "web", } }, } From f4f49f5877d4a0cca4ef9e03ea1c39aa742ba797 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 5 Oct 2020 14:58:56 +0200 Subject: [PATCH 20/55] update blis (#6198) * allow higher blis version * fix typo * bump to 3.0.0a34 * fix pins in other files --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 2 +- spacy/about.py | 2 +- spacy/pipeline/morphologizer.pyx | 2 +- spacy/pipeline/pipe.pyx | 2 +- spacy/pipeline/senter.pyx | 2 +- spacy/pipeline/tagger.pyx | 2 +- spacy/pipeline/textcat.py | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 611a95d27..d48886e0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ requires = [ "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", "thinc>=8.0.0a43,<8.0.0a50", - "blis>=0.4.0,<0.5.0", + "blis>=0.4.0,<0.8.0", "pytokenizations", "pathy" ] diff --git a/requirements.txt b/requirements.txt index 44dad38e3..29695e9b4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 thinc>=8.0.0a43,<8.0.0a50 -blis>=0.4.0,<0.5.0 +blis>=0.4.0,<0.8.0 ml_datasets==0.2.0a0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.8.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 7192ba9d4..d8362c4bd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,7 +41,7 @@ install_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 thinc>=8.0.0a43,<8.0.0a50 - blis>=0.4.0,<0.5.0 + blis>=0.4.0,<0.8.0 wasabi>=0.8.0,<1.1.0 srsly>=2.3.0,<3.0.0 catalogue>=2.0.1,<2.1.0 diff --git a/spacy/about.py b/spacy/about.py index dce627a38..392bfd589 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a33" +__version__ = "3.0.0a34" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 82f3bf37d..6d97b062f 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -210,7 +210,7 @@ class Morphologizer(Tagger): examples (Iterable[Examples]): The batch of examples. scores: Scores representing the model's predictions. - RETUTNRS (Tuple[float, float]): The loss and the gradient. + RETURNS (Tuple[float, float]): The loss and the gradient. DOCS: https://nightly.spacy.io/api/morphologizer#get_loss """ diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 41ca23ace..8e103a638 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -162,7 +162,7 @@ cdef class Pipe: examples (Iterable[Examples]): The batch of examples. scores: Scores representing the model's predictions. - RETUTNRS (Tuple[float, float]): The loss and the gradient. + RETURNS (Tuple[float, float]): The loss and the gradient. DOCS: https://nightly.spacy.io/api/pipe#get_loss """ diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 0bfef7c7b..8fb1e664f 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -104,7 +104,7 @@ class SentenceRecognizer(Tagger): examples (Iterable[Examples]): The batch of examples. scores: Scores representing the model's predictions. - RETUTNRS (Tuple[float, float]): The loss and the gradient. + RETURNS (Tuple[float, float]): The loss and the gradient. DOCS: https://nightly.spacy.io/api/sentencerecognizer#get_loss """ diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 6cb582b36..94ac0c082 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -249,7 +249,7 @@ class Tagger(Pipe): examples (Iterable[Examples]): The batch of examples. scores: Scores representing the model's predictions. - RETUTNRS (Tuple[float, float]): The loss and the gradient. + RETURNS (Tuple[float, float]): The loss and the gradient. DOCS: https://nightly.spacy.io/api/tagger#get_loss """ diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index fc60ebf89..292598e3a 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -281,7 +281,7 @@ class TextCategorizer(Pipe): examples (Iterable[Examples]): The batch of examples. scores: Scores representing the model's predictions. - RETUTNRS (Tuple[float, float]): The loss and the gradient. + RETURNS (Tuple[float, float]): The loss and the gradient. DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss """ From 8171e28b20aafc52ccf571b813b142b3355e550b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 15:09:52 +0200 Subject: [PATCH 21/55] Remove logging [ci skip] This would be fired on each example, which is wrong --- spacy/training/augment.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/training/augment.py b/spacy/training/augment.py index e6d10a195..ee5992b36 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -5,7 +5,7 @@ import copy from functools import partial from pydantic import BaseModel, StrictStr -from ..util import registry, logger +from ..util import registry from ..tokens import Doc from .example import Example @@ -119,7 +119,6 @@ def make_orth_variants( orig_token_dict = copy.deepcopy(token_dict) ndsv = orth_variants.get("single", []) ndpv = orth_variants.get("paired", []) - logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants") words = token_dict.get("words", []) tags = token_dict.get("tags", []) # keep unmodified if words or tags are not defined From 8ec79ad3fadd97f39b220c874e0df46921646fd0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 15:22:00 +0200 Subject: [PATCH 22/55] Allow configuration of MultiHashEmbed features Update arguments to MultiHashEmbed layer so that the attributes can be controlled. A kind of tricky scheme is used to allow optional specification of the rows. I think it's an okay balance between flexibility and convenience. --- spacy/ml/models/tok2vec.py | 100 +++++++++++++++++++++++++------------ spacy/tests/test_models.py | 32 +++++++++++- 2 files changed, 98 insertions(+), 34 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 1a0979cab..4abc1bee6 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -1,4 +1,4 @@ -from typing import Optional, List, Union +from typing import Optional, List, Union, Dict from thinc.types import Floats2d from thinc.api import chain, clone, concatenate, with_array, with_padded from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed @@ -11,7 +11,7 @@ from ...ml import _character_embed from ..staticvectors import StaticVectors from ..featureextractor import FeatureExtractor from ...pipeline.tok2vec import Tok2VecListener -from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr +from ...attrs import ORTH, NORM, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr @registry.architectures.register("spacy.Tok2VecListener.v1") @@ -54,12 +54,16 @@ def build_hash_embed_cnn_tok2vec( a language such as Chinese. pretrained_vectors (bool): Whether to also use static vectors. """ + if subword_features: + attrs = {"NORM": 1.0, "PREFIX": 0.5, "SUFFIX": 0.5, "SHAPE": 0.5} + else: + attrs = {"NORM": 1.0} return build_Tok2Vec_model( embed=MultiHashEmbed( width=width, rows=embed_size, - also_embed_subwords=subword_features, - also_use_static_vectors=bool(pretrained_vectors), + attrs=attrs, + include_static_vectors=bool(pretrained_vectors), ), encode=MaxoutWindowEncoder( width=width, @@ -92,59 +96,89 @@ def build_Tok2Vec_model( @registry.architectures.register("spacy.MultiHashEmbed.v1") -def MultiHashEmbed( +def MultiHashEmbed_v1( width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool +) -> Model[List[Doc], List[Floats2d]]: + """Previous interface for MultiHashEmbed. This should be removed, it's only + here as a temporary compatibility.""" + return MultiHashEmbed( + width=width, + rows=rows, + attrs=[NORM, PREFIX, SUFFIX, SHAPE] if also_embed_subwords else [NORM], + include_static_vectors=also_use_static_vectors + ) + +@registry.architectures.register("spacy.MultiHashEmbed.v2") +def MultiHashEmbed( + width: int, + rows: int, + attrs: Union[List[Union[str, int]], Dict[Union[str, int], float]], + include_static_vectors: bool ) -> Model[List[Doc], List[Floats2d]]: """Construct an embedding layer that separately embeds a number of lexical attributes using hash embedding, concatenates the results, and passes it through a feed-forward subnetwork to build a mixed representations. - The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have - varying definitions depending on the Vocab of the Doc object passed in. - Vectors from pretrained static vectors can also be incorporated into the - concatenated representation. + The features used can be configured with the 'attrs' argument. The suggested + attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into + account some subword information, without contruction a fully character-based + representation. If pretrained vectors are available, they can be included in + the representation as well, with the vectors table will be kept static + (i.e. it's not updated). + + The `width` parameter specifices the output width of the layer and the widths + of all embedding tables. If static vectors are included, a learned linear + layer is used to map the vectors to the specified width before concatenating + it with the other embedding outputs. A single Maxout layer is then used to + reduce the concatenated vectors to the final width. + + The `rows` parameter controls the number of rows used by the `HashEmbed` + tables. The HashEmbed layer needs surprisingly few rows, due to its use of + the hashing trick. Generally between 2000 and 10000 rows is sufficient, + even for very large vocabularies. You can vary the number of rows per + attribute by specifying the attrs as a dict, mapping the keys to float + values which are interpreted as factors of `rows`. For instance, + attrs={"NORM": 1.0, PREFIX: 0.2} will use rows*1 for the NORM table and + rows*0.2 for the PREFIX table. If `attrs` is a list, factors of 1.0 are + assumed for all attributes. width (int): The output width. Also used as the width of the embedding tables. Recommended values are between 64 and 300. - rows (int): The number of rows for the embedding tables. Can be low, due - to the hashing trick. Embeddings for prefix, suffix and word shape - use half as many rows. Recommended values are between 2000 and 10000. - also_embed_subwords (bool): Whether to use the PREFIX, SUFFIX and SHAPE - features in the embeddings. If not using these, you may need more - rows in your hash embeddings, as there will be increased chance of - collisions. - also_use_static_vectors (bool): Whether to also use static word vectors. + rows (int): The base number of rows for the embedding tables. Can be low, due + to the hashing trick. The rows can be varied per attribute by providing + a dictionary as the value of `attrs`. + attrs (dict or list of attr IDs): The token attributes to embed. A separate + embedding table will be constructed for each attribute. Attributes + can be specified as a list or as a dictionary, which lets you control + the number of rows used for each table. + include_static_vectors (bool): Whether to also use static word vectors. Requires a vectors table to be loaded in the Doc objects' vocab. """ - cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH] + if isinstance(attrs, dict): + # Exclude tables that would have 0 rows. + attrs = {key: value for key, value in attrs.items() if value > 0.0} + indices = {attr: i for i, attr in enumerate(attrs)} seed = 7 def make_hash_embed(feature): nonlocal seed + row_factor = attrs[feature] if isinstance(attrs, dict) else 1.0 seed += 1 return HashEmbed( width, - rows if feature == LOWER else rows // 2, - column=cols.index(feature), + int(rows * row_factor), + column=indices[feature], seed=seed, dropout=0.0, ) - if also_embed_subwords: - embeddings = [ - make_hash_embed(LOWER), - make_hash_embed(PREFIX), - make_hash_embed(SUFFIX), - make_hash_embed(SHAPE), - ] - else: - embeddings = [make_hash_embed(LOWER)] - concat_size = width * (len(embeddings) + also_use_static_vectors) - if also_use_static_vectors: + embeddings = [make_hash_embed(attr) for attr in attrs] + concat_size = width * (len(embeddings) + include_static_vectors) + if include_static_vectors: model = chain( concatenate( chain( - FeatureExtractor(cols), + FeatureExtractor(list(attrs)), list2ragged(), with_array(concatenate(*embeddings)), ), @@ -155,7 +189,7 @@ def MultiHashEmbed( ) else: model = chain( - FeatureExtractor(cols), + FeatureExtractor(list(attrs)), list2ragged(), with_array(concatenate(*embeddings)), with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)), diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index a123f459d..3bd3b903d 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -6,6 +6,7 @@ from numpy.testing import assert_array_equal import numpy from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder +from spacy.ml.models import MultiHashEmbed_v1 from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier from spacy.lang.en import English from spacy.lang.en.examples import sentences as EN_SENTENCES @@ -61,7 +62,10 @@ def get_tok2vec_kwargs(): # This actually creates models, so seems best to put it in a function. return { "embed": MultiHashEmbed( - width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False + width=32, + rows=500, + attrs=["NORM", "PREFIX", "SHAPE"], + include_static_vectors=False ), "encode": MaxoutWindowEncoder( width=32, depth=2, maxout_pieces=2, window_size=1 @@ -73,6 +77,32 @@ def test_tok2vec(): return build_Tok2Vec_model(**get_tok2vec_kwargs()) +def test_multi_hash_embed(): + embed = MultiHashEmbed( + width=32, + rows=500, + attrs=["NORM", "PREFIX", "SHAPE"], + include_static_vectors=False + ) + hash_embeds = [node for node in embed.walk() if node.name == "hashembed"] + assert len(hash_embeds) == 3 + # Check they look at different columns. + assert list(sorted(he.attrs["column"] for he in hash_embeds)) == [0, 1, 2] + # Check they use different seeds + assert len(set(he.attrs["seed"] for he in hash_embeds)) == 3 + # Check they all have the same number of rows + assert [he.get_dim("nV") for he in hash_embeds] == [500, 500, 500] + # Now try with different row factors + embed = MultiHashEmbed( + width=32, + rows=500, + attrs={"NORM": 2.0, "PREFIX": 0.1, "SHAPE": 0.5}, + include_static_vectors=False + ) + hash_embeds = [node for node in embed.walk() if node.name == "hashembed"] + assert [he.get_dim("nV") for he in hash_embeds] == [1000, 50, 250] + + @pytest.mark.parametrize( "seed,model_func,kwargs", [ From f2f1deca662a197c8e605e32238bfa015851f2ad Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 15:24:33 +0200 Subject: [PATCH 23/55] spacy/tests/ --- spacy/tests/pipeline/test_tok2vec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 06212e351..78a677acf 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -25,8 +25,8 @@ def test_empty_doc(): MultiHashEmbed( width=width, rows=embed_size, - also_use_static_vectors=False, - also_embed_subwords=True, + include_static_vectors=False, + attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"], ), MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3), ) From f4ca9a39cb5245da78f01d39f95efa53924ae15a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 15:27:06 +0200 Subject: [PATCH 24/55] spacy/tests/ --- spacy/tests/pipeline/test_tok2vec.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 78a677acf..df844365b 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -45,8 +45,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): MultiHashEmbed( width=width, rows=embed_size, - also_use_static_vectors=False, - also_embed_subwords=True, + include_static_vectors=False, + attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"], ), MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3), ) @@ -61,8 +61,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): @pytest.mark.parametrize( "width,embed_arch,embed_config,encode_arch,encode_config", [ - (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), - (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), + (8, MultiHashEmbed, {"rows": 100, "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), + (8, MultiHashEmbed, {"rows": 100, "attrs": {"ORTH": 1.0, "PREFIX": 0.2}, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), ], From 7d93575f35a7fb8484096b772ce71834bfd1914a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 15:28:12 +0200 Subject: [PATCH 25/55] spacy/tests/ --- spacy/tests/pipeline/test_tok2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index df844365b..aa60faf5b 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -62,7 +62,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): "width,embed_arch,embed_config,encode_arch,encode_config", [ (8, MultiHashEmbed, {"rows": 100, "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), - (8, MultiHashEmbed, {"rows": 100, "attrs": {"ORTH": 1.0, "PREFIX": 0.2}, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), + (8, MultiHashEmbed, {"rows": 100, "attrs": {"ORTH": 1.0, "PREFIX": 0.2}, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), ], From eb9ba61517e4e7f39b5521313e797bdbbf6740af Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 15:29:49 +0200 Subject: [PATCH 26/55] Format --- spacy/ml/models/tok2vec.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 4abc1bee6..6e5aed77b 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -103,17 +103,18 @@ def MultiHashEmbed_v1( here as a temporary compatibility.""" return MultiHashEmbed( width=width, - rows=rows, + rows=rows, attrs=[NORM, PREFIX, SUFFIX, SHAPE] if also_embed_subwords else [NORM], - include_static_vectors=also_use_static_vectors + include_static_vectors=also_use_static_vectors, ) + @registry.architectures.register("spacy.MultiHashEmbed.v2") def MultiHashEmbed( width: int, rows: int, attrs: Union[List[Union[str, int]], Dict[Union[str, int], float]], - include_static_vectors: bool + include_static_vectors: bool, ) -> Model[List[Doc], List[Floats2d]]: """Construct an embedding layer that separately embeds a number of lexical attributes using hash embedding, concatenates the results, and passes it From 90040aacec90f18d7e5a0c5f051352316f9e5cd0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 16:12:01 +0200 Subject: [PATCH 27/55] Fix merge --- spacy/training/augment.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/training/augment.py b/spacy/training/augment.py index bbe164aed..685016b62 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -119,7 +119,6 @@ def make_orth_variants( orig_token_dict = copy.deepcopy(token_dict) ndsv = orth_variants.get("single", []) ndpv = orth_variants.get("paired", []) - logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants") words = token_dict.get("ORTH", []) tags = token_dict.get("TAG", []) # keep unmodified if words or tags are not defined From 187234648cfb20974cdbf79b0d8a477c0aaf36b3 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 5 Oct 2020 16:24:28 +0200 Subject: [PATCH 28/55] Revert back to "default" as default for pkuseg_user_dict --- spacy/lang/zh/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 8864ae119..5d4d55aed 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -81,9 +81,11 @@ class ChineseTokenizer(DummyTokenizer): *, nlp: Optional[Language] = None, pkuseg_model: Optional[str] = None, - pkuseg_user_dict: str = "default", + pkuseg_user_dict: Optional[str] = "default", ): if self.segmenter == Segmenter.pkuseg: + if pkuseg_user_dict is None: + pkuseg_user_dict = pkuseg_model self.pkuseg_seg = try_pkuseg_import( pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict, ) From 9f1bc3f24c6c9f0412f815abe044274d3840fa23 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 16:40:23 +0200 Subject: [PATCH 29/55] Fix augment --- spacy/training/augment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/training/augment.py b/spacy/training/augment.py index 685016b62..c538f02d0 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -130,7 +130,7 @@ def make_orth_variants( for word_idx in range(len(words)): for punct_idx in range(len(ndsv)): if ( - tags[word_idx] in ndsv[punct_idx]["TAG"] + tags[word_idx] in ndsv[punct_idx]["tags"] and words[word_idx] in ndsv[punct_idx]["variants"] ): words[word_idx] = punct_choices[punct_idx] From 4ed3e037df766aa2f2827a4b1a63a1f80a79485b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 16:40:23 +0200 Subject: [PATCH 30/55] Fix augment --- spacy/training/augment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/training/augment.py b/spacy/training/augment.py index 06656bdd8..7db8919e9 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -131,7 +131,7 @@ def make_orth_variants( for word_idx in range(len(words)): for punct_idx in range(len(ndsv)): if ( - tags[word_idx] in ndsv[punct_idx]["TAG"] + tags[word_idx] in ndsv[punct_idx]["tags"] and words[word_idx] in ndsv[punct_idx]["variants"] ): words[word_idx] = punct_choices[punct_idx] From 3ee3649b525a9bc1ddd8f531a10ffc213d185e46 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 16:59:49 +0200 Subject: [PATCH 31/55] Fix augment --- spacy/training/augment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/training/augment.py b/spacy/training/augment.py index b6e22542a..e76ee49f7 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -144,8 +144,8 @@ def make_orth_variants( # backup option: random left vs. right from pair pair_idx = random.choice([0, 1]) # best option: rely on paired POS tags like `` / '' - if len(ndpv[punct_idx]["TAG"]) == 2: - pair_idx = ndpv[punct_idx]["TAG"].index(tags[word_idx]) + if len(ndpv[punct_idx]["tags"]) == 2: + pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) # next best option: rely on position in variants # (may not be unambiguous, so order of variants matters) else: From 84fedcebab288a19aebb4dc4462f346bf2cecc8f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 17:07:35 +0200 Subject: [PATCH 32/55] Make args keyword-only [ci skip] Co-authored-by: Matthew Honnibal --- spacy/cli/project/run.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 94d4371d0..ea4675d60 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -176,6 +176,7 @@ def validate_subcommand( def check_rerun( project_dir: Path, command: Dict[str, Any], + *, check_spacy_version: bool = True, check_spacy_commit: bool = False, ) -> bool: From f102ef6b54bbc0ddaf7c093dee7fcacaf667c2ed Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 5 Oct 2020 17:47:39 +0200 Subject: [PATCH 33/55] Read features.msgpack instead of features.pkl --- spacy/lang/zh/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 5d4d55aed..f9065f92c 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -157,7 +157,7 @@ class ChineseTokenizer(DummyTokenizer): self.pkuseg_seg.feature_extractor.save(tempdir) self.pkuseg_seg.model.save(tempdir) tempdir = Path(tempdir) - with open(tempdir / "features.pkl", "rb") as fileh: + with open(tempdir / "features.msgpack", "rb") as fileh: pkuseg_features_b = fileh.read() with open(tempdir / "weights.npz", "rb") as fileh: pkuseg_weights_b = fileh.read() @@ -198,7 +198,7 @@ class ChineseTokenizer(DummyTokenizer): if pkuseg_data["features_b"] and pkuseg_data["weights_b"]: with tempfile.TemporaryDirectory() as tempdir: tempdir = Path(tempdir) - with open(tempdir / "features.pkl", "wb") as fileh: + with open(tempdir / "features.msgpack", "wb") as fileh: fileh.write(pkuseg_data["features_b"]) with open(tempdir / "weights.npz", "wb") as fileh: fileh.write(pkuseg_data["weights_b"]) From d2806f11f2ad87b97a6571b6b71d5fe33f544ae0 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 5 Oct 2020 18:08:32 +0200 Subject: [PATCH 34/55] Update to spacy-pkuseg==0.0.26 in Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a4df0f8c8..3f10e79cc 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ SHELL := /bin/bash ifndef SPACY_EXTRAS -override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core +override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba spacy-pkuseg==0.0.26 sudachipy sudachidict_core endif ifndef PYVER From 6dcc4a0ba63370f2b27713b5f7e86e6a8de6c825 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 19:57:45 +0200 Subject: [PATCH 35/55] Simplify MultiHashEmbed signature --- spacy/ml/models/tok2vec.py | 48 +++++++++++----------------- spacy/tests/pipeline/test_tok2vec.py | 16 +++++----- spacy/tests/test_models.py | 8 ++--- 3 files changed, 31 insertions(+), 41 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 6e5aed77b..f0e846bac 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -55,13 +55,15 @@ def build_hash_embed_cnn_tok2vec( pretrained_vectors (bool): Whether to also use static vectors. """ if subword_features: - attrs = {"NORM": 1.0, "PREFIX": 0.5, "SUFFIX": 0.5, "SHAPE": 0.5} + attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] + row_sizes = [embed_size, embed_size//2, embed_size//2, embed_size//2] else: - attrs = {"NORM": 1.0} + attrs = ["NORM"] + row_sizes = [embed_size] return build_Tok2Vec_model( embed=MultiHashEmbed( width=width, - rows=embed_size, + rows=row_sizes, attrs=attrs, include_static_vectors=bool(pretrained_vectors), ), @@ -103,7 +105,7 @@ def MultiHashEmbed_v1( here as a temporary compatibility.""" return MultiHashEmbed( width=width, - rows=rows, + rows=[rows, rows//2, rows//2, rows//2] if also_embed_subwords else [rows], attrs=[NORM, PREFIX, SUFFIX, SHAPE] if also_embed_subwords else [NORM], include_static_vectors=also_use_static_vectors, ) @@ -112,8 +114,8 @@ def MultiHashEmbed_v1( @registry.architectures.register("spacy.MultiHashEmbed.v2") def MultiHashEmbed( width: int, - rows: int, - attrs: Union[List[Union[str, int]], Dict[Union[str, int], float]], + attrs: List[Union[str, int]], + rows: List[int], include_static_vectors: bool, ) -> Model[List[Doc], List[Floats2d]]: """Construct an embedding layer that separately embeds a number of lexical @@ -136,50 +138,38 @@ def MultiHashEmbed( The `rows` parameter controls the number of rows used by the `HashEmbed` tables. The HashEmbed layer needs surprisingly few rows, due to its use of the hashing trick. Generally between 2000 and 10000 rows is sufficient, - even for very large vocabularies. You can vary the number of rows per - attribute by specifying the attrs as a dict, mapping the keys to float - values which are interpreted as factors of `rows`. For instance, - attrs={"NORM": 1.0, PREFIX: 0.2} will use rows*1 for the NORM table and - rows*0.2 for the PREFIX table. If `attrs` is a list, factors of 1.0 are - assumed for all attributes. + even for very large vocabularies. A number of rows must be specified for each + table, so the `rows` list must be of the same length as the `attrs` parameter. width (int): The output width. Also used as the width of the embedding tables. Recommended values are between 64 and 300. - rows (int): The base number of rows for the embedding tables. Can be low, due - to the hashing trick. The rows can be varied per attribute by providing - a dictionary as the value of `attrs`. - attrs (dict or list of attr IDs): The token attributes to embed. A separate - embedding table will be constructed for each attribute. Attributes - can be specified as a list or as a dictionary, which lets you control - the number of rows used for each table. + attrs (list of attr IDs): The token attributes to embed. A separate + embedding table will be constructed for each attribute. + rows (List[int]): The number of rows in the embedding tables. Must have the + same length as attrs. include_static_vectors (bool): Whether to also use static word vectors. Requires a vectors table to be loaded in the Doc objects' vocab. """ - if isinstance(attrs, dict): - # Exclude tables that would have 0 rows. - attrs = {key: value for key, value in attrs.items() if value > 0.0} - indices = {attr: i for i, attr in enumerate(attrs)} seed = 7 - def make_hash_embed(feature): + def make_hash_embed(index): nonlocal seed - row_factor = attrs[feature] if isinstance(attrs, dict) else 1.0 seed += 1 return HashEmbed( width, - int(rows * row_factor), - column=indices[feature], + rows[index], + column=index, seed=seed, dropout=0.0, ) - embeddings = [make_hash_embed(attr) for attr in attrs] + embeddings = [make_hash_embed(i) for i in range(len(attrs))] concat_size = width * (len(embeddings) + include_static_vectors) if include_static_vectors: model = chain( concatenate( chain( - FeatureExtractor(list(attrs)), + FeatureExtractor(attrs), list2ragged(), with_array(concatenate(*embeddings)), ), diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index aa60faf5b..e86d97a54 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -24,7 +24,7 @@ def test_empty_doc(): tok2vec = build_Tok2Vec_model( MultiHashEmbed( width=width, - rows=embed_size, + rows=[embed_size, embed_size, embed_size, embed_size], include_static_vectors=False, attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"], ), @@ -44,7 +44,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): tok2vec = build_Tok2Vec_model( MultiHashEmbed( width=width, - rows=embed_size, + rows=[embed_size] * 4, include_static_vectors=False, attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"], ), @@ -61,8 +61,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): @pytest.mark.parametrize( "width,embed_arch,embed_config,encode_arch,encode_config", [ - (8, MultiHashEmbed, {"rows": 100, "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), - (8, MultiHashEmbed, {"rows": 100, "attrs": {"ORTH": 1.0, "PREFIX": 0.2}, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), + (8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), + (8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), ], @@ -116,11 +116,11 @@ cfg_string = """ @architectures = "spacy.Tok2Vec.v1" [components.tok2vec.model.embed] - @architectures = "spacy.MultiHashEmbed.v1" + @architectures = "spacy.MultiHashEmbed.v2" width = ${components.tok2vec.model.encode.width} - rows = 2000 - also_embed_subwords = true - also_use_static_vectors = false + rows = [2000, 1000, 1000, 1000] + attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] + include_static_vectors = false [components.tok2vec.model.encode] @architectures = "spacy.MaxoutWindowEncoder.v1" diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index 3bd3b903d..d621be0ba 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -63,7 +63,7 @@ def get_tok2vec_kwargs(): return { "embed": MultiHashEmbed( width=32, - rows=500, + rows=[500, 500, 500], attrs=["NORM", "PREFIX", "SHAPE"], include_static_vectors=False ), @@ -80,7 +80,7 @@ def test_tok2vec(): def test_multi_hash_embed(): embed = MultiHashEmbed( width=32, - rows=500, + rows=[500, 500, 500], attrs=["NORM", "PREFIX", "SHAPE"], include_static_vectors=False ) @@ -95,8 +95,8 @@ def test_multi_hash_embed(): # Now try with different row factors embed = MultiHashEmbed( width=32, - rows=500, - attrs={"NORM": 2.0, "PREFIX": 0.1, "SHAPE": 0.5}, + rows=[1000, 50, 250], + attrs=["NORM", "PREFIX", "SHAPE"], include_static_vectors=False ) hash_embeds = [node for node in embed.walk() if node.name == "hashembed"] From cdd2b79b6d2a87db04f59d478dfa0fd8c2d3abdb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 19:58:18 +0200 Subject: [PATCH 36/55] Remove deprecated MultiHashEmbed --- spacy/ml/models/tok2vec.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index f0e846bac..3a7da4a8e 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -98,20 +98,6 @@ def build_Tok2Vec_model( @registry.architectures.register("spacy.MultiHashEmbed.v1") -def MultiHashEmbed_v1( - width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool -) -> Model[List[Doc], List[Floats2d]]: - """Previous interface for MultiHashEmbed. This should be removed, it's only - here as a temporary compatibility.""" - return MultiHashEmbed( - width=width, - rows=[rows, rows//2, rows//2, rows//2] if also_embed_subwords else [rows], - attrs=[NORM, PREFIX, SUFFIX, SHAPE] if also_embed_subwords else [NORM], - include_static_vectors=also_use_static_vectors, - ) - - -@registry.architectures.register("spacy.MultiHashEmbed.v2") def MultiHashEmbed( width: int, attrs: List[Union[str, int]], From db84d175c3e5d661f9358b6d8b85cd2fe9316392 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 19:59:30 +0200 Subject: [PATCH 37/55] Fix test --- spacy/tests/pipeline/test_tok2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index e86d97a54..90882ae3f 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -116,7 +116,7 @@ cfg_string = """ @architectures = "spacy.Tok2Vec.v1" [components.tok2vec.model.embed] - @architectures = "spacy.MultiHashEmbed.v2" + @architectures = "spacy.MultiHashEmbed.v1" width = ${components.tok2vec.model.encode.width} rows = [2000, 1000, 1000, 1000] attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] From d58fb4270748b9a4d96b077d69532af2ee7ded05 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 20:00:42 +0200 Subject: [PATCH 38/55] Add spacy_version option and validation for project.yml --- spacy/cli/_util.py | 21 ++++++++++++++++++++- spacy/schemas.py | 1 + 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 676a7c8d7..373650172 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -16,7 +16,8 @@ import os from ..schemas import ProjectConfigSchema, validate from ..util import import_file, run_command, make_tempdir, registry, logger -from ..util import ENV_VARS +from ..util import is_compatible_version, ENV_VARS +from .. import about if TYPE_CHECKING: from pathy import Pathy # noqa: F401 @@ -142,6 +143,7 @@ def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]: msg.fail(invalid_err) print("\n".join(errors)) sys.exit(1) + validate_project_version(config) validate_project_commands(config) # Make sure directories defined in config exist for subdir in config.get("directories", []): @@ -167,6 +169,23 @@ def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}): return dict(interpolated["project"]) +def validate_project_version(config: Dict[str, Any]) -> None: + """If the project defines a compatible spaCy version range, chec that it's + compatible with the current version of spaCy. + + config (Dict[str, Any]): The loaded config. + """ + spacy_version = config.get("spacy_version", None) + if spacy_version and not is_compatible_version(about.__version__, spacy_version): + err = ( + f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) " + f"that's not compatible with the version of spaCy you're running " + f"({about.__version__}). You can edit version requirement in the " + f"{PROJECT_FILE} to load it, but the project may not run as expected." + ) + msg.fail(err, exits=1) + + def validate_project_commands(config: Dict[str, Any]) -> None: """Check that project commands and workflows are valid, don't contain duplicates, don't clash and only refer to commands that exist. diff --git a/spacy/schemas.py b/spacy/schemas.py index 591b7e134..0d88d4090 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -448,6 +448,7 @@ class ProjectConfigSchema(BaseModel): workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order") commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") title: Optional[str] = Field(None, title="Project title") + spacy_version: Optional[StrictStr] = Field(None, title="spaCy version range that the project is compatible with") # fmt: on class Config: From 582701519eb8454b60b138559a1f5c9e6684fbef Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 20:00:49 +0200 Subject: [PATCH 39/55] Remove __release__ flag --- spacy/about.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index dce627a38..9329b48e6 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,7 +1,6 @@ # fmt: off __title__ = "spacy-nightly" __version__ = "3.0.0a33" -__release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From e50047f1c5e9949894bbba0a3183295fc79f2f2b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 20:02:45 +0200 Subject: [PATCH 40/55] Check lengths match --- spacy/ml/models/tok2vec.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 3a7da4a8e..65d2bffbb 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -136,6 +136,8 @@ def MultiHashEmbed( include_static_vectors (bool): Whether to also use static word vectors. Requires a vectors table to be loaded in the Doc objects' vocab. """ + if len(rows) != len(attrs): + raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}") seed = 7 def make_hash_embed(index): From be99f1e4de604417bcee07602ae08178a23f6ede Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 20:11:16 +0200 Subject: [PATCH 41/55] Remove output dirs before training (#6204) * Remove output dirs before training * Re-raise error if cleaning fails --- spacy/errors.py | 4 ++++ spacy/training/loop.py | 32 +++++++++++++++++++++++++++----- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 9d9a716d2..bf3628ce9 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -456,6 +456,10 @@ class Errors: "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master + E901 = ("Failed to remove existing output directory: {path}. If your " + "config and the components you train change between runs, a " + "non-empty output directory can lead to stale pipeline data. To " + "solve this, remove the existing directories in the output directory.") E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. " "Try checking whitespace and delimiters. See " "https://nightly.spacy.io/api/cli#convert") diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 0d4414964..67f61567e 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -3,19 +3,24 @@ from typing import Optional, TYPE_CHECKING from pathlib import Path from timeit import default_timer as timer from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator +from wasabi import Printer import random -import wasabi import sys +import shutil from .example import Example from ..schemas import ConfigSchemaTraining from ..errors import Errors -from ..util import resolve_dot_names, registry +from ..util import resolve_dot_names, registry, logger if TYPE_CHECKING: from ..language import Language # noqa: F401 +DIR_MODEL_BEST = "model-best" +DIR_MODEL_LAST = "model-last" + + def train( nlp: "Language", output_path: Optional[Path] = None, @@ -38,7 +43,7 @@ def train( RETURNS (Path / None): The path to the final exported model. """ # We use no_print here so we can respect the stdout/stderr options. - msg = wasabi.Printer(no_print=True) + msg = Printer(no_print=True) # Create iterator, which yields out info after each optimization step. config = nlp.config.interpolate() if config["training"]["seed"] is not None: @@ -69,6 +74,7 @@ def train( eval_frequency=T["eval_frequency"], exclude=frozen_components, ) + clean_output_dir(output_path) stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n") if frozen_components: stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n") @@ -83,7 +89,7 @@ def train( update_meta(T, nlp, info) with nlp.use_params(optimizer.averages): nlp = before_to_disk(nlp) - nlp.to_disk(output_path / "model-best") + nlp.to_disk(output_path / DIR_MODEL_BEST) except Exception as e: if output_path is not None: # We don't want to swallow the traceback if we don't have a @@ -100,7 +106,7 @@ def train( finally: finalize_logger() if output_path is not None: - final_model_path = output_path / "model-last" + final_model_path = output_path / DIR_MODEL_LAST if optimizer.averages: with nlp.use_params(optimizer.averages): nlp.to_disk(final_model_path) @@ -305,3 +311,19 @@ def create_before_to_disk_callback( return modified_nlp return before_to_disk + + +def clean_output_dir(path: Union[str, Path]) -> None: + """Remove an existing output directory. Typically used to ensure that that + a directory like model-best and its contents aren't just being overwritten + by nlp.to_disk, which could preserve existing subdirectories (e.g. + components that don't exist anymore). + """ + if path is not None and path.exists(): + for subdir in [path / DIR_MODEL_BEST, path / DIR_MODEL_LAST]: + if subdir.exists(): + try: + shutil.rmtree(str(subdir)) + logger.debug(f"Removed existing output directory: {subdir}") + except Exception as e: + raise IOError(Errors.E901.format(path=path)) from e From b392d48e7667b95d820bf120dae4ab4a719af497 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 20:17:07 +0200 Subject: [PATCH 42/55] Fix test --- spacy/tests/test_models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index d621be0ba..bad964786 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -6,7 +6,6 @@ from numpy.testing import assert_array_equal import numpy from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder -from spacy.ml.models import MultiHashEmbed_v1 from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier from spacy.lang.en import English from spacy.lang.en.examples import sentences as EN_SENTENCES From 919790cb47b408c827e4cb40a1c6d3343fe0a28f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 20:28:21 +0200 Subject: [PATCH 43/55] Upd MultiHashEmbed docs --- website/docs/api/architectures.md | 51 +++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 5cee45ba5..cea390bb1 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -136,25 +136,50 @@ argument that connects to the shared `tok2vec` component in the pipeline. > [model] > @architectures = "spacy.MultiHashEmbed.v1" > width = 64 -> rows = 2000 -> also_embed_subwords = false -> also_use_static_vectors = false +> attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] +> rows = [2000, 1000, 1000, 1000] +> include_static_vectors = true > ``` Construct an embedding layer that separately embeds a number of lexical -attributes using hash embedding, concatenates the results, and passes it through -a feed-forward subnetwork to build mixed representations. The features used are -the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a -[FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static -vectors can also be incorporated into the concatenated representation. +attributes using hash embedding, concatenates the results, and passes it +through a feed-forward subnetwork to build a mixed representations. + +The features used can be configured with the 'attrs' argument. The suggested +attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into +account some subword information, without contruction a fully character-based +representation. If pretrained vectors are available, they can be included in +the representation as well, with the vectors table will be kept static +(i.e. it's not updated). + +The `width` parameter specifices the output width of the layer and the widths +of all embedding tables. If static vectors are included, a learned linear +layer is used to map the vectors to the specified width before concatenating +it with the other embedding outputs. A single Maxout layer is then used to +reduce the concatenated vectors to the final width. + +The `rows` parameter controls the number of rows used by the `HashEmbed` +tables. The HashEmbed layer needs surprisingly few rows, due to its use of +the hashing trick. Generally between 2000 and 10000 rows is sufficient, +even for very large vocabularies. A number of rows must be specified for each +table, so the `rows` list must be of the same length as the `attrs` parameter. + + attrs (list of attr IDs): The token attributes to embed. A separate + embedding table will be constructed for each attribute. + rows (List[int]): The number of rows in the embedding tables. Must have the + same length as attrs. + include_static_vectors (bool): Whether to also use static word vectors. + Requires a vectors table to be loaded in the Doc objects' vocab. + | Name | Description | | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~ | -| `rows` | The number of rows for the embedding tables. Can be low, due to the hashing trick. Embeddings for prefix, suffix and word shape use half as many rows. Recommended values are between `2000` and `10000`. ~~int~~ | -| `also_embed_subwords` | Whether to use the `PREFIX`, `SUFFIX` and `SHAPE` features in the embeddings. If not using these, you may need more rows in your hash embeddings, as there will be increased chance of collisions. ~~bool~~ | -| `also_use_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | +| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~ | +| `attrs` | The token attributes to embed. A separate | +embedding table will be constructed for each attribute. ~~List[Union[int, str]]~~ | +| `rows` | The number of rows for each embedding tables. Can be low, due to the hashing trick. Recommended values are between `1000` and `10000`. ~~List[int]~~ | +| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ### spacy.CharacterEmbed.v1 {#CharacterEmbed} From 0135f6ed95de6cc2bd7639f491f7a43c4e693116 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 20:51:15 +0200 Subject: [PATCH 44/55] Enable commit check via env var --- spacy/cli/project/remote_storage.py | 6 ++++-- spacy/cli/project/run.py | 6 ++++-- spacy/util.py | 14 ++++++++++++++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py index 7e2caa8d7..6056458e2 100644 --- a/spacy/cli/project/remote_storage.py +++ b/spacy/cli/project/remote_storage.py @@ -7,7 +7,8 @@ import tarfile from pathlib import Path from .._util import get_hash, get_checksum, download_file, ensure_pathy -from ...util import make_tempdir, get_minor_version +from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var +from ...git_info import GIT_VERSION from ... import about if TYPE_CHECKING: @@ -130,7 +131,8 @@ def get_command_hash( currently installed packages, whatever environment variables have been marked as relevant, and the command. """ - spacy_v = get_minor_version(about.__version__) + check_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION) + spacy_v = GIT_VERSION if check_commit else get_minor_version(about.__version__) dep_checksums = [get_checksum(dep) for dep in sorted(deps)] hashes = [spacy_v, site_hash, env_hash] + dep_checksums hashes.extend(cmd) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index ea4675d60..1a9b447ea 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -7,7 +7,8 @@ import srsly from ... import about from ...git_info import GIT_VERSION from ...util import working_dir, run_command, split_command, is_cwd, join_command -from ...util import SimpleFrozenList, is_minor_version_match +from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS +from ...util import check_bool_env_var from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash from .._util import get_checksum, project_cli, Arg, Opt, COMMAND @@ -64,9 +65,10 @@ def project_run( err_help = "Maybe you forgot to run the 'project assets' command or a previous step?" err_kwargs = {"exits": 1} if not dry else {} msg.fail(err, err_help, **err_kwargs) + check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION) with working_dir(project_dir) as current_dir: msg.divider(subcommand) - rerun = check_rerun(current_dir, cmd) + rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit) if not rerun and not force: msg.info(f"Skipping '{cmd['name']}': nothing changed") else: diff --git a/spacy/util.py b/spacy/util.py index 4b2cb018a..aa321b22f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -73,6 +73,7 @@ logger = logging.getLogger("spacy") class ENV_VARS: CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES" + PROJECT_USE_GIT_VERSION = "SPACY_PROJECT_USE_GIT_VERSION" class registry(thinc.registry): @@ -1342,3 +1343,16 @@ def is_cython_func(func: Callable) -> bool: cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]] return hasattr(cls_func, attr) return False + + +def check_bool_env_var(env_var: str) -> bool: + """Convert the value of an environment variable to a boolean. Add special + check for "0" (falsy) and consider everything else truthy, except unset. + + env_var (str): The name of the environment variable to check. + RETURNS (bool): Its boolean value. + """ + value = os.environ.get(env_var, False) + if value == "0": + return False + return bool(value) From 706b7f6973e4f62622bf96370016b25878ec950f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 20:51:22 +0200 Subject: [PATCH 45/55] Update docs --- website/docs/usage/projects.md | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 6d5746308..5fced922d 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -216,15 +216,16 @@ pipelines. %%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml ``` -| Section | Description | -| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | -| `description` | An optional project description used in [auto-generated docs](#custom-docs). | -| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | -| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | -| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | -| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. | -| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. | +| Section | Description | +| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | +| `description` | An optional project description used in [auto-generated docs](#custom-docs). | +| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | +| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | +| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | +| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. | +| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. | +| `spacy_version` | Optional spaCy version range like `>=3.0.0,<3.1.0` that the project is compatible with. If it's loaded with an incompatible version, an error is raised when the project is loaded. | ### Data assets {#data-assets} From 9aa07ad0018cb1e912aeeb97b9a0bde0ead7edfb Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 21:05:41 +0200 Subject: [PATCH 46/55] Update quickstarts [ci skip] --- website/src/styles/quickstart.module.sass | 11 ++++-- website/src/widgets/quickstart-install.js | 47 +++++++++++++++++++---- website/src/widgets/quickstart-models.js | 22 +++++++---- 3 files changed, 62 insertions(+), 18 deletions(-) diff --git a/website/src/styles/quickstart.module.sass b/website/src/styles/quickstart.module.sass index a08d6bcb6..8ad106a78 100644 --- a/website/src/styles/quickstart.module.sass +++ b/website/src/styles/quickstart.module.sass @@ -38,7 +38,7 @@ cursor: pointer display: inline-block padding: 0.35rem 0.5rem 0.25rem 0 - margin: 0 1rem 0.75rem 0 + margin: 0 1rem 0.5rem 0 font-size: var(--font-size-xs) font-weight: bold @@ -73,16 +73,19 @@ background: var(--color-theme) .checkbox + &:before + $size: 18px content: "" display: inline-block - width: 20px - height: 20px + width: $size + height: $size border: 1px solid var(--color-subtle) vertical-align: middle margin-right: 0.5rem cursor: pointer - border-radius: var(--border-radius) + border-radius: $size / 4 background: var(--color-back) + position: relative + top: -1px .checkbox:checked + &:before // Embed "check" icon here for simplicity diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js index 741973945..a8bdf21dc 100644 --- a/website/src/widgets/quickstart-install.js +++ b/website/src/widgets/quickstart-install.js @@ -4,6 +4,8 @@ import { StaticQuery, graphql } from 'gatsby' import { Quickstart, QS } from '../components/quickstart' import { repo } from '../components/util' +const DEFAULT_MODELS = ['en'] +const DEFAULT_OPT = 'efficiency' const DEFAULT_HARDWARE = 'cpu' const DEFAULT_CUDA = 'cuda100' const CUDA = { @@ -68,9 +70,13 @@ const QuickstartInstall = ({ id, title }) => { const [train, setTrain] = useState(false) const [hardware, setHardware] = useState(DEFAULT_HARDWARE) const [cuda, setCuda] = useState(DEFAULT_CUDA) + const [selectedModels, setModels] = useState(DEFAULT_MODELS) + const [efficiency, setEfficiency] = useState(DEFAULT_OPT === 'efficiency') const setters = { hardware: v => (Array.isArray(v) ? setHardware(v[0]) : setCuda(v)), config: v => setTrain(v.includes('train')), + models: setModels, + optimize: v => setEfficiency(v.includes('efficiency')), } const showDropdown = { hardware: () => hardware === 'gpu', @@ -89,13 +95,37 @@ const QuickstartInstall = ({ id, title }) => { ...DATA, { id: 'models', - title: 'Trained Pipelines', + title: 'Trained pipelines', multiple: true, options: models .sort((a, b) => a.name.localeCompare(b.name)) - .map(({ code, name }) => ({ id: code, title: name })), + .map(({ code, name }) => ({ + id: code, + title: name, + checked: DEFAULT_MODELS.includes(code), + })), }, ] + if (selectedModels.length) { + data.push({ + id: 'optimize', + title: 'Select pipeline for', + options: [ + { + id: 'efficiency', + title: 'efficiency', + checked: DEFAULT_OPT === 'efficiency', + help: 'Faster and smaller pipeline, but less accurate', + }, + { + id: 'accuracy', + title: 'accuracy', + checked: DEFAULT_OPT === 'accuracy', + help: 'Larger and slower pipeline, but more accurate', + }, + ], + }) + } return ( { conda install -c conda-forge spacy-lookups-data - {models.map(({ code, models: modelOptions }) => ( - - python -m spacy download {modelOptions[0]} - - ))} + {models.map(({ code, models: modelOptions }) => { + const pkg = modelOptions[efficiency ? 0 : modelOptions.length - 1] + return ( + + python -m spacy download {pkg} + + ) + })} ) }} diff --git a/website/src/widgets/quickstart-models.js b/website/src/widgets/quickstart-models.js index ffd1b3df9..5f94c60cb 100644 --- a/website/src/widgets/quickstart-models.js +++ b/website/src/widgets/quickstart-models.js @@ -31,25 +31,33 @@ const data = [ }, { id: 'optimize', - title: 'Optimize for', - help: - 'Optimize for efficiency (faster & smaller model) or higher accuracy (larger & slower model)', + title: 'Select for', options: [ - { id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' }, - { id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' }, + { + id: 'efficiency', + title: 'efficiency', + checked: DEFAULT_OPT === 'efficiency', + help: 'Faster and smaller pipeline, but less accurate', + }, + { + id: 'accuracy', + title: 'accuracy', + checked: DEFAULT_OPT === 'accuracy', + help: 'Larger and slower pipeline, but more accurate', + }, ], }, { id: 'config', title: 'Options', multiple: true, - options: [{ id: 'example', title: 'Show usage example' }], + options: [{ id: 'example', title: 'Show text example' }], }, ] const QuickstartInstall = ({ id, title, description, children }) => { const [lang, setLang] = useState(DEFAULT_LANG) - const [efficiency, setEfficiency] = useState(DEFAULT_OPT) + const [efficiency, setEfficiency] = useState(DEFAULT_OPT === 'efficiency') const setters = { lang: setLang, optimize: v => setEfficiency(v.includes('efficiency')), From 91d0fbb58821fcecf4b4af3d2bb32d12b490c565 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 21:13:53 +0200 Subject: [PATCH 47/55] Fix test --- spacy/tests/serialize/test_serialize_config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index da048f3d6..8b3f5c2b8 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -89,9 +89,9 @@ def my_parser(): tok2vec = build_Tok2Vec_model( MultiHashEmbed( width=321, - rows=5432, - also_embed_subwords=True, - also_use_static_vectors=False, + attrs=["LOWER", "SHAPE"], + rows=[5432, 5432], + include_static_vectors=False, ), MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2), ) From ff8b9807750e045f40c9a40208eba8c575c714cc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 21:19:41 +0200 Subject: [PATCH 48/55] Upd quickstart template --- spacy/cli/templates/quickstart_training.jinja | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 3bd237b0a..c3419e67d 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -171,8 +171,13 @@ factory = "tok2vec" [components.tok2vec.model.embed] @architectures = "spacy.MultiHashEmbed.v1" width = ${components.tok2vec.model.encode.width} -rows = {{ 2000 if optimize == "efficiency" else 7000 }} -also_embed_subwords = {{ "true" if has_letters else "false" }} +{% if has_letters -%} +attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] +rows = [5000, 2500, 2500, 2500] +{% else -%} +attrs = ["ORTH", "SHAPE"] +rows = [5000, 2500] +{% endif -%} also_use_static_vectors = {{ "true" if optimize == "accuracy" else "false" }} [components.tok2vec.model.encode] From b7e01d20246efbeeb1c6f9babbb08ac965a45582 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 21:21:30 +0200 Subject: [PATCH 49/55] Fix quickstart --- spacy/cli/templates/quickstart_training.jinja | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index c3419e67d..d92de9c15 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -178,7 +178,7 @@ rows = [5000, 2500, 2500, 2500] attrs = ["ORTH", "SHAPE"] rows = [5000, 2500] {% endif -%} -also_use_static_vectors = {{ "true" if optimize == "accuracy" else "false" }} +include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }} [components.tok2vec.model.encode] @architectures = "spacy.MaxoutWindowEncoder.v1" From 4cf73d85bc86e2b31a517437ef68ed8dd87f5038 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 21:37:09 +0200 Subject: [PATCH 50/55] Add [zh] to extras [ci skip] --- setup.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.cfg b/setup.cfg index d8362c4bd..e77bda2fc 100644 --- a/setup.cfg +++ b/setup.cfg @@ -92,6 +92,8 @@ ko = natto-py==0.9.0 th = pythainlp>=2.0 +zh = + spacy-pkuseg==0.0.26 [bdist_wheel] universal = false From 2d0c0134bcaa2527a40d13e62be594bf05ac389b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 21:38:23 +0200 Subject: [PATCH 51/55] Adjust message [ci skip] --- spacy/lang/zh/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index f9065f92c..ed988c1ba 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -206,7 +206,7 @@ class ChineseTokenizer(DummyTokenizer): import spacy_pkuseg except ImportError: raise ImportError( - "spacy_pkuseg not installed. To use this model, " + "spacy-pkuseg not installed. To use this model, " + _PKUSEG_INSTALL_MSG ) from None self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir)) @@ -256,7 +256,7 @@ class ChineseTokenizer(DummyTokenizer): except ImportError: if self.segmenter == Segmenter.pkuseg: raise ImportError( - "spacy_pkuseg not installed. To use this model, " + "spacy-pkuseg not installed. To use this model, " + _PKUSEG_INSTALL_MSG ) from None if path.exists(): @@ -317,7 +317,7 @@ def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None: import spacy_pkuseg except ImportError: - msg = "spacy_pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG + msg = "spacy-pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG raise ImportError(msg) from None try: return spacy_pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) From 8a39d5414e536d3ff5c3cde1fae71f604d1b3762 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 21:43:51 +0200 Subject: [PATCH 52/55] Update quickstart [ci skip] --- website/src/widgets/quickstart-install.js | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js index a8bdf21dc..ab91b8e30 100644 --- a/website/src/widgets/quickstart-install.js +++ b/website/src/widgets/quickstart-install.js @@ -17,6 +17,7 @@ const CUDA = { '10.1': 'cuda101', '10.2': 'cuda102', } +const LANG_EXTRAS = ['zh', 'ja'] // only for languages with models const DATA = [ { id: 'os', @@ -81,7 +82,13 @@ const QuickstartInstall = ({ id, title }) => { const showDropdown = { hardware: () => hardware === 'gpu', } - const pipExtras = [hardware === 'gpu' && cuda, train && 'transformers', train && 'lookups'] + const modelExtras = train ? selectedModels.filter(m => LANG_EXTRAS.includes(m)) : [] + const pipExtras = [ + hardware === 'gpu' && cuda, + train && 'transformers', + train && 'lookups', + ...modelExtras, + ] .filter(e => e) .join(',') return ( From 9614e53b02749e8fec394c0f8a7f965a392918d2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 21:55:18 +0200 Subject: [PATCH 53/55] Tidy up and auto-format --- spacy/ml/models/tok2vec.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 65d2bffbb..61edb86c4 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -1,4 +1,4 @@ -from typing import Optional, List, Union, Dict +from typing import Optional, List, Union from thinc.types import Floats2d from thinc.api import chain, clone, concatenate, with_array, with_padded from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed @@ -11,7 +11,7 @@ from ...ml import _character_embed from ..staticvectors import StaticVectors from ..featureextractor import FeatureExtractor from ...pipeline.tok2vec import Tok2VecListener -from ...attrs import ORTH, NORM, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr +from ...attrs import intify_attr @registry.architectures.register("spacy.Tok2VecListener.v1") @@ -29,7 +29,7 @@ def build_hash_embed_cnn_tok2vec( window_size: int, maxout_pieces: int, subword_features: bool, - pretrained_vectors: Optional[bool] + pretrained_vectors: Optional[bool], ) -> Model[List[Doc], List[Floats2d]]: """Build spaCy's 'standard' tok2vec layer, which uses hash embedding with subword features and a CNN with layer-normalized maxout. @@ -56,7 +56,7 @@ def build_hash_embed_cnn_tok2vec( """ if subword_features: attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] - row_sizes = [embed_size, embed_size//2, embed_size//2, embed_size//2] + row_sizes = [embed_size, embed_size // 2, embed_size // 2, embed_size // 2] else: attrs = ["NORM"] row_sizes = [embed_size] @@ -120,7 +120,7 @@ def MultiHashEmbed( layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single Maxout layer is then used to reduce the concatenated vectors to the final width. - + The `rows` parameter controls the number of rows used by the `HashEmbed` tables. The HashEmbed layer needs surprisingly few rows, due to its use of the hashing trick. Generally between 2000 and 10000 rows is sufficient, @@ -143,13 +143,7 @@ def MultiHashEmbed( def make_hash_embed(index): nonlocal seed seed += 1 - return HashEmbed( - width, - rows[index], - column=index, - seed=seed, - dropout=0.0, - ) + return HashEmbed(width, rows[index], column=index, seed=seed, dropout=0.0) embeddings = [make_hash_embed(i) for i in range(len(attrs))] concat_size = width * (len(embeddings) + include_static_vectors) From 1a554bdcb14f7409bf4111092962b0f9ba0000c4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 21:55:27 +0200 Subject: [PATCH 54/55] Update docs and docstring [ci skip] --- spacy/ml/models/tok2vec.py | 4 +-- website/docs/api/architectures.md | 52 +++++++++---------------------- 2 files changed, 17 insertions(+), 39 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 61edb86c4..23cfe883b 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -110,12 +110,12 @@ def MultiHashEmbed( The features used can be configured with the 'attrs' argument. The suggested attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into - account some subword information, without contruction a fully character-based + account some subword information, without construction a fully character-based representation. If pretrained vectors are available, they can be included in the representation as well, with the vectors table will be kept static (i.e. it's not updated). - The `width` parameter specifices the output width of the layer and the widths + The `width` parameter specifies the output width of the layer and the widths of all embedding tables. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single Maxout layer is then used to diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index cea390bb1..5246a3ed6 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -142,44 +142,22 @@ argument that connects to the shared `tok2vec` component in the pipeline. > ``` Construct an embedding layer that separately embeds a number of lexical -attributes using hash embedding, concatenates the results, and passes it -through a feed-forward subnetwork to build a mixed representations. +attributes using hash embedding, concatenates the results, and passes it through +a feed-forward subnetwork to build a mixed representations. The features used +can be configured with the `attrs` argument. The suggested attributes are +`NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account +some subword information, without construction a fully character-based +representation. If pretrained vectors are available, they can be included in the +representation as well, with the vectors table will be kept static (i.e. it's +not updated). -The features used can be configured with the 'attrs' argument. The suggested -attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into -account some subword information, without contruction a fully character-based -representation. If pretrained vectors are available, they can be included in -the representation as well, with the vectors table will be kept static -(i.e. it's not updated). - -The `width` parameter specifices the output width of the layer and the widths -of all embedding tables. If static vectors are included, a learned linear -layer is used to map the vectors to the specified width before concatenating -it with the other embedding outputs. A single Maxout layer is then used to -reduce the concatenated vectors to the final width. - -The `rows` parameter controls the number of rows used by the `HashEmbed` -tables. The HashEmbed layer needs surprisingly few rows, due to its use of -the hashing trick. Generally between 2000 and 10000 rows is sufficient, -even for very large vocabularies. A number of rows must be specified for each -table, so the `rows` list must be of the same length as the `attrs` parameter. - - attrs (list of attr IDs): The token attributes to embed. A separate - embedding table will be constructed for each attribute. - rows (List[int]): The number of rows in the embedding tables. Must have the - same length as attrs. - include_static_vectors (bool): Whether to also use static word vectors. - Requires a vectors table to be loaded in the Doc objects' vocab. - - -| Name | Description | -| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~ | -| `attrs` | The token attributes to embed. A separate | -embedding table will be constructed for each attribute. ~~List[Union[int, str]]~~ | -| `rows` | The number of rows for each embedding tables. Can be low, due to the hashing trick. Recommended values are between `1000` and `10000`. ~~List[int]~~ | -| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | +| Name | Description | +| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~ | +| `attrs` | The token attributes to embed. A separate embedding table will be constructed for each attribute. ~~List[Union[int, str]]~~ | +| `rows` | The number of rows for each embedding tables. Can be low, due to the hashing trick. Recommended values are between `1000` and `10000`. The layer needs surprisingly few rows, due to its use of the hashing trick. Generally between 2000 and 10000 rows is sufficient, even for very large vocabularies. A number of rows must be specified for each table, so the `rows` list must be of the same length as the `attrs` parameter. ~~List[int]~~ | +| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ### spacy.CharacterEmbed.v1 {#CharacterEmbed} From 126268ce50d08d38aefa15e7925632c156c792d4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 21:58:18 +0200 Subject: [PATCH 55/55] Auto-format [ci skip] --- spacy/lang/uk/__init__.py | 4 +++- spacy/lang/zh/__init__.py | 6 ++---- spacy/tests/doc/test_retokenize_split.py | 10 +++++++--- spacy/tests/pipeline/test_pipe_methods.py | 2 +- spacy/tests/test_models.py | 8 ++++---- spacy/training/augment.py | 2 +- 6 files changed, 18 insertions(+), 14 deletions(-) diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index 0abe9170e..24c88e5a7 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -26,7 +26,9 @@ class Ukrainian(Language): default_config={"model": None, "mode": "pymorphy2"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False,): +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False +): return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index ed988c1ba..30560ed0d 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -54,9 +54,7 @@ def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char): class ChineseTokenizer(DummyTokenizer): - def __init__( - self, nlp: Language, segmenter: Segmenter = Segmenter.char, - ): + def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char): self.vocab = nlp.vocab if isinstance(segmenter, Segmenter): segmenter = segmenter.value @@ -87,7 +85,7 @@ class ChineseTokenizer(DummyTokenizer): if pkuseg_user_dict is None: pkuseg_user_dict = pkuseg_model self.pkuseg_seg = try_pkuseg_import( - pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict, + pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict ) def __call__(self, text: str) -> Doc: diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index da4a46a47..30f945165 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -209,9 +209,13 @@ def test_doc_retokenizer_split_norm(en_vocab): # Retokenize to split out the words in the token at doc[2]. token = doc[2] with doc.retokenize() as retokenizer: - retokenizer.split(token, ["brown", "fox", "jumps", "over", "the"], heads=[(token, idx) for idx in range(5)]) + retokenizer.split( + token, + ["brown", "fox", "jumps", "over", "the"], + heads=[(token, idx) for idx in range(5)], + ) - assert doc[9].text == "w/" + assert doc[9].text == "w/" assert doc[9].norm_ == "with" - assert doc[5].text == "over" + assert doc[5].text == "over" assert doc[5].norm_ == "over" diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index a4297a1d1..4b96992e1 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -350,7 +350,7 @@ def test_pipe_methods_frozen(): @pytest.mark.parametrize( - "pipe", ["tagger", "parser", "ner", "textcat", "morphologizer"], + "pipe", ["tagger", "parser", "ner", "textcat", "morphologizer"] ) def test_pipe_label_data_exports_labels(pipe): nlp = Language() diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index bad964786..17408f7e8 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -64,7 +64,7 @@ def get_tok2vec_kwargs(): width=32, rows=[500, 500, 500], attrs=["NORM", "PREFIX", "SHAPE"], - include_static_vectors=False + include_static_vectors=False, ), "encode": MaxoutWindowEncoder( width=32, depth=2, maxout_pieces=2, window_size=1 @@ -81,7 +81,7 @@ def test_multi_hash_embed(): width=32, rows=[500, 500, 500], attrs=["NORM", "PREFIX", "SHAPE"], - include_static_vectors=False + include_static_vectors=False, ) hash_embeds = [node for node in embed.walk() if node.name == "hashembed"] assert len(hash_embeds) == 3 @@ -96,11 +96,11 @@ def test_multi_hash_embed(): width=32, rows=[1000, 50, 250], attrs=["NORM", "PREFIX", "SHAPE"], - include_static_vectors=False + include_static_vectors=False, ) hash_embeds = [node for node in embed.walk() if node.name == "hashembed"] assert [he.get_dim("nV") for he in hash_embeds] == [1000, 50, 250] - + @pytest.mark.parametrize( "seed,model_func,kwargs", diff --git a/spacy/training/augment.py b/spacy/training/augment.py index e76ee49f7..13ae45bd2 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -64,7 +64,7 @@ def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]: def lower_casing_augmenter( - nlp: "Language", example: Example, *, level: float, + nlp: "Language", example: Example, *, level: float ) -> Iterator[Example]: if random.random() >= level: yield example