From 59b0b26cc6a973e528bd190e8f19397ceab81ee6 Mon Sep 17 00:00:00 2001 From: richardpaulhudson Date: Fri, 9 Dec 2022 21:06:16 +0100 Subject: [PATCH] Changes based on review comments --- spacy/ml/models/lemmatizer.py | 5 ++++- spacy/pipeline/edit_tree_lemmatizer.py | 13 ++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/spacy/ml/models/lemmatizer.py b/spacy/ml/models/lemmatizer.py index c42e0756e..f537983b2 100644 --- a/spacy/ml/models/lemmatizer.py +++ b/spacy/ml/models/lemmatizer.py @@ -13,6 +13,7 @@ def build_lemmatizer_model( nO: Optional[int] = None, normalize=False, lowercasing=True, + lowercasing_relu_width: Optional[int] = 50, ) -> Model[List[Doc], Union[List[Floats2d]]]: """Build a model for the edit-tree lemmatizer, using a provided token-to-vector component. A linear layer with softmax activation is added to predict scores @@ -38,7 +39,9 @@ def build_lemmatizer_model( model = tok2vec >> with_array(softmax) if lowercasing: lowercasing_output = Sigmoid(1) - sigmoid_appendage = Relu(50) >> Dropout(0.2) >> lowercasing_output + sigmoid_appendage = ( + Relu(lowercasing_relu_width) >> Dropout() >> lowercasing_output + ) model |= tok2vec >> with_array(sigmoid_appendage) model.set_ref("lowercasing_output", lowercasing_output) return model diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py index b19511790..58fc1ad87 100644 --- a/spacy/pipeline/edit_tree_lemmatizer.py +++ b/spacy/pipeline/edit_tree_lemmatizer.py @@ -25,6 +25,7 @@ default_model_config = """ [model] @architectures = "spacy.Lemmatizer.v1" lowercasing = true +lowercasing_relu_width = 50 [model.tok2vec] @architectures = "spacy.HashEmbedCNN.v2" @@ -204,7 +205,7 @@ class EditTreeLemmatizer(TrainablePipe): self, docs, scores, lowercasing_flags: Optional[List[Floats2d]] ): guesses = [] - for i, (doc, doc_scores) in enumerate(zip(docs, scores)): + for (i, doc, doc_scores) in zip(range(len(docs)), docs, scores): if self.top_k == 1: doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1) else: @@ -214,11 +215,13 @@ class EditTreeLemmatizer(TrainablePipe): doc_guesses = doc_guesses.get() doc_compat_guesses = [] - for j, (token, candidates) in enumerate(zip(doc, doc_guesses)): - to_lowercase = False + for (j, token, candidates) in zip(range(len(doc)), doc, doc_guesses): if lowercasing_flags is not None and lowercasing_flags[i][j] > 0.5: - to_lowercase = True - text = token.lower_ if to_lowercase else token.text + to_lowercase = 1 + text = token.lower_ + else: + to_lowercase = 0 + text = token.text tree_id = -1 for candidate in candidates: candidate_tree_id = self.cfg["labels"][candidate]