Changes based on review comments

This commit is contained in:
richardpaulhudson 2022-12-09 21:06:16 +01:00
parent 449559cc2d
commit 59b0b26cc6
2 changed files with 12 additions and 6 deletions

View File

@ -13,6 +13,7 @@ def build_lemmatizer_model(
nO: Optional[int] = None, nO: Optional[int] = None,
normalize=False, normalize=False,
lowercasing=True, lowercasing=True,
lowercasing_relu_width: Optional[int] = 50,
) -> Model[List[Doc], Union[List[Floats2d]]]: ) -> Model[List[Doc], Union[List[Floats2d]]]:
"""Build a model for the edit-tree lemmatizer, using a provided token-to-vector component. """Build a model for the edit-tree lemmatizer, using a provided token-to-vector component.
A linear layer with softmax activation is added to predict scores A linear layer with softmax activation is added to predict scores
@ -38,7 +39,9 @@ def build_lemmatizer_model(
model = tok2vec >> with_array(softmax) model = tok2vec >> with_array(softmax)
if lowercasing: if lowercasing:
lowercasing_output = Sigmoid(1) lowercasing_output = Sigmoid(1)
sigmoid_appendage = Relu(50) >> Dropout(0.2) >> lowercasing_output sigmoid_appendage = (
Relu(lowercasing_relu_width) >> Dropout() >> lowercasing_output
)
model |= tok2vec >> with_array(sigmoid_appendage) model |= tok2vec >> with_array(sigmoid_appendage)
model.set_ref("lowercasing_output", lowercasing_output) model.set_ref("lowercasing_output", lowercasing_output)
return model return model

View File

@ -25,6 +25,7 @@ default_model_config = """
[model] [model]
@architectures = "spacy.Lemmatizer.v1" @architectures = "spacy.Lemmatizer.v1"
lowercasing = true lowercasing = true
lowercasing_relu_width = 50
[model.tok2vec] [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2" @architectures = "spacy.HashEmbedCNN.v2"
@ -204,7 +205,7 @@ class EditTreeLemmatizer(TrainablePipe):
self, docs, scores, lowercasing_flags: Optional[List[Floats2d]] self, docs, scores, lowercasing_flags: Optional[List[Floats2d]]
): ):
guesses = [] guesses = []
for i, (doc, doc_scores) in enumerate(zip(docs, scores)): for (i, doc, doc_scores) in zip(range(len(docs)), docs, scores):
if self.top_k == 1: if self.top_k == 1:
doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1) doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1)
else: else:
@ -214,11 +215,13 @@ class EditTreeLemmatizer(TrainablePipe):
doc_guesses = doc_guesses.get() doc_guesses = doc_guesses.get()
doc_compat_guesses = [] doc_compat_guesses = []
for j, (token, candidates) in enumerate(zip(doc, doc_guesses)): for (j, token, candidates) in zip(range(len(doc)), doc, doc_guesses):
to_lowercase = False
if lowercasing_flags is not None and lowercasing_flags[i][j] > 0.5: if lowercasing_flags is not None and lowercasing_flags[i][j] > 0.5:
to_lowercase = True to_lowercase = 1
text = token.lower_ if to_lowercase else token.text text = token.lower_
else:
to_lowercase = 0
text = token.text
tree_id = -1 tree_id = -1
for candidate in candidates: for candidate in candidates:
candidate_tree_id = self.cfg["labels"][candidate] candidate_tree_id = self.cfg["labels"][candidate]