Tidy up and auto-format

This commit is contained in:
Ines Montani 2019-03-08 13:28:53 +01:00
parent d260aa17fd
commit ad834be494
4 changed files with 61 additions and 58 deletions

View File

@ -465,17 +465,16 @@ def getitem(i):
@describe.attributes( @describe.attributes(
W=Synapses("Weights matrix", W=Synapses("Weights matrix", lambda obj: (obj.nO, obj.nI), lambda W, ops: None)
lambda obj: (obj.nO, obj.nI),
lambda W, ops: None)
) )
class MultiSoftmax(Affine): class MultiSoftmax(Affine):
'''Neural network layer that predicts several multi-class attributes at once. """Neural network layer that predicts several multi-class attributes at once.
For instance, we might predict one class with 6 variables, and another with 5. For instance, we might predict one class with 6 variables, and another with 5.
We predict the 11 neurons required for this, and then softmax them such We predict the 11 neurons required for this, and then softmax them such
that columns 0-6 make a probability distribution and coumns 6-11 make another. that columns 0-6 make a probability distribution and coumns 6-11 make another.
''' """
name = 'multisoftmax'
name = "multisoftmax"
def __init__(self, out_sizes, nI=None, **kwargs): def __init__(self, out_sizes, nI=None, **kwargs):
Model.__init__(self, **kwargs) Model.__init__(self, **kwargs)
@ -487,12 +486,13 @@ class MultiSoftmax(Affine):
output__BO = self.ops.affine(self.W, self.b, input__BI) output__BO = self.ops.affine(self.W, self.b, input__BI)
i = 0 i = 0
for out_size in self.out_sizes: for out_size in self.out_sizes:
self.ops.softmax(output__BO[:, i : i+out_size], inplace=True) self.ops.softmax(output__BO[:, i : i + out_size], inplace=True)
i += out_size i += out_size
return output__BO return output__BO
def begin_update(self, input__BI, drop=0.): def begin_update(self, input__BI, drop=0.0):
output__BO = self.predict(input__BI) output__BO = self.predict(input__BI)
def finish_update(grad__BO, sgd=None): def finish_update(grad__BO, sgd=None):
self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True) self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True)
self.d_b += grad__BO.sum(axis=0) self.d_b += grad__BO.sum(axis=0)
@ -500,6 +500,7 @@ class MultiSoftmax(Affine):
if sgd is not None: if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id) sgd(self._mem.weights, self._mem.gradient, key=self.id)
return grad__BI return grad__BI
return output__BO, finish_update return output__BO, finish_update
@ -515,41 +516,41 @@ def build_tagger_model(nr_class, **cfg):
if "tok2vec" in cfg: if "tok2vec" in cfg:
tok2vec = cfg["tok2vec"] tok2vec = cfg["tok2vec"]
else: else:
tok2vec = Tok2Vec(token_vector_width, embed_size, tok2vec = Tok2Vec(
token_vector_width,
embed_size,
subword_features=subword_features, subword_features=subword_features,
pretrained_vectors=pretrained_vectors) pretrained_vectors=pretrained_vectors,
softmax = with_flatten(
Softmax(nr_class, token_vector_width))
model = (
tok2vec
>> softmax
) )
softmax = with_flatten(Softmax(nr_class, token_vector_width))
model = tok2vec >> softmax
model.nI = None model.nI = None
model.tok2vec = tok2vec model.tok2vec = tok2vec
model.softmax = softmax model.softmax = softmax
return model return model
def build_morphologizer_model(class_nums, **cfg): def build_morphologizer_model(class_nums, **cfg):
embed_size = util.env_opt('embed_size', 7000) embed_size = util.env_opt("embed_size", 7000)
if 'token_vector_width' in cfg: if "token_vector_width" in cfg:
token_vector_width = cfg['token_vector_width'] token_vector_width = cfg["token_vector_width"]
else: else:
token_vector_width = util.env_opt('token_vector_width', 128) token_vector_width = util.env_opt("token_vector_width", 128)
pretrained_vectors = cfg.get('pretrained_vectors') pretrained_vectors = cfg.get("pretrained_vectors")
subword_features = cfg.get('subword_features', True) subword_features = cfg.get("subword_features", True)
with Model.define_operators({'>>': chain, '+': add}): with Model.define_operators({">>": chain, "+": add}):
if 'tok2vec' in cfg: if "tok2vec" in cfg:
tok2vec = cfg['tok2vec'] tok2vec = cfg["tok2vec"]
else: else:
tok2vec = Tok2Vec(token_vector_width, embed_size, tok2vec = Tok2Vec(
token_vector_width,
embed_size,
subword_features=subword_features, subword_features=subword_features,
pretrained_vectors=pretrained_vectors) pretrained_vectors=pretrained_vectors,
)
softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width)) softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
softmax.out_sizes = class_nums softmax.out_sizes = class_nums
model = ( model = tok2vec >> softmax
tok2vec
>> softmax
)
model.nI = None model.nI = None
model.tok2vec = tok2vec model.tok2vec = tok2vec
model.softmax = softmax model.softmax = softmax
@ -630,17 +631,13 @@ def build_text_classifier(nr_class, width=64, **cfg):
) )
linear_model = _preprocess_doc >> LinearModel(nr_class) linear_model = _preprocess_doc >> LinearModel(nr_class)
if cfg.get('exclusive_classes'): if cfg.get("exclusive_classes"):
output_layer = Softmax(nr_class, nr_class * 2) output_layer = Softmax(nr_class, nr_class * 2)
else: else:
output_layer = ( output_layer = (
zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic
>> logistic
)
model = (
(linear_model | cnn_model)
>> output_layer
) )
model = (linear_model | cnn_model) >> output_layer
model.tok2vec = chain(tok2vec, flatten) model.tok2vec = chain(tok2vec, flatten)
model.nO = nr_class model.nO = nr_class
model.lsuv = False model.lsuv = False
@ -658,7 +655,9 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False,
if exclusive_classes: if exclusive_classes:
output_layer = Softmax(nr_class, tok2vec.nO) output_layer = Softmax(nr_class, tok2vec.nO)
else: else:
output_layer = zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic output_layer = (
zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic
)
model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer
model.tok2vec = chain(tok2vec, flatten) model.tok2vec = chain(tok2vec, flatten)
model.nO = nr_class model.nO = nr_class

View File

@ -350,7 +350,6 @@ class Errors(object):
"is likely a bug in spaCy.") "is likely a bug in spaCy.")
@add_codes @add_codes
class TempErrors(object): class TempErrors(object):
T003 = ("Resizing pre-trained Tagger models is not currently supported.") T003 = ("Resizing pre-trained Tagger models is not currently supported.")

View File

@ -2,11 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
import numpy
from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STOP
from spacy.symbols import VERB
from spacy.vocab import Vocab
from spacy.tokens import Doc
@pytest.fixture @pytest.fixture
def i_has(en_tokenizer): def i_has(en_tokenizer):
@ -15,11 +11,13 @@ def i_has(en_tokenizer):
doc[1].tag_ = "VBZ" doc[1].tag_ = "VBZ"
return doc return doc
def test_token_morph_id(i_has): def test_token_morph_id(i_has):
assert i_has[0].morph.id assert i_has[0].morph.id
assert i_has[1].morph.id != 0 assert i_has[1].morph.id != 0
assert i_has[0].morph.id != i_has[1].morph.id assert i_has[0].morph.id != i_has[1].morph.id
def test_morph_props(i_has): def test_morph_props(i_has):
assert i_has[0].morph.pron_type == i_has.vocab.strings["PronType_prs"] assert i_has[0].morph.pron_type == i_has.vocab.strings["PronType_prs"]
assert i_has[0].morph.pron_type_ == "PronType_prs" assert i_has[0].morph.pron_type_ == "PronType_prs"

View File

@ -1,38 +1,45 @@
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
from ...morphology import Morphology import pytest
from ...strings import StringStore, get_string_id from spacy.morphology import Morphology
from ...lemmatizer import Lemmatizer from spacy.strings import StringStore, get_string_id
from ...morphology import * from spacy.lemmatizer import Lemmatizer
@pytest.fixture @pytest.fixture
def morphology(): def morphology():
return Morphology(StringStore(), {}, Lemmatizer()) return Morphology(StringStore(), {}, Lemmatizer())
def test_init(morphology): def test_init(morphology):
pass pass
def test_add_morphology_with_string_names(morphology): def test_add_morphology_with_string_names(morphology):
morphology.add({"Case_gen", "Number_sing"}) morphology.add({"Case_gen", "Number_sing"})
def test_add_morphology_with_int_ids(morphology): def test_add_morphology_with_int_ids(morphology):
morphology.add({get_string_id("Case_gen"), get_string_id("Number_sing")}) morphology.add({get_string_id("Case_gen"), get_string_id("Number_sing")})
def test_add_morphology_with_mix_strings_and_ints(morphology): def test_add_morphology_with_mix_strings_and_ints(morphology):
morphology.add({get_string_id("PunctSide_ini"), 'VerbType_aux'}) morphology.add({get_string_id("PunctSide_ini"), "VerbType_aux"})
def test_morphology_tags_hash_distinctly(morphology): def test_morphology_tags_hash_distinctly(morphology):
tag1 = morphology.add({"PunctSide_ini", 'VerbType_aux'}) tag1 = morphology.add({"PunctSide_ini", "VerbType_aux"})
tag2 = morphology.add({"Case_gen", 'Number_sing'}) tag2 = morphology.add({"Case_gen", "Number_sing"})
assert tag1 != tag2 assert tag1 != tag2
def test_morphology_tags_hash_independent_of_order(morphology): def test_morphology_tags_hash_independent_of_order(morphology):
tag1 = morphology.add({"Case_gen", 'Number_sing'}) tag1 = morphology.add({"Case_gen", "Number_sing"})
tag2 = morphology.add({"Number_sing", "Case_gen"}) tag2 = morphology.add({"Number_sing", "Case_gen"})
assert tag1 == tag2 assert tag1 == tag2
def test_update_morphology_tag(morphology): def test_update_morphology_tag(morphology):
tag1 = morphology.add({"Case_gen"}) tag1 = morphology.add({"Case_gen"})
tag2 = morphology.update(tag1, {"Number_sing"}) tag2 = morphology.update(tag1, {"Number_sing"})