diff --git a/spacy/_ml.py b/spacy/_ml.py index 422bbe66a..68dedc0b3 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -465,17 +465,16 @@ def getitem(i): @describe.attributes( - W=Synapses("Weights matrix", - lambda obj: (obj.nO, obj.nI), - lambda W, ops: None) + W=Synapses("Weights matrix", lambda obj: (obj.nO, obj.nI), lambda W, ops: None) ) class MultiSoftmax(Affine): - '''Neural network layer that predicts several multi-class attributes at once. + """Neural network layer that predicts several multi-class attributes at once. For instance, we might predict one class with 6 variables, and another with 5. We predict the 11 neurons required for this, and then softmax them such that columns 0-6 make a probability distribution and coumns 6-11 make another. - ''' - name = 'multisoftmax' + """ + + name = "multisoftmax" def __init__(self, out_sizes, nI=None, **kwargs): Model.__init__(self, **kwargs) @@ -487,12 +486,13 @@ class MultiSoftmax(Affine): output__BO = self.ops.affine(self.W, self.b, input__BI) i = 0 for out_size in self.out_sizes: - self.ops.softmax(output__BO[:, i : i+out_size], inplace=True) + self.ops.softmax(output__BO[:, i : i + out_size], inplace=True) i += out_size return output__BO - def begin_update(self, input__BI, drop=0.): + def begin_update(self, input__BI, drop=0.0): output__BO = self.predict(input__BI) + def finish_update(grad__BO, sgd=None): self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True) self.d_b += grad__BO.sum(axis=0) @@ -500,6 +500,7 @@ class MultiSoftmax(Affine): if sgd is not None: sgd(self._mem.weights, self._mem.gradient, key=self.id) return grad__BI + return output__BO, finish_update @@ -515,41 +516,41 @@ def build_tagger_model(nr_class, **cfg): if "tok2vec" in cfg: tok2vec = cfg["tok2vec"] else: - tok2vec = Tok2Vec(token_vector_width, embed_size, - subword_features=subword_features, - pretrained_vectors=pretrained_vectors) - softmax = with_flatten( - Softmax(nr_class, token_vector_width)) - model = ( - tok2vec - >> softmax - ) + tok2vec = Tok2Vec( + token_vector_width, + embed_size, + subword_features=subword_features, + pretrained_vectors=pretrained_vectors, + ) + softmax = with_flatten(Softmax(nr_class, token_vector_width)) + model = tok2vec >> softmax model.nI = None model.tok2vec = tok2vec model.softmax = softmax return model + def build_morphologizer_model(class_nums, **cfg): - embed_size = util.env_opt('embed_size', 7000) - if 'token_vector_width' in cfg: - token_vector_width = cfg['token_vector_width'] + embed_size = util.env_opt("embed_size", 7000) + if "token_vector_width" in cfg: + token_vector_width = cfg["token_vector_width"] else: - token_vector_width = util.env_opt('token_vector_width', 128) - pretrained_vectors = cfg.get('pretrained_vectors') - subword_features = cfg.get('subword_features', True) - with Model.define_operators({'>>': chain, '+': add}): - if 'tok2vec' in cfg: - tok2vec = cfg['tok2vec'] + token_vector_width = util.env_opt("token_vector_width", 128) + pretrained_vectors = cfg.get("pretrained_vectors") + subword_features = cfg.get("subword_features", True) + with Model.define_operators({">>": chain, "+": add}): + if "tok2vec" in cfg: + tok2vec = cfg["tok2vec"] else: - tok2vec = Tok2Vec(token_vector_width, embed_size, - subword_features=subword_features, - pretrained_vectors=pretrained_vectors) + tok2vec = Tok2Vec( + token_vector_width, + embed_size, + subword_features=subword_features, + pretrained_vectors=pretrained_vectors, + ) softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width)) softmax.out_sizes = class_nums - model = ( - tok2vec - >> softmax - ) + model = tok2vec >> softmax model.nI = None model.tok2vec = tok2vec model.softmax = softmax @@ -630,17 +631,13 @@ def build_text_classifier(nr_class, width=64, **cfg): ) linear_model = _preprocess_doc >> LinearModel(nr_class) - if cfg.get('exclusive_classes'): + if cfg.get("exclusive_classes"): output_layer = Softmax(nr_class, nr_class * 2) else: output_layer = ( - zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) - >> logistic + zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic ) - model = ( - (linear_model | cnn_model) - >> output_layer - ) + model = (linear_model | cnn_model) >> output_layer model.tok2vec = chain(tok2vec, flatten) model.nO = nr_class model.lsuv = False @@ -658,7 +655,9 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, if exclusive_classes: output_layer = Softmax(nr_class, tok2vec.nO) else: - output_layer = zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic + output_layer = ( + zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic + ) model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer model.tok2vec = chain(tok2vec, flatten) model.nO = nr_class diff --git a/spacy/errors.py b/spacy/errors.py index 13382d146..f9dd8535e 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -350,7 +350,6 @@ class Errors(object): "is likely a bug in spaCy.") - @add_codes class TempErrors(object): T003 = ("Resizing pre-trained Tagger models is not currently supported.") diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 2a8acbde5..686743a6a 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -14,6 +14,7 @@ __all__ = [ "TextCategorizer", "Tensorizer", "Pipe", + "Morphologizer", "EntityRuler", "SentenceSegmenter", "SimilarityHook", diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index ffee5694a..5d570af53 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -2,11 +2,7 @@ from __future__ import unicode_literals import pytest -import numpy -from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STOP -from spacy.symbols import VERB -from spacy.vocab import Vocab -from spacy.tokens import Doc + @pytest.fixture def i_has(en_tokenizer): @@ -15,11 +11,13 @@ def i_has(en_tokenizer): doc[1].tag_ = "VBZ" return doc + def test_token_morph_id(i_has): assert i_has[0].morph.id assert i_has[1].morph.id != 0 assert i_has[0].morph.id != i_has[1].morph.id + def test_morph_props(i_has): assert i_has[0].morph.pron_type == i_has.vocab.strings["PronType_prs"] assert i_has[0].morph.pron_type_ == "PronType_prs" diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py index dcb0b32ff..4b8f0d754 100644 --- a/spacy/tests/morphology/test_morph_features.py +++ b/spacy/tests/morphology/test_morph_features.py @@ -1,41 +1,48 @@ +# coding: utf-8 from __future__ import unicode_literals -import pytest -from ...morphology import Morphology -from ...strings import StringStore, get_string_id -from ...lemmatizer import Lemmatizer -from ...morphology import * +import pytest +from spacy.morphology import Morphology +from spacy.strings import StringStore, get_string_id +from spacy.lemmatizer import Lemmatizer + @pytest.fixture def morphology(): return Morphology(StringStore(), {}, Lemmatizer()) + def test_init(morphology): pass + def test_add_morphology_with_string_names(morphology): morphology.add({"Case_gen", "Number_sing"}) + def test_add_morphology_with_int_ids(morphology): morphology.add({get_string_id("Case_gen"), get_string_id("Number_sing")}) + def test_add_morphology_with_mix_strings_and_ints(morphology): - morphology.add({get_string_id("PunctSide_ini"), 'VerbType_aux'}) + morphology.add({get_string_id("PunctSide_ini"), "VerbType_aux"}) def test_morphology_tags_hash_distinctly(morphology): - tag1 = morphology.add({"PunctSide_ini", 'VerbType_aux'}) - tag2 = morphology.add({"Case_gen", 'Number_sing'}) + tag1 = morphology.add({"PunctSide_ini", "VerbType_aux"}) + tag2 = morphology.add({"Case_gen", "Number_sing"}) assert tag1 != tag2 + def test_morphology_tags_hash_independent_of_order(morphology): - tag1 = morphology.add({"Case_gen", 'Number_sing'}) - tag2 = morphology.add({"Number_sing", "Case_gen"}) + tag1 = morphology.add({"Case_gen", "Number_sing"}) + tag2 = morphology.add({"Number_sing", "Case_gen"}) assert tag1 == tag2 + def test_update_morphology_tag(morphology): tag1 = morphology.add({"Case_gen"}) tag2 = morphology.update(tag1, {"Number_sing"}) assert tag1 != tag2 - tag3 = morphology.add({"Number_sing", "Case_gen"}) + tag3 = morphology.add({"Number_sing", "Case_gen"}) assert tag2 == tag3