mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Merge __init__
This commit is contained in:
commit
8a9181d95a
81
spacy/_ml.py
81
spacy/_ml.py
|
@ -465,17 +465,16 @@ def getitem(i):
|
||||||
|
|
||||||
|
|
||||||
@describe.attributes(
|
@describe.attributes(
|
||||||
W=Synapses("Weights matrix",
|
W=Synapses("Weights matrix", lambda obj: (obj.nO, obj.nI), lambda W, ops: None)
|
||||||
lambda obj: (obj.nO, obj.nI),
|
|
||||||
lambda W, ops: None)
|
|
||||||
)
|
)
|
||||||
class MultiSoftmax(Affine):
|
class MultiSoftmax(Affine):
|
||||||
'''Neural network layer that predicts several multi-class attributes at once.
|
"""Neural network layer that predicts several multi-class attributes at once.
|
||||||
For instance, we might predict one class with 6 variables, and another with 5.
|
For instance, we might predict one class with 6 variables, and another with 5.
|
||||||
We predict the 11 neurons required for this, and then softmax them such
|
We predict the 11 neurons required for this, and then softmax them such
|
||||||
that columns 0-6 make a probability distribution and coumns 6-11 make another.
|
that columns 0-6 make a probability distribution and coumns 6-11 make another.
|
||||||
'''
|
"""
|
||||||
name = 'multisoftmax'
|
|
||||||
|
name = "multisoftmax"
|
||||||
|
|
||||||
def __init__(self, out_sizes, nI=None, **kwargs):
|
def __init__(self, out_sizes, nI=None, **kwargs):
|
||||||
Model.__init__(self, **kwargs)
|
Model.__init__(self, **kwargs)
|
||||||
|
@ -487,12 +486,13 @@ class MultiSoftmax(Affine):
|
||||||
output__BO = self.ops.affine(self.W, self.b, input__BI)
|
output__BO = self.ops.affine(self.W, self.b, input__BI)
|
||||||
i = 0
|
i = 0
|
||||||
for out_size in self.out_sizes:
|
for out_size in self.out_sizes:
|
||||||
self.ops.softmax(output__BO[:, i : i+out_size], inplace=True)
|
self.ops.softmax(output__BO[:, i : i + out_size], inplace=True)
|
||||||
i += out_size
|
i += out_size
|
||||||
return output__BO
|
return output__BO
|
||||||
|
|
||||||
def begin_update(self, input__BI, drop=0.):
|
def begin_update(self, input__BI, drop=0.0):
|
||||||
output__BO = self.predict(input__BI)
|
output__BO = self.predict(input__BI)
|
||||||
|
|
||||||
def finish_update(grad__BO, sgd=None):
|
def finish_update(grad__BO, sgd=None):
|
||||||
self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True)
|
self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True)
|
||||||
self.d_b += grad__BO.sum(axis=0)
|
self.d_b += grad__BO.sum(axis=0)
|
||||||
|
@ -500,6 +500,7 @@ class MultiSoftmax(Affine):
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||||
return grad__BI
|
return grad__BI
|
||||||
|
|
||||||
return output__BO, finish_update
|
return output__BO, finish_update
|
||||||
|
|
||||||
|
|
||||||
|
@ -515,41 +516,41 @@ def build_tagger_model(nr_class, **cfg):
|
||||||
if "tok2vec" in cfg:
|
if "tok2vec" in cfg:
|
||||||
tok2vec = cfg["tok2vec"]
|
tok2vec = cfg["tok2vec"]
|
||||||
else:
|
else:
|
||||||
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
tok2vec = Tok2Vec(
|
||||||
subword_features=subword_features,
|
token_vector_width,
|
||||||
pretrained_vectors=pretrained_vectors)
|
embed_size,
|
||||||
softmax = with_flatten(
|
subword_features=subword_features,
|
||||||
Softmax(nr_class, token_vector_width))
|
pretrained_vectors=pretrained_vectors,
|
||||||
model = (
|
)
|
||||||
tok2vec
|
softmax = with_flatten(Softmax(nr_class, token_vector_width))
|
||||||
>> softmax
|
model = tok2vec >> softmax
|
||||||
)
|
|
||||||
model.nI = None
|
model.nI = None
|
||||||
model.tok2vec = tok2vec
|
model.tok2vec = tok2vec
|
||||||
model.softmax = softmax
|
model.softmax = softmax
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def build_morphologizer_model(class_nums, **cfg):
|
def build_morphologizer_model(class_nums, **cfg):
|
||||||
embed_size = util.env_opt('embed_size', 7000)
|
embed_size = util.env_opt("embed_size", 7000)
|
||||||
if 'token_vector_width' in cfg:
|
if "token_vector_width" in cfg:
|
||||||
token_vector_width = cfg['token_vector_width']
|
token_vector_width = cfg["token_vector_width"]
|
||||||
else:
|
else:
|
||||||
token_vector_width = util.env_opt('token_vector_width', 128)
|
token_vector_width = util.env_opt("token_vector_width", 128)
|
||||||
pretrained_vectors = cfg.get('pretrained_vectors')
|
pretrained_vectors = cfg.get("pretrained_vectors")
|
||||||
subword_features = cfg.get('subword_features', True)
|
subword_features = cfg.get("subword_features", True)
|
||||||
with Model.define_operators({'>>': chain, '+': add}):
|
with Model.define_operators({">>": chain, "+": add}):
|
||||||
if 'tok2vec' in cfg:
|
if "tok2vec" in cfg:
|
||||||
tok2vec = cfg['tok2vec']
|
tok2vec = cfg["tok2vec"]
|
||||||
else:
|
else:
|
||||||
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
tok2vec = Tok2Vec(
|
||||||
subword_features=subword_features,
|
token_vector_width,
|
||||||
pretrained_vectors=pretrained_vectors)
|
embed_size,
|
||||||
|
subword_features=subword_features,
|
||||||
|
pretrained_vectors=pretrained_vectors,
|
||||||
|
)
|
||||||
softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
|
softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
|
||||||
softmax.out_sizes = class_nums
|
softmax.out_sizes = class_nums
|
||||||
model = (
|
model = tok2vec >> softmax
|
||||||
tok2vec
|
|
||||||
>> softmax
|
|
||||||
)
|
|
||||||
model.nI = None
|
model.nI = None
|
||||||
model.tok2vec = tok2vec
|
model.tok2vec = tok2vec
|
||||||
model.softmax = softmax
|
model.softmax = softmax
|
||||||
|
@ -630,17 +631,13 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
)
|
)
|
||||||
|
|
||||||
linear_model = _preprocess_doc >> LinearModel(nr_class)
|
linear_model = _preprocess_doc >> LinearModel(nr_class)
|
||||||
if cfg.get('exclusive_classes'):
|
if cfg.get("exclusive_classes"):
|
||||||
output_layer = Softmax(nr_class, nr_class * 2)
|
output_layer = Softmax(nr_class, nr_class * 2)
|
||||||
else:
|
else:
|
||||||
output_layer = (
|
output_layer = (
|
||||||
zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0))
|
zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic
|
||||||
>> logistic
|
|
||||||
)
|
)
|
||||||
model = (
|
model = (linear_model | cnn_model) >> output_layer
|
||||||
(linear_model | cnn_model)
|
|
||||||
>> output_layer
|
|
||||||
)
|
|
||||||
model.tok2vec = chain(tok2vec, flatten)
|
model.tok2vec = chain(tok2vec, flatten)
|
||||||
model.nO = nr_class
|
model.nO = nr_class
|
||||||
model.lsuv = False
|
model.lsuv = False
|
||||||
|
@ -658,7 +655,9 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False,
|
||||||
if exclusive_classes:
|
if exclusive_classes:
|
||||||
output_layer = Softmax(nr_class, tok2vec.nO)
|
output_layer = Softmax(nr_class, tok2vec.nO)
|
||||||
else:
|
else:
|
||||||
output_layer = zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic
|
output_layer = (
|
||||||
|
zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic
|
||||||
|
)
|
||||||
model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer
|
model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer
|
||||||
model.tok2vec = chain(tok2vec, flatten)
|
model.tok2vec = chain(tok2vec, flatten)
|
||||||
model.nO = nr_class
|
model.nO = nr_class
|
||||||
|
|
|
@ -350,7 +350,6 @@ class Errors(object):
|
||||||
"is likely a bug in spaCy.")
|
"is likely a bug in spaCy.")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
class TempErrors(object):
|
class TempErrors(object):
|
||||||
T003 = ("Resizing pre-trained Tagger models is not currently supported.")
|
T003 = ("Resizing pre-trained Tagger models is not currently supported.")
|
||||||
|
|
|
@ -14,6 +14,7 @@ __all__ = [
|
||||||
"TextCategorizer",
|
"TextCategorizer",
|
||||||
"Tensorizer",
|
"Tensorizer",
|
||||||
"Pipe",
|
"Pipe",
|
||||||
|
"Morphologizer",
|
||||||
"EntityRuler",
|
"EntityRuler",
|
||||||
"SentenceSegmenter",
|
"SentenceSegmenter",
|
||||||
"SimilarityHook",
|
"SimilarityHook",
|
||||||
|
|
|
@ -2,11 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import numpy
|
|
||||||
from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STOP
|
|
||||||
from spacy.symbols import VERB
|
|
||||||
from spacy.vocab import Vocab
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def i_has(en_tokenizer):
|
def i_has(en_tokenizer):
|
||||||
|
@ -15,11 +11,13 @@ def i_has(en_tokenizer):
|
||||||
doc[1].tag_ = "VBZ"
|
doc[1].tag_ = "VBZ"
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
def test_token_morph_id(i_has):
|
def test_token_morph_id(i_has):
|
||||||
assert i_has[0].morph.id
|
assert i_has[0].morph.id
|
||||||
assert i_has[1].morph.id != 0
|
assert i_has[1].morph.id != 0
|
||||||
assert i_has[0].morph.id != i_has[1].morph.id
|
assert i_has[0].morph.id != i_has[1].morph.id
|
||||||
|
|
||||||
|
|
||||||
def test_morph_props(i_has):
|
def test_morph_props(i_has):
|
||||||
assert i_has[0].morph.pron_type == i_has.vocab.strings["PronType_prs"]
|
assert i_has[0].morph.pron_type == i_has.vocab.strings["PronType_prs"]
|
||||||
assert i_has[0].morph.pron_type_ == "PronType_prs"
|
assert i_has[0].morph.pron_type_ == "PronType_prs"
|
||||||
|
|
|
@ -1,41 +1,48 @@
|
||||||
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import pytest
|
|
||||||
|
|
||||||
from ...morphology import Morphology
|
import pytest
|
||||||
from ...strings import StringStore, get_string_id
|
from spacy.morphology import Morphology
|
||||||
from ...lemmatizer import Lemmatizer
|
from spacy.strings import StringStore, get_string_id
|
||||||
from ...morphology import *
|
from spacy.lemmatizer import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def morphology():
|
def morphology():
|
||||||
return Morphology(StringStore(), {}, Lemmatizer())
|
return Morphology(StringStore(), {}, Lemmatizer())
|
||||||
|
|
||||||
|
|
||||||
def test_init(morphology):
|
def test_init(morphology):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def test_add_morphology_with_string_names(morphology):
|
def test_add_morphology_with_string_names(morphology):
|
||||||
morphology.add({"Case_gen", "Number_sing"})
|
morphology.add({"Case_gen", "Number_sing"})
|
||||||
|
|
||||||
|
|
||||||
def test_add_morphology_with_int_ids(morphology):
|
def test_add_morphology_with_int_ids(morphology):
|
||||||
morphology.add({get_string_id("Case_gen"), get_string_id("Number_sing")})
|
morphology.add({get_string_id("Case_gen"), get_string_id("Number_sing")})
|
||||||
|
|
||||||
|
|
||||||
def test_add_morphology_with_mix_strings_and_ints(morphology):
|
def test_add_morphology_with_mix_strings_and_ints(morphology):
|
||||||
morphology.add({get_string_id("PunctSide_ini"), 'VerbType_aux'})
|
morphology.add({get_string_id("PunctSide_ini"), "VerbType_aux"})
|
||||||
|
|
||||||
|
|
||||||
def test_morphology_tags_hash_distinctly(morphology):
|
def test_morphology_tags_hash_distinctly(morphology):
|
||||||
tag1 = morphology.add({"PunctSide_ini", 'VerbType_aux'})
|
tag1 = morphology.add({"PunctSide_ini", "VerbType_aux"})
|
||||||
tag2 = morphology.add({"Case_gen", 'Number_sing'})
|
tag2 = morphology.add({"Case_gen", "Number_sing"})
|
||||||
assert tag1 != tag2
|
assert tag1 != tag2
|
||||||
|
|
||||||
|
|
||||||
def test_morphology_tags_hash_independent_of_order(morphology):
|
def test_morphology_tags_hash_independent_of_order(morphology):
|
||||||
tag1 = morphology.add({"Case_gen", 'Number_sing'})
|
tag1 = morphology.add({"Case_gen", "Number_sing"})
|
||||||
tag2 = morphology.add({"Number_sing", "Case_gen"})
|
tag2 = morphology.add({"Number_sing", "Case_gen"})
|
||||||
assert tag1 == tag2
|
assert tag1 == tag2
|
||||||
|
|
||||||
|
|
||||||
def test_update_morphology_tag(morphology):
|
def test_update_morphology_tag(morphology):
|
||||||
tag1 = morphology.add({"Case_gen"})
|
tag1 = morphology.add({"Case_gen"})
|
||||||
tag2 = morphology.update(tag1, {"Number_sing"})
|
tag2 = morphology.update(tag1, {"Number_sing"})
|
||||||
assert tag1 != tag2
|
assert tag1 != tag2
|
||||||
tag3 = morphology.add({"Number_sing", "Case_gen"})
|
tag3 = morphology.add({"Number_sing", "Case_gen"})
|
||||||
assert tag2 == tag3
|
assert tag2 == tag3
|
||||||
|
|
Loading…
Reference in New Issue
Block a user