diff --git a/spacy/_ml.py b/spacy/_ml.py index 97660f8f9..d81ceccc1 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -348,7 +348,7 @@ def Tok2Vec(width, embed_size, **kwargs): if pretrained_vectors is not None: glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID)) - if subword_features: + if subword_features: embed = uniqued( (glove | norm | prefix | suffix | shape) >> LN(Maxout(width, width * 5, pieces=3)), @@ -363,14 +363,16 @@ def Tok2Vec(width, embed_size, **kwargs): embed = uniqued( (norm | prefix | suffix | shape) >> LN(Maxout(width, width * 4, pieces=3)), - column=cols.index(ORTH) + column=cols.index(ORTH), ) - elif char_embed: + elif char_embed: embed = concatenate_lists( CharacterEmbed(nM=64, nC=8), - FeatureExtracter(cols) >> with_flatten(norm) + FeatureExtracter(cols) >> with_flatten(norm), + ) + reduce_dimensions = LN( + Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces) ) - reduce_dimensions = LN(Maxout(width, 64*8+width, pieces=cnn_maxout_pieces)) else: embed = norm @@ -379,22 +381,14 @@ def Tok2Vec(width, embed_size, **kwargs): >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces)) ) if char_embed: - tok2vec = ( - embed - >> with_flatten( - reduce_dimensions - >> convolution ** conv_depth, pad=conv_depth - ) + tok2vec = embed >> with_flatten( + reduce_dimensions >> convolution ** conv_depth, pad=conv_depth ) else: - tok2vec = ( - FeatureExtracter(cols) - >> with_flatten( - embed - >> convolution ** conv_depth, pad=conv_depth - ) + tok2vec = FeatureExtracter(cols) >> with_flatten( + embed >> convolution ** conv_depth, pad=conv_depth ) - + if bilstm_depth >= 1: tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth) # Work around thinc API limitations :(. TODO: Revise in Thinc 7 @@ -611,9 +605,7 @@ def build_morphologizer_model(class_nums, **cfg): char_embed=char_embed, pretrained_vectors=pretrained_vectors, ) - softmax = with_flatten( - MultiSoftmax(class_nums, token_vector_width) - ) + softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width)) softmax.out_sizes = class_nums model = tok2vec >> softmax model.nI = None @@ -906,16 +898,17 @@ def _replace_word(word, random_words, mask="[MASK]"): def _uniform_init(lo, hi): def wrapped(W, ops): copy_array(W, ops.xp.random.uniform(lo, hi, W.shape)) + return wrapped @describe.attributes( nM=Dimension("Vector dimensions"), nC=Dimension("Number of characters per word"), - vectors=Synapses("Embed matrix", - lambda obj: (obj.nC, obj.nV, obj.nM), - _uniform_init(-0.1, 0.1)), - d_vectors=Gradient("vectors") + vectors=Synapses( + "Embed matrix", lambda obj: (obj.nC, obj.nV, obj.nM), _uniform_init(-0.1, 0.1) + ), + d_vectors=Gradient("vectors"), ) class CharacterEmbed(Model): def __init__(self, nM=None, nC=None, **kwargs): @@ -926,12 +919,12 @@ class CharacterEmbed(Model): @property def nO(self): return self.nM * self.nC - + @property def nV(self): return 256 - def begin_update(self, docs, drop=0.): + def begin_update(self, docs, drop=0.0): if not docs: return [] ids = [] @@ -959,6 +952,7 @@ class CharacterEmbed(Model): if sgd is not None: sgd(self._mem.weights, self._mem.gradient, key=self.id) return None + return output, backprop_character_embed @@ -974,4 +968,4 @@ def get_cossim_loss(yh, y): cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2)) loss = xp.abs(cosine - 1).sum() - return loss, -d_yh \ No newline at end of file + return loss, -d_yh diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 365e7ea44..8d162362c 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -64,7 +64,12 @@ from .. import about str, ), noise_level=("Amount of corruption for data augmentation", "option", "nl", float), - orth_variant_level=("Amount of orthography variation for data augmentation", "option", "ovl", float), + orth_variant_level=( + "Amount of orthography variation for data augmentation", + "option", + "ovl", + float, + ), eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str), gold_preproc=("Use gold preprocessing", "flag", "G", bool), learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool), @@ -245,7 +250,11 @@ def train( best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( - nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0 + nlp, + noise_level=noise_level, + orth_variant_level=orth_variant_level, + gold_preproc=gold_preproc, + max_length=0, ) if raw_text: random.shuffle(raw_text) diff --git a/spacy/errors.py b/spacy/errors.py index c0868800d..b8a8dccba 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -456,6 +456,7 @@ class Errors(object): E159 = ("Can't find table '{name}' in lookups. Available tables: {tables}") E160 = ("Can't find language data file: {path}") + @add_codes class TempErrors(object): T003 = ("Resizing pre-trained Tagger models is not currently supported.") diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index 1ddee54b3..b96069235 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -27,10 +27,20 @@ class GermanDefaults(Language.Defaults): stop_words = STOP_WORDS syntax_iterators = SYNTAX_ITERATORS resources = {"lemma_lookup": "lemma_lookup.json"} - single_orth_variants = [{"tags": ["$("], "variants": ["…", "..."]}, - {"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]}] - paired_orth_variants = [{"tags": ["$("], "variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")]}, - {"tags": ["$("], "variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")]}] + single_orth_variants = [ + {"tags": ["$("], "variants": ["…", "..."]}, + {"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]}, + ] + paired_orth_variants = [ + { + "tags": ["$("], + "variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")], + }, + { + "tags": ["$("], + "variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")], + }, + ] class German(Language): diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 2f391de0b..e4c745c83 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -38,10 +38,14 @@ class EnglishDefaults(Language.Defaults): "lemma_index": "lemmatizer/lemma_index.json", "lemma_exc": "lemmatizer/lemma_exc.json", } - single_orth_variants = [{"tags": ["NFP"], "variants": ["…", "..."]}, - {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}] - paired_orth_variants = [{"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]}, - {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]}] + single_orth_variants = [ + {"tags": ["NFP"], "variants": ["…", "..."]}, + {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}, + ] + paired_orth_variants = [ + {"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]}, + {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]}, + ] class English(Language): diff --git a/spacy/lang/en/morph_rules.py b/spacy/lang/en/morph_rules.py index f910e42b8..5ed4eac59 100644 --- a/spacy/lang/en/morph_rules.py +++ b/spacy/lang/en/morph_rules.py @@ -12,50 +12,50 @@ _subordinating_conjunctions = [ "if", "as", "because", - #"of", - #"for", - #"before", - #"in", + # "of", + # "for", + # "before", + # "in", "while", - #"after", + # "after", "since", "like", - #"with", + # "with", "so", - #"to", - #"by", - #"on", - #"about", + # "to", + # "by", + # "on", + # "about", "than", "whether", "although", - #"from", + # "from", "though", - #"until", + # "until", "unless", "once", - #"without", - #"at", - #"into", + # "without", + # "at", + # "into", "cause", - #"over", + # "over", "upon", "till", "whereas", - #"beyond", + # "beyond", "whilst", "except", "despite", "wether", - #"then", + # "then", "but", "becuse", "whie", - #"below", - #"against", + # "below", + # "against", "it", "w/out", - #"toward", + # "toward", "albeit", "save", "besides", @@ -67,17 +67,17 @@ _subordinating_conjunctions = [ "out", "near", "seince", - #"towards", + # "towards", "tho", "sice", "will", ] # This seems kind of wrong too? -#_relative_pronouns = ["this", "that", "those", "these"] +# _relative_pronouns = ["this", "that", "those", "these"] MORPH_RULES = { - #"DT": {word: {"POS": "PRON"} for word in _relative_pronouns}, + # "DT": {word: {"POS": "PRON"} for word in _relative_pronouns}, "IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions}, "NN": { "something": {"POS": "PRON"}, diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index 91c29c9e0..c45197771 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -30,12 +30,7 @@ for pron in ["i"]: for orth in [pron, pron.title()]: _exc[orth + "'m"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - { - ORTH: "'m", - LEMMA: "be", - NORM: "am", - TAG: "VBP", - }, + {ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP"}, ] _exc[orth + "m"] = [ diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index c9ccbcd0d..d14f5292e 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -2,8 +2,7 @@ from __future__ import unicode_literals from collections import OrderedDict -from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN -from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos +from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN class Lemmatizer(object): @@ -71,13 +70,13 @@ class Lemmatizer(object): return True elif univ_pos == "adj" and morphology.get("Degree") == "pos": return True - elif morphology.get('VerbForm') == 'inf': + elif morphology.get("VerbForm") == "inf": return True - elif morphology.get('VerbForm') == 'none': + elif morphology.get("VerbForm") == "none": return True - elif morphology.get('VerbForm') == 'inf': + elif morphology.get("VerbForm") == "inf": return True - elif morphology.get('Degree') == 'pos': + elif morphology.get("Degree") == "pos": return True else: return False diff --git a/spacy/lookups.py b/spacy/lookups.py index 801b4d00d..741d40330 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -137,6 +137,7 @@ class Table(OrderedDict): """A table in the lookups. Subclass of builtin dict that implements a slightly more consistent and unified API. """ + @classmethod def from_dict(cls, data, name=None): self = cls(name=name) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index c39491ecf..db911dba0 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -119,28 +119,8 @@ def test_oracle_moves_missing_B(en_vocab): def test_oracle_moves_whitespace(en_vocab): - words = [ - "production", - "\n", - "of", - "Northrop", - "\n", - "Corp.", - "\n", - "'s", - "radar", - ] - biluo_tags = [ - "O", - "O", - "O", - "B-ORG", - None, - "I-ORG", - "L-ORG", - "O", - "O", - ] + words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"] + biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"] doc = Doc(en_vocab, words=words) gold = GoldParse(doc, words=words, entities=biluo_tags) @@ -156,4 +136,4 @@ def test_oracle_moves_whitespace(en_vocab): action, label = tag.split("-") moves.add_action(move_types.index(action), label) moves.preprocess_gold(gold) - seq = moves.get_oracle_sequence(doc, gold) + moves.get_oracle_sequence(doc, gold)