Tidy up and auto-format

2025-12-19 08:04:42 +03:00 · 2019-09-11 14:00:36 +02:00 · 2019-09-11 14:00:36 +02:00 · af25323653
commit af25323653
parent 178d010b25
10 changed files with 90 additions and 97 deletions
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -348,7 +348,7 @@ def Tok2Vec(width, embed_size, **kwargs):
        if pretrained_vectors is not None:
            glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))
-            if subword_features: 
+            if subword_features:
                embed = uniqued(
                    (glove | norm | prefix | suffix | shape)
                    >> LN(Maxout(width, width * 5, pieces=3)),
@ -363,14 +363,16 @@ def Tok2Vec(width, embed_size, **kwargs):
            embed = uniqued(
                (norm | prefix | suffix | shape)
                >> LN(Maxout(width, width * 4, pieces=3)),
-                column=cols.index(ORTH)
+                column=cols.index(ORTH),
            )
-        elif char_embed: 
+        elif char_embed:
            embed = concatenate_lists(
                CharacterEmbed(nM=64, nC=8),
-                FeatureExtracter(cols) >> with_flatten(norm)
+                FeatureExtracter(cols) >> with_flatten(norm),
            )
            reduce_dimensions = LN(
                Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)
            )
            reduce_dimensions = LN(Maxout(width, 64*8+width, pieces=cnn_maxout_pieces))
        else:
            embed = norm
@ -379,22 +381,14 @@ def Tok2Vec(width, embed_size, **kwargs):
            >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
        )
        if char_embed:
-            tok2vec = (
+            tok2vec = embed >> with_flatten(
-                embed
+                reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
                >> with_flatten(
                    reduce_dimensions
                    >> convolution ** conv_depth, pad=conv_depth
                )
            )
        else:
-            tok2vec = (
+            tok2vec = FeatureExtracter(cols) >> with_flatten(
-                FeatureExtracter(cols)
+                embed >> convolution ** conv_depth, pad=conv_depth
                >> with_flatten(
                    embed
                    >> convolution ** conv_depth, pad=conv_depth
                )
            )
- 
+
        if bilstm_depth >= 1:
            tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
@ -611,9 +605,7 @@ def build_morphologizer_model(class_nums, **cfg):
                char_embed=char_embed,
                pretrained_vectors=pretrained_vectors,
            )
-        softmax = with_flatten(
+        softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
            MultiSoftmax(class_nums, token_vector_width)
        )
        softmax.out_sizes = class_nums
        model = tok2vec >> softmax
    model.nI = None
@ -906,16 +898,17 @@ def _replace_word(word, random_words, mask="[MASK]"):
 def _uniform_init(lo, hi):
    def wrapped(W, ops):
        copy_array(W, ops.xp.random.uniform(lo, hi, W.shape))
    return wrapped
@describe.attributes(
    nM=Dimension("Vector dimensions"),
    nC=Dimension("Number of characters per word"),
-    vectors=Synapses("Embed matrix",
+    vectors=Synapses(
-        lambda obj: (obj.nC, obj.nV, obj.nM),
+        "Embed matrix", lambda obj: (obj.nC, obj.nV, obj.nM), _uniform_init(-0.1, 0.1)
-        _uniform_init(-0.1, 0.1)),
+    ),
-    d_vectors=Gradient("vectors")
+    d_vectors=Gradient("vectors"),
 )
 class CharacterEmbed(Model):
    def __init__(self, nM=None, nC=None, **kwargs):
@ -926,12 +919,12 @@ class CharacterEmbed(Model):
    @property
    def nO(self):
        return self.nM * self.nC
-    
+
    @property
    def nV(self):
        return 256
-    def begin_update(self, docs, drop=0.):
+    def begin_update(self, docs, drop=0.0):
        if not docs:
            return []
        ids = []
@ -959,6 +952,7 @@ class CharacterEmbed(Model):
            if sgd is not None:
                sgd(self._mem.weights, self._mem.gradient, key=self.id)
            return None
        return output, backprop_character_embed
@ -974,4 +968,4 @@ def get_cossim_loss(yh, y):
    cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
    d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2))
    loss = xp.abs(cosine - 1).sum()
-    return loss, -d_yh
+    return loss, -d_yh
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -64,7 +64,12 @@ from .. import about
        str,
    ),
    noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
-    orth_variant_level=("Amount of orthography variation for data augmentation", "option", "ovl", float),
+    orth_variant_level=(
        "Amount of orthography variation for data augmentation",
        "option",
        "ovl",
        float,
    ),
    eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str),
    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
    learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
@ -245,7 +250,11 @@ def train(
        best_score = 0.0
        for i in range(n_iter):
            train_docs = corpus.train_docs(
-                nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0
+                nlp,
                noise_level=noise_level,
                orth_variant_level=orth_variant_level,
                gold_preproc=gold_preproc,
                max_length=0,
            )
            if raw_text:
                random.shuffle(raw_text)
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -456,6 +456,7 @@ class Errors(object):
    E159 = ("Can't find table '{name}' in lookups. Available tables: {tables}")
    E160 = ("Can't find language data file: {path}")
@add_codes
 class TempErrors(object):
    T003 = ("Resizing pre-trained Tagger models is not currently supported.")
--- a/spacy/lang/de/init.py
+++ b/spacy/lang/de/init.py
@ -27,10 +27,20 @@ class GermanDefaults(Language.Defaults):
    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS
    resources = {"lemma_lookup": "lemma_lookup.json"}
-    single_orth_variants = [{"tags": ["$("], "variants": ["…", "..."]},
+    single_orth_variants = [
-            {"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]}]
+        {"tags": ["$("], "variants": ["…", "..."]},
-    paired_orth_variants = [{"tags": ["$("], "variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")]},
+        {"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]},
-            {"tags": ["$("], "variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")]}]
+    ]
    paired_orth_variants = [
        {
            "tags": ["$("],
            "variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")],
        },
        {
            "tags": ["$("],
            "variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")],
        },
    ]
 class German(Language):
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -38,10 +38,14 @@ class EnglishDefaults(Language.Defaults):
        "lemma_index": "lemmatizer/lemma_index.json",
        "lemma_exc": "lemmatizer/lemma_exc.json",
    }
-    single_orth_variants = [{"tags": ["NFP"], "variants": ["…", "..."]},
+    single_orth_variants = [
-            {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}]
+        {"tags": ["NFP"], "variants": ["…", "..."]},
-    paired_orth_variants = [{"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]},
+        {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
-            {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]}]
+    ]
    paired_orth_variants = [
        {"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]},
        {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
    ]
 class English(Language):
--- a/spacy/lang/en/morph_rules.py
+++ b/spacy/lang/en/morph_rules.py
@ -12,50 +12,50 @@ _subordinating_conjunctions = [
    "if",
    "as",
    "because",
-    #"of",
+    # "of",
-    #"for",
+    # "for",
-    #"before",
+    # "before",
-    #"in",
+    # "in",
    "while",
-    #"after",
+    # "after",
    "since",
    "like",
-    #"with",
+    # "with",
    "so",
-    #"to",
+    # "to",
-    #"by",
+    # "by",
-    #"on",
+    # "on",
-    #"about",
+    # "about",
    "than",
    "whether",
    "although",
-    #"from",
+    # "from",
    "though",
-    #"until",
+    # "until",
    "unless",
    "once",
-    #"without",
+    # "without",
-    #"at",
+    # "at",
-    #"into",
+    # "into",
    "cause",
-    #"over",
+    # "over",
    "upon",
    "till",
    "whereas",
-    #"beyond",
+    # "beyond",
    "whilst",
    "except",
    "despite",
    "wether",
-    #"then",
+    # "then",
    "but",
    "becuse",
    "whie",
-    #"below",
+    # "below",
-    #"against",
+    # "against",
    "it",
    "w/out",
-    #"toward",
+    # "toward",
    "albeit",
    "save",
    "besides",
@ -67,17 +67,17 @@ _subordinating_conjunctions = [
    "out",
    "near",
    "seince",
-    #"towards",
+    # "towards",
    "tho",
    "sice",
    "will",
 ]
 # This seems kind of wrong too?
-#_relative_pronouns = ["this", "that", "those", "these"]
+# _relative_pronouns = ["this", "that", "those", "these"]
 MORPH_RULES = {
-    #"DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
+    # "DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
    "IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions},
    "NN": {
        "something": {"POS": "PRON"},
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -30,12 +30,7 @@ for pron in ["i"]:
    for orth in [pron, pron.title()]:
        _exc[orth + "'m"] = [
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
-            {
+            {ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP"},
                ORTH: "'m",
                LEMMA: "be",
                NORM: "am",
                TAG: "VBP",
            },
        ]
        _exc[orth + "m"] = [
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -2,8 +2,7 @@
 from __future__ import unicode_literals
 from collections import OrderedDict
-from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN
+from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
 from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
 class Lemmatizer(object):
@ -71,13 +70,13 @@ class Lemmatizer(object):
            return True
        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
            return True
-        elif morphology.get('VerbForm') == 'inf':
+        elif morphology.get("VerbForm") == "inf":
            return True
-        elif morphology.get('VerbForm') == 'none':
+        elif morphology.get("VerbForm") == "none":
            return True
-        elif morphology.get('VerbForm') == 'inf':
+        elif morphology.get("VerbForm") == "inf":
            return True
-        elif morphology.get('Degree') == 'pos':
+        elif morphology.get("Degree") == "pos":
            return True
        else:
            return False
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@ -137,6 +137,7 @@ class Table(OrderedDict):
    """A table in the lookups. Subclass of builtin dict that implements a
    slightly more consistent and unified API.
    """
    @classmethod
    def from_dict(cls, data, name=None):
        self = cls(name=name)
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -119,28 +119,8 @@ def test_oracle_moves_missing_B(en_vocab):
 def test_oracle_moves_whitespace(en_vocab):
-    words = [
+    words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"]
-        "production",
+    biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"]
        "\n",
        "of",
        "Northrop",
        "\n",
        "Corp.",
        "\n",
        "'s",
        "radar",
    ]
    biluo_tags = [
        "O",
        "O",
        "O",
        "B-ORG",
        None,
        "I-ORG",
        "L-ORG",
        "O",
        "O",
    ]
    doc = Doc(en_vocab, words=words)
    gold = GoldParse(doc, words=words, entities=biluo_tags)
@ -156,4 +136,4 @@ def test_oracle_moves_whitespace(en_vocab):
            action, label = tag.split("-")
            moves.add_action(move_types.index(action), label)
    moves.preprocess_gold(gold)
-    seq = moves.get_oracle_sequence(doc, gold)
+    moves.get_oracle_sequence(doc, gold)