Tidy up and auto-format

This commit is contained in:
Ines Montani 2019-09-11 14:00:36 +02:00
parent 178d010b25
commit af25323653
10 changed files with 90 additions and 97 deletions

View File

@ -363,14 +363,16 @@ def Tok2Vec(width, embed_size, **kwargs):
embed = uniqued(
(norm | prefix | suffix | shape)
>> LN(Maxout(width, width * 4, pieces=3)),
column=cols.index(ORTH)
column=cols.index(ORTH),
)
elif char_embed:
embed = concatenate_lists(
CharacterEmbed(nM=64, nC=8),
FeatureExtracter(cols) >> with_flatten(norm)
FeatureExtracter(cols) >> with_flatten(norm),
)
reduce_dimensions = LN(
Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)
)
reduce_dimensions = LN(Maxout(width, 64*8+width, pieces=cnn_maxout_pieces))
else:
embed = norm
@ -379,20 +381,12 @@ def Tok2Vec(width, embed_size, **kwargs):
>> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
)
if char_embed:
tok2vec = (
embed
>> with_flatten(
reduce_dimensions
>> convolution ** conv_depth, pad=conv_depth
)
tok2vec = embed >> with_flatten(
reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
)
else:
tok2vec = (
FeatureExtracter(cols)
>> with_flatten(
embed
>> convolution ** conv_depth, pad=conv_depth
)
tok2vec = FeatureExtracter(cols) >> with_flatten(
embed >> convolution ** conv_depth, pad=conv_depth
)
if bilstm_depth >= 1:
@ -611,9 +605,7 @@ def build_morphologizer_model(class_nums, **cfg):
char_embed=char_embed,
pretrained_vectors=pretrained_vectors,
)
softmax = with_flatten(
MultiSoftmax(class_nums, token_vector_width)
)
softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
softmax.out_sizes = class_nums
model = tok2vec >> softmax
model.nI = None
@ -906,16 +898,17 @@ def _replace_word(word, random_words, mask="[MASK]"):
def _uniform_init(lo, hi):
def wrapped(W, ops):
copy_array(W, ops.xp.random.uniform(lo, hi, W.shape))
return wrapped
@describe.attributes(
nM=Dimension("Vector dimensions"),
nC=Dimension("Number of characters per word"),
vectors=Synapses("Embed matrix",
lambda obj: (obj.nC, obj.nV, obj.nM),
_uniform_init(-0.1, 0.1)),
d_vectors=Gradient("vectors")
vectors=Synapses(
"Embed matrix", lambda obj: (obj.nC, obj.nV, obj.nM), _uniform_init(-0.1, 0.1)
),
d_vectors=Gradient("vectors"),
)
class CharacterEmbed(Model):
def __init__(self, nM=None, nC=None, **kwargs):
@ -931,7 +924,7 @@ class CharacterEmbed(Model):
def nV(self):
return 256
def begin_update(self, docs, drop=0.):
def begin_update(self, docs, drop=0.0):
if not docs:
return []
ids = []
@ -959,6 +952,7 @@ class CharacterEmbed(Model):
if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id)
return None
return output, backprop_character_embed

View File

@ -64,7 +64,12 @@ from .. import about
str,
),
noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
orth_variant_level=("Amount of orthography variation for data augmentation", "option", "ovl", float),
orth_variant_level=(
"Amount of orthography variation for data augmentation",
"option",
"ovl",
float,
),
eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
@ -245,7 +250,11 @@ def train(
best_score = 0.0
for i in range(n_iter):
train_docs = corpus.train_docs(
nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0
nlp,
noise_level=noise_level,
orth_variant_level=orth_variant_level,
gold_preproc=gold_preproc,
max_length=0,
)
if raw_text:
random.shuffle(raw_text)

View File

@ -456,6 +456,7 @@ class Errors(object):
E159 = ("Can't find table '{name}' in lookups. Available tables: {tables}")
E160 = ("Can't find language data file: {path}")
@add_codes
class TempErrors(object):
T003 = ("Resizing pre-trained Tagger models is not currently supported.")

View File

@ -27,10 +27,20 @@ class GermanDefaults(Language.Defaults):
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
resources = {"lemma_lookup": "lemma_lookup.json"}
single_orth_variants = [{"tags": ["$("], "variants": ["", "..."]},
{"tags": ["$("], "variants": ["-", "", "", "--", "---", "——"]}]
paired_orth_variants = [{"tags": ["$("], "variants": [("'", "'"), (",", "'"), ("", ""), ("", ""), ("", "")]},
{"tags": ["$("], "variants": [("``", "''"), ('"', '"'), ("", ""), ("»", "«"), ("«", "»")]}]
single_orth_variants = [
{"tags": ["$("], "variants": ["", "..."]},
{"tags": ["$("], "variants": ["-", "", "", "--", "---", "——"]},
]
paired_orth_variants = [
{
"tags": ["$("],
"variants": [("'", "'"), (",", "'"), ("", ""), ("", ""), ("", "")],
},
{
"tags": ["$("],
"variants": [("``", "''"), ('"', '"'), ("", ""), ("»", "«"), ("«", "»")],
},
]
class German(Language):

View File

@ -38,10 +38,14 @@ class EnglishDefaults(Language.Defaults):
"lemma_index": "lemmatizer/lemma_index.json",
"lemma_exc": "lemmatizer/lemma_exc.json",
}
single_orth_variants = [{"tags": ["NFP"], "variants": ["", "..."]},
{"tags": [":"], "variants": ["-", "", "", "--", "---", "——"]}]
paired_orth_variants = [{"tags": ["``", "''"], "variants": [("'", "'"), ("", "")]},
{"tags": ["``", "''"], "variants": [('"', '"'), ("", "")]}]
single_orth_variants = [
{"tags": ["NFP"], "variants": ["", "..."]},
{"tags": [":"], "variants": ["-", "", "", "--", "---", "——"]},
]
paired_orth_variants = [
{"tags": ["``", "''"], "variants": [("'", "'"), ("", "")]},
{"tags": ["``", "''"], "variants": [('"', '"'), ("", "")]},
]
class English(Language):

View File

@ -12,50 +12,50 @@ _subordinating_conjunctions = [
"if",
"as",
"because",
#"of",
#"for",
#"before",
#"in",
# "of",
# "for",
# "before",
# "in",
"while",
#"after",
# "after",
"since",
"like",
#"with",
# "with",
"so",
#"to",
#"by",
#"on",
#"about",
# "to",
# "by",
# "on",
# "about",
"than",
"whether",
"although",
#"from",
# "from",
"though",
#"until",
# "until",
"unless",
"once",
#"without",
#"at",
#"into",
# "without",
# "at",
# "into",
"cause",
#"over",
# "over",
"upon",
"till",
"whereas",
#"beyond",
# "beyond",
"whilst",
"except",
"despite",
"wether",
#"then",
# "then",
"but",
"becuse",
"whie",
#"below",
#"against",
# "below",
# "against",
"it",
"w/out",
#"toward",
# "toward",
"albeit",
"save",
"besides",
@ -67,17 +67,17 @@ _subordinating_conjunctions = [
"out",
"near",
"seince",
#"towards",
# "towards",
"tho",
"sice",
"will",
]
# This seems kind of wrong too?
#_relative_pronouns = ["this", "that", "those", "these"]
# _relative_pronouns = ["this", "that", "those", "these"]
MORPH_RULES = {
#"DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
# "DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
"IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions},
"NN": {
"something": {"POS": "PRON"},

View File

@ -30,12 +30,7 @@ for pron in ["i"]:
for orth in [pron, pron.title()]:
_exc[orth + "'m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{
ORTH: "'m",
LEMMA: "be",
NORM: "am",
TAG: "VBP",
},
{ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP"},
]
_exc[orth + "m"] = [

View File

@ -2,8 +2,7 @@
from __future__ import unicode_literals
from collections import OrderedDict
from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
class Lemmatizer(object):
@ -71,13 +70,13 @@ class Lemmatizer(object):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
elif morphology.get('VerbForm') == 'inf':
elif morphology.get("VerbForm") == "inf":
return True
elif morphology.get('VerbForm') == 'none':
elif morphology.get("VerbForm") == "none":
return True
elif morphology.get('VerbForm') == 'inf':
elif morphology.get("VerbForm") == "inf":
return True
elif morphology.get('Degree') == 'pos':
elif morphology.get("Degree") == "pos":
return True
else:
return False

View File

@ -137,6 +137,7 @@ class Table(OrderedDict):
"""A table in the lookups. Subclass of builtin dict that implements a
slightly more consistent and unified API.
"""
@classmethod
def from_dict(cls, data, name=None):
self = cls(name=name)

View File

@ -119,28 +119,8 @@ def test_oracle_moves_missing_B(en_vocab):
def test_oracle_moves_whitespace(en_vocab):
words = [
"production",
"\n",
"of",
"Northrop",
"\n",
"Corp.",
"\n",
"'s",
"radar",
]
biluo_tags = [
"O",
"O",
"O",
"B-ORG",
None,
"I-ORG",
"L-ORG",
"O",
"O",
]
words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"]
biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"]
doc = Doc(en_vocab, words=words)
gold = GoldParse(doc, words=words, entities=biluo_tags)
@ -156,4 +136,4 @@ def test_oracle_moves_whitespace(en_vocab):
action, label = tag.split("-")
moves.add_action(move_types.index(action), label)
moves.preprocess_gold(gold)
seq = moves.get_oracle_sequence(doc, gold)
moves.get_oracle_sequence(doc, gold)