Tidy up and auto-format

This commit is contained in:
Ines Montani 2019-09-11 14:00:36 +02:00
parent 178d010b25
commit af25323653
10 changed files with 90 additions and 97 deletions

View File

@ -348,7 +348,7 @@ def Tok2Vec(width, embed_size, **kwargs):
if pretrained_vectors is not None: if pretrained_vectors is not None:
glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID)) glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))
if subword_features: if subword_features:
embed = uniqued( embed = uniqued(
(glove | norm | prefix | suffix | shape) (glove | norm | prefix | suffix | shape)
>> LN(Maxout(width, width * 5, pieces=3)), >> LN(Maxout(width, width * 5, pieces=3)),
@ -363,14 +363,16 @@ def Tok2Vec(width, embed_size, **kwargs):
embed = uniqued( embed = uniqued(
(norm | prefix | suffix | shape) (norm | prefix | suffix | shape)
>> LN(Maxout(width, width * 4, pieces=3)), >> LN(Maxout(width, width * 4, pieces=3)),
column=cols.index(ORTH) column=cols.index(ORTH),
) )
elif char_embed: elif char_embed:
embed = concatenate_lists( embed = concatenate_lists(
CharacterEmbed(nM=64, nC=8), CharacterEmbed(nM=64, nC=8),
FeatureExtracter(cols) >> with_flatten(norm) FeatureExtracter(cols) >> with_flatten(norm),
)
reduce_dimensions = LN(
Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)
) )
reduce_dimensions = LN(Maxout(width, 64*8+width, pieces=cnn_maxout_pieces))
else: else:
embed = norm embed = norm
@ -379,22 +381,14 @@ def Tok2Vec(width, embed_size, **kwargs):
>> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces)) >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
) )
if char_embed: if char_embed:
tok2vec = ( tok2vec = embed >> with_flatten(
embed reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
>> with_flatten(
reduce_dimensions
>> convolution ** conv_depth, pad=conv_depth
)
) )
else: else:
tok2vec = ( tok2vec = FeatureExtracter(cols) >> with_flatten(
FeatureExtracter(cols) embed >> convolution ** conv_depth, pad=conv_depth
>> with_flatten(
embed
>> convolution ** conv_depth, pad=conv_depth
)
) )
if bilstm_depth >= 1: if bilstm_depth >= 1:
tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth) tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
# Work around thinc API limitations :(. TODO: Revise in Thinc 7 # Work around thinc API limitations :(. TODO: Revise in Thinc 7
@ -611,9 +605,7 @@ def build_morphologizer_model(class_nums, **cfg):
char_embed=char_embed, char_embed=char_embed,
pretrained_vectors=pretrained_vectors, pretrained_vectors=pretrained_vectors,
) )
softmax = with_flatten( softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
MultiSoftmax(class_nums, token_vector_width)
)
softmax.out_sizes = class_nums softmax.out_sizes = class_nums
model = tok2vec >> softmax model = tok2vec >> softmax
model.nI = None model.nI = None
@ -906,16 +898,17 @@ def _replace_word(word, random_words, mask="[MASK]"):
def _uniform_init(lo, hi): def _uniform_init(lo, hi):
def wrapped(W, ops): def wrapped(W, ops):
copy_array(W, ops.xp.random.uniform(lo, hi, W.shape)) copy_array(W, ops.xp.random.uniform(lo, hi, W.shape))
return wrapped return wrapped
@describe.attributes( @describe.attributes(
nM=Dimension("Vector dimensions"), nM=Dimension("Vector dimensions"),
nC=Dimension("Number of characters per word"), nC=Dimension("Number of characters per word"),
vectors=Synapses("Embed matrix", vectors=Synapses(
lambda obj: (obj.nC, obj.nV, obj.nM), "Embed matrix", lambda obj: (obj.nC, obj.nV, obj.nM), _uniform_init(-0.1, 0.1)
_uniform_init(-0.1, 0.1)), ),
d_vectors=Gradient("vectors") d_vectors=Gradient("vectors"),
) )
class CharacterEmbed(Model): class CharacterEmbed(Model):
def __init__(self, nM=None, nC=None, **kwargs): def __init__(self, nM=None, nC=None, **kwargs):
@ -926,12 +919,12 @@ class CharacterEmbed(Model):
@property @property
def nO(self): def nO(self):
return self.nM * self.nC return self.nM * self.nC
@property @property
def nV(self): def nV(self):
return 256 return 256
def begin_update(self, docs, drop=0.): def begin_update(self, docs, drop=0.0):
if not docs: if not docs:
return [] return []
ids = [] ids = []
@ -959,6 +952,7 @@ class CharacterEmbed(Model):
if sgd is not None: if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id) sgd(self._mem.weights, self._mem.gradient, key=self.id)
return None return None
return output, backprop_character_embed return output, backprop_character_embed
@ -974,4 +968,4 @@ def get_cossim_loss(yh, y):
cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2)) d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2))
loss = xp.abs(cosine - 1).sum() loss = xp.abs(cosine - 1).sum()
return loss, -d_yh return loss, -d_yh

View File

@ -64,7 +64,12 @@ from .. import about
str, str,
), ),
noise_level=("Amount of corruption for data augmentation", "option", "nl", float), noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
orth_variant_level=("Amount of orthography variation for data augmentation", "option", "ovl", float), orth_variant_level=(
"Amount of orthography variation for data augmentation",
"option",
"ovl",
float,
),
eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str), eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str),
gold_preproc=("Use gold preprocessing", "flag", "G", bool), gold_preproc=("Use gold preprocessing", "flag", "G", bool),
learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool), learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
@ -245,7 +250,11 @@ def train(
best_score = 0.0 best_score = 0.0
for i in range(n_iter): for i in range(n_iter):
train_docs = corpus.train_docs( train_docs = corpus.train_docs(
nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0 nlp,
noise_level=noise_level,
orth_variant_level=orth_variant_level,
gold_preproc=gold_preproc,
max_length=0,
) )
if raw_text: if raw_text:
random.shuffle(raw_text) random.shuffle(raw_text)

View File

@ -456,6 +456,7 @@ class Errors(object):
E159 = ("Can't find table '{name}' in lookups. Available tables: {tables}") E159 = ("Can't find table '{name}' in lookups. Available tables: {tables}")
E160 = ("Can't find language data file: {path}") E160 = ("Can't find language data file: {path}")
@add_codes @add_codes
class TempErrors(object): class TempErrors(object):
T003 = ("Resizing pre-trained Tagger models is not currently supported.") T003 = ("Resizing pre-trained Tagger models is not currently supported.")

View File

@ -27,10 +27,20 @@ class GermanDefaults(Language.Defaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
resources = {"lemma_lookup": "lemma_lookup.json"} resources = {"lemma_lookup": "lemma_lookup.json"}
single_orth_variants = [{"tags": ["$("], "variants": ["", "..."]}, single_orth_variants = [
{"tags": ["$("], "variants": ["-", "", "", "--", "---", "——"]}] {"tags": ["$("], "variants": ["", "..."]},
paired_orth_variants = [{"tags": ["$("], "variants": [("'", "'"), (",", "'"), ("", ""), ("", ""), ("", "")]}, {"tags": ["$("], "variants": ["-", "", "", "--", "---", "——"]},
{"tags": ["$("], "variants": [("``", "''"), ('"', '"'), ("", ""), ("»", "«"), ("«", "»")]}] ]
paired_orth_variants = [
{
"tags": ["$("],
"variants": [("'", "'"), (",", "'"), ("", ""), ("", ""), ("", "")],
},
{
"tags": ["$("],
"variants": [("``", "''"), ('"', '"'), ("", ""), ("»", "«"), ("«", "»")],
},
]
class German(Language): class German(Language):

View File

@ -38,10 +38,14 @@ class EnglishDefaults(Language.Defaults):
"lemma_index": "lemmatizer/lemma_index.json", "lemma_index": "lemmatizer/lemma_index.json",
"lemma_exc": "lemmatizer/lemma_exc.json", "lemma_exc": "lemmatizer/lemma_exc.json",
} }
single_orth_variants = [{"tags": ["NFP"], "variants": ["", "..."]}, single_orth_variants = [
{"tags": [":"], "variants": ["-", "", "", "--", "---", "——"]}] {"tags": ["NFP"], "variants": ["", "..."]},
paired_orth_variants = [{"tags": ["``", "''"], "variants": [("'", "'"), ("", "")]}, {"tags": [":"], "variants": ["-", "", "", "--", "---", "——"]},
{"tags": ["``", "''"], "variants": [('"', '"'), ("", "")]}] ]
paired_orth_variants = [
{"tags": ["``", "''"], "variants": [("'", "'"), ("", "")]},
{"tags": ["``", "''"], "variants": [('"', '"'), ("", "")]},
]
class English(Language): class English(Language):

View File

@ -12,50 +12,50 @@ _subordinating_conjunctions = [
"if", "if",
"as", "as",
"because", "because",
#"of", # "of",
#"for", # "for",
#"before", # "before",
#"in", # "in",
"while", "while",
#"after", # "after",
"since", "since",
"like", "like",
#"with", # "with",
"so", "so",
#"to", # "to",
#"by", # "by",
#"on", # "on",
#"about", # "about",
"than", "than",
"whether", "whether",
"although", "although",
#"from", # "from",
"though", "though",
#"until", # "until",
"unless", "unless",
"once", "once",
#"without", # "without",
#"at", # "at",
#"into", # "into",
"cause", "cause",
#"over", # "over",
"upon", "upon",
"till", "till",
"whereas", "whereas",
#"beyond", # "beyond",
"whilst", "whilst",
"except", "except",
"despite", "despite",
"wether", "wether",
#"then", # "then",
"but", "but",
"becuse", "becuse",
"whie", "whie",
#"below", # "below",
#"against", # "against",
"it", "it",
"w/out", "w/out",
#"toward", # "toward",
"albeit", "albeit",
"save", "save",
"besides", "besides",
@ -67,17 +67,17 @@ _subordinating_conjunctions = [
"out", "out",
"near", "near",
"seince", "seince",
#"towards", # "towards",
"tho", "tho",
"sice", "sice",
"will", "will",
] ]
# This seems kind of wrong too? # This seems kind of wrong too?
#_relative_pronouns = ["this", "that", "those", "these"] # _relative_pronouns = ["this", "that", "those", "these"]
MORPH_RULES = { MORPH_RULES = {
#"DT": {word: {"POS": "PRON"} for word in _relative_pronouns}, # "DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
"IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions}, "IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions},
"NN": { "NN": {
"something": {"POS": "PRON"}, "something": {"POS": "PRON"},

View File

@ -30,12 +30,7 @@ for pron in ["i"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'m"] = [ _exc[orth + "'m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ {ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP"},
ORTH: "'m",
LEMMA: "be",
NORM: "am",
TAG: "VBP",
},
] ]
_exc[orth + "m"] = [ _exc[orth + "m"] = [

View File

@ -2,8 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from collections import OrderedDict from collections import OrderedDict
from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
class Lemmatizer(object): class Lemmatizer(object):
@ -71,13 +70,13 @@ class Lemmatizer(object):
return True return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos": elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True return True
elif morphology.get('VerbForm') == 'inf': elif morphology.get("VerbForm") == "inf":
return True return True
elif morphology.get('VerbForm') == 'none': elif morphology.get("VerbForm") == "none":
return True return True
elif morphology.get('VerbForm') == 'inf': elif morphology.get("VerbForm") == "inf":
return True return True
elif morphology.get('Degree') == 'pos': elif morphology.get("Degree") == "pos":
return True return True
else: else:
return False return False

View File

@ -137,6 +137,7 @@ class Table(OrderedDict):
"""A table in the lookups. Subclass of builtin dict that implements a """A table in the lookups. Subclass of builtin dict that implements a
slightly more consistent and unified API. slightly more consistent and unified API.
""" """
@classmethod @classmethod
def from_dict(cls, data, name=None): def from_dict(cls, data, name=None):
self = cls(name=name) self = cls(name=name)

View File

@ -119,28 +119,8 @@ def test_oracle_moves_missing_B(en_vocab):
def test_oracle_moves_whitespace(en_vocab): def test_oracle_moves_whitespace(en_vocab):
words = [ words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"]
"production", biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"]
"\n",
"of",
"Northrop",
"\n",
"Corp.",
"\n",
"'s",
"radar",
]
biluo_tags = [
"O",
"O",
"O",
"B-ORG",
None,
"I-ORG",
"L-ORG",
"O",
"O",
]
doc = Doc(en_vocab, words=words) doc = Doc(en_vocab, words=words)
gold = GoldParse(doc, words=words, entities=biluo_tags) gold = GoldParse(doc, words=words, entities=biluo_tags)
@ -156,4 +136,4 @@ def test_oracle_moves_whitespace(en_vocab):
action, label = tag.split("-") action, label = tag.split("-")
moves.add_action(move_types.index(action), label) moves.add_action(move_types.index(action), label)
moves.preprocess_gold(gold) moves.preprocess_gold(gold)
seq = moves.get_oracle_sequence(doc, gold) moves.get_oracle_sequence(doc, gold)