mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Tidy up and auto-format
This commit is contained in:
parent
178d010b25
commit
af25323653
50
spacy/_ml.py
50
spacy/_ml.py
|
@ -348,7 +348,7 @@ def Tok2Vec(width, embed_size, **kwargs):
|
|||
if pretrained_vectors is not None:
|
||||
glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))
|
||||
|
||||
if subword_features:
|
||||
if subword_features:
|
||||
embed = uniqued(
|
||||
(glove | norm | prefix | suffix | shape)
|
||||
>> LN(Maxout(width, width * 5, pieces=3)),
|
||||
|
@ -363,14 +363,16 @@ def Tok2Vec(width, embed_size, **kwargs):
|
|||
embed = uniqued(
|
||||
(norm | prefix | suffix | shape)
|
||||
>> LN(Maxout(width, width * 4, pieces=3)),
|
||||
column=cols.index(ORTH)
|
||||
column=cols.index(ORTH),
|
||||
)
|
||||
elif char_embed:
|
||||
elif char_embed:
|
||||
embed = concatenate_lists(
|
||||
CharacterEmbed(nM=64, nC=8),
|
||||
FeatureExtracter(cols) >> with_flatten(norm)
|
||||
FeatureExtracter(cols) >> with_flatten(norm),
|
||||
)
|
||||
reduce_dimensions = LN(
|
||||
Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)
|
||||
)
|
||||
reduce_dimensions = LN(Maxout(width, 64*8+width, pieces=cnn_maxout_pieces))
|
||||
else:
|
||||
embed = norm
|
||||
|
||||
|
@ -379,22 +381,14 @@ def Tok2Vec(width, embed_size, **kwargs):
|
|||
>> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
|
||||
)
|
||||
if char_embed:
|
||||
tok2vec = (
|
||||
embed
|
||||
>> with_flatten(
|
||||
reduce_dimensions
|
||||
>> convolution ** conv_depth, pad=conv_depth
|
||||
)
|
||||
tok2vec = embed >> with_flatten(
|
||||
reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
|
||||
)
|
||||
else:
|
||||
tok2vec = (
|
||||
FeatureExtracter(cols)
|
||||
>> with_flatten(
|
||||
embed
|
||||
>> convolution ** conv_depth, pad=conv_depth
|
||||
)
|
||||
tok2vec = FeatureExtracter(cols) >> with_flatten(
|
||||
embed >> convolution ** conv_depth, pad=conv_depth
|
||||
)
|
||||
|
||||
|
||||
if bilstm_depth >= 1:
|
||||
tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
|
||||
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
||||
|
@ -611,9 +605,7 @@ def build_morphologizer_model(class_nums, **cfg):
|
|||
char_embed=char_embed,
|
||||
pretrained_vectors=pretrained_vectors,
|
||||
)
|
||||
softmax = with_flatten(
|
||||
MultiSoftmax(class_nums, token_vector_width)
|
||||
)
|
||||
softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
|
||||
softmax.out_sizes = class_nums
|
||||
model = tok2vec >> softmax
|
||||
model.nI = None
|
||||
|
@ -906,16 +898,17 @@ def _replace_word(word, random_words, mask="[MASK]"):
|
|||
def _uniform_init(lo, hi):
|
||||
def wrapped(W, ops):
|
||||
copy_array(W, ops.xp.random.uniform(lo, hi, W.shape))
|
||||
|
||||
return wrapped
|
||||
|
||||
|
||||
@describe.attributes(
|
||||
nM=Dimension("Vector dimensions"),
|
||||
nC=Dimension("Number of characters per word"),
|
||||
vectors=Synapses("Embed matrix",
|
||||
lambda obj: (obj.nC, obj.nV, obj.nM),
|
||||
_uniform_init(-0.1, 0.1)),
|
||||
d_vectors=Gradient("vectors")
|
||||
vectors=Synapses(
|
||||
"Embed matrix", lambda obj: (obj.nC, obj.nV, obj.nM), _uniform_init(-0.1, 0.1)
|
||||
),
|
||||
d_vectors=Gradient("vectors"),
|
||||
)
|
||||
class CharacterEmbed(Model):
|
||||
def __init__(self, nM=None, nC=None, **kwargs):
|
||||
|
@ -926,12 +919,12 @@ class CharacterEmbed(Model):
|
|||
@property
|
||||
def nO(self):
|
||||
return self.nM * self.nC
|
||||
|
||||
|
||||
@property
|
||||
def nV(self):
|
||||
return 256
|
||||
|
||||
def begin_update(self, docs, drop=0.):
|
||||
def begin_update(self, docs, drop=0.0):
|
||||
if not docs:
|
||||
return []
|
||||
ids = []
|
||||
|
@ -959,6 +952,7 @@ class CharacterEmbed(Model):
|
|||
if sgd is not None:
|
||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||
return None
|
||||
|
||||
return output, backprop_character_embed
|
||||
|
||||
|
||||
|
@ -974,4 +968,4 @@ def get_cossim_loss(yh, y):
|
|||
cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
|
||||
d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2))
|
||||
loss = xp.abs(cosine - 1).sum()
|
||||
return loss, -d_yh
|
||||
return loss, -d_yh
|
||||
|
|
|
@ -64,7 +64,12 @@ from .. import about
|
|||
str,
|
||||
),
|
||||
noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
|
||||
orth_variant_level=("Amount of orthography variation for data augmentation", "option", "ovl", float),
|
||||
orth_variant_level=(
|
||||
"Amount of orthography variation for data augmentation",
|
||||
"option",
|
||||
"ovl",
|
||||
float,
|
||||
),
|
||||
eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
|
||||
|
@ -245,7 +250,11 @@ def train(
|
|||
best_score = 0.0
|
||||
for i in range(n_iter):
|
||||
train_docs = corpus.train_docs(
|
||||
nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0
|
||||
nlp,
|
||||
noise_level=noise_level,
|
||||
orth_variant_level=orth_variant_level,
|
||||
gold_preproc=gold_preproc,
|
||||
max_length=0,
|
||||
)
|
||||
if raw_text:
|
||||
random.shuffle(raw_text)
|
||||
|
|
|
@ -456,6 +456,7 @@ class Errors(object):
|
|||
E159 = ("Can't find table '{name}' in lookups. Available tables: {tables}")
|
||||
E160 = ("Can't find language data file: {path}")
|
||||
|
||||
|
||||
@add_codes
|
||||
class TempErrors(object):
|
||||
T003 = ("Resizing pre-trained Tagger models is not currently supported.")
|
||||
|
|
|
@ -27,10 +27,20 @@ class GermanDefaults(Language.Defaults):
|
|||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
single_orth_variants = [{"tags": ["$("], "variants": ["…", "..."]},
|
||||
{"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]}]
|
||||
paired_orth_variants = [{"tags": ["$("], "variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")]},
|
||||
{"tags": ["$("], "variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")]}]
|
||||
single_orth_variants = [
|
||||
{"tags": ["$("], "variants": ["…", "..."]},
|
||||
{"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]},
|
||||
]
|
||||
paired_orth_variants = [
|
||||
{
|
||||
"tags": ["$("],
|
||||
"variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")],
|
||||
},
|
||||
{
|
||||
"tags": ["$("],
|
||||
"variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class German(Language):
|
||||
|
|
|
@ -38,10 +38,14 @@ class EnglishDefaults(Language.Defaults):
|
|||
"lemma_index": "lemmatizer/lemma_index.json",
|
||||
"lemma_exc": "lemmatizer/lemma_exc.json",
|
||||
}
|
||||
single_orth_variants = [{"tags": ["NFP"], "variants": ["…", "..."]},
|
||||
{"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}]
|
||||
paired_orth_variants = [{"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]},
|
||||
{"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]}]
|
||||
single_orth_variants = [
|
||||
{"tags": ["NFP"], "variants": ["…", "..."]},
|
||||
{"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
|
||||
]
|
||||
paired_orth_variants = [
|
||||
{"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]},
|
||||
{"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
|
||||
]
|
||||
|
||||
|
||||
class English(Language):
|
||||
|
|
|
@ -12,50 +12,50 @@ _subordinating_conjunctions = [
|
|||
"if",
|
||||
"as",
|
||||
"because",
|
||||
#"of",
|
||||
#"for",
|
||||
#"before",
|
||||
#"in",
|
||||
# "of",
|
||||
# "for",
|
||||
# "before",
|
||||
# "in",
|
||||
"while",
|
||||
#"after",
|
||||
# "after",
|
||||
"since",
|
||||
"like",
|
||||
#"with",
|
||||
# "with",
|
||||
"so",
|
||||
#"to",
|
||||
#"by",
|
||||
#"on",
|
||||
#"about",
|
||||
# "to",
|
||||
# "by",
|
||||
# "on",
|
||||
# "about",
|
||||
"than",
|
||||
"whether",
|
||||
"although",
|
||||
#"from",
|
||||
# "from",
|
||||
"though",
|
||||
#"until",
|
||||
# "until",
|
||||
"unless",
|
||||
"once",
|
||||
#"without",
|
||||
#"at",
|
||||
#"into",
|
||||
# "without",
|
||||
# "at",
|
||||
# "into",
|
||||
"cause",
|
||||
#"over",
|
||||
# "over",
|
||||
"upon",
|
||||
"till",
|
||||
"whereas",
|
||||
#"beyond",
|
||||
# "beyond",
|
||||
"whilst",
|
||||
"except",
|
||||
"despite",
|
||||
"wether",
|
||||
#"then",
|
||||
# "then",
|
||||
"but",
|
||||
"becuse",
|
||||
"whie",
|
||||
#"below",
|
||||
#"against",
|
||||
# "below",
|
||||
# "against",
|
||||
"it",
|
||||
"w/out",
|
||||
#"toward",
|
||||
# "toward",
|
||||
"albeit",
|
||||
"save",
|
||||
"besides",
|
||||
|
@ -67,17 +67,17 @@ _subordinating_conjunctions = [
|
|||
"out",
|
||||
"near",
|
||||
"seince",
|
||||
#"towards",
|
||||
# "towards",
|
||||
"tho",
|
||||
"sice",
|
||||
"will",
|
||||
]
|
||||
|
||||
# This seems kind of wrong too?
|
||||
#_relative_pronouns = ["this", "that", "those", "these"]
|
||||
# _relative_pronouns = ["this", "that", "those", "these"]
|
||||
|
||||
MORPH_RULES = {
|
||||
#"DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
|
||||
# "DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
|
||||
"IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions},
|
||||
"NN": {
|
||||
"something": {"POS": "PRON"},
|
||||
|
|
|
@ -30,12 +30,7 @@ for pron in ["i"]:
|
|||
for orth in [pron, pron.title()]:
|
||||
_exc[orth + "'m"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{
|
||||
ORTH: "'m",
|
||||
LEMMA: "be",
|
||||
NORM: "am",
|
||||
TAG: "VBP",
|
||||
},
|
||||
{ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP"},
|
||||
]
|
||||
|
||||
_exc[orth + "m"] = [
|
||||
|
|
|
@ -2,8 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
from collections import OrderedDict
|
||||
|
||||
from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN
|
||||
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
||||
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
|
||||
|
||||
|
||||
class Lemmatizer(object):
|
||||
|
@ -71,13 +70,13 @@ class Lemmatizer(object):
|
|||
return True
|
||||
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
||||
return True
|
||||
elif morphology.get('VerbForm') == 'inf':
|
||||
elif morphology.get("VerbForm") == "inf":
|
||||
return True
|
||||
elif morphology.get('VerbForm') == 'none':
|
||||
elif morphology.get("VerbForm") == "none":
|
||||
return True
|
||||
elif morphology.get('VerbForm') == 'inf':
|
||||
elif morphology.get("VerbForm") == "inf":
|
||||
return True
|
||||
elif morphology.get('Degree') == 'pos':
|
||||
elif morphology.get("Degree") == "pos":
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
|
|
@ -137,6 +137,7 @@ class Table(OrderedDict):
|
|||
"""A table in the lookups. Subclass of builtin dict that implements a
|
||||
slightly more consistent and unified API.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data, name=None):
|
||||
self = cls(name=name)
|
||||
|
|
|
@ -119,28 +119,8 @@ def test_oracle_moves_missing_B(en_vocab):
|
|||
|
||||
|
||||
def test_oracle_moves_whitespace(en_vocab):
|
||||
words = [
|
||||
"production",
|
||||
"\n",
|
||||
"of",
|
||||
"Northrop",
|
||||
"\n",
|
||||
"Corp.",
|
||||
"\n",
|
||||
"'s",
|
||||
"radar",
|
||||
]
|
||||
biluo_tags = [
|
||||
"O",
|
||||
"O",
|
||||
"O",
|
||||
"B-ORG",
|
||||
None,
|
||||
"I-ORG",
|
||||
"L-ORG",
|
||||
"O",
|
||||
"O",
|
||||
]
|
||||
words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"]
|
||||
biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"]
|
||||
|
||||
doc = Doc(en_vocab, words=words)
|
||||
gold = GoldParse(doc, words=words, entities=biluo_tags)
|
||||
|
@ -156,4 +136,4 @@ def test_oracle_moves_whitespace(en_vocab):
|
|||
action, label = tag.split("-")
|
||||
moves.add_action(move_types.index(action), label)
|
||||
moves.preprocess_gold(gold)
|
||||
seq = moves.get_oracle_sequence(doc, gold)
|
||||
moves.get_oracle_sequence(doc, gold)
|
||||
|
|
Loading…
Reference in New Issue
Block a user