mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Tidy up and auto-format
This commit is contained in:
parent
178d010b25
commit
af25323653
40
spacy/_ml.py
40
spacy/_ml.py
|
@ -363,14 +363,16 @@ def Tok2Vec(width, embed_size, **kwargs):
|
||||||
embed = uniqued(
|
embed = uniqued(
|
||||||
(norm | prefix | suffix | shape)
|
(norm | prefix | suffix | shape)
|
||||||
>> LN(Maxout(width, width * 4, pieces=3)),
|
>> LN(Maxout(width, width * 4, pieces=3)),
|
||||||
column=cols.index(ORTH)
|
column=cols.index(ORTH),
|
||||||
)
|
)
|
||||||
elif char_embed:
|
elif char_embed:
|
||||||
embed = concatenate_lists(
|
embed = concatenate_lists(
|
||||||
CharacterEmbed(nM=64, nC=8),
|
CharacterEmbed(nM=64, nC=8),
|
||||||
FeatureExtracter(cols) >> with_flatten(norm)
|
FeatureExtracter(cols) >> with_flatten(norm),
|
||||||
|
)
|
||||||
|
reduce_dimensions = LN(
|
||||||
|
Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)
|
||||||
)
|
)
|
||||||
reduce_dimensions = LN(Maxout(width, 64*8+width, pieces=cnn_maxout_pieces))
|
|
||||||
else:
|
else:
|
||||||
embed = norm
|
embed = norm
|
||||||
|
|
||||||
|
@ -379,20 +381,12 @@ def Tok2Vec(width, embed_size, **kwargs):
|
||||||
>> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
|
>> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
|
||||||
)
|
)
|
||||||
if char_embed:
|
if char_embed:
|
||||||
tok2vec = (
|
tok2vec = embed >> with_flatten(
|
||||||
embed
|
reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
|
||||||
>> with_flatten(
|
|
||||||
reduce_dimensions
|
|
||||||
>> convolution ** conv_depth, pad=conv_depth
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
tok2vec = (
|
tok2vec = FeatureExtracter(cols) >> with_flatten(
|
||||||
FeatureExtracter(cols)
|
embed >> convolution ** conv_depth, pad=conv_depth
|
||||||
>> with_flatten(
|
|
||||||
embed
|
|
||||||
>> convolution ** conv_depth, pad=conv_depth
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if bilstm_depth >= 1:
|
if bilstm_depth >= 1:
|
||||||
|
@ -611,9 +605,7 @@ def build_morphologizer_model(class_nums, **cfg):
|
||||||
char_embed=char_embed,
|
char_embed=char_embed,
|
||||||
pretrained_vectors=pretrained_vectors,
|
pretrained_vectors=pretrained_vectors,
|
||||||
)
|
)
|
||||||
softmax = with_flatten(
|
softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
|
||||||
MultiSoftmax(class_nums, token_vector_width)
|
|
||||||
)
|
|
||||||
softmax.out_sizes = class_nums
|
softmax.out_sizes = class_nums
|
||||||
model = tok2vec >> softmax
|
model = tok2vec >> softmax
|
||||||
model.nI = None
|
model.nI = None
|
||||||
|
@ -906,16 +898,17 @@ def _replace_word(word, random_words, mask="[MASK]"):
|
||||||
def _uniform_init(lo, hi):
|
def _uniform_init(lo, hi):
|
||||||
def wrapped(W, ops):
|
def wrapped(W, ops):
|
||||||
copy_array(W, ops.xp.random.uniform(lo, hi, W.shape))
|
copy_array(W, ops.xp.random.uniform(lo, hi, W.shape))
|
||||||
|
|
||||||
return wrapped
|
return wrapped
|
||||||
|
|
||||||
|
|
||||||
@describe.attributes(
|
@describe.attributes(
|
||||||
nM=Dimension("Vector dimensions"),
|
nM=Dimension("Vector dimensions"),
|
||||||
nC=Dimension("Number of characters per word"),
|
nC=Dimension("Number of characters per word"),
|
||||||
vectors=Synapses("Embed matrix",
|
vectors=Synapses(
|
||||||
lambda obj: (obj.nC, obj.nV, obj.nM),
|
"Embed matrix", lambda obj: (obj.nC, obj.nV, obj.nM), _uniform_init(-0.1, 0.1)
|
||||||
_uniform_init(-0.1, 0.1)),
|
),
|
||||||
d_vectors=Gradient("vectors")
|
d_vectors=Gradient("vectors"),
|
||||||
)
|
)
|
||||||
class CharacterEmbed(Model):
|
class CharacterEmbed(Model):
|
||||||
def __init__(self, nM=None, nC=None, **kwargs):
|
def __init__(self, nM=None, nC=None, **kwargs):
|
||||||
|
@ -931,7 +924,7 @@ class CharacterEmbed(Model):
|
||||||
def nV(self):
|
def nV(self):
|
||||||
return 256
|
return 256
|
||||||
|
|
||||||
def begin_update(self, docs, drop=0.):
|
def begin_update(self, docs, drop=0.0):
|
||||||
if not docs:
|
if not docs:
|
||||||
return []
|
return []
|
||||||
ids = []
|
ids = []
|
||||||
|
@ -959,6 +952,7 @@ class CharacterEmbed(Model):
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return output, backprop_character_embed
|
return output, backprop_character_embed
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -64,7 +64,12 @@ from .. import about
|
||||||
str,
|
str,
|
||||||
),
|
),
|
||||||
noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
|
noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
|
||||||
orth_variant_level=("Amount of orthography variation for data augmentation", "option", "ovl", float),
|
orth_variant_level=(
|
||||||
|
"Amount of orthography variation for data augmentation",
|
||||||
|
"option",
|
||||||
|
"ovl",
|
||||||
|
float,
|
||||||
|
),
|
||||||
eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str),
|
eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str),
|
||||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||||
learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
|
learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
|
||||||
|
@ -245,7 +250,11 @@ def train(
|
||||||
best_score = 0.0
|
best_score = 0.0
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
train_docs = corpus.train_docs(
|
train_docs = corpus.train_docs(
|
||||||
nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0
|
nlp,
|
||||||
|
noise_level=noise_level,
|
||||||
|
orth_variant_level=orth_variant_level,
|
||||||
|
gold_preproc=gold_preproc,
|
||||||
|
max_length=0,
|
||||||
)
|
)
|
||||||
if raw_text:
|
if raw_text:
|
||||||
random.shuffle(raw_text)
|
random.shuffle(raw_text)
|
||||||
|
|
|
@ -456,6 +456,7 @@ class Errors(object):
|
||||||
E159 = ("Can't find table '{name}' in lookups. Available tables: {tables}")
|
E159 = ("Can't find table '{name}' in lookups. Available tables: {tables}")
|
||||||
E160 = ("Can't find language data file: {path}")
|
E160 = ("Can't find language data file: {path}")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
class TempErrors(object):
|
class TempErrors(object):
|
||||||
T003 = ("Resizing pre-trained Tagger models is not currently supported.")
|
T003 = ("Resizing pre-trained Tagger models is not currently supported.")
|
||||||
|
|
|
@ -27,10 +27,20 @@ class GermanDefaults(Language.Defaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||||
single_orth_variants = [{"tags": ["$("], "variants": ["…", "..."]},
|
single_orth_variants = [
|
||||||
{"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]}]
|
{"tags": ["$("], "variants": ["…", "..."]},
|
||||||
paired_orth_variants = [{"tags": ["$("], "variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")]},
|
{"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]},
|
||||||
{"tags": ["$("], "variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")]}]
|
]
|
||||||
|
paired_orth_variants = [
|
||||||
|
{
|
||||||
|
"tags": ["$("],
|
||||||
|
"variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"tags": ["$("],
|
||||||
|
"variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class German(Language):
|
class German(Language):
|
||||||
|
|
|
@ -38,10 +38,14 @@ class EnglishDefaults(Language.Defaults):
|
||||||
"lemma_index": "lemmatizer/lemma_index.json",
|
"lemma_index": "lemmatizer/lemma_index.json",
|
||||||
"lemma_exc": "lemmatizer/lemma_exc.json",
|
"lemma_exc": "lemmatizer/lemma_exc.json",
|
||||||
}
|
}
|
||||||
single_orth_variants = [{"tags": ["NFP"], "variants": ["…", "..."]},
|
single_orth_variants = [
|
||||||
{"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}]
|
{"tags": ["NFP"], "variants": ["…", "..."]},
|
||||||
paired_orth_variants = [{"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]},
|
{"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
|
||||||
{"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]}]
|
]
|
||||||
|
paired_orth_variants = [
|
||||||
|
{"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]},
|
||||||
|
{"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class English(Language):
|
class English(Language):
|
||||||
|
|
|
@ -12,50 +12,50 @@ _subordinating_conjunctions = [
|
||||||
"if",
|
"if",
|
||||||
"as",
|
"as",
|
||||||
"because",
|
"because",
|
||||||
#"of",
|
# "of",
|
||||||
#"for",
|
# "for",
|
||||||
#"before",
|
# "before",
|
||||||
#"in",
|
# "in",
|
||||||
"while",
|
"while",
|
||||||
#"after",
|
# "after",
|
||||||
"since",
|
"since",
|
||||||
"like",
|
"like",
|
||||||
#"with",
|
# "with",
|
||||||
"so",
|
"so",
|
||||||
#"to",
|
# "to",
|
||||||
#"by",
|
# "by",
|
||||||
#"on",
|
# "on",
|
||||||
#"about",
|
# "about",
|
||||||
"than",
|
"than",
|
||||||
"whether",
|
"whether",
|
||||||
"although",
|
"although",
|
||||||
#"from",
|
# "from",
|
||||||
"though",
|
"though",
|
||||||
#"until",
|
# "until",
|
||||||
"unless",
|
"unless",
|
||||||
"once",
|
"once",
|
||||||
#"without",
|
# "without",
|
||||||
#"at",
|
# "at",
|
||||||
#"into",
|
# "into",
|
||||||
"cause",
|
"cause",
|
||||||
#"over",
|
# "over",
|
||||||
"upon",
|
"upon",
|
||||||
"till",
|
"till",
|
||||||
"whereas",
|
"whereas",
|
||||||
#"beyond",
|
# "beyond",
|
||||||
"whilst",
|
"whilst",
|
||||||
"except",
|
"except",
|
||||||
"despite",
|
"despite",
|
||||||
"wether",
|
"wether",
|
||||||
#"then",
|
# "then",
|
||||||
"but",
|
"but",
|
||||||
"becuse",
|
"becuse",
|
||||||
"whie",
|
"whie",
|
||||||
#"below",
|
# "below",
|
||||||
#"against",
|
# "against",
|
||||||
"it",
|
"it",
|
||||||
"w/out",
|
"w/out",
|
||||||
#"toward",
|
# "toward",
|
||||||
"albeit",
|
"albeit",
|
||||||
"save",
|
"save",
|
||||||
"besides",
|
"besides",
|
||||||
|
@ -67,17 +67,17 @@ _subordinating_conjunctions = [
|
||||||
"out",
|
"out",
|
||||||
"near",
|
"near",
|
||||||
"seince",
|
"seince",
|
||||||
#"towards",
|
# "towards",
|
||||||
"tho",
|
"tho",
|
||||||
"sice",
|
"sice",
|
||||||
"will",
|
"will",
|
||||||
]
|
]
|
||||||
|
|
||||||
# This seems kind of wrong too?
|
# This seems kind of wrong too?
|
||||||
#_relative_pronouns = ["this", "that", "those", "these"]
|
# _relative_pronouns = ["this", "that", "those", "these"]
|
||||||
|
|
||||||
MORPH_RULES = {
|
MORPH_RULES = {
|
||||||
#"DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
|
# "DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
|
||||||
"IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions},
|
"IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions},
|
||||||
"NN": {
|
"NN": {
|
||||||
"something": {"POS": "PRON"},
|
"something": {"POS": "PRON"},
|
||||||
|
|
|
@ -30,12 +30,7 @@ for pron in ["i"]:
|
||||||
for orth in [pron, pron.title()]:
|
for orth in [pron, pron.title()]:
|
||||||
_exc[orth + "'m"] = [
|
_exc[orth + "'m"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{
|
{ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP"},
|
||||||
ORTH: "'m",
|
|
||||||
LEMMA: "be",
|
|
||||||
NORM: "am",
|
|
||||||
TAG: "VBP",
|
|
||||||
},
|
|
||||||
]
|
]
|
||||||
|
|
||||||
_exc[orth + "m"] = [
|
_exc[orth + "m"] = [
|
||||||
|
|
|
@ -2,8 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN
|
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
|
||||||
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer(object):
|
class Lemmatizer(object):
|
||||||
|
@ -71,13 +70,13 @@ class Lemmatizer(object):
|
||||||
return True
|
return True
|
||||||
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
||||||
return True
|
return True
|
||||||
elif morphology.get('VerbForm') == 'inf':
|
elif morphology.get("VerbForm") == "inf":
|
||||||
return True
|
return True
|
||||||
elif morphology.get('VerbForm') == 'none':
|
elif morphology.get("VerbForm") == "none":
|
||||||
return True
|
return True
|
||||||
elif morphology.get('VerbForm') == 'inf':
|
elif morphology.get("VerbForm") == "inf":
|
||||||
return True
|
return True
|
||||||
elif morphology.get('Degree') == 'pos':
|
elif morphology.get("Degree") == "pos":
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -137,6 +137,7 @@ class Table(OrderedDict):
|
||||||
"""A table in the lookups. Subclass of builtin dict that implements a
|
"""A table in the lookups. Subclass of builtin dict that implements a
|
||||||
slightly more consistent and unified API.
|
slightly more consistent and unified API.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dict(cls, data, name=None):
|
def from_dict(cls, data, name=None):
|
||||||
self = cls(name=name)
|
self = cls(name=name)
|
||||||
|
|
|
@ -119,28 +119,8 @@ def test_oracle_moves_missing_B(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
def test_oracle_moves_whitespace(en_vocab):
|
def test_oracle_moves_whitespace(en_vocab):
|
||||||
words = [
|
words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"]
|
||||||
"production",
|
biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"]
|
||||||
"\n",
|
|
||||||
"of",
|
|
||||||
"Northrop",
|
|
||||||
"\n",
|
|
||||||
"Corp.",
|
|
||||||
"\n",
|
|
||||||
"'s",
|
|
||||||
"radar",
|
|
||||||
]
|
|
||||||
biluo_tags = [
|
|
||||||
"O",
|
|
||||||
"O",
|
|
||||||
"O",
|
|
||||||
"B-ORG",
|
|
||||||
None,
|
|
||||||
"I-ORG",
|
|
||||||
"L-ORG",
|
|
||||||
"O",
|
|
||||||
"O",
|
|
||||||
]
|
|
||||||
|
|
||||||
doc = Doc(en_vocab, words=words)
|
doc = Doc(en_vocab, words=words)
|
||||||
gold = GoldParse(doc, words=words, entities=biluo_tags)
|
gold = GoldParse(doc, words=words, entities=biluo_tags)
|
||||||
|
@ -156,4 +136,4 @@ def test_oracle_moves_whitespace(en_vocab):
|
||||||
action, label = tag.split("-")
|
action, label = tag.split("-")
|
||||||
moves.add_action(move_types.index(action), label)
|
moves.add_action(move_types.index(action), label)
|
||||||
moves.preprocess_gold(gold)
|
moves.preprocess_gold(gold)
|
||||||
seq = moves.get_oracle_sequence(doc, gold)
|
moves.get_oracle_sequence(doc, gold)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user