mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-01 19:53:06 +03:00
Auto-format [ci skip]
This commit is contained in:
parent
afe4a428f7
commit
5e9849b60f
|
@ -360,7 +360,7 @@ def Tok2Vec(width, embed_size, **kwargs):
|
||||||
"config": {
|
"config": {
|
||||||
"vectors_name": pretrained_vectors,
|
"vectors_name": pretrained_vectors,
|
||||||
"width": width,
|
"width": width,
|
||||||
"column": cols.index("ID")
|
"column": cols.index("ID"),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
if cnn_maxout_pieces >= 2:
|
if cnn_maxout_pieces >= 2:
|
||||||
|
|
|
@ -45,12 +45,15 @@ def MultiHashEmbed(config):
|
||||||
|
|
||||||
norm = HashEmbed(width, rows, column=cols.index("NORM"), name="embed_norm")
|
norm = HashEmbed(width, rows, column=cols.index("NORM"), name="embed_norm")
|
||||||
if config["use_subwords"]:
|
if config["use_subwords"]:
|
||||||
prefix = HashEmbed(width, rows // 2,
|
prefix = HashEmbed(
|
||||||
column=cols.index("PREFIX"), name="embed_prefix")
|
width, rows // 2, column=cols.index("PREFIX"), name="embed_prefix"
|
||||||
suffix = HashEmbed(width, rows // 2,
|
)
|
||||||
column=cols.index("SUFFIX"), name="embed_suffix")
|
suffix = HashEmbed(
|
||||||
shape = HashEmbed(width, rows // 2,
|
width, rows // 2, column=cols.index("SUFFIX"), name="embed_suffix"
|
||||||
column=cols.index("SHAPE"), name="embed_shape")
|
)
|
||||||
|
shape = HashEmbed(
|
||||||
|
width, rows // 2, column=cols.index("SHAPE"), name="embed_shape"
|
||||||
|
)
|
||||||
if config.get("@pretrained_vectors"):
|
if config.get("@pretrained_vectors"):
|
||||||
glove = make_layer(config["@pretrained_vectors"])
|
glove = make_layer(config["@pretrained_vectors"])
|
||||||
mix = make_layer(config["@mix"])
|
mix = make_layer(config["@mix"])
|
||||||
|
@ -60,20 +63,16 @@ def MultiHashEmbed(config):
|
||||||
mix._layers[0].nI = width * 5
|
mix._layers[0].nI = width * 5
|
||||||
layer = uniqued(
|
layer = uniqued(
|
||||||
(glove | norm | prefix | suffix | shape) >> mix,
|
(glove | norm | prefix | suffix | shape) >> mix,
|
||||||
column=cols.index("ORTH")
|
column=cols.index("ORTH"),
|
||||||
)
|
)
|
||||||
elif config["use_subwords"]:
|
elif config["use_subwords"]:
|
||||||
mix._layers[0].nI = width * 4
|
mix._layers[0].nI = width * 4
|
||||||
layer = uniqued(
|
layer = uniqued(
|
||||||
(norm | prefix | suffix | shape) >> mix,
|
(norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH")
|
||||||
column=cols.index("ORTH")
|
|
||||||
)
|
)
|
||||||
elif config["@pretrained_vectors"]:
|
elif config["@pretrained_vectors"]:
|
||||||
mix._layers[0].nI = width * 2
|
mix._layers[0].nI = width * 2
|
||||||
embed = uniqued(
|
embed = uniqued((glove | norm) >> mix, column=cols.index("ORTH"),)
|
||||||
(glove | norm) >> mix,
|
|
||||||
column=cols.index("ORTH"),
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
embed = norm
|
embed = norm
|
||||||
layer.cfg = config
|
layer.cfg = config
|
||||||
|
|
|
@ -21,4 +21,14 @@ def doc(en_tokenizer):
|
||||||
def test_merge_subtokens(doc):
|
def test_merge_subtokens(doc):
|
||||||
doc = merge_subtokens(doc)
|
doc = merge_subtokens(doc)
|
||||||
# get_doc() doesn't set spaces, so the result is "And a third ."
|
# get_doc() doesn't set spaces, so the result is "And a third ."
|
||||||
assert [t.text for t in doc] == ["This", "is", "a sentence", ".", "This", "is", "another sentence", ".", "And a third ."]
|
assert [t.text for t in doc] == [
|
||||||
|
"This",
|
||||||
|
"is",
|
||||||
|
"a sentence",
|
||||||
|
".",
|
||||||
|
"This",
|
||||||
|
"is",
|
||||||
|
"another sentence",
|
||||||
|
".",
|
||||||
|
"And a third .",
|
||||||
|
]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user