mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-28 02:04:07 +03:00
Update requirements, fixing windows crashes (#13727)
* Re-enable pretraining test * Require thinc 8.3.4 * Reformat * Re-enable test
This commit is contained in:
parent
311f7cc9fb
commit
ba7468e32e
|
@ -5,7 +5,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.3.0,<8.4.0",
|
"thinc>=8.3.4,<8.4.0",
|
||||||
"numpy>=2.0.0,<3.0.0"
|
"numpy>=2.0.0,<3.0.0"
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
|
@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
|
||||||
spacy-loggers>=1.0.0,<2.0.0
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.3.0,<8.4.0
|
thinc>=8.3.4,<8.4.0
|
||||||
ml_datasets>=0.2.0,<0.3.0
|
ml_datasets>=0.2.0,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
|
|
|
@ -41,7 +41,7 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.3.0,<8.4.0
|
thinc>=8.3.4,<8.4.0
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.11,<3.1.0
|
spacy-legacy>=3.0.11,<3.1.0
|
||||||
|
@ -49,7 +49,7 @@ install_requires =
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.3.0,<8.4.0
|
thinc>=8.3.4,<8.4.0
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
"""Test that longer and mixed texts are tokenized correctly."""
|
"""Test that longer and mixed texts are tokenized correctly."""
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,13 @@ import pytest
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"word,lemma",
|
"word,lemma",
|
||||||
[("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "すごい"), ("いただきました", "いただく"), ("なった", "なる")],
|
[
|
||||||
|
("新しく", "新しい"),
|
||||||
|
("赤く", "赤い"),
|
||||||
|
("すごく", "すごい"),
|
||||||
|
("いただきました", "いただく"),
|
||||||
|
("なった", "なる"),
|
||||||
|
],
|
||||||
)
|
)
|
||||||
def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
|
def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
|
||||||
test_lemma = ja_tokenizer(word)[0].lemma_
|
test_lemma = ja_tokenizer(word)[0].lemma_
|
||||||
|
|
|
@ -143,7 +143,12 @@ def test_ja_tokenizer_sub_tokens(
|
||||||
[
|
[
|
||||||
(
|
(
|
||||||
"取ってつけた",
|
"取ってつけた",
|
||||||
(["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
|
(
|
||||||
|
["五段-ラ行;連用形-促音便"],
|
||||||
|
[],
|
||||||
|
["下一段-カ行;連用形-一般"],
|
||||||
|
["助動詞-タ;終止形-一般"],
|
||||||
|
),
|
||||||
(["トッ"], ["テ"], ["ツケ"], ["タ"]),
|
(["トッ"], ["テ"], ["ツケ"], ["タ"]),
|
||||||
),
|
),
|
||||||
("2=3", ([], [], []), (["ニ"], ["_"], ["サン"])),
|
("2=3", ([], [], []), (["ニ"], ["_"], ["サン"])),
|
||||||
|
|
|
@ -2,7 +2,14 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")]
|
"word,lemma",
|
||||||
|
[
|
||||||
|
("새로운", "새롭"),
|
||||||
|
("빨간", "빨갛"),
|
||||||
|
("클수록", "크"),
|
||||||
|
("뭡니까", "뭣"),
|
||||||
|
("됐다", "되"),
|
||||||
|
],
|
||||||
)
|
)
|
||||||
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
|
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
|
||||||
test_lemma = ko_tokenizer(word)[0].lemma_
|
test_lemma = ko_tokenizer(word)[0].lemma_
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
"""Words like numbers are recognized correctly."""
|
"""Words like numbers are recognized correctly."""
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -265,50 +265,50 @@ def test_pretraining_tagger():
|
||||||
|
|
||||||
|
|
||||||
# Try to debug segfault on windows
|
# Try to debug segfault on windows
|
||||||
#def test_pretraining_training():
|
def test_pretraining_training():
|
||||||
# """Test that training can use a pretrained Tok2Vec model"""
|
"""Test that training can use a pretrained Tok2Vec model"""
|
||||||
# config = Config().from_str(pretrain_string_internal)
|
config = Config().from_str(pretrain_string_internal)
|
||||||
# nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
|
nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
|
||||||
# filled = nlp.config
|
filled = nlp.config
|
||||||
# pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
||||||
# filled = pretrain_config.merge(filled)
|
filled = pretrain_config.merge(filled)
|
||||||
# train_config = util.load_config(DEFAULT_CONFIG_PATH)
|
train_config = util.load_config(DEFAULT_CONFIG_PATH)
|
||||||
# filled = train_config.merge(filled)
|
filled = train_config.merge(filled)
|
||||||
# with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
# pretrain_dir = tmp_dir / "pretrain"
|
pretrain_dir = tmp_dir / "pretrain"
|
||||||
# pretrain_dir.mkdir()
|
pretrain_dir.mkdir()
|
||||||
# file_path = write_sample_jsonl(pretrain_dir)
|
file_path = write_sample_jsonl(pretrain_dir)
|
||||||
# filled["paths"]["raw_text"] = file_path
|
filled["paths"]["raw_text"] = file_path
|
||||||
# filled["pretraining"]["component"] = "tagger"
|
filled["pretraining"]["component"] = "tagger"
|
||||||
# filled["pretraining"]["layer"] = "tok2vec"
|
filled["pretraining"]["layer"] = "tok2vec"
|
||||||
# train_dir = tmp_dir / "train"
|
train_dir = tmp_dir / "train"
|
||||||
# train_dir.mkdir()
|
train_dir.mkdir()
|
||||||
# train_path, dev_path = write_sample_training(train_dir)
|
train_path, dev_path = write_sample_training(train_dir)
|
||||||
# filled["paths"]["train"] = train_path
|
filled["paths"]["train"] = train_path
|
||||||
# filled["paths"]["dev"] = dev_path
|
filled["paths"]["dev"] = dev_path
|
||||||
# filled = filled.interpolate()
|
filled = filled.interpolate()
|
||||||
# P = filled["pretraining"]
|
P = filled["pretraining"]
|
||||||
# nlp_base = init_nlp(filled)
|
nlp_base = init_nlp(filled)
|
||||||
# model_base = (
|
model_base = (
|
||||||
# nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
|
nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
|
||||||
# )
|
)
|
||||||
# embed_base = None
|
embed_base = None
|
||||||
# for node in model_base.walk():
|
for node in model_base.walk():
|
||||||
# if node.name == "hashembed":
|
if node.name == "hashembed":
|
||||||
# embed_base = node
|
embed_base = node
|
||||||
# pretrain(filled, pretrain_dir)
|
pretrain(filled, pretrain_dir)
|
||||||
# pretrained_model = Path(pretrain_dir / "model3.bin")
|
pretrained_model = Path(pretrain_dir / "model3.bin")
|
||||||
# assert pretrained_model.exists()
|
assert pretrained_model.exists()
|
||||||
# filled["initialize"]["init_tok2vec"] = str(pretrained_model)
|
filled["initialize"]["init_tok2vec"] = str(pretrained_model)
|
||||||
# nlp = init_nlp(filled)
|
nlp = init_nlp(filled)
|
||||||
# model = nlp.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
|
model = nlp.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
|
||||||
# embed = None
|
embed = None
|
||||||
# for node in model.walk():
|
for node in model.walk():
|
||||||
# if node.name == "hashembed":
|
if node.name == "hashembed":
|
||||||
# embed = node
|
embed = node
|
||||||
# # ensure that the tok2vec weights are actually changed by the pretraining
|
# ensure that the tok2vec weights are actually changed by the pretraining
|
||||||
# assert np.any(np.not_equal(embed.get_param("E"), embed_base.get_param("E")))
|
assert np.any(np.not_equal(embed.get_param("E"), embed_base.get_param("E")))
|
||||||
# train(nlp, train_dir)
|
train(nlp, train_dir)
|
||||||
|
|
||||||
|
|
||||||
def write_sample_jsonl(tmp_dir):
|
def write_sample_jsonl(tmp_dir):
|
Loading…
Reference in New Issue
Block a user