mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Auto-format [ci skip]
This commit is contained in:
parent
1a554bdcb1
commit
126268ce50
|
@ -26,7 +26,9 @@ class Ukrainian(Language):
|
||||||
default_config={"model": None, "mode": "pymorphy2"},
|
default_config={"model": None, "mode": "pymorphy2"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False,):
|
def make_lemmatizer(
|
||||||
|
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False
|
||||||
|
):
|
||||||
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -54,9 +54,7 @@ def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
|
||||||
|
|
||||||
|
|
||||||
class ChineseTokenizer(DummyTokenizer):
|
class ChineseTokenizer(DummyTokenizer):
|
||||||
def __init__(
|
def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char):
|
||||||
self, nlp: Language, segmenter: Segmenter = Segmenter.char,
|
|
||||||
):
|
|
||||||
self.vocab = nlp.vocab
|
self.vocab = nlp.vocab
|
||||||
if isinstance(segmenter, Segmenter):
|
if isinstance(segmenter, Segmenter):
|
||||||
segmenter = segmenter.value
|
segmenter = segmenter.value
|
||||||
|
@ -87,7 +85,7 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
if pkuseg_user_dict is None:
|
if pkuseg_user_dict is None:
|
||||||
pkuseg_user_dict = pkuseg_model
|
pkuseg_user_dict = pkuseg_model
|
||||||
self.pkuseg_seg = try_pkuseg_import(
|
self.pkuseg_seg = try_pkuseg_import(
|
||||||
pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict,
|
pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict
|
||||||
)
|
)
|
||||||
|
|
||||||
def __call__(self, text: str) -> Doc:
|
def __call__(self, text: str) -> Doc:
|
||||||
|
|
|
@ -209,9 +209,13 @@ def test_doc_retokenizer_split_norm(en_vocab):
|
||||||
# Retokenize to split out the words in the token at doc[2].
|
# Retokenize to split out the words in the token at doc[2].
|
||||||
token = doc[2]
|
token = doc[2]
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.split(token, ["brown", "fox", "jumps", "over", "the"], heads=[(token, idx) for idx in range(5)])
|
retokenizer.split(
|
||||||
|
token,
|
||||||
|
["brown", "fox", "jumps", "over", "the"],
|
||||||
|
heads=[(token, idx) for idx in range(5)],
|
||||||
|
)
|
||||||
|
|
||||||
assert doc[9].text == "w/"
|
assert doc[9].text == "w/"
|
||||||
assert doc[9].norm_ == "with"
|
assert doc[9].norm_ == "with"
|
||||||
assert doc[5].text == "over"
|
assert doc[5].text == "over"
|
||||||
assert doc[5].norm_ == "over"
|
assert doc[5].norm_ == "over"
|
||||||
|
|
|
@ -350,7 +350,7 @@ def test_pipe_methods_frozen():
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"pipe", ["tagger", "parser", "ner", "textcat", "morphologizer"],
|
"pipe", ["tagger", "parser", "ner", "textcat", "morphologizer"]
|
||||||
)
|
)
|
||||||
def test_pipe_label_data_exports_labels(pipe):
|
def test_pipe_label_data_exports_labels(pipe):
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
|
|
|
@ -64,7 +64,7 @@ def get_tok2vec_kwargs():
|
||||||
width=32,
|
width=32,
|
||||||
rows=[500, 500, 500],
|
rows=[500, 500, 500],
|
||||||
attrs=["NORM", "PREFIX", "SHAPE"],
|
attrs=["NORM", "PREFIX", "SHAPE"],
|
||||||
include_static_vectors=False
|
include_static_vectors=False,
|
||||||
),
|
),
|
||||||
"encode": MaxoutWindowEncoder(
|
"encode": MaxoutWindowEncoder(
|
||||||
width=32, depth=2, maxout_pieces=2, window_size=1
|
width=32, depth=2, maxout_pieces=2, window_size=1
|
||||||
|
@ -81,7 +81,7 @@ def test_multi_hash_embed():
|
||||||
width=32,
|
width=32,
|
||||||
rows=[500, 500, 500],
|
rows=[500, 500, 500],
|
||||||
attrs=["NORM", "PREFIX", "SHAPE"],
|
attrs=["NORM", "PREFIX", "SHAPE"],
|
||||||
include_static_vectors=False
|
include_static_vectors=False,
|
||||||
)
|
)
|
||||||
hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
|
hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
|
||||||
assert len(hash_embeds) == 3
|
assert len(hash_embeds) == 3
|
||||||
|
@ -96,11 +96,11 @@ def test_multi_hash_embed():
|
||||||
width=32,
|
width=32,
|
||||||
rows=[1000, 50, 250],
|
rows=[1000, 50, 250],
|
||||||
attrs=["NORM", "PREFIX", "SHAPE"],
|
attrs=["NORM", "PREFIX", "SHAPE"],
|
||||||
include_static_vectors=False
|
include_static_vectors=False,
|
||||||
)
|
)
|
||||||
hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
|
hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
|
||||||
assert [he.get_dim("nV") for he in hash_embeds] == [1000, 50, 250]
|
assert [he.get_dim("nV") for he in hash_embeds] == [1000, 50, 250]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"seed,model_func,kwargs",
|
"seed,model_func,kwargs",
|
||||||
|
|
|
@ -64,7 +64,7 @@ def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
|
||||||
|
|
||||||
|
|
||||||
def lower_casing_augmenter(
|
def lower_casing_augmenter(
|
||||||
nlp: "Language", example: Example, *, level: float,
|
nlp: "Language", example: Example, *, level: float
|
||||||
) -> Iterator[Example]:
|
) -> Iterator[Example]:
|
||||||
if random.random() >= level:
|
if random.random() >= level:
|
||||||
yield example
|
yield example
|
||||||
|
|
Loading…
Reference in New Issue
Block a user