mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Tidy up and auto-format
This commit is contained in:
parent
7c4ab7e82c
commit
3bc3c05fcc
|
@ -171,7 +171,7 @@ def debug_data(
|
||||||
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"{} words in training data without vectors ({:0.2f}%)".format(
|
"{} words in training data without vectors ({:0.2f}%)".format(
|
||||||
n_missing_vectors, n_missing_vectors / gold_train_data["n_words"],
|
n_missing_vectors, n_missing_vectors / gold_train_data["n_words"]
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
msg.text(
|
msg.text(
|
||||||
|
|
|
@ -8,7 +8,6 @@ from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import PolishLemmatizer
|
from .lemmatizer import PolishLemmatizer
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -47,7 +47,7 @@ class Segmenter(str, Enum):
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
||||||
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,):
|
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
|
||||||
def chinese_tokenizer_factory(nlp):
|
def chinese_tokenizer_factory(nlp):
|
||||||
return ChineseTokenizer(nlp, segmenter=segmenter)
|
return ChineseTokenizer(nlp, segmenter=segmenter)
|
||||||
|
|
||||||
|
|
|
@ -165,8 +165,12 @@ def MultiHashEmbed(
|
||||||
|
|
||||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
||||||
def CharacterEmbed(
|
def CharacterEmbed(
|
||||||
width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool,
|
width: int,
|
||||||
feature: Union[int, str]="LOWER"
|
rows: int,
|
||||||
|
nM: int,
|
||||||
|
nC: int,
|
||||||
|
also_use_static_vectors: bool,
|
||||||
|
feature: Union[int, str] = "LOWER",
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
"""Construct an embedded representation based on character embeddings, using
|
"""Construct an embedded representation based on character embeddings, using
|
||||||
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
||||||
|
|
|
@ -70,7 +70,7 @@ subword_features = true
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
def make_textcat(
|
def make_textcat(
|
||||||
nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float,
|
nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
|
||||||
) -> "TextCategorizer":
|
) -> "TextCategorizer":
|
||||||
"""Create a TextCategorizer compoment. The text categorizer predicts categories
|
"""Create a TextCategorizer compoment. The text categorizer predicts categories
|
||||||
over a whole document. It can learn one or more labels, and the labels can
|
over a whole document. It can learn one or more labels, and the labels can
|
||||||
|
|
|
@ -294,7 +294,8 @@ def zh_tokenizer_pkuseg():
|
||||||
"segmenter": "pkuseg",
|
"segmenter": "pkuseg",
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"initialize": {"tokenizer": {
|
"initialize": {
|
||||||
|
"tokenizer": {
|
||||||
"pkuseg_model": "default",
|
"pkuseg_model": "default",
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
@ -5,12 +5,14 @@ import pytest
|
||||||
def i_has(en_tokenizer):
|
def i_has(en_tokenizer):
|
||||||
doc = en_tokenizer("I has")
|
doc = en_tokenizer("I has")
|
||||||
doc[0].set_morph({"PronType": "prs"})
|
doc[0].set_morph({"PronType": "prs"})
|
||||||
doc[1].set_morph({
|
doc[1].set_morph(
|
||||||
|
{
|
||||||
"VerbForm": "fin",
|
"VerbForm": "fin",
|
||||||
"Tense": "pres",
|
"Tense": "pres",
|
||||||
"Number": "sing",
|
"Number": "sing",
|
||||||
"Person": "three",
|
"Person": "three",
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
|
@ -34,7 +34,8 @@ def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
||||||
"segmenter": "pkuseg",
|
"segmenter": "pkuseg",
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"initialize": {"tokenizer": {
|
"initialize": {
|
||||||
|
"tokenizer": {
|
||||||
"pkuseg_model": "medicine",
|
"pkuseg_model": "medicine",
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
@ -139,7 +139,8 @@ def test_overfitting_IO():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
|
nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
|
||||||
# Set exclusive labels
|
# Set exclusive labels
|
||||||
textcat = nlp.add_pipe("textcat", config={"model": {"exclusive_classes": True}},)
|
config = {"model": {"exclusive_classes": True}}
|
||||||
|
textcat = nlp.add_pipe("textcat", config=config)
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for text, annotations in TRAIN_DATA:
|
for text, annotations in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
|
@ -226,7 +227,9 @@ def test_positive_class_not_binary():
|
||||||
textcat = nlp.add_pipe("textcat")
|
textcat = nlp.add_pipe("textcat")
|
||||||
get_examples = make_get_examples(nlp)
|
get_examples = make_get_examples(nlp)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
textcat.initialize(get_examples, labels=["SOME", "THING", "POS"], positive_label="POS")
|
textcat.initialize(
|
||||||
|
get_examples, labels=["SOME", "THING", "POS"], positive_label="POS"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_textcat_evaluation():
|
def test_textcat_evaluation():
|
||||||
|
|
|
@ -92,7 +92,13 @@ def test_serialize_doc_bin_unknown_spaces(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"writer_flag,reader_flag,reader_value", [(True, True, "bar"), (True, False, "bar"), (False, True, "nothing"), (False, False, "nothing")]
|
"writer_flag,reader_flag,reader_value",
|
||||||
|
[
|
||||||
|
(True, True, "bar"),
|
||||||
|
(True, False, "bar"),
|
||||||
|
(False, True, "nothing"),
|
||||||
|
(False, False, "nothing"),
|
||||||
|
],
|
||||||
)
|
)
|
||||||
def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value):
|
def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value):
|
||||||
"""Test that custom extensions are correctly serialized in DocBin."""
|
"""Test that custom extensions are correctly serialized in DocBin."""
|
||||||
|
|
|
@ -158,7 +158,7 @@ def test_las_per_type(en_vocab):
|
||||||
examples = []
|
examples = []
|
||||||
for input_, annot in test_las_apple:
|
for input_, annot in test_las_apple:
|
||||||
doc = Doc(
|
doc = Doc(
|
||||||
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"],
|
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"]
|
||||||
)
|
)
|
||||||
gold = {"heads": annot["heads"], "deps": annot["deps"]}
|
gold = {"heads": annot["heads"], "deps": annot["deps"]}
|
||||||
doc[0].dep_ = "compound"
|
doc[0].dep_ = "compound"
|
||||||
|
@ -182,9 +182,7 @@ def test_ner_per_type(en_vocab):
|
||||||
examples = []
|
examples = []
|
||||||
for input_, annot in test_ner_cardinal:
|
for input_, annot in test_ner_cardinal:
|
||||||
doc = Doc(
|
doc = Doc(
|
||||||
en_vocab,
|
en_vocab, words=input_.split(" "), ents=["B-CARDINAL", "O", "B-CARDINAL"]
|
||||||
words=input_.split(" "),
|
|
||||||
ents=["B-CARDINAL", "O", "B-CARDINAL"],
|
|
||||||
)
|
)
|
||||||
entities = offsets_to_biluo_tags(doc, annot["entities"])
|
entities = offsets_to_biluo_tags(doc, annot["entities"])
|
||||||
example = Example.from_dict(doc, {"entities": entities})
|
example = Example.from_dict(doc, {"entities": entities})
|
||||||
|
|
|
@ -30,7 +30,7 @@ class OrthVariants(BaseModel):
|
||||||
|
|
||||||
@registry.augmenters("spacy.orth_variants.v1")
|
@registry.augmenters("spacy.orth_variants.v1")
|
||||||
def create_orth_variants_augmenter(
|
def create_orth_variants_augmenter(
|
||||||
level: float, lower: float, orth_variants: OrthVariants,
|
level: float, lower: float, orth_variants: OrthVariants
|
||||||
) -> Callable[["Language", Example], Iterator[Example]]:
|
) -> Callable[["Language", Example], Iterator[Example]]:
|
||||||
"""Create a data augmentation callback that uses orth-variant replacement.
|
"""Create a data augmentation callback that uses orth-variant replacement.
|
||||||
The callback can be added to a corpus or other data iterator during training.
|
The callback can be added to a corpus or other data iterator during training.
|
||||||
|
|
|
@ -22,7 +22,7 @@ def train(
|
||||||
*,
|
*,
|
||||||
use_gpu: int = -1,
|
use_gpu: int = -1,
|
||||||
stdout: IO = sys.stdout,
|
stdout: IO = sys.stdout,
|
||||||
stderr: IO=sys.stderr
|
stderr: IO = sys.stderr,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Train a pipeline.
|
"""Train a pipeline.
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@ from .errors import Errors
|
||||||
from .attrs import intify_attrs, NORM, IS_STOP
|
from .attrs import intify_attrs, NORM, IS_STOP
|
||||||
from .vectors import Vectors
|
from .vectors import Vectors
|
||||||
from .util import registry
|
from .util import registry
|
||||||
from .lookups import Lookups, load_lookups
|
from .lookups import Lookups
|
||||||
from . import util
|
from . import util
|
||||||
from .lang.norm_exceptions import BASE_NORMS
|
from .lang.norm_exceptions import BASE_NORMS
|
||||||
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
|
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
|
||||||
|
|
Loading…
Reference in New Issue
Block a user