mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Format (#9630)
This commit is contained in:
parent
f0e8c9fe58
commit
e6f91b6f27
|
@ -203,7 +203,11 @@ class Japanese(Language):
|
||||||
"extend": True,
|
"extend": True,
|
||||||
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
|
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={"pos_acc": 0.5, "morph_micro_f": 0.5, "morph_per_feat": None},
|
default_score_weights={
|
||||||
|
"pos_acc": 0.5,
|
||||||
|
"morph_micro_f": 0.5,
|
||||||
|
"morph_per_feat": None,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
def make_morphologizer(
|
def make_morphologizer(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
|
|
|
@ -33,7 +33,9 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
) from None
|
) from None
|
||||||
if getattr(self, "_morph", None) is None:
|
if getattr(self, "_morph", None) is None:
|
||||||
self._morph = MorphAnalyzer()
|
self._morph = MorphAnalyzer()
|
||||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
|
super().__init__(
|
||||||
|
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
||||||
string = token.text
|
string = token.text
|
||||||
|
|
|
@ -27,7 +27,7 @@ _num_words = [
|
||||||
"ትሪልዮን",
|
"ትሪልዮን",
|
||||||
"ኳድሪልዮን",
|
"ኳድሪልዮን",
|
||||||
"ጋዚልዮን",
|
"ጋዚልዮን",
|
||||||
"ባዚልዮን"
|
"ባዚልዮን",
|
||||||
]
|
]
|
||||||
|
|
||||||
# Tigrinya ordinals above 10 are the same as _num_words but start with "መበል "
|
# Tigrinya ordinals above 10 are the same as _num_words but start with "መበል "
|
||||||
|
@ -41,7 +41,7 @@ _ordinal_words = [
|
||||||
"ሻውዓይ",
|
"ሻውዓይ",
|
||||||
"ሻምናይ",
|
"ሻምናይ",
|
||||||
"ታሽዓይ",
|
"ታሽዓይ",
|
||||||
"ዓስራይ"
|
"ዓስራይ",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -29,4 +29,6 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
||||||
) from None
|
) from None
|
||||||
if getattr(self, "_morph", None) is None:
|
if getattr(self, "_morph", None) is None:
|
||||||
self._morph = MorphAnalyzer(lang="uk")
|
self._morph = MorphAnalyzer(lang="uk")
|
||||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
|
super().__init__(
|
||||||
|
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
>>> from spacy.lang.vi.examples import sentences
|
>>> from spacy.lang.vi.examples import sentences
|
||||||
|
|
|
@ -303,7 +303,9 @@ class Scorer:
|
||||||
pred_per_feat[field] = set()
|
pred_per_feat[field] = set()
|
||||||
pred_per_feat[field].add((gold_i, feat))
|
pred_per_feat[field].add((gold_i, feat))
|
||||||
for field in per_feat:
|
for field in per_feat:
|
||||||
micro_score.score_set(pred_per_feat.get(field, set()), gold_per_feat.get(field, set()))
|
micro_score.score_set(
|
||||||
|
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
|
||||||
|
)
|
||||||
per_feat[field].score_set(
|
per_feat[field].score_set(
|
||||||
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
|
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
|
||||||
)
|
)
|
||||||
|
|
|
@ -133,11 +133,7 @@ def test_ja_tokenizer_sub_tokens(
|
||||||
(["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
|
(["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
|
||||||
(["トッ"], ["テ"], ["ツケ"], ["タ"]),
|
(["トッ"], ["テ"], ["ツケ"], ["タ"]),
|
||||||
),
|
),
|
||||||
(
|
("2=3", ([], [], []), (["ニ"], ["_"], ["サン"])),
|
||||||
"2=3",
|
|
||||||
([], [], []),
|
|
||||||
(["ニ"], ["_"], ["サン"])
|
|
||||||
),
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_ja_tokenizer_inflections_reading_forms(
|
def test_ja_tokenizer_inflections_reading_forms(
|
||||||
|
|
|
@ -216,8 +216,8 @@ def test_tokenizer_flush_specials(en_vocab):
|
||||||
|
|
||||||
def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
|
def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
|
||||||
# the prefix and suffix matches overlap in the suffix lookbehind
|
# the prefix and suffix matches overlap in the suffix lookbehind
|
||||||
prefixes = ['a(?=.)']
|
prefixes = ["a(?=.)"]
|
||||||
suffixes = [r'(?<=\w)\.', r'(?<=a)\d+\.']
|
suffixes = [r"(?<=\w)\.", r"(?<=a)\d+\."]
|
||||||
prefix_re = compile_prefix_regex(prefixes)
|
prefix_re = compile_prefix_regex(prefixes)
|
||||||
suffix_re = compile_suffix_regex(suffixes)
|
suffix_re = compile_suffix_regex(suffixes)
|
||||||
tokenizer = Tokenizer(
|
tokenizer = Tokenizer(
|
||||||
|
|
|
@ -524,6 +524,7 @@ def test_roundtrip_docs_to_docbin(doc):
|
||||||
assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"]
|
assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"]
|
||||||
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
|
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
|
||||||
|
|
||||||
|
|
||||||
def test_docbin_user_data_serialized(doc):
|
def test_docbin_user_data_serialized(doc):
|
||||||
doc.user_data["check"] = True
|
doc.user_data["check"] = True
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
@ -536,6 +537,7 @@ def test_docbin_user_data_serialized(doc):
|
||||||
|
|
||||||
assert reloaded_doc.user_data["check"] == True
|
assert reloaded_doc.user_data["check"] == True
|
||||||
|
|
||||||
|
|
||||||
def test_docbin_user_data_not_serialized(doc):
|
def test_docbin_user_data_not_serialized(doc):
|
||||||
# this isn't serializable, but that shouldn't cause an error
|
# this isn't serializable, but that shouldn't cause an error
|
||||||
doc.user_data["check"] = set()
|
doc.user_data["check"] = set()
|
||||||
|
@ -549,6 +551,7 @@ def test_docbin_user_data_not_serialized(doc):
|
||||||
|
|
||||||
assert "check" not in reloaded_doc.user_data
|
assert "check" not in reloaded_doc.user_data
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"tokens_a,tokens_b,expected",
|
"tokens_a,tokens_b,expected",
|
||||||
[
|
[
|
||||||
|
|
|
@ -50,7 +50,9 @@ def pretrain(
|
||||||
# TODO: move this to logger function?
|
# TODO: move this to logger function?
|
||||||
tracker = ProgressTracker(frequency=10000)
|
tracker = ProgressTracker(frequency=10000)
|
||||||
if P["n_save_epoch"]:
|
if P["n_save_epoch"]:
|
||||||
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume} - saving every {P['n_save_epoch']} epoch")
|
msg.divider(
|
||||||
|
f"Pre-training tok2vec layer - starting at epoch {epoch_resume} - saving every {P['n_save_epoch']} epoch"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
||||||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
||||||
|
|
|
@ -288,16 +288,17 @@ def find_matching_language(lang: str) -> Optional[str]:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
import spacy.lang # noqa: F401
|
import spacy.lang # noqa: F401
|
||||||
if lang == 'xx':
|
|
||||||
return 'xx'
|
if lang == "xx":
|
||||||
|
return "xx"
|
||||||
|
|
||||||
# Find out which language modules we have
|
# Find out which language modules we have
|
||||||
possible_languages = []
|
possible_languages = []
|
||||||
for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore
|
for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore
|
||||||
code = modinfo.name
|
code = modinfo.name
|
||||||
if code == 'xx':
|
if code == "xx":
|
||||||
# Temporarily make 'xx' into a valid language code
|
# Temporarily make 'xx' into a valid language code
|
||||||
possible_languages.append('mul')
|
possible_languages.append("mul")
|
||||||
elif langcodes.tag_is_valid(code):
|
elif langcodes.tag_is_valid(code):
|
||||||
possible_languages.append(code)
|
possible_languages.append(code)
|
||||||
|
|
||||||
|
@ -306,12 +307,10 @@ def find_matching_language(lang: str) -> Optional[str]:
|
||||||
# more possibilities, like variants of Chinese like 'wuu', but text that
|
# more possibilities, like variants of Chinese like 'wuu', but text that
|
||||||
# is labeled that way is probably trying to be distinct from 'zh' and
|
# is labeled that way is probably trying to be distinct from 'zh' and
|
||||||
# shouldn't automatically match.
|
# shouldn't automatically match.
|
||||||
match = langcodes.closest_supported_match(
|
match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
|
||||||
lang, possible_languages, max_distance=9
|
if match == "mul":
|
||||||
)
|
|
||||||
if match == 'mul':
|
|
||||||
# Convert 'mul' back to spaCy's 'xx'
|
# Convert 'mul' back to spaCy's 'xx'
|
||||||
return 'xx'
|
return "xx"
|
||||||
else:
|
else:
|
||||||
return match
|
return match
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user