This commit is contained in:
Adriane Boyd 2021-11-05 09:56:26 +01:00 committed by GitHub
parent f0e8c9fe58
commit e6f91b6f27
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 33 additions and 24 deletions

View File

@ -203,7 +203,11 @@ class Japanese(Language):
"extend": True, "extend": True,
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
}, },
default_score_weights={"pos_acc": 0.5, "morph_micro_f": 0.5, "morph_per_feat": None}, default_score_weights={
"pos_acc": 0.5,
"morph_micro_f": 0.5,
"morph_per_feat": None,
},
) )
def make_morphologizer( def make_morphologizer(
nlp: Language, nlp: Language,

View File

@ -33,7 +33,9 @@ class RussianLemmatizer(Lemmatizer):
) from None ) from None
if getattr(self, "_morph", None) is None: if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer() self._morph = MorphAnalyzer()
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer) super().__init__(
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
def pymorphy2_lemmatize(self, token: Token) -> List[str]: def pymorphy2_lemmatize(self, token: Token) -> List[str]:
string = token.text string = token.text

View File

@ -27,7 +27,7 @@ _num_words = [
"ትሪልዮን", "ትሪልዮን",
"ኳድሪልዮን", "ኳድሪልዮን",
"ጋዚልዮን", "ጋዚልዮን",
"ባዚልዮን" "ባዚልዮን",
] ]
# Tigrinya ordinals above 10 are the same as _num_words but start with "መበል " # Tigrinya ordinals above 10 are the same as _num_words but start with "መበል "
@ -41,7 +41,7 @@ _ordinal_words = [
"ሻውዓይ", "ሻውዓይ",
"ሻምናይ", "ሻምናይ",
"ታሽዓይ", "ታሽዓይ",
"ዓስራይ" "ዓስራይ",
] ]

View File

@ -29,4 +29,6 @@ class UkrainianLemmatizer(RussianLemmatizer):
) from None ) from None
if getattr(self, "_morph", None) is None: if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer(lang="uk") self._morph = MorphAnalyzer(lang="uk")
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer) super().__init__(
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.
>>> from spacy.lang.vi.examples import sentences >>> from spacy.lang.vi.examples import sentences

View File

@ -303,7 +303,9 @@ class Scorer:
pred_per_feat[field] = set() pred_per_feat[field] = set()
pred_per_feat[field].add((gold_i, feat)) pred_per_feat[field].add((gold_i, feat))
for field in per_feat: for field in per_feat:
micro_score.score_set(pred_per_feat.get(field, set()), gold_per_feat.get(field, set())) micro_score.score_set(
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
)
per_feat[field].score_set( per_feat[field].score_set(
pred_per_feat.get(field, set()), gold_per_feat.get(field, set()) pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
) )

View File

@ -133,11 +133,7 @@ def test_ja_tokenizer_sub_tokens(
(["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]), (["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
(["トッ"], [""], ["ツケ"], [""]), (["トッ"], [""], ["ツケ"], [""]),
), ),
( ("2=3", ([], [], []), ([""], ["_"], ["サン"])),
"2=3",
([], [], []),
([""], ["_"], ["サン"])
),
], ],
) )
def test_ja_tokenizer_inflections_reading_forms( def test_ja_tokenizer_inflections_reading_forms(

View File

@ -216,8 +216,8 @@ def test_tokenizer_flush_specials(en_vocab):
def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab): def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
# the prefix and suffix matches overlap in the suffix lookbehind # the prefix and suffix matches overlap in the suffix lookbehind
prefixes = ['a(?=.)'] prefixes = ["a(?=.)"]
suffixes = [r'(?<=\w)\.', r'(?<=a)\d+\.'] suffixes = [r"(?<=\w)\.", r"(?<=a)\d+\."]
prefix_re = compile_prefix_regex(prefixes) prefix_re = compile_prefix_regex(prefixes)
suffix_re = compile_suffix_regex(suffixes) suffix_re = compile_suffix_regex(suffixes)
tokenizer = Tokenizer( tokenizer = Tokenizer(

View File

@ -524,6 +524,7 @@ def test_roundtrip_docs_to_docbin(doc):
assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"] assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"]
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"] assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
def test_docbin_user_data_serialized(doc): def test_docbin_user_data_serialized(doc):
doc.user_data["check"] = True doc.user_data["check"] = True
nlp = English() nlp = English()
@ -536,6 +537,7 @@ def test_docbin_user_data_serialized(doc):
assert reloaded_doc.user_data["check"] == True assert reloaded_doc.user_data["check"] == True
def test_docbin_user_data_not_serialized(doc): def test_docbin_user_data_not_serialized(doc):
# this isn't serializable, but that shouldn't cause an error # this isn't serializable, but that shouldn't cause an error
doc.user_data["check"] = set() doc.user_data["check"] = set()
@ -549,6 +551,7 @@ def test_docbin_user_data_not_serialized(doc):
assert "check" not in reloaded_doc.user_data assert "check" not in reloaded_doc.user_data
@pytest.mark.parametrize( @pytest.mark.parametrize(
"tokens_a,tokens_b,expected", "tokens_a,tokens_b,expected",
[ [

View File

@ -50,7 +50,9 @@ def pretrain(
# TODO: move this to logger function? # TODO: move this to logger function?
tracker = ProgressTracker(frequency=10000) tracker = ProgressTracker(frequency=10000)
if P["n_save_epoch"]: if P["n_save_epoch"]:
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume} - saving every {P['n_save_epoch']} epoch") msg.divider(
f"Pre-training tok2vec layer - starting at epoch {epoch_resume} - saving every {P['n_save_epoch']} epoch"
)
else: else:
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}

View File

@ -288,16 +288,17 @@ def find_matching_language(lang: str) -> Optional[str]:
None None
""" """
import spacy.lang # noqa: F401 import spacy.lang # noqa: F401
if lang == 'xx':
return 'xx' if lang == "xx":
return "xx"
# Find out which language modules we have # Find out which language modules we have
possible_languages = [] possible_languages = []
for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore
code = modinfo.name code = modinfo.name
if code == 'xx': if code == "xx":
# Temporarily make 'xx' into a valid language code # Temporarily make 'xx' into a valid language code
possible_languages.append('mul') possible_languages.append("mul")
elif langcodes.tag_is_valid(code): elif langcodes.tag_is_valid(code):
possible_languages.append(code) possible_languages.append(code)
@ -306,12 +307,10 @@ def find_matching_language(lang: str) -> Optional[str]:
# more possibilities, like variants of Chinese like 'wuu', but text that # more possibilities, like variants of Chinese like 'wuu', but text that
# is labeled that way is probably trying to be distinct from 'zh' and # is labeled that way is probably trying to be distinct from 'zh' and
# shouldn't automatically match. # shouldn't automatically match.
match = langcodes.closest_supported_match( match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
lang, possible_languages, max_distance=9 if match == "mul":
)
if match == 'mul':
# Convert 'mul' back to spaCy's 'xx' # Convert 'mul' back to spaCy's 'xx'
return 'xx' return "xx"
else: else:
return match return match