diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 81ff5b5b8..bf86305fb 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -203,7 +203,11 @@ class Japanese(Language): "extend": True, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, }, - default_score_weights={"pos_acc": 0.5, "morph_micro_f": 0.5, "morph_per_feat": None}, + default_score_weights={ + "pos_acc": 0.5, + "morph_micro_f": 0.5, + "morph_per_feat": None, + }, ) def make_morphologizer( nlp: Language, diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 2fc3a471b..85180b1e4 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -33,7 +33,9 @@ class RussianLemmatizer(Lemmatizer): ) from None if getattr(self, "_morph", None) is None: self._morph = MorphAnalyzer() - super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer) + super().__init__( + vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) def pymorphy2_lemmatize(self, token: Token) -> List[str]: string = token.text diff --git a/spacy/lang/ti/lex_attrs.py b/spacy/lang/ti/lex_attrs.py index b29bd8c96..da56af6c0 100644 --- a/spacy/lang/ti/lex_attrs.py +++ b/spacy/lang/ti/lex_attrs.py @@ -27,7 +27,7 @@ _num_words = [ "ትሪልዮን", "ኳድሪልዮን", "ጋዚልዮን", - "ባዚልዮን" + "ባዚልዮን", ] # Tigrinya ordinals above 10 are the same as _num_words but start with "መበል " @@ -41,7 +41,7 @@ _ordinal_words = [ "ሻውዓይ", "ሻምናይ", "ታሽዓይ", - "ዓስራይ" + "ዓስራይ", ] diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index fd566a3a8..a8bc56057 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -29,4 +29,6 @@ class UkrainianLemmatizer(RussianLemmatizer): ) from None if getattr(self, "_morph", None) is None: self._morph = MorphAnalyzer(lang="uk") - super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer) + super().__init__( + vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) diff --git a/spacy/lang/vi/examples.py b/spacy/lang/vi/examples.py index 86d0b50b8..36575f67c 100644 --- a/spacy/lang/vi/examples.py +++ b/spacy/lang/vi/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.vi.examples import sentences diff --git a/spacy/scorer.py b/spacy/scorer.py index 75e5b3317..cfdf34e62 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -303,7 +303,9 @@ class Scorer: pred_per_feat[field] = set() pred_per_feat[field].add((gold_i, feat)) for field in per_feat: - micro_score.score_set(pred_per_feat.get(field, set()), gold_per_feat.get(field, set())) + micro_score.score_set( + pred_per_feat.get(field, set()), gold_per_feat.get(field, set()) + ) per_feat[field].score_set( pred_per_feat.get(field, set()), gold_per_feat.get(field, set()) ) diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index 098884cf0..3437ea283 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -133,11 +133,7 @@ def test_ja_tokenizer_sub_tokens( (["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]), (["トッ"], ["テ"], ["ツケ"], ["タ"]), ), - ( - "2=3", - ([], [], []), - (["ニ"], ["_"], ["サン"]) - ), + ("2=3", ([], [], []), (["ニ"], ["_"], ["サン"])), ], ) def test_ja_tokenizer_inflections_reading_forms( diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 192faa67b..452bcc079 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -216,8 +216,8 @@ def test_tokenizer_flush_specials(en_vocab): def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab): # the prefix and suffix matches overlap in the suffix lookbehind - prefixes = ['a(?=.)'] - suffixes = [r'(?<=\w)\.', r'(?<=a)\d+\.'] + prefixes = ["a(?=.)"] + suffixes = [r"(?<=\w)\.", r"(?<=a)\d+\."] prefix_re = compile_prefix_regex(prefixes) suffix_re = compile_suffix_regex(suffixes) tokenizer = Tokenizer( diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 48636a4eb..68f86190b 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -524,6 +524,7 @@ def test_roundtrip_docs_to_docbin(doc): assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"] assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"] + def test_docbin_user_data_serialized(doc): doc.user_data["check"] = True nlp = English() @@ -536,6 +537,7 @@ def test_docbin_user_data_serialized(doc): assert reloaded_doc.user_data["check"] == True + def test_docbin_user_data_not_serialized(doc): # this isn't serializable, but that shouldn't cause an error doc.user_data["check"] = set() @@ -549,6 +551,7 @@ def test_docbin_user_data_not_serialized(doc): assert "check" not in reloaded_doc.user_data + @pytest.mark.parametrize( "tokens_a,tokens_b,expected", [ diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py index 7830196bc..465406a49 100644 --- a/spacy/training/pretrain.py +++ b/spacy/training/pretrain.py @@ -50,7 +50,9 @@ def pretrain( # TODO: move this to logger function? tracker = ProgressTracker(frequency=10000) if P["n_save_epoch"]: - msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume} - saving every {P['n_save_epoch']} epoch") + msg.divider( + f"Pre-training tok2vec layer - starting at epoch {epoch_resume} - saving every {P['n_save_epoch']} epoch" + ) else: msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} diff --git a/spacy/util.py b/spacy/util.py index e14f6030f..4424f6897 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -288,16 +288,17 @@ def find_matching_language(lang: str) -> Optional[str]: None """ import spacy.lang # noqa: F401 - if lang == 'xx': - return 'xx' + + if lang == "xx": + return "xx" # Find out which language modules we have possible_languages = [] for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore code = modinfo.name - if code == 'xx': + if code == "xx": # Temporarily make 'xx' into a valid language code - possible_languages.append('mul') + possible_languages.append("mul") elif langcodes.tag_is_valid(code): possible_languages.append(code) @@ -306,12 +307,10 @@ def find_matching_language(lang: str) -> Optional[str]: # more possibilities, like variants of Chinese like 'wuu', but text that # is labeled that way is probably trying to be distinct from 'zh' and # shouldn't automatically match. - match = langcodes.closest_supported_match( - lang, possible_languages, max_distance=9 - ) - if match == 'mul': + match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9) + if match == "mul": # Convert 'mul' back to spaCy's 'xx' - return 'xx' + return "xx" else: return match