diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 1b42b5254..309b6b1e7 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -225,13 +225,11 @@ def get_git_version( @overload -def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]: - ... +def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]: ... @overload -def string_to_list(value: str, intify: Literal[True]) -> List[int]: - ... +def string_to_list(value: str, intify: Literal[True]) -> List[int]: ... def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]: diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index af3c24f3b..1c9c0e0ea 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -968,16 +968,14 @@ def _compile_gold( @overload -def _format_labels(labels: Iterable[str], counts: Literal[False] = False) -> str: - ... +def _format_labels(labels: Iterable[str], counts: Literal[False] = False) -> str: ... @overload def _format_labels( labels: Iterable[Tuple[str, int]], counts: Literal[True], -) -> str: - ... +) -> str: ... def _format_labels( diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 3e86495e7..ff7af32e6 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -157,9 +157,11 @@ def find_threshold( exits=1, ) return { - keys[0]: filter_config(config[keys[0]], keys[1:], full_key) - if len(keys) > 1 - else config[keys[0]] + keys[0]: ( + filter_config(config[keys[0]], keys[1:], full_key) + if len(keys) > 1 + else config[keys[0]] + ) } # Evaluate with varying threshold values. @@ -216,12 +218,14 @@ def find_threshold( if len(set(scores.values())) == 1: wasabi.msg.warn( title="All scores are identical. Verify that all settings are correct.", - text="" - if ( - not isinstance(pipe, MultiLabel_TextCategorizer) - or scores_key in ("cats_macro_f", "cats_micro_f") - ) - else "Use `cats_macro_f` or `cats_micro_f` when optimizing the threshold for `textcat_multilabel`.", + text=( + "" + if ( + not isinstance(pipe, MultiLabel_TextCategorizer) + or scores_key in ("cats_macro_f", "cats_micro_f") + ) + else "Use `cats_macro_f` or `cats_micro_f` when optimizing the threshold for `textcat_multilabel`." + ), ) else: diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index a7c03d00f..a7fb2b5b8 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -195,9 +195,11 @@ def init_config( "Pipeline": ", ".join(pipeline), "Optimize for": optimize, "Hardware": variables["hardware"].upper(), - "Transformer": template_vars.transformer.get("name") # type: ignore[attr-defined] - if template_vars.use_transformer # type: ignore[attr-defined] - else None, + "Transformer": ( + template_vars.transformer.get("name") # type: ignore[attr-defined] + if template_vars.use_transformer # type: ignore[attr-defined] + else None + ), } msg.info("Generated config template specific for your use case") for label, value in use_case.items(): diff --git a/spacy/compat.py b/spacy/compat.py index 522fa30dd..a9e7d5a20 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -1,4 +1,5 @@ """Helpers for Python and platform compatibility.""" + import sys from thinc.util import copy_array diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 4651e5212..55474734a 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -4,6 +4,7 @@ spaCy's built in visualization suite for dependencies and named entities. DOCS: https://spacy.io/api/top-level#displacy USAGE: https://spacy.io/usage/visualizers """ + import warnings from typing import Any, Callable, Dict, Iterable, Optional, Union diff --git a/spacy/lang/am/examples.py b/spacy/lang/am/examples.py index 253d32d1d..b156cb84f 100644 --- a/spacy/lang/am/examples.py +++ b/spacy/lang/am/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "አፕል የዩኬን ጅምር ድርጅት በ 1 ቢሊዮን ዶላር ለመግዛት አስቧል።", "የራስ ገዝ መኪኖች የኢንሹራንስ ኃላፊነትን ወደ አምራቾች ያዛውራሉ", diff --git a/spacy/lang/az/examples.py b/spacy/lang/az/examples.py index f3331a8cb..df5e3521d 100644 --- a/spacy/lang/az/examples.py +++ b/spacy/lang/az/examples.py @@ -4,7 +4,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Bu bir cümlədir.", "Necəsən?", diff --git a/spacy/lang/bg/stop_words.py b/spacy/lang/bg/stop_words.py index df708b65e..061850da5 100644 --- a/spacy/lang/bg/stop_words.py +++ b/spacy/lang/bg/stop_words.py @@ -3,6 +3,7 @@ References: https://github.com/Alir3z4/stop-words - Original list, serves as a base. https://postvai.com/books/stop-dumi.pdf - Additions to the original list in order to improve it. """ + STOP_WORDS = set( """ а автентичен аз ако ала diff --git a/spacy/lang/bn/examples.py b/spacy/lang/bn/examples.py index c3be4c556..11a65acb1 100644 --- a/spacy/lang/bn/examples.py +++ b/spacy/lang/bn/examples.py @@ -5,5 +5,4 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = ["তুই খুব ভালো", "আজ আমরা ডাক্তার দেখতে যাবো", "আমি জানি না "] diff --git a/spacy/lang/bo/examples.py b/spacy/lang/bo/examples.py index 8ed9372ec..8655f2d9d 100644 --- a/spacy/lang/bo/examples.py +++ b/spacy/lang/bo/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།", "ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག", diff --git a/spacy/lang/ca/examples.py b/spacy/lang/ca/examples.py index ae6aa3e24..de54c05ce 100644 --- a/spacy/lang/ca/examples.py +++ b/spacy/lang/ca/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Apple està buscant comprar una startup del Regne Unit per mil milions de dòlars", "Els cotxes autònoms deleguen la responsabilitat de l'assegurança als seus fabricants", diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 37c58c85f..69e752c91 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -277,10 +277,10 @@ _currency = ( # These expressions contain various unicode variations, including characters # used in Chinese (see #1333, #1340, #1351) – unless there are cross-language # conflicts, spaCy's base tokenizer should handle all of those by default -_punct = ( - r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ۔ ؛ ٪" +_punct = r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ۔ ؛ ٪" +_quotes = ( + r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉 〈 〉 ⟦ ⟧' ) -_quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉 〈 〉 ⟦ ⟧' _hyphens = "- – — -- --- —— ~" # Various symbols like dingbats, but also emoji diff --git a/spacy/lang/cs/examples.py b/spacy/lang/cs/examples.py index a30b5ac14..35d86dde7 100644 --- a/spacy/lang/cs/examples.py +++ b/spacy/lang/cs/examples.py @@ -4,7 +4,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Máma mele maso.", "Příliš žluťoučký kůň úpěl ďábelské ódy.", diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index 649d12022..15a943ad6 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -2,6 +2,7 @@ Tokenizer Exceptions. Source: https://forkortelse.dk/ and various others. """ + from ...symbols import NORM, ORTH from ...util import update_exc from ..tokenizer_exceptions import BASE_EXCEPTIONS diff --git a/spacy/lang/de/examples.py b/spacy/lang/de/examples.py index 735d1c316..30b8f195b 100644 --- a/spacy/lang/de/examples.py +++ b/spacy/lang/de/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen", "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz", diff --git a/spacy/lang/dsb/examples.py b/spacy/lang/dsb/examples.py index 6e9143826..11ecbddb2 100644 --- a/spacy/lang/dsb/examples.py +++ b/spacy/lang/dsb/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.", "Mi so tu jara derje spodoba.", diff --git a/spacy/lang/en/examples.py b/spacy/lang/en/examples.py index 2cca9e05f..7ed0ba0c1 100644 --- a/spacy/lang/en/examples.py +++ b/spacy/lang/en/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Apple is looking at buying U.K. startup for $1 billion", "Autonomous cars shift insurance liability toward manufacturers", diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py index e4dfbcb6d..653a38bfd 100644 --- a/spacy/lang/es/examples.py +++ b/spacy/lang/es/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares.", "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes.", diff --git a/spacy/lang/fa/examples.py b/spacy/lang/fa/examples.py index 9c6fb0345..6810e48d5 100644 --- a/spacy/lang/fa/examples.py +++ b/spacy/lang/fa/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "این یک جمله نمونه می باشد.", "قرار ما، امروز ساعت ۲:۳۰ بعدازظهر هست!", diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py index 881d5b91d..0bbd7bd91 100644 --- a/spacy/lang/fi/tokenizer_exceptions.py +++ b/spacy/lang/fi/tokenizer_exceptions.py @@ -100,9 +100,9 @@ conj_contraction_negations = [ ("eivat", "eivät"), ("eivät", "eivät"), ] -for (base_lower, base_norm) in conj_contraction_bases: +for base_lower, base_norm in conj_contraction_bases: for base in [base_lower, base_lower.title()]: - for (suffix, suffix_norm) in conj_contraction_negations: + for suffix, suffix_norm in conj_contraction_negations: _exc[base + suffix] = [ {ORTH: base, NORM: base_norm}, {ORTH: suffix, NORM: suffix_norm}, diff --git a/spacy/lang/fr/examples.py b/spacy/lang/fr/examples.py index a74a62204..759de5615 100644 --- a/spacy/lang/fr/examples.py +++ b/spacy/lang/fr/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Apple cherche à acheter une start-up anglaise pour 1 milliard de dollars", "Les voitures autonomes déplacent la responsabilité de l'assurance vers les constructeurs", diff --git a/spacy/lang/grc/examples.py b/spacy/lang/grc/examples.py index 9c0bcb265..51ec8f8cc 100644 --- a/spacy/lang/grc/examples.py +++ b/spacy/lang/grc/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·", "εὐδαίμων Χαρίτων καὶ Μελάνιππος ἔφυ, θείας ἁγητῆρες ἐφαμερίοις φιλότατος.", diff --git a/spacy/lang/gu/examples.py b/spacy/lang/gu/examples.py index 1cf75fd32..e67b7ba9d 100644 --- a/spacy/lang/gu/examples.py +++ b/spacy/lang/gu/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "લોકશાહી એ સરકારનું એક એવું તંત્ર છે જ્યાં નાગરિકો મત દ્વારા સત્તાનો ઉપયોગ કરે છે.", "તે ગુજરાત રાજ્યના ધરમપુર શહેરમાં આવેલું હતું", diff --git a/spacy/lang/he/examples.py b/spacy/lang/he/examples.py index d54d2a145..ee484e07b 100644 --- a/spacy/lang/he/examples.py +++ b/spacy/lang/he/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל", 'רה"מ הודיע כי יחרים טקס בחסותו', diff --git a/spacy/lang/hi/examples.py b/spacy/lang/hi/examples.py index 1443b4908..f3196c58f 100644 --- a/spacy/lang/hi/examples.py +++ b/spacy/lang/hi/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "एप्पल 1 अरब डॉलर के लिए यू.के. स्टार्टअप खरीदने पर विचार कर रहा है।", "स्वायत्त कारें निर्माताओं की ओर बीमा दायित्व रखतीं हैं।", diff --git a/spacy/lang/hsb/examples.py b/spacy/lang/hsb/examples.py index 21f6f7584..754011c6f 100644 --- a/spacy/lang/hsb/examples.py +++ b/spacy/lang/hsb/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "To běšo wjelgin raźone a jo se wót luźi derje pśiwzeło. Tak som dožywiła wjelgin", "Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.", diff --git a/spacy/lang/ht/__init__.py b/spacy/lang/ht/__init__.py index e5c1c2770..9fc2df40c 100644 --- a/spacy/lang/ht/__init__.py +++ b/spacy/lang/ht/__init__.py @@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults): stop_words = STOP_WORDS tag_map = TAG_MAP + class HaitianCreole(Language): lang = "ht" Defaults = HaitianCreoleDefaults + @HaitianCreole.factory( "lemmatizer", assigns=["token.lemma"], @@ -49,4 +51,5 @@ def make_lemmatizer( nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer ) + __all__ = ["HaitianCreole"] diff --git a/spacy/lang/ht/examples.py b/spacy/lang/ht/examples.py index 456d34a5f..0afeb19c8 100644 --- a/spacy/lang/ht/examples.py +++ b/spacy/lang/ht/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Apple ap panse achte yon demaraj nan Wayòm Ini pou $1 milya dola", "Machin otonòm fè responsablite asirans lan ale sou men fabrikan yo", diff --git a/spacy/lang/ht/lex_attrs.py b/spacy/lang/ht/lex_attrs.py index 8a3ec1ff9..ab1a39a82 100644 --- a/spacy/lang/ht/lex_attrs.py +++ b/spacy/lang/ht/lex_attrs.py @@ -49,6 +49,7 @@ NORM_MAP = { "P": "Pa", } + def like_num(text): text = text.strip().lower() if text.startswith(("+", "-", "±", "~")): @@ -69,9 +70,11 @@ def like_num(text): return True return False + def norm_custom(text): return NORM_MAP.get(text, text.lower()) + LEX_ATTRS = { LIKE_NUM: like_num, NORM: norm_custom, diff --git a/spacy/lang/ht/punctuation.py b/spacy/lang/ht/punctuation.py index 61d88d6e1..0077db1c0 100644 --- a/spacy/lang/ht/punctuation.py +++ b/spacy/lang/ht/punctuation.py @@ -16,28 +16,43 @@ ELISION = "'’".replace(" ", "") _prefixes_elision = "m n l y t k w" _prefixes_elision += " " + _prefixes_elision.upper() -TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [ - r"(?:({pe})[{el}])(?=[{a}])".format( - a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) - ) -] +TOKENIZER_PREFIXES = ( + LIST_PUNCT + + LIST_QUOTES + + [ + r"(?:({pe})[{el}])(?=[{a}])".format( + a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) + ) + ] +) -TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [ - r"(?<=[0-9])%", # numbers like 10% - r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers - r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters - r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions - r"(?<=[{a}0-9])\)", # right parenthesis after letter/number - r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string - r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis -] +TOKENIZER_SUFFIXES = ( + LIST_PUNCT + + LIST_QUOTES + + LIST_ELLIPSES + + [ + r"(?<=[0-9])%", # numbers like 10% + r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers + r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters + r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions + r"(?<=[{a}0-9])\)", # right parenthesis after letter/number + r"(?<=[{a}])\.(?=\s|$)".format( + a=ALPHA + ), # period after letter if space or end of string + r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis + ] +) -TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [ - r"(?<=[0-9])[+\-\*^](?=[0-9-])", - r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( - al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES - ), - r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), - r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), - r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), -] +TOKENIZER_INFIXES = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), + ] +) diff --git a/spacy/lang/ht/stop_words.py b/spacy/lang/ht/stop_words.py index 6243887a4..50998e0e5 100644 --- a/spacy/lang/ht/stop_words.py +++ b/spacy/lang/ht/stop_words.py @@ -39,8 +39,7 @@ sa san si swa si men mèsi oswa osinon -""" -.split() +""".split() ) # Add common contractions, with and without apostrophe variants diff --git a/spacy/lang/ht/tag_map.py b/spacy/lang/ht/tag_map.py index 8c9cdd6d4..261d1aef3 100644 --- a/spacy/lang/ht/tag_map.py +++ b/spacy/lang/ht/tag_map.py @@ -1,4 +1,22 @@ -from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X +from spacy.symbols import ( + NOUN, + VERB, + AUX, + ADJ, + ADV, + PRON, + DET, + ADP, + SCONJ, + CCONJ, + PART, + INTJ, + NUM, + PROPN, + PUNCT, + SYM, + X, +) TAG_MAP = { "NOUN": {"pos": NOUN}, diff --git a/spacy/lang/ht/tokenizer_exceptions.py b/spacy/lang/ht/tokenizer_exceptions.py index b44ad7a6f..4d617fd36 100644 --- a/spacy/lang/ht/tokenizer_exceptions.py +++ b/spacy/lang/ht/tokenizer_exceptions.py @@ -1,5 +1,6 @@ from spacy.symbols import ORTH, NORM + def make_variants(base, first_norm, second_orth, second_norm): return { base: [ @@ -7,14 +8,16 @@ def make_variants(base, first_norm, second_orth, second_norm): {ORTH: second_orth, NORM: second_norm}, ], base.capitalize(): [ - {ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()}, + { + ORTH: base.split("'")[0].capitalize() + "'", + NORM: first_norm.capitalize(), + }, {ORTH: second_orth, NORM: second_norm}, - ] + ], } -TOKENIZER_EXCEPTIONS = { - "Dr.": [{ORTH: "Dr."}] -} + +TOKENIZER_EXCEPTIONS = {"Dr.": [{ORTH: "Dr."}]} # Apostrophe forms TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap")) @@ -29,93 +32,95 @@ TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap")) TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap")) # Non-apostrophe contractions (with capitalized variants) -TOKENIZER_EXCEPTIONS.update({ - "map": [ - {ORTH: "m", NORM: "mwen"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Map": [ - {ORTH: "M", NORM: "Mwen"}, - {ORTH: "ap", NORM: "ap"}, - ], - "lem": [ - {ORTH: "le", NORM: "le"}, - {ORTH: "m", NORM: "mwen"}, - ], - "Lem": [ - {ORTH: "Le", NORM: "Le"}, - {ORTH: "m", NORM: "mwen"}, - ], - "lew": [ - {ORTH: "le", NORM: "le"}, - {ORTH: "w", NORM: "ou"}, - ], - "Lew": [ - {ORTH: "Le", NORM: "Le"}, - {ORTH: "w", NORM: "ou"}, - ], - "nap": [ - {ORTH: "n", NORM: "nou"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Nap": [ - {ORTH: "N", NORM: "Nou"}, - {ORTH: "ap", NORM: "ap"}, - ], - "lap": [ - {ORTH: "l", NORM: "li"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Lap": [ - {ORTH: "L", NORM: "Li"}, - {ORTH: "ap", NORM: "ap"}, - ], - "yap": [ - {ORTH: "y", NORM: "yo"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Yap": [ - {ORTH: "Y", NORM: "Yo"}, - {ORTH: "ap", NORM: "ap"}, - ], - "mte": [ - {ORTH: "m", NORM: "mwen"}, - {ORTH: "te", NORM: "te"}, - ], - "Mte": [ - {ORTH: "M", NORM: "Mwen"}, - {ORTH: "te", NORM: "te"}, - ], - "mpral": [ - {ORTH: "m", NORM: "mwen"}, - {ORTH: "pral", NORM: "pral"}, - ], - "Mpral": [ - {ORTH: "M", NORM: "Mwen"}, - {ORTH: "pral", NORM: "pral"}, - ], - "wap": [ - {ORTH: "w", NORM: "ou"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Wap": [ - {ORTH: "W", NORM: "Ou"}, - {ORTH: "ap", NORM: "ap"}, - ], - "kap": [ - {ORTH: "k", NORM: "ki"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Kap": [ - {ORTH: "K", NORM: "Ki"}, - {ORTH: "ap", NORM: "ap"}, - ], - "tap": [ - {ORTH: "t", NORM: "te"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Tap": [ - {ORTH: "T", NORM: "Te"}, - {ORTH: "ap", NORM: "ap"}, - ], -}) +TOKENIZER_EXCEPTIONS.update( + { + "map": [ + {ORTH: "m", NORM: "mwen"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Map": [ + {ORTH: "M", NORM: "Mwen"}, + {ORTH: "ap", NORM: "ap"}, + ], + "lem": [ + {ORTH: "le", NORM: "le"}, + {ORTH: "m", NORM: "mwen"}, + ], + "Lem": [ + {ORTH: "Le", NORM: "Le"}, + {ORTH: "m", NORM: "mwen"}, + ], + "lew": [ + {ORTH: "le", NORM: "le"}, + {ORTH: "w", NORM: "ou"}, + ], + "Lew": [ + {ORTH: "Le", NORM: "Le"}, + {ORTH: "w", NORM: "ou"}, + ], + "nap": [ + {ORTH: "n", NORM: "nou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Nap": [ + {ORTH: "N", NORM: "Nou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "lap": [ + {ORTH: "l", NORM: "li"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Lap": [ + {ORTH: "L", NORM: "Li"}, + {ORTH: "ap", NORM: "ap"}, + ], + "yap": [ + {ORTH: "y", NORM: "yo"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Yap": [ + {ORTH: "Y", NORM: "Yo"}, + {ORTH: "ap", NORM: "ap"}, + ], + "mte": [ + {ORTH: "m", NORM: "mwen"}, + {ORTH: "te", NORM: "te"}, + ], + "Mte": [ + {ORTH: "M", NORM: "Mwen"}, + {ORTH: "te", NORM: "te"}, + ], + "mpral": [ + {ORTH: "m", NORM: "mwen"}, + {ORTH: "pral", NORM: "pral"}, + ], + "Mpral": [ + {ORTH: "M", NORM: "Mwen"}, + {ORTH: "pral", NORM: "pral"}, + ], + "wap": [ + {ORTH: "w", NORM: "ou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Wap": [ + {ORTH: "W", NORM: "Ou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "kap": [ + {ORTH: "k", NORM: "ki"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Kap": [ + {ORTH: "K", NORM: "Ki"}, + {ORTH: "ap", NORM: "ap"}, + ], + "tap": [ + {ORTH: "t", NORM: "te"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Tap": [ + {ORTH: "T", NORM: "Te"}, + {ORTH: "ap", NORM: "ap"}, + ], + } +) diff --git a/spacy/lang/hu/examples.py b/spacy/lang/hu/examples.py index 711a438bd..c056c0967 100644 --- a/spacy/lang/hu/examples.py +++ b/spacy/lang/hu/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Az Apple egy brit startup vásárlását tervezi 1 milliárd dollár értékben.", "San Francisco vezetése mérlegeli a járdát használó szállító robotok betiltását.", diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py index dbf93c622..dc9741076 100644 --- a/spacy/lang/hu/punctuation.py +++ b/spacy/lang/hu/punctuation.py @@ -11,7 +11,7 @@ from ..char_classes import ( ) # removing ° from the special icons to keep e.g. 99° as one token -_concat_icons = CONCAT_ICONS.replace("\u00B0", "") +_concat_icons = CONCAT_ICONS.replace("\u00b0", "") _currency = r"\$¢£€¥฿" _quotes = CONCAT_QUOTES.replace("'", "") diff --git a/spacy/lang/hy/examples.py b/spacy/lang/hy/examples.py index 212a2ec86..9455396db 100644 --- a/spacy/lang/hy/examples.py +++ b/spacy/lang/hy/examples.py @@ -4,7 +4,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։", "Ո՞վ է Ֆրանսիայի նախագահը։", diff --git a/spacy/lang/id/examples.py b/spacy/lang/id/examples.py index d35271551..17d1c5f28 100644 --- a/spacy/lang/id/examples.py +++ b/spacy/lang/id/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Indonesia merupakan negara kepulauan yang kaya akan budaya.", "Berapa banyak warga yang dibutuhkan saat kerja bakti?", diff --git a/spacy/lang/it/examples.py b/spacy/lang/it/examples.py index 506721276..ae857382a 100644 --- a/spacy/lang/it/examples.py +++ b/spacy/lang/it/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari", "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori", diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index e21e85cd9..492478af3 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -102,9 +102,9 @@ class JapaneseTokenizer(DummyTokenizer): token.dictionary_form(), # lemma token.normalized_form(), token.reading_form(), - sub_tokens_list[idx] - if sub_tokens_list - else None, # user_data['sub_tokens'] + ( + sub_tokens_list[idx] if sub_tokens_list else None + ), # user_data['sub_tokens'] ) for idx, token in enumerate(sudachipy_tokens) if len(token.surface()) > 0 diff --git a/spacy/lang/ja/examples.py b/spacy/lang/ja/examples.py index c3a011862..a07711c53 100644 --- a/spacy/lang/ja/examples.py +++ b/spacy/lang/ja/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "アップルがイギリスの新興企業を10億ドルで購入を検討", "自動運転車の損害賠償責任、自動車メーカーに一定の負担を求める", diff --git a/spacy/lang/ja/tag_map.py b/spacy/lang/ja/tag_map.py index 5c14f41bf..527c83629 100644 --- a/spacy/lang/ja/tag_map.py +++ b/spacy/lang/ja/tag_map.py @@ -25,7 +25,9 @@ TAG_MAP = { # Universal Dependencies Mapping: (Some of the entries in this mapping are updated to v2.6 in the list below) # http://universaldependencies.org/ja/overview/morphology.html # http://universaldependencies.org/ja/pos/all.html - "記号-一般": {POS: NOUN}, # this includes characters used to represent sounds like ドレミ + "記号-一般": { + POS: NOUN + }, # this includes characters used to represent sounds like ドレミ "記号-文字": { POS: NOUN }, # this is for Greek and Latin characters having some meanings, or used as symbols, as in math @@ -72,7 +74,9 @@ TAG_MAP = { "名詞-固有名詞-地名-国": {POS: PROPN}, # country name "名詞-助動詞語幹": {POS: AUX}, "名詞-数詞": {POS: NUM}, # includes Chinese numerals - "名詞-普通名詞-サ変可能": {POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun + "名詞-普通名詞-サ変可能": { + POS: NOUN + }, # XXX: sometimes VERB in UDv2; suru-verb noun "名詞-普通名詞-サ変形状詞可能": {POS: NOUN}, "名詞-普通名詞-一般": {POS: NOUN}, "名詞-普通名詞-形状詞可能": {POS: NOUN}, # XXX: sometimes ADJ in UDv2 diff --git a/spacy/lang/kn/examples.py b/spacy/lang/kn/examples.py index 3e055752e..7cbb7fc07 100644 --- a/spacy/lang/kn/examples.py +++ b/spacy/lang/kn/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "ಆಪಲ್ ಒಂದು ಯು.ಕೆ. ಸ್ಟಾರ್ಟ್ಅಪ್ ಅನ್ನು ೧ ಶತಕೋಟಿ ಡಾಲರ್ಗಳಿಗೆ ಖರೀದಿಸಲು ನೋಡುತ್ತಿದೆ.", "ಸ್ವಾಯತ್ತ ಕಾರುಗಳು ವಿಮಾ ಹೊಣೆಗಾರಿಕೆಯನ್ನು ತಯಾರಕರ ಕಡೆಗೆ ಬದಲಾಯಿಸುತ್ತವೆ.", diff --git a/spacy/lang/lij/examples.py b/spacy/lang/lij/examples.py index ba7fe43fd..ec336b07f 100644 --- a/spacy/lang/lij/examples.py +++ b/spacy/lang/lij/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Sciusciâ e sciorbî no se peu.", "Graçie di çetroin, che me son arrivæ.", diff --git a/spacy/lang/lt/examples.py b/spacy/lang/lt/examples.py index eaf941f1a..57d6eb4d1 100644 --- a/spacy/lang/lt/examples.py +++ b/spacy/lang/lt/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Jaunikis pirmąją vestuvinę naktį iškeitė į areštinės gultą", "Bepiločiai automobiliai išnaikins vairavimo mokyklas, autoservisus ir eismo nelaimes", diff --git a/spacy/lang/ml/examples.py b/spacy/lang/ml/examples.py index 9794eab29..d067b8b66 100644 --- a/spacy/lang/ml/examples.py +++ b/spacy/lang/ml/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "അനാവശ്യമായി കണ്ണിലും മൂക്കിലും വായിലും സ്പർശിക്കാതിരിക്കുക", "പൊതുരംഗത്ത് മലയാള ഭാഷയുടെ സമഗ്രപുരോഗതി ലക്ഷ്യമാക്കി പ്രവർത്തിക്കുന്ന സംഘടനയായ മലയാളഐക്യവേദിയുടെ വിദ്യാർത്ഥിക്കൂട്ടായ്മയാണ് വിദ്യാർത്ഥി മലയാളവേദി", diff --git a/spacy/lang/ms/examples.py b/spacy/lang/ms/examples.py index 97ab19b6e..1af439d4a 100644 --- a/spacy/lang/ms/examples.py +++ b/spacy/lang/ms/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Malaysia ialah sebuah negara yang terletak di Asia Tenggara.", "Berapa banyak pelajar yang akan menghadiri majlis perpisahan sekolah?", diff --git a/spacy/lang/nb/examples.py b/spacy/lang/nb/examples.py index b1a63ad74..242dab7c5 100644 --- a/spacy/lang/nb/examples.py +++ b/spacy/lang/nb/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar.", "Selvkjørende biler flytter forsikringsansvaret over på produsentene.", diff --git a/spacy/lang/ne/examples.py b/spacy/lang/ne/examples.py index a29b77c2f..cc3b382df 100644 --- a/spacy/lang/ne/examples.py +++ b/spacy/lang/ne/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "एप्पलले अमेरिकी स्टार्टअप १ अर्ब डलरमा किन्ने सोच्दै छ", "स्वायत्त कारहरूले बीमा दायित्व निर्माताहरु तिर बदल्छन्", diff --git a/spacy/lang/nl/examples.py b/spacy/lang/nl/examples.py index 8c8c50c60..3440f01db 100644 --- a/spacy/lang/nl/examples.py +++ b/spacy/lang/nl/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Apple overweegt om voor 1 miljard een U.K. startup te kopen", "Autonome auto's verschuiven de verzekeringverantwoordelijkheid naar producenten", diff --git a/spacy/lang/nn/examples.py b/spacy/lang/nn/examples.py index 95ec0aadd..ee03bf95e 100644 --- a/spacy/lang/nn/examples.py +++ b/spacy/lang/nn/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - # sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/) sentences = [ "Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.", diff --git a/spacy/lang/pl/examples.py b/spacy/lang/pl/examples.py index b1ea5880f..cb55ed07d 100644 --- a/spacy/lang/pl/examples.py +++ b/spacy/lang/pl/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Poczuł przyjemną woń mocnej kawy.", "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.", diff --git a/spacy/lang/pt/examples.py b/spacy/lang/pt/examples.py index 13f3512cf..42ae602c1 100644 --- a/spacy/lang/pt/examples.py +++ b/spacy/lang/pt/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares", "Carros autônomos empurram a responsabilidade do seguro para os fabricantes." diff --git a/spacy/lang/ro/examples.py b/spacy/lang/ro/examples.py index bfa258ffc..46b4c9a67 100644 --- a/spacy/lang/ro/examples.py +++ b/spacy/lang/ro/examples.py @@ -7,7 +7,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Apple plănuiește să cumpere o companie britanică pentru un miliard de dolari", "Municipalitatea din San Francisco ia în calcul interzicerea roboților curieri pe trotuar", diff --git a/spacy/lang/ru/examples.py b/spacy/lang/ru/examples.py index adb007625..9595d583a 100644 --- a/spacy/lang/ru/examples.py +++ b/spacy/lang/ru/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ # Translations from English: "Apple рассматривает возможность покупки стартапа из Соединённого Королевства за $1 млрд", diff --git a/spacy/lang/sa/examples.py b/spacy/lang/sa/examples.py index 60243c04c..6a0bc4e13 100644 --- a/spacy/lang/sa/examples.py +++ b/spacy/lang/sa/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "अभ्यावहति कल्याणं विविधं वाक् सुभाषिता ।", "मनसि व्याकुले चक्षुः पश्यन्नपि न पश्यति ।", diff --git a/spacy/lang/si/examples.py b/spacy/lang/si/examples.py index b34051d00..8e0ffec69 100644 --- a/spacy/lang/si/examples.py +++ b/spacy/lang/si/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "මෙය වාක්‍යයකි.", "ඔබ කවුද?", diff --git a/spacy/lang/sk/examples.py b/spacy/lang/sk/examples.py index 736109a7c..079d0d2b1 100644 --- a/spacy/lang/sk/examples.py +++ b/spacy/lang/sk/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Ardevop, s.r.o. je malá startup firma na území SR.", "Samojazdiace autá presúvajú poistnú zodpovednosť na výrobcov automobilov.", diff --git a/spacy/lang/sl/examples.py b/spacy/lang/sl/examples.py index bf483c6a4..79846114b 100644 --- a/spacy/lang/sl/examples.py +++ b/spacy/lang/sl/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Apple načrtuje nakup britanskega startupa za 1 bilijon dolarjev", "France Prešeren je umrl 8. februarja 1849 v Kranju", diff --git a/spacy/lang/sq/examples.py b/spacy/lang/sq/examples.py index 06ed20fa1..61bf713a6 100644 --- a/spacy/lang/sq/examples.py +++ b/spacy/lang/sq/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Apple po shqyrton blerjen e nje shoqërie të U.K. për 1 miliard dollarë", "Makinat autonome ndryshojnë përgjegjësinë e sigurimit ndaj prodhuesve", diff --git a/spacy/lang/sr/examples.py b/spacy/lang/sr/examples.py index ec7f57ced..2d34d42b4 100644 --- a/spacy/lang/sr/examples.py +++ b/spacy/lang/sr/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ # Translations from English "Apple планира куповину америчког стартапа за $1 милијарду.", diff --git a/spacy/lang/sv/examples.py b/spacy/lang/sv/examples.py index bc6cd7a54..ffea6e457 100644 --- a/spacy/lang/sv/examples.py +++ b/spacy/lang/sv/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Apple överväger att köpa brittisk startup för 1 miljard dollar.", "Självkörande bilar förskjuter försäkringsansvar mot tillverkare.", diff --git a/spacy/lang/ta/examples.py b/spacy/lang/ta/examples.py index e68dc6237..522cd926d 100644 --- a/spacy/lang/ta/examples.py +++ b/spacy/lang/ta/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "கிறிஸ்துமஸ் மற்றும் இனிய புத்தாண்டு வாழ்த்துக்கள்", "எனக்கு என் குழந்தைப் பருவம் நினைவிருக்கிறது", diff --git a/spacy/lang/te/examples.py b/spacy/lang/te/examples.py index cff7d3cb0..4af872828 100644 --- a/spacy/lang/te/examples.py +++ b/spacy/lang/te/examples.py @@ -7,7 +7,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "ఆపిల్ 1 బిలియన్ డాలర్స్ కి యూ.కె. స్టార్ట్అప్ ని కొనాలని అనుకుంటుంది.", "ఆటోనోమోస్ కార్లు భీమా బాధ్యతను తయారీదారులపైకి మళ్లిస్తాయి.", diff --git a/spacy/lang/ti/examples.py b/spacy/lang/ti/examples.py index 167b58d09..146ac349b 100644 --- a/spacy/lang/ti/examples.py +++ b/spacy/lang/ti/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "አፕል ብዩኬ ትርከብ ንግድ ብ1 ቢሊዮን ዶላር ንምግዛዕ ሐሲባ።", "ፈላማይ ክታበት ኮቪድ 19 ተጀሚሩ፤ሓዱሽ ተስፋ ሂቡ ኣሎ", diff --git a/spacy/lang/tn/examples.py b/spacy/lang/tn/examples.py index 7b33fae5a..fb6d96f97 100644 --- a/spacy/lang/tn/examples.py +++ b/spacy/lang/tn/examples.py @@ -4,7 +4,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion", "Johannesburg ke toropo e kgolo mo Afrika Borwa.", diff --git a/spacy/lang/tr/examples.py b/spacy/lang/tr/examples.py index c912c950d..c96e54032 100644 --- a/spacy/lang/tr/examples.py +++ b/spacy/lang/tr/examples.py @@ -4,7 +4,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Neredesin?", "Neredesiniz?", diff --git a/spacy/lang/uk/examples.py b/spacy/lang/uk/examples.py index f75d44488..3335c82ac 100644 --- a/spacy/lang/uk/examples.py +++ b/spacy/lang/uk/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Ніч на середу буде морозною.", "Чим кращі книги ти читав, тим гірше спиш.", # Serhiy Zhadan diff --git a/spacy/lang/ur/examples.py b/spacy/lang/ur/examples.py index e55b337be..f612c6b81 100644 --- a/spacy/lang/ur/examples.py +++ b/spacy/lang/ur/examples.py @@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "اردو ہے جس کا نام ہم جانتے ہیں داغ", "سارے جہاں میں دھوم ہماری زباں کی ہے", diff --git a/spacy/lang/vi/examples.py b/spacy/lang/vi/examples.py index 36575f67c..5f2a9b2ba 100644 --- a/spacy/lang/vi/examples.py +++ b/spacy/lang/vi/examples.py @@ -4,7 +4,6 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ "Đây là đâu, tôi là ai?", "Căn phòng có nhiều cửa sổ nên nó khá sáng", diff --git a/spacy/language.py b/spacy/language.py index 5b9eb8bd2..dcf436c65 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1519,8 +1519,7 @@ class Language: disable: Iterable[str] = ..., component_cfg: Optional[Dict[str, Dict[str, Any]]] = ..., n_process: int = ..., - ) -> Iterator[Doc]: - ... + ) -> Iterator[Doc]: ... @overload def pipe( # noqa: F811 @@ -1532,8 +1531,7 @@ class Language: disable: Iterable[str] = ..., component_cfg: Optional[Dict[str, Dict[str, Any]]] = ..., n_process: int = ..., - ) -> Iterator[Tuple[Doc, _AnyContext]]: - ... + ) -> Iterator[Tuple[Doc, _AnyContext]]: ... def pipe( # noqa: F811 self, @@ -1641,7 +1639,7 @@ class Language: batch_size: int, ) -> Iterator[Doc]: def prepare_input( - texts: Iterable[Union[str, Doc]] + texts: Iterable[Union[str, Doc]], ) -> Iterable[Tuple[Union[str, bytes], _AnyContext]]: # Serialize Doc inputs to bytes to avoid incurring pickling # overhead when they are passed to child processes. Also yield @@ -1943,9 +1941,9 @@ class Language: ) if "_sourced_vectors_hashes" not in nlp.meta: nlp.meta["_sourced_vectors_hashes"] = {} - nlp.meta["_sourced_vectors_hashes"][ - pipe_name - ] = source_nlp_vectors_hashes[model] + nlp.meta["_sourced_vectors_hashes"][pipe_name] = ( + source_nlp_vectors_hashes[model] + ) # Delete from cache if listeners were replaced if listeners_replaced: del source_nlps[model] diff --git a/spacy/matcher/dependencymatcher.pyi b/spacy/matcher/dependencymatcher.pyi index b9fbabda7..d84a30a58 100644 --- a/spacy/matcher/dependencymatcher.pyi +++ b/spacy/matcher/dependencymatcher.pyi @@ -51,9 +51,7 @@ class DependencyMatcher: ] = ... ) -> None: ... def has_key(self, key: Union[str, int]) -> bool: ... - def get( - self, key: Union[str, int], default: Optional[Any] = ... - ) -> Tuple[ + def get(self, key: Union[str, int], default: Optional[Any] = ...) -> Tuple[ Optional[ Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any] ], diff --git a/spacy/ml/featureextractor.py b/spacy/ml/featureextractor.py index 2f869ad65..fb4e3c39a 100644 --- a/spacy/ml/featureextractor.py +++ b/spacy/ml/featureextractor.py @@ -7,7 +7,7 @@ from ..tokens import Doc def FeatureExtractor( - columns: Union[List[str], List[int], List[Union[int, str]]] + columns: Union[List[str], List[int], List[Union[int, str]]], ) -> Model[List[Doc], List[Ints2d]]: return Model("extract_features", forward, attrs={"columns": columns}) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 752d1c443..8b12720db 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -122,7 +122,7 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: return get_candidates -def create_candidates_batch() -> Callable[ - [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] -]: +def create_candidates_batch() -> ( + Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]] +): return get_candidates_batch diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py index 6029ed313..0941b43c1 100644 --- a/spacy/pipeline/edit_tree_lemmatizer.py +++ b/spacy/pipeline/edit_tree_lemmatizer.py @@ -93,7 +93,7 @@ class EditTreeLemmatizer(TrainablePipe): truths = [] for eg in examples: eg_truths = [] - for (predicted, gold_lemma) in zip( + for predicted, gold_lemma in zip( eg.predicted, eg.get_aligned("LEMMA", as_string=True) ): if gold_lemma is None or gold_lemma == "": diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 030572850..805a0538f 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -80,8 +80,7 @@ DEFAULT_SPANCAT_SINGLELABEL_MODEL = Config().from_str( @runtime_checkable class Suggester(Protocol): - def __call__(self, docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged: - ... + def __call__(self, docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged: ... def ngram_suggester( diff --git a/spacy/registrations.py b/spacy/registrations.py index f742da9d3..7e29486b6 100644 --- a/spacy/registrations.py +++ b/spacy/registrations.py @@ -6,6 +6,7 @@ remain in their original locations, but decoration is moved here. Component definitions and registrations are in spacy/pipeline/factories.py """ + # Global flag to track if registry has been populated REGISTRY_POPULATED = False diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 73544c51a..d72c916ef 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -141,7 +141,8 @@ def test_issue3869(sentence): @pytest.mark.issue(3962) def test_issue3962(en_vocab): """Ensure that as_doc does not result in out-of-bound access of tokens. - This is achieved by setting the head to itself if it would lie out of the span otherwise.""" + This is achieved by setting the head to itself if it would lie out of the span otherwise. + """ # fmt: off words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."] heads = [1, 7, 1, 2, 7, 7, 7, 7, 9, 7, 7] @@ -180,7 +181,8 @@ def test_issue3962(en_vocab): @pytest.mark.issue(3962) def test_issue3962_long(en_vocab): """Ensure that as_doc does not result in out-of-bound access of tokens. - This is achieved by setting the head to itself if it would lie out of the span otherwise.""" + This is achieved by setting the head to itself if it would lie out of the span otherwise. + """ # fmt: off words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."] heads = [1, 1, 1, 2, 1, 7, 7, 7, 9, 7, 7] diff --git a/spacy/tests/lang/ht/test_exceptions.py b/spacy/tests/lang/ht/test_exceptions.py index 685b72c07..ea2e2b204 100644 --- a/spacy/tests/lang/ht/test_exceptions.py +++ b/spacy/tests/lang/ht/test_exceptions.py @@ -29,4 +29,16 @@ def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text): def test_ht_tokenizer_full_sentence(ht_tokenizer): text = "Si'm ka vini, m'ap pale ak li." tokens = [t.text for t in ht_tokenizer(text)] - assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."] + assert tokens == [ + "Si", + "'m", + "ka", + "vini", + ",", + "m'", + "ap", + "pale", + "ak", + "li", + ".", + ] diff --git a/spacy/tests/lang/ht/test_prefix_suffix_infix.py b/spacy/tests/lang/ht/test_prefix_suffix_infix.py index 7dabec17a..5ff409cd9 100644 --- a/spacy/tests/lang/ht/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/ht/test_prefix_suffix_infix.py @@ -37,7 +37,9 @@ def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text): assert len(tokens) == 5 -@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]) +@pytest.mark.parametrize( + "text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)] +) def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length): tokens = ht_tokenizer(text) assert len(tokens) == length diff --git a/spacy/tests/lang/ht/test_text.py b/spacy/tests/lang/ht/test_text.py index f396e352a..e63299fc0 100644 --- a/spacy/tests/lang/ht/test_text.py +++ b/spacy/tests/lang/ht/test_text.py @@ -16,7 +16,6 @@ Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre assert len(tokens) == 84 - @pytest.mark.parametrize( "text,length", [ @@ -66,14 +65,14 @@ def test_ht_lex_attrs_capitals(word): @pytest.mark.parametrize( - "word, expected", [ + "word, expected", + [ ("'m", "mwen"), ("'n", "nou"), ("'l", "li"), ("'y", "yo"), ("'w", "ou"), - ] + ], ) def test_ht_lex_attrs_norm_custom(word, expected): assert norm_custom(word) == expected - diff --git a/spacy/tests/lang/hu/test_tokenizer.py b/spacy/tests/lang/hu/test_tokenizer.py index fa689c8f3..30f3e9487 100644 --- a/spacy/tests/lang/hu/test_tokenizer.py +++ b/spacy/tests/lang/hu/test_tokenizer.py @@ -304,9 +304,11 @@ TESTS.extend([x for i, x in enumerate(EXTRA_TESTS) if i % 10 == 0]) SLOW_TESTS = [x for i, x in enumerate(EXTRA_TESTS) if i % 10 != 0] TESTS.extend( [ - pytest.param(x[0], x[1], marks=pytest.mark.slow()) - if not isinstance(x[0], tuple) - else x + ( + pytest.param(x[0], x[1], marks=pytest.mark.slow()) + if not isinstance(x[0], tuple) + else x + ) for x in SLOW_TESTS ] ) diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index 3b65fee23..1109766dc 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -544,7 +544,7 @@ def test_greedy_matching_longest(doc, text, pattern, longest): matcher = Matcher(doc.vocab) matcher.add("RULE", [pattern], greedy="LONGEST") matches = matcher(doc) - for (key, s, e) in matches: + for key, s, e in matches: assert doc[s:e].text == longest diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 5e50a4d28..1b6f49f4c 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -496,15 +496,15 @@ def test_el_pipe_configuration(nlp): return [get_lowercased_candidates(kb, span) for span in spans] @registry.misc("spacy.LowercaseCandidateGenerator.v1") - def create_candidates() -> Callable[ - [InMemoryLookupKB, "Span"], Iterable[Candidate] - ]: + def create_candidates() -> ( + Callable[[InMemoryLookupKB, "Span"], Iterable[Candidate]] + ): return get_lowercased_candidates @registry.misc("spacy.LowercaseCandidateBatchGenerator.v1") - def create_candidates_batch() -> Callable[ - [InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]] - ]: + def create_candidates_batch() -> ( + Callable[[InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]]] + ): return get_lowercased_candidates_batch # replace the pipe with a new one with with a different candidate generator diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index c45dccb06..b355379bf 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -279,20 +279,17 @@ def test_pipe_factories_wrong_formats(): with pytest.raises(ValueError): # Decorator is not called @Language.component - def component(foo: int, bar: str): - ... + def component(foo: int, bar: str): ... with pytest.raises(ValueError): # Decorator is not called @Language.factory - def factory1(foo: int, bar: str): - ... + def factory1(foo: int, bar: str): ... with pytest.raises(ValueError): # Factory function is missing "nlp" and "name" arguments @Language.factory("test_pipe_factories_missing_args") - def factory2(foo: int, bar: str): - ... + def factory2(foo: int, bar: str): ... def test_pipe_factory_meta_config_cleanup(): @@ -329,8 +326,7 @@ def test_pipe_factories_empty_dict_default(): name = "test_pipe_factories_empty_dict_default" @Language.factory(name, default_config={"foo": {}}) - def factory(nlp: Language, name: str, foo: dict): - ... + def factory(nlp: Language, name: str, foo: dict): ... nlp = Language() nlp.create_pipe(name) @@ -549,11 +545,9 @@ def test_pipe_factories_from_source_config(): class PipeFactoriesIdempotent: - def __init__(self, nlp, name): - ... + def __init__(self, nlp, name): ... - def __call__(self, doc): - ... + def __call__(self, doc): ... @pytest.mark.parametrize( diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 8e4a5ed7c..4310e41ab 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -874,7 +874,8 @@ def test_textcat_eval_missing(multi_label: bool, spring_p: float): def test_textcat_loss(multi_label: bool, expected_loss: float): """ multi-label: the missing 'spring' in gold_doc_2 doesn't incur an increase in loss - exclusive labels: the missing 'spring' in gold_doc_2 is interpreted as 0.0 and adds to the loss""" + exclusive labels: the missing 'spring' in gold_doc_2 is interpreted as 0.0 and adds to the loss + """ train_examples = [] nlp = English() diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 7b729d78f..43d5f6283 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -890,7 +890,7 @@ def test_cli_find_threshold(capsys): return docs def init_nlp( - components: Tuple[Tuple[str, Dict[str, Any]], ...] = () + components: Tuple[Tuple[str, Dict[str, Any]], ...] = (), ) -> Tuple[Language, List[Example]]: new_nlp = English() new_nlp.add_pipe( # type: ignore diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index f0b68862c..d92f04d05 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -57,9 +57,7 @@ class Doc: force: bool = ..., ) -> None: ... @classmethod - def get_extension( - cls, name: str - ) -> Tuple[ + def get_extension(cls, name: str) -> Tuple[ Optional[Any], Optional[DocMethod], Optional[Callable[[Doc], Any]], @@ -68,9 +66,7 @@ class Doc: @classmethod def has_extension(cls, name: str) -> bool: ... @classmethod - def remove_extension( - cls, name: str - ) -> Tuple[ + def remove_extension(cls, name: str) -> Tuple[ Optional[Any], Optional[DocMethod], Optional[Callable[[Doc], Any]], diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi index b982eb810..070aaffb3 100644 --- a/spacy/tokens/span.pyi +++ b/spacy/tokens/span.pyi @@ -23,9 +23,7 @@ class Span: force: bool = ..., ) -> None: ... @classmethod - def get_extension( - cls, name: str - ) -> Tuple[ + def get_extension(cls, name: str) -> Tuple[ Optional[Any], Optional[SpanMethod], Optional[Callable[[Span], Any]], @@ -34,9 +32,7 @@ class Span: @classmethod def has_extension(cls, name: str) -> bool: ... @classmethod - def remove_extension( - cls, name: str - ) -> Tuple[ + def remove_extension(cls, name: str) -> Tuple[ Optional[Any], Optional[SpanMethod], Optional[Callable[[Span], Any]], diff --git a/spacy/tokens/token.pyi b/spacy/tokens/token.pyi index 435ace527..7e56ae3bc 100644 --- a/spacy/tokens/token.pyi +++ b/spacy/tokens/token.pyi @@ -27,9 +27,7 @@ class Token: force: bool = ..., ) -> None: ... @classmethod - def get_extension( - cls, name: str - ) -> Tuple[ + def get_extension(cls, name: str) -> Tuple[ Optional[Any], Optional[TokenMethod], Optional[Callable[[Token], Any]], @@ -38,9 +36,7 @@ class Token: @classmethod def has_extension(cls, name: str) -> bool: ... @classmethod - def remove_extension( - cls, name: str - ) -> Tuple[ + def remove_extension(cls, name: str) -> Tuple[ Optional[Any], Optional[TokenMethod], Optional[Callable[[Token], Any]], diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 56df53957..6f5099858 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -354,7 +354,7 @@ def update_meta( def create_before_to_disk_callback( - callback: Optional[Callable[["Language"], "Language"]] + callback: Optional[Callable[["Language"], "Language"]], ) -> Callable[["Language"], "Language"]: from ..language import Language # noqa: F811 diff --git a/spacy/ty.py b/spacy/ty.py index f389456c0..b37f2e18a 100644 --- a/spacy/ty.py +++ b/spacy/ty.py @@ -30,11 +30,9 @@ class TrainableComponent(Protocol): drop: float = 0.0, sgd: Optional[Optimizer] = None, losses: Optional[Dict[str, float]] = None - ) -> Dict[str, float]: - ... + ) -> Dict[str, float]: ... - def finish_update(self, sgd: Optimizer) -> None: - ... + def finish_update(self, sgd: Optimizer) -> None: ... @runtime_checkable @@ -44,8 +42,7 @@ class InitializableComponent(Protocol): get_examples: Callable[[], Iterable["Example"]], nlp: "Language", **kwargs: Any - ): - ... + ): ... @runtime_checkable @@ -55,11 +52,8 @@ class ListenedToComponent(Protocol): listener_map: Dict[str, Sequence[Model]] listening_components: List[str] - def add_listener(self, listener: Model, component_name: str) -> None: - ... + def add_listener(self, listener: Model, component_name: str) -> None: ... - def remove_listener(self, listener: Model, component_name: str) -> bool: - ... + def remove_listener(self, listener: Model, component_name: str) -> bool: ... - def find_listeners(self, component) -> None: - ... + def find_listeners(self, component) -> None: ... diff --git a/spacy/util.py b/spacy/util.py index 527e6eb3a..ad5a7e0ba 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -657,7 +657,7 @@ def load_model_from_config( def get_sourced_components( - config: Union[Dict[str, Any], Config] + config: Union[Dict[str, Any], Config], ) -> Dict[str, Dict[str, Any]]: """RETURNS (List[str]): All sourced components in the original config, e.g. {"source": "en_core_web_sm"}. If the config contains a key