From 453732d32d55029ea9787ef737c8cf8d626f45b0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Mar 2026 09:56:06 +0100 Subject: [PATCH] Format (#13929) --- spacy/lang/af/stop_words.py | 6 ++---- spacy/lang/am/stop_words.py | 6 ++---- spacy/lang/ar/lex_attrs.py | 12 ++++-------- spacy/lang/ar/stop_words.py | 6 ++---- spacy/lang/az/stop_words.py | 6 ++---- spacy/lang/bg/stop_words.py | 6 ++---- spacy/lang/bn/stop_words.py | 6 ++---- spacy/lang/bo/stop_words.py | 6 ++---- spacy/lang/ca/stop_words.py | 6 ++---- spacy/lang/cs/stop_words.py | 6 ++---- spacy/lang/da/stop_words.py | 6 ++---- spacy/lang/de/stop_words.py | 6 ++---- spacy/lang/dsb/stop_words.py | 6 ++---- spacy/lang/el/stop_words.py | 6 ++---- spacy/lang/en/stop_words.py | 6 ++---- spacy/lang/es/stop_words.py | 6 ++---- spacy/lang/et/stop_words.py | 6 ++---- spacy/lang/eu/stop_words.py | 6 ++---- spacy/lang/fa/lex_attrs.py | 12 ++++-------- spacy/lang/fa/stop_words.py | 6 ++---- spacy/lang/fi/stop_words.py | 6 ++---- spacy/lang/fr/lex_attrs.py | 12 ++++-------- spacy/lang/fr/stop_words.py | 6 ++---- spacy/lang/ga/stop_words.py | 6 ++---- spacy/lang/gd/stop_words.py | 8 ++------ spacy/lang/gd/tokenizer_exceptions.py | 4 +--- spacy/lang/grc/stop_words.py | 6 ++---- spacy/lang/gu/stop_words.py | 6 ++---- spacy/lang/he/stop_words.py | 6 ++---- spacy/lang/hi/stop_words.py | 6 ++---- spacy/lang/hr/stop_words.py | 6 ++---- spacy/lang/hsb/stop_words.py | 6 ++---- spacy/lang/ht/lex_attrs.py | 12 ++++-------- spacy/lang/ht/stop_words.py | 6 ++---- spacy/lang/hu/stop_words.py | 6 ++---- spacy/lang/hy/stop_words.py | 6 ++---- spacy/lang/id/_tokenizer_exceptions_list.py | 6 ++---- spacy/lang/id/stop_words.py | 6 ++---- spacy/lang/is/stop_words.py | 6 ++---- spacy/lang/it/stop_words.py | 6 ++---- spacy/lang/ja/stop_words.py | 6 ++---- spacy/lang/kmr/stop_words.py | 6 ++---- spacy/lang/kn/stop_words.py | 6 ++---- spacy/lang/ko/stop_words.py | 6 ++---- spacy/lang/ky/stop_words.py | 6 ++---- spacy/lang/la/stop_words.py | 6 ++---- spacy/lang/lb/lex_attrs.py | 12 ++++-------- spacy/lang/lb/stop_words.py | 6 ++---- spacy/lang/lg/stop_words.py | 6 ++---- spacy/lang/lij/stop_words.py | 6 ++---- spacy/lang/lv/stop_words.py | 6 ++---- spacy/lang/mk/stop_words.py | 6 ++---- spacy/lang/ml/stop_words.py | 6 ++---- spacy/lang/mr/stop_words.py | 6 ++---- spacy/lang/ms/_tokenizer_exceptions_list.py | 6 ++---- spacy/lang/ms/stop_words.py | 6 ++---- spacy/lang/nb/stop_words.py | 6 ++---- spacy/lang/ne/stop_words.py | 6 ++---- spacy/lang/nl/lex_attrs.py | 12 ++++-------- spacy/lang/nl/stop_words.py | 6 ++---- spacy/lang/pl/stop_words.py | 6 ++---- spacy/lang/pt/stop_words.py | 6 ++---- spacy/lang/ro/lex_attrs.py | 12 ++++-------- spacy/lang/ro/stop_words.py | 6 ++---- spacy/lang/ru/lex_attrs.py | 8 ++------ spacy/lang/ru/stop_words.py | 6 ++---- spacy/lang/sa/stop_words.py | 6 ++---- spacy/lang/si/stop_words.py | 6 ++---- spacy/lang/sk/stop_words.py | 6 ++---- spacy/lang/sl/lex_attrs.py | 18 ++++++------------ spacy/lang/sl/stop_words.py | 6 ++---- spacy/lang/sq/stop_words.py | 6 ++---- spacy/lang/sr/stop_words.py | 6 ++---- spacy/lang/sv/stop_words.py | 6 ++---- spacy/lang/ta/stop_words.py | 6 ++---- spacy/lang/te/stop_words.py | 6 ++---- spacy/lang/th/stop_words.py | 6 ++---- spacy/lang/ti/stop_words.py | 6 ++---- spacy/lang/tl/stop_words.py | 6 ++---- spacy/lang/tn/stop_words.py | 6 ++---- spacy/lang/tokenizer_exceptions.py | 6 ++---- spacy/lang/tr/stop_words.py | 6 ++---- spacy/lang/tt/stop_words.py | 6 ++---- spacy/lang/uk/stop_words.py | 6 ++---- spacy/lang/ur/lex_attrs.py | 4 +++- spacy/lang/ur/stop_words.py | 6 ++---- spacy/lang/vi/stop_words.py | 8 ++------ spacy/lang/zh/__init__.py | 10 +++++----- spacy/lang/zh/stop_words.py | 6 ++---- spacy/lookups.py | 6 +++--- spacy/tests/doc/test_creation.py | 8 ++++---- spacy/tests/matcher/test_phrase_matcher.py | 2 +- 92 files changed, 207 insertions(+), 403 deletions(-) diff --git a/spacy/lang/af/stop_words.py b/spacy/lang/af/stop_words.py index 4b5a04a5e..337afb57f 100644 --- a/spacy/lang/af/stop_words.py +++ b/spacy/lang/af/stop_words.py @@ -1,7 +1,6 @@ # Source: https://github.com/stopwords-iso/stopwords-af -STOP_WORDS = set( - """ +STOP_WORDS = set(""" 'n aan af @@ -53,5 +52,4 @@ vir was wat ʼn -""".split() -) +""".split()) diff --git a/spacy/lang/am/stop_words.py b/spacy/lang/am/stop_words.py index 5487ada5a..8a04c555f 100644 --- a/spacy/lang/am/stop_words.py +++ b/spacy/lang/am/stop_words.py @@ -1,8 +1,7 @@ # Stop words by Teshome Kassie http://etd.aau.edu.et/bitstream/handle/123456789/3315/Teshome%20Kassie.pdf?sequence=1&isAllowed=y # Stop words by Tihitina Petros http://etd.aau.edu.et/bitstream/handle/123456789/3384/Tihitina%20Petros.pdf?sequence=1&isAllowed=y -STOP_WORDS = set( - """ +STOP_WORDS = set(""" ግን አንቺ አንተ እናንተ ያንተ ያንቺ የናንተ ራስህን ራስሽን ራሳችሁን ሁሉ ኋላ በሰሞኑ አሉ በኋላ ሁኔታ በኩል አስታውቀዋል ሆነ በውስጥ አስታውሰዋል ሆኑ ባጣም እስካሁን ሆኖም በተለይ አሳሰበ ሁል በተመለከተ @@ -29,5 +28,4 @@ STOP_WORDS = set( በዚህም መሆን ምንጊዜም እነዚህም በዚህና ያለ ስም ሲኖር ከዚህም መሆኑን በሁኔታው የማያንስ እነዚህኑ ማንም ከነዚሁ ያላቸውን እጅግ ሲሆኑ ለሆኑ ሊሆን ለማናቸውም -""".split() -) +""".split()) diff --git a/spacy/lang/ar/lex_attrs.py b/spacy/lang/ar/lex_attrs.py index 54ad7a8c3..6e943d064 100644 --- a/spacy/lang/ar/lex_attrs.py +++ b/spacy/lang/ar/lex_attrs.py @@ -1,7 +1,6 @@ from ...attrs import LIKE_NUM -_num_words = set( - """ +_num_words = set(""" صفر واحد إثنان @@ -51,11 +50,9 @@ _num_words = set( مليون مليار مليارات -""".split() -) +""".split()) -_ordinal_words = set( - """ +_ordinal_words = set(""" اول أول حاد @@ -70,8 +67,7 @@ _ordinal_words = set( ثامن تاسع عاشر -""".split() -) +""".split()) def like_num(text): diff --git a/spacy/lang/ar/stop_words.py b/spacy/lang/ar/stop_words.py index f4da54dda..65c8992cb 100644 --- a/spacy/lang/ar/stop_words.py +++ b/spacy/lang/ar/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" من نحو لعل @@ -386,5 +385,4 @@ STOP_WORDS = set( وإن ولو يا -""".split() -) +""".split()) diff --git a/spacy/lang/az/stop_words.py b/spacy/lang/az/stop_words.py index 2114939ba..8beffa998 100644 --- a/spacy/lang/az/stop_words.py +++ b/spacy/lang/az/stop_words.py @@ -1,6 +1,5 @@ # Source: https://github.com/eliasdabbas/advertools/blob/master/advertools/stopwords.py -STOP_WORDS = set( - """ +STOP_WORDS = set(""" amma arasında artıq @@ -141,5 +140,4 @@ zaman əlbəttə ən əslində -""".split() -) +""".split()) diff --git a/spacy/lang/bg/stop_words.py b/spacy/lang/bg/stop_words.py index 061850da5..7d3e75605 100644 --- a/spacy/lang/bg/stop_words.py +++ b/spacy/lang/bg/stop_words.py @@ -4,8 +4,7 @@ References: https://postvai.com/books/stop-dumi.pdf - Additions to the original list in order to improve it. """ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" а автентичен аз ако ала бе без беше би бивш бивша бившо бивши бил била били било благодаря близо бъдат @@ -76,5 +75,4 @@ STOP_WORDS = set( юмрук я як -""".split() -) +""".split()) diff --git a/spacy/lang/bn/stop_words.py b/spacy/lang/bn/stop_words.py index bf38e3254..5aec18b7f 100644 --- a/spacy/lang/bn/stop_words.py +++ b/spacy/lang/bn/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে আগামী আগে আগেই আছে আজ আদ্যভাগে আপনার আপনি আবার আমরা আমাকে আমাদের আমার আমি আর আরও ইত্যাদি ইহা @@ -38,5 +37,4 @@ STOP_WORDS = set( সাধারণ সামনে সঙ্গে সঙ্গেও সব সবার সমস্ত সম্প্রতি সময় সহ সহিত সাথে সুতরাং সে সেই সেখান সেখানে সেটা সেটাই সেটাও সেটি স্পষ্ট স্বয়ং হইতে হইবে হইয়া হওয়া হওয়ায় হওয়ার হচ্ছে হত হতে হতেই হন হবে হবেন হয় হয়তো হয়নি হয়ে হয়েই হয়েছিল হয়েছে হাজার হয়েছেন হল হলে হলেই হলেও হলো হিসাবে হিসেবে হৈলে হোক হয় হয়ে হয়েছে হৈতে হইয়া হয়েছিল হয়েছেন হয়নি হয়েই হয়তো হওয়া হওয়ার হওয়ায় -""".split() -) +""".split()) diff --git a/spacy/lang/bo/stop_words.py b/spacy/lang/bo/stop_words.py index 407242c84..158e148b0 100644 --- a/spacy/lang/bo/stop_words.py +++ b/spacy/lang/bo/stop_words.py @@ -1,7 +1,6 @@ # Source: https://zenodo.org/records/10148636 -STOP_WORDS = set( - """ +STOP_WORDS = set(""" འི་ ། དུ་ @@ -194,5 +193,4 @@ STOP_WORDS = set( གིང་ ཚ་ ཀྱང -""".split() -) +""".split()) diff --git a/spacy/lang/ca/stop_words.py b/spacy/lang/ca/stop_words.py index 1a87b2f9d..90cce5de8 100644 --- a/spacy/lang/ca/stop_words.py +++ b/spacy/lang/ca/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a abans ací ah així això al aleshores algun alguna algunes alguns alhora allà allí allò als altra altre altres amb ambdues ambdós anar ans apa aquell aquella aquelles aquells aquest aquesta aquestes aquests aquí @@ -48,5 +47,4 @@ un una unes uns us últim ús va vaig vam van vas veu vosaltres vostra vostre vostres -""".split() -) +""".split()) diff --git a/spacy/lang/cs/stop_words.py b/spacy/lang/cs/stop_words.py index f61f424f6..35db9fedc 100644 --- a/spacy/lang/cs/stop_words.py +++ b/spacy/lang/cs/stop_words.py @@ -1,8 +1,7 @@ # Source: https://github.com/Alir3z4/stop-words # Source: https://github.com/stopwords-iso/stopwords-cs/blob/master/stopwords-cs.txt -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a aby ahoj @@ -361,5 +360,4 @@ zač zatímco ze že -""".split() -) +""".split()) diff --git a/spacy/lang/da/stop_words.py b/spacy/lang/da/stop_words.py index 05b2084dd..0e71dfde7 100644 --- a/spacy/lang/da/stop_words.py +++ b/spacy/lang/da/stop_words.py @@ -1,7 +1,6 @@ # Source: Handpicked by Jens Dahl Møllerhøj. -STOP_WORDS = set( - """ +STOP_WORDS = set(""" af aldrig alene alle allerede alligevel alt altid anden andet andre at bag begge blandt blev blive bliver burde bør @@ -41,5 +40,4 @@ ud uden udover under undtagen var ved vi via vil ville vore vores vær være været øvrigt -""".split() -) +""".split()) diff --git a/spacy/lang/de/stop_words.py b/spacy/lang/de/stop_words.py index f52687eb9..5fbd74287 100644 --- a/spacy/lang/de/stop_words.py +++ b/spacy/lang/de/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" á a ab aber ach acht achte achten achter achtes ag alle allein allem allen aller allerdings alles allgemeinen als also am an andere anderen anderem andern anders auch auf aus ausser außer ausserdem außerdem @@ -74,5 +73,4 @@ wollt wollte wollten worden wurde würde wurden würden zehn zehnte zehnten zehnter zehntes zeit zu zuerst zugleich zum zunächst zur zurück zusammen zwanzig zwar zwei zweite zweiten zweiter zweites zwischen -""".split() -) +""".split()) diff --git a/spacy/lang/dsb/stop_words.py b/spacy/lang/dsb/stop_words.py index 376e04aa6..90735a623 100644 --- a/spacy/lang/dsb/stop_words.py +++ b/spacy/lang/dsb/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a abo aby ako ale až daniž dokulaž @@ -11,5 +10,4 @@ jolic pak pótom teke togodla -""".split() -) +""".split()) diff --git a/spacy/lang/el/stop_words.py b/spacy/lang/el/stop_words.py index 7c436219f..b5c1c36c4 100644 --- a/spacy/lang/el/stop_words.py +++ b/spacy/lang/el/stop_words.py @@ -1,7 +1,6 @@ # Stop words # Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0 -STOP_WORDS = set( - """ +STOP_WORDS = set(""" αδιάκοπα αι ακόμα ακόμη ακριβώς άλλα αλλά αλλαχού άλλες άλλη άλλην άλλης αλλιώς αλλιώτικα άλλο άλλοι αλλοιώς αλλοιώτικα άλλον άλλος άλλοτε αλλού άλλους άλλων άμα άμεσα αμέσως αν ανά ανάμεσα αναμεταξύ άνευ αντί αντίπερα αντίς @@ -83,5 +82,4 @@ STOP_WORDS = set( χωρίς χωριστά ω ως ωσάν ωσότου ώσπου ώστε ωστόσο ωχ -""".split() -) +""".split()) diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py index 1ca5cbc16..cbce281b4 100644 --- a/spacy/lang/en/stop_words.py +++ b/spacy/lang/en/stop_words.py @@ -1,6 +1,5 @@ # Stop words -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a about above across after afterwards again against all almost alone along already also although always am among amongst amount an and another any anyhow anyone anything anyway anywhere are around as at @@ -62,8 +61,7 @@ whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you your yours yourself yourselves -""".split() -) +""".split()) contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"] STOP_WORDS.update(contractions) diff --git a/spacy/lang/es/stop_words.py b/spacy/lang/es/stop_words.py index 6d2885481..5099359e8 100644 --- a/spacy/lang/es/stop_words.py +++ b/spacy/lang/es/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a acuerdo adelante ademas además afirmó agregó ahi ahora ahí al algo alguna algunas alguno algunos algún alli allí alrededor ambos ante anterior antes apenas aproximadamente aquel aquella aquellas aquello aquellos aqui aquél @@ -76,5 +75,4 @@ va vais vamos van varias varios vaya veces ver verdad verdadera verdadero vez vosotras vosotros voy vuestra vuestras vuestro vuestros y ya yo -""".split() -) +""".split()) diff --git a/spacy/lang/et/stop_words.py b/spacy/lang/et/stop_words.py index e1da1f14d..248bcb61f 100644 --- a/spacy/lang/et/stop_words.py +++ b/spacy/lang/et/stop_words.py @@ -1,7 +1,6 @@ # Source: https://github.com/stopwords-iso/stopwords-et -STOP_WORDS = set( - """ +STOP_WORDS = set(""" aga ei et @@ -37,5 +36,4 @@ siis ta te ära -""".split() -) +""".split()) diff --git a/spacy/lang/eu/stop_words.py b/spacy/lang/eu/stop_words.py index d213b5b81..4a6661e7d 100644 --- a/spacy/lang/eu/stop_words.py +++ b/spacy/lang/eu/stop_words.py @@ -1,8 +1,7 @@ # Source: https://github.com/stopwords-iso/stopwords-eu # https://www.ranks.nl/stopwords/basque # https://www.mustgo.com/worldlanguages/basque/ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" al anitz arabera @@ -101,5 +100,4 @@ zu zuek zuen zuten -""".split() -) +""".split()) diff --git a/spacy/lang/fa/lex_attrs.py b/spacy/lang/fa/lex_attrs.py index 065e81bd6..9b0ff546e 100644 --- a/spacy/lang/fa/lex_attrs.py +++ b/spacy/lang/fa/lex_attrs.py @@ -5,8 +5,7 @@ ZWNJ_O_MIM = "‌ام" YE_NUN = "ین" -_num_words = set( - """ +_num_words = set(""" صفر یک دو @@ -63,15 +62,12 @@ _num_words = set( کوادریلیون کادریلیارد کوینتیلیون -""".split() -) +""".split()) -_ordinal_words = set( - """ +_ordinal_words = set(""" اول سوم -سی‌ام""".split() -) +سی‌ام""".split()) _ordinal_words.update({num + MIM for num in _num_words}) _ordinal_words.update({num + ZWNJ_O_MIM for num in _num_words}) diff --git a/spacy/lang/fa/stop_words.py b/spacy/lang/fa/stop_words.py index f462f2e7a..93738c892 100644 --- a/spacy/lang/fa/stop_words.py +++ b/spacy/lang/fa/stop_words.py @@ -1,6 +1,5 @@ # Stop words from HAZM package -STOP_WORDS = set( - """ +STOP_WORDS = set(""" و در به @@ -389,5 +388,4 @@ STOP_WORDS = set( لذا زاده گردد -اینجا""".split() -) +اینجا""".split()) diff --git a/spacy/lang/fi/stop_words.py b/spacy/lang/fi/stop_words.py index 8e8dcfa56..742cacc26 100644 --- a/spacy/lang/fi/stop_words.py +++ b/spacy/lang/fi/stop_words.py @@ -1,7 +1,6 @@ # Source https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt # Reformatted with some minor corrections -STOP_WORDS = set( - """ +STOP_WORDS = set(""" aiemmin aika aikaa aikaan aikaisemmin aikaisin aikana aikoina aikoo aikovat aina ainakaan ainakin ainoa ainoat aiomme aion aiotte aivan ajan alas alemmas alkuisin alkuun alla alle aloitamme aloitan aloitat aloitatte aloitattivat @@ -106,5 +105,4 @@ yhtäällä yhtäältä yhtään yhä yksi yksin yksittäin yleensä ylemmäs yl ympäri älköön älä -""".split() -) +""".split()) diff --git a/spacy/lang/fr/lex_attrs.py b/spacy/lang/fr/lex_attrs.py index 9cf508a07..8a9dfb82a 100644 --- a/spacy/lang/fr/lex_attrs.py +++ b/spacy/lang/fr/lex_attrs.py @@ -1,24 +1,20 @@ from ...attrs import LIKE_NUM -_num_words = set( - """ +_num_words = set(""" zero un une deux trois quatre cinq six sept huit neuf dix onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf vingt trente quarante cinquante soixante soixante-dix septante quatre-vingt huitante quatre-vingt-dix nonante cent mille mil million milliard billion quadrillion quintillion sextillion septillion octillion nonillion decillion -""".split() -) +""".split()) -_ordinal_words = set( - """ +_ordinal_words = set(""" premier première deuxième second seconde troisième quatrième cinquième sixième septième huitième neuvième dixième onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neuvième vingtième trentième quarantième cinquantième soixantième soixante-dixième septantième quatre-vingtième huitantième quatre-vingt-dixième nonantième centième millième millionnième milliardième billionnième quadrillionnième quintillionnième sextillionnième septillionnième octillionnième nonillionnième decillionnième -""".split() -) +""".split()) def like_num(text): diff --git a/spacy/lang/fr/stop_words.py b/spacy/lang/fr/stop_words.py index b32ee3d71..85ffe47ba 100644 --- a/spacy/lang/fr/stop_words.py +++ b/spacy/lang/fr/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a à â abord afin ah ai aie ainsi ait allaient allons alors anterieur anterieure anterieures antérieur antérieure antérieures apres après as assez attendu au @@ -80,5 +79,4 @@ votre votres vous vous-mêmes vu vé vôtre vôtres y -""".split() -) +""".split()) diff --git a/spacy/lang/ga/stop_words.py b/spacy/lang/ga/stop_words.py index 4ef052ca5..e32ad6431 100644 --- a/spacy/lang/ga/stop_words.py +++ b/spacy/lang/ga/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a ach ag agus an aon ar arna as ba beirt bhúr @@ -39,5 +38,4 @@ um í ó ón óna ónár -""".split() -) +""".split()) diff --git a/spacy/lang/gd/stop_words.py b/spacy/lang/gd/stop_words.py index 9f5a66cbc..6f2c2856b 100644 --- a/spacy/lang/gd/stop_words.py +++ b/spacy/lang/gd/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" 'ad 'ar 'd # iad @@ -382,7 +381,4 @@ urrainn ì ò ó -""".split( - "\n" - ) -) +""".split("\n")) diff --git a/spacy/lang/gd/tokenizer_exceptions.py b/spacy/lang/gd/tokenizer_exceptions.py index 76e169d90..b65584154 100644 --- a/spacy/lang/gd/tokenizer_exceptions.py +++ b/spacy/lang/gd/tokenizer_exceptions.py @@ -1974,9 +1974,7 @@ Tron an tuilleadh 's a chòir Tuilleadh 's a chòir tuilleadh sa chòir -Tuilleadh sa chòir""".split( - "\n" -): +Tuilleadh sa chòir""".split("\n"): _exc[orth] = [{ORTH: orth}] diff --git a/spacy/lang/grc/stop_words.py b/spacy/lang/grc/stop_words.py index cbb766a8c..51f5e9d9d 100644 --- a/spacy/lang/grc/stop_words.py +++ b/spacy/lang/grc/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" αὐτῷ αὐτοῦ αὐτῆς αὐτόν αὐτὸν αὐτῶν αὐτὸς αὐτὸ αὐτό αὐτός αὐτὴν αὐτοῖς αὐτοὺς αὔτ' αὐτὰ αὐτῇ αὐτὴ αὐτὼ αὑταὶ καὐτὸς αὐτά αὑτός αὐτοῖσι αὐτοῖσιν αὑτὸς αὐτήν αὐτοῖσί αὐτοί αὐτοὶ αὐτοῖο αὐτάων αὐτὰς αὐτέων αὐτώ αὐτάς αὐτούς αὐτή αὐταί αὐταὶ αὐτῇσιν τὠυτῷ τὠυτὸ ταὐτὰ ταύτῃ αὐτῇσι αὐτῇς αὐταῖς αὐτᾶς αὐτὰν ταὐτὸν @@ -57,5 +56,4 @@ STOP_WORDS = set( ὣς ὡς ὥς ὧς ὥστ' ὥστε ὥσθ' ὤ ὢ - """.split() -) + """.split()) diff --git a/spacy/lang/gu/stop_words.py b/spacy/lang/gu/stop_words.py index 2c859681b..1d11a3ebd 100644 --- a/spacy/lang/gu/stop_words.py +++ b/spacy/lang/gu/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" એમ આ એ @@ -84,5 +83,4 @@ STOP_WORDS = set( દર એટલો પરંતુ -""".split() -) +""".split()) diff --git a/spacy/lang/he/stop_words.py b/spacy/lang/he/stop_words.py index 23bb5176d..ea4867224 100644 --- a/spacy/lang/he/stop_words.py +++ b/spacy/lang/he/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" אני את אתה @@ -218,5 +217,4 @@ STOP_WORDS = set( אחרות אשר או -""".split() -) +""".split()) diff --git a/spacy/lang/hi/stop_words.py b/spacy/lang/hi/stop_words.py index 475b07da1..9bc57bd31 100644 --- a/spacy/lang/hi/stop_words.py +++ b/spacy/lang/hi/stop_words.py @@ -1,7 +1,6 @@ # Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6 -STOP_WORDS = set( - """ +STOP_WORDS = set(""" अंदर अत अदि @@ -235,5 +234,4 @@ STOP_WORDS = set( होते होना होने -""".split() -) +""".split()) diff --git a/spacy/lang/hr/stop_words.py b/spacy/lang/hr/stop_words.py index dd10f792d..769ebe4db 100644 --- a/spacy/lang/hr/stop_words.py +++ b/spacy/lang/hr/stop_words.py @@ -1,6 +1,5 @@ # Source: https://github.com/stopwords-iso/stopwords-hr -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a ah aha @@ -340,5 +339,4 @@ zbog željeo zimus zum -""".split() -) +""".split()) diff --git a/spacy/lang/hsb/stop_words.py b/spacy/lang/hsb/stop_words.py index e6fedaf4c..86021f555 100644 --- a/spacy/lang/hsb/stop_words.py +++ b/spacy/lang/hsb/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a abo ale ani dokelž @@ -15,5 +14,4 @@ pak potom tež tohodla zo zoby -""".split() -) +""".split()) diff --git a/spacy/lang/ht/lex_attrs.py b/spacy/lang/ht/lex_attrs.py index ab1a39a82..27a535dd7 100644 --- a/spacy/lang/ht/lex_attrs.py +++ b/spacy/lang/ht/lex_attrs.py @@ -1,24 +1,20 @@ from ...attrs import LIKE_NUM, NORM # Cardinal numbers in Creole -_num_words = set( - """ +_num_words = set(""" zewo youn en de twa kat senk sis sèt uit nèf dis onz douz trèz katoz kenz sèz disèt dizwit diznèf vent trant karant sinkant swasant swasann-dis san mil milyon milya -""".split() -) +""".split()) # Ordinal numbers in Creole (some are French-influenced, some simplified) -_ordinal_words = set( - """ +_ordinal_words = set(""" premye dezyèm twazyèm katryèm senkyèm sizyèm sètvyèm uitvyèm nèvyèm dizyèm onzèm douzyèm trèzyèm katozyèm kenzèm sèzyèm disetyèm dizwityèm diznèvyèm ventyèm trantyèm karantyèm sinkantyèm swasantyèm swasann-disyèm santyèm milyèm milyonnyèm milyadyèm -""".split() -) +""".split()) NORM_MAP = { "'m": "mwen", diff --git a/spacy/lang/ht/stop_words.py b/spacy/lang/ht/stop_words.py index 50998e0e5..fd85c2a19 100644 --- a/spacy/lang/ht/stop_words.py +++ b/spacy/lang/ht/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a ak an ankò ant apre ap atò avan avanlè byen bò byenke @@ -39,8 +38,7 @@ sa san si swa si men mèsi oswa osinon -""".split() -) +""".split()) # Add common contractions, with and without apostrophe variants contractions = ["m'", "n'", "w'", "y'", "l'", "t'", "k'"] diff --git a/spacy/lang/hu/stop_words.py b/spacy/lang/hu/stop_words.py index e39a26d35..184155707 100644 --- a/spacy/lang/hu/stop_words.py +++ b/spacy/lang/hu/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a abban ahhoz ahogy ahol aki akik akkor akár alatt amely amelyek amelyekben amelyeket amelyet amelynek ami amikor amit amolyan amíg annak arra arról az azok azon azonban azt aztán azután azzal azért @@ -58,5 +57,4 @@ volna volt voltak voltam voltunk úgy új újabb újra ő őket -""".split() -) +""".split()) diff --git a/spacy/lang/hy/stop_words.py b/spacy/lang/hy/stop_words.py index 46d0f6b51..1bfd09a4b 100644 --- a/spacy/lang/hy/stop_words.py +++ b/spacy/lang/hy/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" նա ողջը այստեղ @@ -103,5 +102,4 @@ STOP_WORDS = set( այս մեջ թ -""".split() -) +""".split()) diff --git a/spacy/lang/id/_tokenizer_exceptions_list.py b/spacy/lang/id/_tokenizer_exceptions_list.py index a0b35fa1a..11220a61e 100644 --- a/spacy/lang/id/_tokenizer_exceptions_list.py +++ b/spacy/lang/id/_tokenizer_exceptions_list.py @@ -1,5 +1,4 @@ -ID_BASE_EXCEPTIONS = set( - """ +ID_BASE_EXCEPTIONS = set(""" aba-aba abah-abah abal-abal @@ -3898,5 +3897,4 @@ yel-yel yo-yo zam-zam zig-zag -""".split() -) +""".split()) diff --git a/spacy/lang/id/stop_words.py b/spacy/lang/id/stop_words.py index b1bfaea79..fc85f8367 100644 --- a/spacy/lang/id/stop_words.py +++ b/spacy/lang/id/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya aku akulah amat amatlah anda andalah antar antara antaranya apa apaan apabila apakah apalagi apatah artinya asal asalkan atas atau ataukah ataupun awal @@ -114,5 +113,4 @@ ucap ucapnya ujar ujarnya umum umumnya ungkap ungkapnya untuk usah usai waduh wah wahai waktu waktunya walau walaupun wong yaitu yakin yakni yang -""".split() -) +""".split()) diff --git a/spacy/lang/is/stop_words.py b/spacy/lang/is/stop_words.py index 917fb6df4..79f84ee60 100644 --- a/spacy/lang/is/stop_words.py +++ b/spacy/lang/is/stop_words.py @@ -1,7 +1,6 @@ # Source: https://github.com/Xangis/extra-stopwords -STOP_WORDS = set( - """ +STOP_WORDS = set(""" afhverju aftan aftur @@ -154,5 +153,4 @@ ykkar því þær ætti -""".split() -) +""".split()) diff --git a/spacy/lang/it/stop_words.py b/spacy/lang/it/stop_words.py index 42adc7904..2a37236a9 100644 --- a/spacy/lang/it/stop_words.py +++ b/spacy/lang/it/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl agli ahime ahimè ai al alcuna alcuni alcuno all alla alle allo allora altri altrimenti altro altrove altrui anche ancora anni anno ansa anticipo assai @@ -79,5 +78,4 @@ uguali ulteriore ultimo un un' una uno uomo v' va vale vari varia varie vario verso vi via vicino visto vita voi volta volte vostra vostre vostri vostro -""".split() -) +""".split()) diff --git a/spacy/lang/ja/stop_words.py b/spacy/lang/ja/stop_words.py index 98560d7e2..661b51835 100644 --- a/spacy/lang/ja/stop_words.py +++ b/spacy/lang/ja/stop_words.py @@ -2,8 +2,7 @@ # filtering out everything that wasn't hiragana. ー (one) was also added. # Considered keeping some non-hiragana words but too many place names were # present. -STOP_WORDS = set( - """ +STOP_WORDS = set(""" あ あっ あまり あり ある あるいは あれ い いい いう いく いずれ いっ いつ いる いわ うち @@ -44,5 +43,4 @@ STOP_WORDS = set( を ん 一 -""".split() -) +""".split()) diff --git a/spacy/lang/kmr/stop_words.py b/spacy/lang/kmr/stop_words.py index aee33c2b7..93e6ea27f 100644 --- a/spacy/lang/kmr/stop_words.py +++ b/spacy/lang/kmr/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" û li bi @@ -40,5 +39,4 @@ gelek hemû kes tişt -""".split() -) +""".split()) diff --git a/spacy/lang/kn/stop_words.py b/spacy/lang/kn/stop_words.py index dba9740af..528e5e3a8 100644 --- a/spacy/lang/kn/stop_words.py +++ b/spacy/lang/kn/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" ಹಲವು ಮೂಲಕ ಹಾಗೂ @@ -82,5 +81,4 @@ STOP_WORDS = set( ಎಂದು ನನ್ನ ಮೇಲೆ -""".split() -) +""".split()) diff --git a/spacy/lang/ko/stop_words.py b/spacy/lang/ko/stop_words.py index 3eba9fc82..d4cdbc7a1 100644 --- a/spacy/lang/ko/stop_words.py +++ b/spacy/lang/ko/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" 이 있 하 @@ -63,5 +62,4 @@ STOP_WORDS = set( 원 잘 놓 -""".split() -) +""".split()) diff --git a/spacy/lang/ky/stop_words.py b/spacy/lang/ky/stop_words.py index ea40bdfa2..fb8e2c84b 100644 --- a/spacy/lang/ky/stop_words.py +++ b/spacy/lang/ky/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" ага адам айтты айтымында айтып ал алар алардын алган алуу алып анда андан аны анын ар @@ -38,5 +37,4 @@ STOP_WORDS = set( үч үчүн өз -""".split() -) +""".split()) diff --git a/spacy/lang/la/stop_words.py b/spacy/lang/la/stop_words.py index 8b590bb67..47abf7384 100644 --- a/spacy/lang/la/stop_words.py +++ b/spacy/lang/la/stop_words.py @@ -1,7 +1,6 @@ # Corrected Perseus list, cf. https://wiki.digitalclassicist.org/Stopwords_for_Greek_and_Latin -STOP_WORDS = set( - """ +STOP_WORDS = set(""" ab ac ad adhuc aliqui aliquis an ante apud at atque aut autem cum cur @@ -33,5 +32,4 @@ tam tamen trans tu tum ubi uel uero vel vero -""".split() -) +""".split()) diff --git a/spacy/lang/lb/lex_attrs.py b/spacy/lang/lb/lex_attrs.py index 119231374..bbef72b9b 100644 --- a/spacy/lang/lb/lex_attrs.py +++ b/spacy/lang/lb/lex_attrs.py @@ -1,22 +1,18 @@ from ...attrs import LIKE_NUM -_num_words = set( - """ +_num_words = set(""" null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng véierzéng foffzéng siechzéng siwwenzéng uechtzeng uechzeng nonnzéng nongzéng zwanzeg drësseg véierzeg foffzeg sechzeg siechzeg siwenzeg achtzeg achzeg uechtzeg uechzeg nonnzeg honnert dausend millioun milliard billioun billiard trillioun triliard -""".split() -) +""".split()) -_ordinal_words = set( - """ +_ordinal_words = set(""" éischten zweeten drëtten véierten fënneften sechsten siwenten aachten néngten zéngten eeleften zwieleften dräizéngten véierzéngten foffzéngten siechzéngten uechtzéngen uechzéngten nonnzéngten nongzéngten zwanzegsten drëssegsten véierzegsten foffzegsten siechzegsten siwenzegsten uechzegsten nonnzegsten honnertsten dausendsten milliounsten milliardsten billiounsten billiardsten trilliounsten trilliardsten -""".split() -) +""".split()) def like_num(text): diff --git a/spacy/lang/lb/stop_words.py b/spacy/lang/lb/stop_words.py index 8f22ea6e6..386ce1222 100644 --- a/spacy/lang/lb/stop_words.py +++ b/spacy/lang/lb/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a à äis @@ -207,5 +206,4 @@ ze zu zum zwar -""".split() -) +""".split()) diff --git a/spacy/lang/lg/stop_words.py b/spacy/lang/lg/stop_words.py index 7bad59344..a9f99cbf4 100644 --- a/spacy/lang/lg/stop_words.py +++ b/spacy/lang/lg/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" abadde abalala abamu abangi abava ajja ali alina ani anti ateekeddwa atewamu atya awamu aweebwa ayinza ba baali babadde babalina bajja bajjanewankubade bali balina bandi bangi bano bateekeddwa baweebwa bayina bebombi beera bibye @@ -15,5 +14,4 @@ oyina oyo seetaaga si sinakindi singa talina tayina tebaali tebaalina tebayina t tetuteekeddwa tewali teyalina teyayina tolina tu tuyina tulina tuyina twafuna twetaaga wa wabula wabweru wadde waggulunnina wakati waliwobangi waliyo wandi wange wano wansi weebwa yabadde yaffe ye yenna yennyini yina yonna ziba zijja zonna -""".split() -) +""".split()) diff --git a/spacy/lang/lij/stop_words.py b/spacy/lang/lij/stop_words.py index 1d6f09d27..37eb163ff 100644 --- a/spacy/lang/lij/stop_words.py +++ b/spacy/lang/lij/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei bella belle belli bello ben @@ -35,5 +34,4 @@ tanta tante tanti tanto te ti torna tra tròppo tutta tutte tutti tutto un uña unn' unna za zu -""".split() -) +""".split()) diff --git a/spacy/lang/lv/stop_words.py b/spacy/lang/lv/stop_words.py index 2685c2430..4ed61996a 100644 --- a/spacy/lang/lv/stop_words.py +++ b/spacy/lang/lv/stop_words.py @@ -1,7 +1,6 @@ # Source: https://github.com/stopwords-iso/stopwords-lv -STOP_WORDS = set( - """ +STOP_WORDS = set(""" aiz ap apakš @@ -163,5 +162,4 @@ viņpus zem ārpus šaipus -""".split() -) +""".split()) diff --git a/spacy/lang/mk/stop_words.py b/spacy/lang/mk/stop_words.py index 312a456c5..90a271798 100644 --- a/spacy/lang/mk/stop_words.py +++ b/spacy/lang/mk/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" а абре aв @@ -811,5 +810,4 @@ aв џагара-магара џанам џив-џив - """.split() -) + """.split()) diff --git a/spacy/lang/ml/stop_words.py b/spacy/lang/ml/stop_words.py index 441e93586..64b9acc10 100644 --- a/spacy/lang/ml/stop_words.py +++ b/spacy/lang/ml/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" അത് ഇത് ആയിരുന്നു @@ -9,5 +8,4 @@ STOP_WORDS = set( അന്ന് ഇന്ന് ആണ് -""".split() -) +""".split()) diff --git a/spacy/lang/mr/stop_words.py b/spacy/lang/mr/stop_words.py index 9b0cee951..3c9c62089 100644 --- a/spacy/lang/mr/stop_words.py +++ b/spacy/lang/mr/stop_words.py @@ -1,6 +1,5 @@ # Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json -STOP_WORDS = set( - """ +STOP_WORDS = set(""" न अतरी तो @@ -188,5 +187,4 @@ STOP_WORDS = set( होता होती होते -""".split() -) +""".split()) diff --git a/spacy/lang/ms/_tokenizer_exceptions_list.py b/spacy/lang/ms/_tokenizer_exceptions_list.py index fba1dd70f..e579e316a 100644 --- a/spacy/lang/ms/_tokenizer_exceptions_list.py +++ b/spacy/lang/ms/_tokenizer_exceptions_list.py @@ -1,7 +1,6 @@ # from https://prpm.dbp.gov.my/cari1?keyword= # dbp https://en.wikipedia.org/wiki/Dewan_Bahasa_dan_Pustaka -MS_BASE_EXCEPTIONS = set( - """ +MS_BASE_EXCEPTIONS = set(""" aba-aba abah-abah abar-abar @@ -1939,5 +1938,4 @@ warna-warni water-cooled world-class yang-yang -""".split() -) +""".split()) diff --git a/spacy/lang/ms/stop_words.py b/spacy/lang/ms/stop_words.py index b1bfaea79..fc85f8367 100644 --- a/spacy/lang/ms/stop_words.py +++ b/spacy/lang/ms/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya aku akulah amat amatlah anda andalah antar antara antaranya apa apaan apabila apakah apalagi apatah artinya asal asalkan atas atau ataukah ataupun awal @@ -114,5 +113,4 @@ ucap ucapnya ujar ujarnya umum umumnya ungkap ungkapnya untuk usah usai waduh wah wahai waktu waktunya walau walaupun wong yaitu yakin yakni yang -""".split() -) +""".split()) diff --git a/spacy/lang/nb/stop_words.py b/spacy/lang/nb/stop_words.py index d9ed414ef..bc1c54a4a 100644 --- a/spacy/lang/nb/stop_words.py +++ b/spacy/lang/nb/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" alle allerede alt and andre annen annet at av bak bare bedre beste blant ble bli blir blitt bris by både @@ -46,5 +45,4 @@ vant var ved veldig vi videre viktig vil ville viser vår være vært å år ønsker -""".split() -) +""".split()) diff --git a/spacy/lang/ne/stop_words.py b/spacy/lang/ne/stop_words.py index 8470297b9..95d7a3758 100644 --- a/spacy/lang/ne/stop_words.py +++ b/spacy/lang/ne/stop_words.py @@ -1,7 +1,6 @@ # Source: https://github.com/sanjaalcorps/NepaliStopWords/blob/master/NepaliStopWords.txt -STOP_WORDS = set( - """ +STOP_WORDS = set(""" अक्सर अगाडि अगाडी @@ -490,5 +489,4 @@ STOP_WORDS = set( होइन होकि होला -""".split() -) +""".split()) diff --git a/spacy/lang/nl/lex_attrs.py b/spacy/lang/nl/lex_attrs.py index 488224c2f..1b8602831 100644 --- a/spacy/lang/nl/lex_attrs.py +++ b/spacy/lang/nl/lex_attrs.py @@ -1,21 +1,17 @@ from ...attrs import LIKE_NUM -_num_words = set( - """ +_num_words = set(""" nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd duizend miljoen miljard biljoen biljard triljoen triljard -""".split() -) +""".split()) -_ordinal_words = set( - """ +_ordinal_words = set(""" eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste miljardste biljoenste biljardste triljoenste triljardste -""".split() -) +""".split()) def like_num(text): diff --git a/spacy/lang/nl/stop_words.py b/spacy/lang/nl/stop_words.py index cd4fdefdf..a88c29051 100644 --- a/spacy/lang/nl/stop_words.py +++ b/spacy/lang/nl/stop_words.py @@ -13,8 +13,7 @@ # should have a Dutch counterpart here. -STOP_WORDS = set( - """ +STOP_WORDS = set(""" aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaande aangezien achter achterna afgelopen aldus alhoewel anderzijds @@ -68,5 +67,4 @@ welk welke welken werd werden wiens wier wilde wordt zal ze zei zelf zich zij zijn zo zonder zou zeer zeker zekere zelfde zelfs zichzelf zijnde zijne zo’n zoals zodra zouden zoveel zowat zulk zulke zulks zullen zult -""".split() -) +""".split()) diff --git a/spacy/lang/pl/stop_words.py b/spacy/lang/pl/stop_words.py index 075aec391..4418deedc 100644 --- a/spacy/lang/pl/stop_words.py +++ b/spacy/lang/pl/stop_words.py @@ -1,7 +1,6 @@ # sources: https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt and https://github.com/stopwords-iso/stopwords-pl -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a aby ach acz aczkolwiek aj albo ale alez ależ ani az aż @@ -74,5 +73,4 @@ xi xii xiii xiv xv z za zaden zadna zadne zadnych zapewne zawsze zaś ze zeby znow znowu znów zostal został -żaden żadna żadne żadnych że żeby""".split() -) +żaden żadna żadne żadnych że żeby""".split()) diff --git a/spacy/lang/pt/stop_words.py b/spacy/lang/pt/stop_words.py index ce3c86ff5..722aef802 100644 --- a/spacy/lang/pt/stop_words.py +++ b/spacy/lang/pt/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes ao aos apenas apoia apoio apontar após aquela aquelas aquele aqueles aqui aquilo as assim através atrás até aí @@ -62,5 +61,4 @@ vai vais valor veja vem vens ver vez vezes vinda vindo vinte você vocês vos vo vossas vosso vossos vários vão vêm vós zero -""".split() -) +""".split()) diff --git a/spacy/lang/ro/lex_attrs.py b/spacy/lang/ro/lex_attrs.py index 736aa911a..a5880fc2f 100644 --- a/spacy/lang/ro/lex_attrs.py +++ b/spacy/lang/ro/lex_attrs.py @@ -1,16 +1,13 @@ from ...attrs import LIKE_NUM -_num_words = set( - """ +_num_words = set(""" zero unu doi două trei patru cinci șase șapte opt nouă zece unsprezece doisprezece douăsprezece treisprezece patrusprezece cincisprezece șaisprezece șaptesprezece optsprezece nouăsprezece douăzeci treizeci patruzeci cincizeci șaizeci șaptezeci optzeci nouăzeci sută mie milion miliard bilion trilion cvadrilion catralion cvintilion sextilion septilion enșpemii -""".split() -) +""".split()) -_ordinal_words = set( - """ +_ordinal_words = set(""" primul doilea treilea patrulea cincilea șaselea șaptelea optulea nouălea zecelea prima doua treia patra cincia șasea șaptea opta noua zecea unsprezecelea doisprezecelea treisprezecelea patrusprezecelea cincisprezecelea șaisprezecelea șaptesprezecelea optsprezecelea nouăsprezecelea @@ -18,8 +15,7 @@ unsprezecea douăsprezecea treisprezecea patrusprezecea cincisprezecea șaisprez douăzecilea treizecilea patruzecilea cincizecilea șaizecilea șaptezecilea optzecilea nouăzecilea sutălea douăzecea treizecea patruzecea cincizecea șaizecea șaptezecea optzecea nouăzecea suta miilea mielea mia milionulea milioana miliardulea miliardelea miliarda enșpemia -""".split() -) +""".split()) def like_num(text): diff --git a/spacy/lang/ro/stop_words.py b/spacy/lang/ro/stop_words.py index d68a81c45..c7c0801f1 100644 --- a/spacy/lang/ro/stop_words.py +++ b/spacy/lang/ro/stop_words.py @@ -1,6 +1,5 @@ # Source: https://github.com/stopwords-iso/stopwords-ro -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a abia acea @@ -495,5 +494,4 @@ zice știu ți ție -""".split() -) +""".split()) diff --git a/spacy/lang/ru/lex_attrs.py b/spacy/lang/ru/lex_attrs.py index e0b35bdc0..63b1cead8 100644 --- a/spacy/lang/ru/lex_attrs.py +++ b/spacy/lang/ru/lex_attrs.py @@ -1,8 +1,6 @@ from ...attrs import LIKE_NUM -_num_words = list( - set( - """ +_num_words = list(set(""" ноль ноля нолю нолём ноле нулевой нулевого нулевому нулевым нулевом нулевая нулевую нулевое нулевые нулевых нулевыми четверть четверти четвертью четвертей четвертям четвертями четвертях @@ -203,9 +201,7 @@ _num_words = list( квинтиллиону квинтиллионов квинтлн i ii iii iv v vi vii viii ix x xi xii xiii xiv xv xvi xvii xviii xix xx xxi xxii xxiii xxiv xxv xxvi xxvii xxvii xxix -""".split() - ) -) +""".split())) def like_num(text): diff --git a/spacy/lang/ru/stop_words.py b/spacy/lang/ru/stop_words.py index d6ea6b42a..3040adb52 100644 --- a/spacy/lang/ru/stop_words.py +++ b/spacy/lang/ru/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" а авось ага агу аж ай али алло ау ах ая б будем будет будете будешь буду будут будучи будь будьте бы был была были было @@ -107,5 +106,4 @@ STOP_WORDS = set( ю я явно явных яко якобы якоже -""".split() -) +""".split()) diff --git a/spacy/lang/sa/stop_words.py b/spacy/lang/sa/stop_words.py index 30302a14d..eaf0ffaa2 100644 --- a/spacy/lang/sa/stop_words.py +++ b/spacy/lang/sa/stop_words.py @@ -1,7 +1,6 @@ # Source: https://gist.github.com/Akhilesh28/fe8b8e180f64b72e64751bc31cb6d323 -STOP_WORDS = set( - """ +STOP_WORDS = set(""" अहम् आवाम् वयम् @@ -511,5 +510,4 @@ STOP_WORDS = set( ह हन्त हि -""".split() -) +""".split()) diff --git a/spacy/lang/si/stop_words.py b/spacy/lang/si/stop_words.py index 7d29bc1b4..acae5763b 100644 --- a/spacy/lang/si/stop_words.py +++ b/spacy/lang/si/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" සහ සමග සමඟ @@ -191,5 +190,4 @@ STOP_WORDS = set( ලෙස පරිදි එහෙත් -""".split() -) +""".split()) diff --git a/spacy/lang/sk/stop_words.py b/spacy/lang/sk/stop_words.py index 017e7beef..6ef4818c3 100644 --- a/spacy/lang/sk/stop_words.py +++ b/spacy/lang/sk/stop_words.py @@ -1,7 +1,6 @@ # Source: https://github.com/Ardevop-sk/stopwords-sk -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a aby aj @@ -420,5 +419,4 @@ zo ňou ňu že -""".split() -) +""".split()) diff --git a/spacy/lang/sl/lex_attrs.py b/spacy/lang/sl/lex_attrs.py index 3c1493050..6d6b40b45 100644 --- a/spacy/lang/sl/lex_attrs.py +++ b/spacy/lang/sl/lex_attrs.py @@ -2,8 +2,7 @@ import unicodedata from ...attrs import IS_CURRENCY, LIKE_NUM -_num_words = set( - """ +_num_words = set(""" nula ničla nič ena dva tri štiri pet šest sedem osem devet deset enajst dvanajst trinajst štirinajst petnajst šestnajst sedemnajst osemnajst devetnajst dvajset trideset štirideset @@ -18,11 +17,9 @@ _num_words = set( šestnajstih šestnajstim šestnajstimi petnajstih petnajstim petnajstimi sedemnajstih sedemnajstim sedemnajstimi osemnajstih osemnajstim osemnajstimi devetnajstih devetnajstim devetnajstimi dvajsetih dvajsetim dvajsetimi - """.split() -) + """.split()) -_ordinal_words = set( - """ +_ordinal_words = set(""" prvi drugi tretji četrti peti šesti sedmi osmi deveti deseti enajsti dvanajsti trinajsti štirinajsti petnajsti šestnajsti sedemnajsti osemnajsti devetnajsti @@ -92,11 +89,9 @@ _ordinal_words = set( osemnajstimi devetnajstimi dvajsetimi tridesetimi štiridesetimi petdesetimi šestdesetimi sedemdesetimi osemdesetimi devetdesetimi stotimi tisočimi milijontimi bilijontimi trilijontimi kvadrilijontimi neštetimi - """.split() -) + """.split()) -_currency_words = set( - """ +_currency_words = set(""" evro evra evru evrom evrov evroma evrih evrom evre evri evr eur cent centa centu cenom centov centoma centih centom cente centi dolar dolarja dolarji dolarju dolarjem dolarjev dolarjema dolarjih dolarje usd @@ -109,8 +104,7 @@ _currency_words = set( jen jena jeni jenu jenom jenov jenoma jenih jene kuna kuni kune kuno kun kunama kunah kunam kunami marka marki marke markama markah markami - """.split() -) + """.split()) def like_num(text): diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py index 8491efcb5..a81c00db2 100644 --- a/spacy/lang/sl/stop_words.py +++ b/spacy/lang/sl/stop_words.py @@ -1,7 +1,6 @@ # Source: https://github.com/stopwords-iso/stopwords-sl -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a ali b bi bil bila bile bili bilo biti blizu bo bodo bojo bolj bom bomo @@ -80,5 +79,4 @@ x z za zadaj zadnji zakaj zaprta zaprti zaprto zdaj zelo zunaj ž že -""".split() -) +""".split()) diff --git a/spacy/lang/sq/stop_words.py b/spacy/lang/sq/stop_words.py index f2b1a4f4a..bf1c7a703 100644 --- a/spacy/lang/sq/stop_words.py +++ b/spacy/lang/sq/stop_words.py @@ -1,7 +1,6 @@ # Source: https://github.com/andrixh/index-albanian -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a afert ai @@ -225,5 +224,4 @@ vetes vjen yne zakonisht -""".split() -) +""".split()) diff --git a/spacy/lang/sr/stop_words.py b/spacy/lang/sr/stop_words.py index 5df5509d2..758964a58 100644 --- a/spacy/lang/sr/stop_words.py +++ b/spacy/lang/sr/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" а авај ако @@ -389,5 +388,4 @@ STOP_WORDS = set( ћете ћеш ћу -""".split() -) +""".split()) diff --git a/spacy/lang/sv/stop_words.py b/spacy/lang/sv/stop_words.py index 2422b2a9e..08251bcff 100644 --- a/spacy/lang/sv/stop_words.py +++ b/spacy/lang/sv/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" aderton adertonde adjö aldrig alla allas allt alltid alltså än andra andras annan annat ännu artonde arton åtminstone att åtta åttio åttionde åttonde av även @@ -62,5 +61,4 @@ under upp ur ursäkt ut utan utanför ute vad vänster vänstra var vår vara våra varför varifrån varit varken värre varsågod vart vårt vem vems verkligen vi vid vidare viktig viktigare viktigast viktigt vilka vilken vilket vill -""".split() -) +""".split()) diff --git a/spacy/lang/ta/stop_words.py b/spacy/lang/ta/stop_words.py index abbff949d..d6ef21f3b 100644 --- a/spacy/lang/ta/stop_words.py +++ b/spacy/lang/ta/stop_words.py @@ -1,7 +1,6 @@ # Stop words -STOP_WORDS = set( - """ +STOP_WORDS = set(""" ஒரு என்று மற்றும் @@ -127,5 +126,4 @@ STOP_WORDS = set( வரையில் சற்று எனக் -""".split() -) +""".split()) diff --git a/spacy/lang/te/stop_words.py b/spacy/lang/te/stop_words.py index b18dab697..d28342608 100644 --- a/spacy/lang/te/stop_words.py +++ b/spacy/lang/te/stop_words.py @@ -1,7 +1,6 @@ # Source: https://github.com/Xangis/extra-stopwords (MIT License) -STOP_WORDS = set( - """ +STOP_WORDS = set(""" అందరూ అందుబాటులో అడగండి @@ -52,5 +51,4 @@ STOP_WORDS = set( వేరుగా వ్యతిరేకంగా సంబంధం -""".split() -) +""".split()) diff --git a/spacy/lang/th/stop_words.py b/spacy/lang/th/stop_words.py index 2823281ce..3dd6e5652 100644 --- a/spacy/lang/th/stop_words.py +++ b/spacy/lang/th/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" ทั้งนี้ ดัง ขอ รวม หลังจาก เป็น หลัง หรือ ๆ เกี่ยวกับ ซึ่งได้แก่ ด้วยเพราะ ด้วยว่า ด้วยเหตุเพราะ ด้วยเหตุว่า สุดๆ เสร็จแล้ว เช่น เข้า ถ้า ถูก ถึง ต่างๆ ใคร เปิดเผย ครา รือ ตาม ใน ได้แก่ ได้แต่ ได้ที่ ตลอดถึง นอกจากว่า นอกนั้น จริง อย่างดี ส่วน เพียงเพื่อ เดียว จัด ทั้งที ทั้งคน ทั้งตัว ไกลๆ @@ -71,5 +70,4 @@ STOP_WORDS = set( แห่งนี้ แห่งโน้น แห่งไหน แหละ ให้แก่ ใหญ่ ใหญ่โต อย่างมาก อย่างยิ่ง อย่างไรก็ อย่างไรก็ได้ อย่างไรเสีย อย่างละ อย่างหนึ่ง อย่างๆ อัน อันจะ อันได้แก่ อันที่ อันที่จริง อันที่จะ อันเนื่องมาจาก อันละ อันๆ อาจจะ อาจเป็น อาจเป็นด้วย อื่น อื่นๆ เอ็ง เอา ฯ ฯล ฯลฯ 555 กำ ขอโทษ เยี่ยม นี่คือ -""".split() -) +""".split()) diff --git a/spacy/lang/ti/stop_words.py b/spacy/lang/ti/stop_words.py index 9bd712200..e0aaf47d3 100644 --- a/spacy/lang/ti/stop_words.py +++ b/spacy/lang/ti/stop_words.py @@ -1,8 +1,7 @@ # Stop words from Tigrinya Wordcount: https://github.com/fgaim/Tigrinya-WordCount/blob/main/ti_stop_words.txt # Stop words -STOP_WORDS = set( - """ +STOP_WORDS = set(""" 'ምበር 'ሞ 'ቲ 'ታ 'ኳ 'ውን 'ዚ 'የ 'ዩ 'ያ 'ዮም 'ዮን ልዕሊ ሒዙ ሒዛ ሕጂ መበል መን መንጎ መጠን ማለት ምስ ምባል ምእንቲ ምኽንያቱ ምኽንያት ምዃኑ ምዃንና ምዃኖም @@ -23,5 +22,4 @@ STOP_WORDS = set( ዝነበረ ዝነበረት ዝነበሩ ዝካየድ ዝኸውን ዝኽእል ዝኾነ ዝዀነ የለን ይቕረብ ይብል ይኸውን ይኹን ይኽእል ደኣ ድሕሪ ድማ ገለ ገሊጹ ገና ገይሩ ግና ግን ጥራይ -""".split() -) +""".split()) diff --git a/spacy/lang/tl/stop_words.py b/spacy/lang/tl/stop_words.py index 2560cdaed..a7bf54199 100644 --- a/spacy/lang/tl/stop_words.py +++ b/spacy/lang/tl/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """ +STOP_WORDS = set(""" akin aking ako @@ -147,5 +146,4 @@ tulad tungkol una walang -""".split() -) +""".split()) diff --git a/spacy/lang/tn/stop_words.py b/spacy/lang/tn/stop_words.py index f614771dd..a63a455f7 100644 --- a/spacy/lang/tn/stop_words.py +++ b/spacy/lang/tn/stop_words.py @@ -1,6 +1,5 @@ # Stop words -STOP_WORDS = set( - """ +STOP_WORDS = set(""" ke gareng ga selekanyo tlhwatlhwa yo mongwe se sengwe fa go le jalo gongwe ba na mo tikologong jaaka kwa morago nna gonne ka sa pele nako teng @@ -16,5 +15,4 @@ tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi bonala e tshwanang bogolo tsenya tsweetswee karolo sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa tlhano lesometlhano botlalo lekgolo -""".split() -) +""".split()) diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index dbf9aab49..e80423e51 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -111,8 +111,7 @@ for orth in [ BASE_EXCEPTIONS[orth] = [{ORTH: orth}] -emoticons = set( - r""" +emoticons = set(r""" :) :-) :)) @@ -243,8 +242,7 @@ o.0 ¯\(ツ)/¯ (╯°□°)╯︵┻━┻ ><(((*> -""".split() -) +""".split()) for orth in emoticons: diff --git a/spacy/lang/tr/stop_words.py b/spacy/lang/tr/stop_words.py index 85dcff6a5..5323cf32d 100644 --- a/spacy/lang/tr/stop_words.py +++ b/spacy/lang/tr/stop_words.py @@ -1,6 +1,5 @@ # Source: https://github.com/stopwords-iso/stopwords-tr -STOP_WORDS = set( - """ +STOP_WORDS = set(""" acaba acep adamakıllı @@ -553,5 +552,4 @@ zarfında zaten zati zira -""".split() -) +""".split()) diff --git a/spacy/lang/tt/stop_words.py b/spacy/lang/tt/stop_words.py index 44169b757..8f146d915 100644 --- a/spacy/lang/tt/stop_words.py +++ b/spacy/lang/tt/stop_words.py @@ -1,7 +1,6 @@ # Tatar stopwords are from https://github.com/aliiae/stopwords-tt -STOP_WORDS = set( - """алай алайса алар аларга аларда алардан аларны аларның аларча +STOP_WORDS = set("""алай алайса алар аларга аларда алардан аларны аларның аларча алары аларын аларынга аларында аларыннан аларының алтмыш алтмышынчы алтмышынчыга алтмышынчыда алтмышынчыдан алтмышынчылар алтмышынчыларга алтмышынчыларда алтмышынчылардан алтмышынчыларны алтмышынчыларның алтмышынчыны алтмышынчының @@ -169,5 +168,4 @@ STOP_WORDS = set( өстәп өч өчен өченче өченчегә өченчедә өченчедән өченчеләр өченчеләргә өченчеләрдә өченчеләрдән өченчеләрне өченчеләрнең өченчене өченченең өчләп -өчәрләп""".split() -) +өчәрләп""".split()) diff --git a/spacy/lang/uk/stop_words.py b/spacy/lang/uk/stop_words.py index b11d7a044..517c30007 100644 --- a/spacy/lang/uk/stop_words.py +++ b/spacy/lang/uk/stop_words.py @@ -1,5 +1,4 @@ -STOP_WORDS = set( - """а +STOP_WORDS = set("""а або адже аж @@ -465,5 +464,4 @@ STOP_WORDS = set( якій якого якої -якщо""".split() -) +якщо""".split()) diff --git a/spacy/lang/ur/lex_attrs.py b/spacy/lang/ur/lex_attrs.py index e590ed3e3..916a47bfd 100644 --- a/spacy/lang/ur/lex_attrs.py +++ b/spacy/lang/ur/lex_attrs.py @@ -5,7 +5,8 @@ from ...attrs import LIKE_NUM # https://en.wikibooks.org/wiki/Urdu/Vocabulary/Numbers # https://www.urdu-english.com/lessons/beginner/numbers -_num_words = """ایک دو تین چار پانچ چھ سات آٹھ نو دس گیارہ بارہ تیرہ چودہ پندرہ سولہ سترہ +_num_words = ( + """ایک دو تین چار پانچ چھ سات آٹھ نو دس گیارہ بارہ تیرہ چودہ پندرہ سولہ سترہ اٹهارا انیس بیس اکیس بائیس تئیس چوبیس پچیس چھببیس ستایس اٹھائس انتيس تیس اکتیس بتیس تینتیس چونتیس پینتیس چھتیس سینتیس ارتیس انتالیس چالیس اکتالیس بیالیس تیتالیس @@ -17,6 +18,7 @@ _num_words = """ایک دو تین چار پانچ چھ سات آٹھ نو دس سٹیاسی اٹھیاسی نواسی نوے اکانوے بانوے ترانوے چورانوے پچانوے چھیانوے ستانوے اٹھانوے ننانوے سو """.split() +) # source https://www.google.com/intl/ur/inputtools/try/ diff --git a/spacy/lang/ur/stop_words.py b/spacy/lang/ur/stop_words.py index abfa36497..00f0dd2d6 100644 --- a/spacy/lang/ur/stop_words.py +++ b/spacy/lang/ur/stop_words.py @@ -1,6 +1,5 @@ # Source: collected from different resource on internet -STOP_WORDS = set( - """ +STOP_WORDS = set(""" ثھی خو گی @@ -509,5 +508,4 @@ STOP_WORDS = set( ہورہی ثبعث ضت -""".split() -) +""".split()) diff --git a/spacy/lang/vi/stop_words.py b/spacy/lang/vi/stop_words.py index 1d2ecdf8d..9163e1093 100644 --- a/spacy/lang/vi/stop_words.py +++ b/spacy/lang/vi/stop_words.py @@ -1,6 +1,5 @@ # Source: https://github.com/stopwords/vietnamese-stopwords -STOP_WORDS = set( - """ +STOP_WORDS = set(""" a_lô a_ha ai @@ -1943,7 +1942,4 @@ yêu_cầu ừ_ào ừ_ừ ử -""".split( - "\n" - ) -) +""".split("\n")) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 6ad044c60..748e9ac4e 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -91,13 +91,13 @@ class ChineseTokenizer(DummyTokenizer): def __call__(self, text: str) -> Doc: if self.segmenter == Segmenter.jieba: words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x]) # type: ignore[union-attr] - (words, spaces) = util.get_words_and_spaces(words, text) + words, spaces = util.get_words_and_spaces(words, text) return Doc(self.vocab, words=words, spaces=spaces) elif self.segmenter == Segmenter.pkuseg: if self.pkuseg_seg is None: raise ValueError(Errors.E1000) words = self.pkuseg_seg.cut(text) - (words, spaces) = util.get_words_and_spaces(words, text) + words, spaces = util.get_words_and_spaces(words, text) return Doc(self.vocab, words=words, spaces=spaces) # warn if segmenter setting is not the only remaining option "char" @@ -112,7 +112,7 @@ class ChineseTokenizer(DummyTokenizer): # split into individual characters words = list(text) - (words, spaces) = util.get_words_and_spaces(words, text) + words, spaces = util.get_words_and_spaces(words, text) return Doc(self.vocab, words=words, spaces=spaces) def pkuseg_update_user_dict(self, words: List[str], reset: bool = False): @@ -210,7 +210,7 @@ class ChineseTokenizer(DummyTokenizer): self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir)) if pkuseg_data["processors_data"]: processors_data = pkuseg_data["processors_data"] - (user_dict, do_process, common_words, other_words) = processors_data + user_dict, do_process, common_words, other_words = processors_data self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict) self.pkuseg_seg.postprocesser.do_process = do_process self.pkuseg_seg.postprocesser.common_words = set(common_words) @@ -268,7 +268,7 @@ class ChineseTokenizer(DummyTokenizer): raise ImportError(self._pkuseg_install_msg) from None if self.segmenter == Segmenter.pkuseg: data = srsly.read_msgpack(path) - (user_dict, do_process, common_words, other_words) = data + user_dict, do_process, common_words, other_words = data self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict) self.pkuseg_seg.postprocesser.do_process = do_process self.pkuseg_seg.postprocesser.common_words = set(common_words) diff --git a/spacy/lang/zh/stop_words.py b/spacy/lang/zh/stop_words.py index 42ae4a1de..d54fe6895 100644 --- a/spacy/lang/zh/stop_words.py +++ b/spacy/lang/zh/stop_words.py @@ -1,7 +1,6 @@ # stop words as whitespace-separated list # Chinese stop words,maybe not enough -STOP_WORDS = set( - """ +STOP_WORDS = set(""" ! " # @@ -1895,5 +1894,4 @@ sup ~± ~+ ¥ -""".split() -) +""".split()) diff --git a/spacy/lookups.py b/spacy/lookups.py index 1a2c44bfa..fd404edee 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -85,7 +85,7 @@ class Table(OrderedDict): value: The value to set. """ key = get_string_id(key) - OrderedDict.__setitem__(self, key, value) # type:ignore[assignment] + OrderedDict.__setitem__(self, key, value) # type: ignore[assignment] self.bloom.add(key) def set(self, key: Union[str, int], value: Any) -> None: @@ -104,7 +104,7 @@ class Table(OrderedDict): RETURNS: The value. """ key = get_string_id(key) - return OrderedDict.__getitem__(self, key) # type:ignore[index] + return OrderedDict.__getitem__(self, key) # type: ignore[index] def get(self, key: Union[str, int], default: Optional[Any] = None) -> Any: """Get the value for a given key. String keys will be hashed. @@ -114,7 +114,7 @@ class Table(OrderedDict): RETURNS: The value. """ key = get_string_id(key) - return OrderedDict.get(self, key, default) # type:ignore[arg-type] + return OrderedDict.get(self, key, default) # type: ignore[arg-type] def __contains__(self, key: Union[str, int]) -> bool: # type: ignore[override] """Check whether a key is in the table. String keys will be hashed. diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index 4bc1de3e0..1fb5c73af 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -26,7 +26,7 @@ def test_create_from_words_and_text(vocab): # no whitespace in words words = ["'", "dogs", "'", "run"] text = " 'dogs'\n\nrun " - (words, spaces) = util.get_words_and_spaces(words, text) + words, spaces = util.get_words_and_spaces(words, text) doc = Doc(vocab, words=words, spaces=spaces) assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "] assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""] @@ -38,7 +38,7 @@ def test_create_from_words_and_text(vocab): # partial whitespace in words words = [" ", "'", "dogs", "'", "\n\n", "run", " "] text = " 'dogs'\n\nrun " - (words, spaces) = util.get_words_and_spaces(words, text) + words, spaces = util.get_words_and_spaces(words, text) doc = Doc(vocab, words=words, spaces=spaces) assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "] assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""] @@ -50,7 +50,7 @@ def test_create_from_words_and_text(vocab): # non-standard whitespace tokens words = [" ", " ", "'", "dogs", "'", "\n\n", "run"] text = " 'dogs'\n\nrun " - (words, spaces) = util.get_words_and_spaces(words, text) + words, spaces = util.get_words_and_spaces(words, text) doc = Doc(vocab, words=words, spaces=spaces) assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "] assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""] @@ -63,7 +63,7 @@ def test_create_from_words_and_text(vocab): with pytest.raises(ValueError): words = [" ", " ", "'", "dogs", "'", "\n\n", "run"] text = " 'dogs'\n\nrun " - (words, spaces) = util.get_words_and_spaces(words + ["away"], text) + words, spaces = util.get_words_and_spaces(words + ["away"], text) def test_create_with_heads_and_no_deps(vocab): diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 7335bbdf1..f09f51a85 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -437,7 +437,7 @@ def test_phrase_matcher_pickle(en_vocab): assert matches == matches_unpickled # clunky way to vaguely check that callback is unpickled - (vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1] + vocab, docs, callbacks, attr = matcher_unpickled.__reduce__()[1] assert isinstance(callbacks.get("TEST2"), Mock)