mirror of
https://github.com/explosion/spaCy.git
synced 2026-03-04 03:41:29 +03:00
Format (#13929)
This commit is contained in:
parent
c1e7cb2ebf
commit
453732d32d
|
|
@ -1,7 +1,6 @@
|
|||
# Source: https://github.com/stopwords-iso/stopwords-af
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
'n
|
||||
aan
|
||||
af
|
||||
|
|
@ -53,5 +52,4 @@ vir
|
|||
was
|
||||
wat
|
||||
ʼn
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
# Stop words by Teshome Kassie http://etd.aau.edu.et/bitstream/handle/123456789/3315/Teshome%20Kassie.pdf?sequence=1&isAllowed=y
|
||||
# Stop words by Tihitina Petros http://etd.aau.edu.et/bitstream/handle/123456789/3384/Tihitina%20Petros.pdf?sequence=1&isAllowed=y
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
ግን አንቺ አንተ እናንተ ያንተ ያንቺ የናንተ ራስህን ራስሽን ራሳችሁን
|
||||
ሁሉ ኋላ በሰሞኑ አሉ በኋላ ሁኔታ በኩል አስታውቀዋል ሆነ በውስጥ
|
||||
አስታውሰዋል ሆኑ ባጣም እስካሁን ሆኖም በተለይ አሳሰበ ሁል በተመለከተ
|
||||
|
|
@ -29,5 +28,4 @@ STOP_WORDS = set(
|
|||
በዚህም መሆን ምንጊዜም እነዚህም በዚህና ያለ ስም
|
||||
ሲኖር ከዚህም መሆኑን በሁኔታው የማያንስ እነዚህኑ ማንም ከነዚሁ
|
||||
ያላቸውን እጅግ ሲሆኑ ለሆኑ ሊሆን ለማናቸውም
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = set(
|
||||
"""
|
||||
_num_words = set("""
|
||||
صفر
|
||||
واحد
|
||||
إثنان
|
||||
|
|
@ -51,11 +50,9 @@ _num_words = set(
|
|||
مليون
|
||||
مليار
|
||||
مليارات
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
_ordinal_words = set("""
|
||||
اول
|
||||
أول
|
||||
حاد
|
||||
|
|
@ -70,8 +67,7 @@ _ordinal_words = set(
|
|||
ثامن
|
||||
تاسع
|
||||
عاشر
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
|
||||
def like_num(text):
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
من
|
||||
نحو
|
||||
لعل
|
||||
|
|
@ -386,5 +385,4 @@ STOP_WORDS = set(
|
|||
وإن
|
||||
ولو
|
||||
يا
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
# Source: https://github.com/eliasdabbas/advertools/blob/master/advertools/stopwords.py
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
amma
|
||||
arasında
|
||||
artıq
|
||||
|
|
@ -141,5 +140,4 @@ zaman
|
|||
əlbəttə
|
||||
ən
|
||||
əslində
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -4,8 +4,7 @@ References:
|
|||
https://postvai.com/books/stop-dumi.pdf - Additions to the original list in order to improve it.
|
||||
"""
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
а автентичен аз ако ала
|
||||
|
||||
бе без беше би бивш бивша бившо бивши бил била били било благодаря близо бъдат
|
||||
|
|
@ -76,5 +75,4 @@ STOP_WORDS = set(
|
|||
юмрук
|
||||
|
||||
я як
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে
|
||||
আগামী আগে আগেই আছে আজ আদ্যভাগে আপনার আপনি আবার আমরা আমাকে আমাদের আমার আমি আর আরও
|
||||
ইত্যাদি ইহা
|
||||
|
|
@ -38,5 +37,4 @@ STOP_WORDS = set(
|
|||
সাধারণ সামনে সঙ্গে সঙ্গেও সব সবার সমস্ত সম্প্রতি সময় সহ সহিত সাথে সুতরাং সে সেই সেখান সেখানে সেটা সেটাই সেটাও সেটি স্পষ্ট স্বয়ং
|
||||
হইতে হইবে হইয়া হওয়া হওয়ায় হওয়ার হচ্ছে হত হতে হতেই হন হবে হবেন হয় হয়তো হয়নি হয়ে হয়েই হয়েছিল হয়েছে হাজার
|
||||
হয়েছেন হল হলে হলেই হলেও হলো হিসাবে হিসেবে হৈলে হোক হয় হয়ে হয়েছে হৈতে হইয়া হয়েছিল হয়েছেন হয়নি হয়েই হয়তো হওয়া হওয়ার হওয়ায়
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Source: https://zenodo.org/records/10148636
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
འི་
|
||||
།
|
||||
དུ་
|
||||
|
|
@ -194,5 +193,4 @@ STOP_WORDS = set(
|
|||
གིང་
|
||||
ཚ་
|
||||
ཀྱང
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a abans ací ah així això al aleshores algun alguna algunes alguns alhora allà allí allò
|
||||
als altra altre altres amb ambdues ambdós anar ans apa aquell aquella aquelles aquells
|
||||
aquest aquesta aquestes aquests aquí
|
||||
|
|
@ -48,5 +47,4 @@ un una unes uns us últim ús
|
|||
|
||||
va vaig vam van vas veu vosaltres vostra vostre vostres
|
||||
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
# Source: https://github.com/Alir3z4/stop-words
|
||||
# Source: https://github.com/stopwords-iso/stopwords-cs/blob/master/stopwords-cs.txt
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a
|
||||
aby
|
||||
ahoj
|
||||
|
|
@ -361,5 +360,4 @@ zač
|
|||
zatímco
|
||||
ze
|
||||
že
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Source: Handpicked by Jens Dahl Møllerhøj.
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
af aldrig alene alle allerede alligevel alt altid anden andet andre at
|
||||
|
||||
bag begge blandt blev blive bliver burde bør
|
||||
|
|
@ -41,5 +40,4 @@ ud uden udover under undtagen
|
|||
var ved vi via vil ville vore vores vær være været
|
||||
|
||||
øvrigt
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
á a ab aber ach acht achte achten achter achtes ag alle allein allem allen
|
||||
aller allerdings alles allgemeinen als also am an andere anderen anderem andern
|
||||
anders auch auf aus ausser außer ausserdem außerdem
|
||||
|
|
@ -74,5 +73,4 @@ wollt wollte wollten worden wurde würde wurden würden
|
|||
|
||||
zehn zehnte zehnten zehnter zehntes zeit zu zuerst zugleich zum zunächst zur
|
||||
zurück zusammen zwanzig zwar zwei zweite zweiten zweiter zweites zwischen
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a abo aby ako ale až
|
||||
|
||||
daniž dokulaž
|
||||
|
|
@ -11,5 +10,4 @@ jolic
|
|||
pak pótom
|
||||
|
||||
teke togodla
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Stop words
|
||||
# Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
αδιάκοπα αι ακόμα ακόμη ακριβώς άλλα αλλά αλλαχού άλλες άλλη άλλην
|
||||
άλλης αλλιώς αλλιώτικα άλλο άλλοι αλλοιώς αλλοιώτικα άλλον άλλος άλλοτε αλλού
|
||||
άλλους άλλων άμα άμεσα αμέσως αν ανά ανάμεσα αναμεταξύ άνευ αντί αντίπερα αντίς
|
||||
|
|
@ -83,5 +82,4 @@ STOP_WORDS = set(
|
|||
χωρίς χωριστά
|
||||
|
||||
ω ως ωσάν ωσότου ώσπου ώστε ωστόσο ωχ
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
# Stop words
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a about above across after afterwards again against all almost alone along
|
||||
already also although always am among amongst amount an and another any anyhow
|
||||
anyone anything anyway anywhere are around as at
|
||||
|
|
@ -62,8 +61,7 @@ whereafter whereas whereby wherein whereupon wherever whether which while
|
|||
whither who whoever whole whom whose why will with within without would
|
||||
|
||||
yet you your yours yourself yourselves
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"]
|
||||
STOP_WORDS.update(contractions)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a acuerdo adelante ademas además afirmó agregó ahi ahora ahí al algo alguna
|
||||
algunas alguno algunos algún alli allí alrededor ambos ante anterior antes
|
||||
apenas aproximadamente aquel aquella aquellas aquello aquellos aqui aquél
|
||||
|
|
@ -76,5 +75,4 @@ va vais vamos van varias varios vaya veces ver verdad verdadera verdadero vez
|
|||
vosotras vosotros voy vuestra vuestras vuestro vuestros
|
||||
|
||||
y ya yo
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Source: https://github.com/stopwords-iso/stopwords-et
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
aga
|
||||
ei
|
||||
et
|
||||
|
|
@ -37,5 +36,4 @@ siis
|
|||
ta
|
||||
te
|
||||
ära
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
# Source: https://github.com/stopwords-iso/stopwords-eu
|
||||
# https://www.ranks.nl/stopwords/basque
|
||||
# https://www.mustgo.com/worldlanguages/basque/
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
al
|
||||
anitz
|
||||
arabera
|
||||
|
|
@ -101,5 +100,4 @@ zu
|
|||
zuek
|
||||
zuen
|
||||
zuten
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -5,8 +5,7 @@ ZWNJ_O_MIM = "ام"
|
|||
YE_NUN = "ین"
|
||||
|
||||
|
||||
_num_words = set(
|
||||
"""
|
||||
_num_words = set("""
|
||||
صفر
|
||||
یک
|
||||
دو
|
||||
|
|
@ -63,15 +62,12 @@ _num_words = set(
|
|||
کوادریلیون
|
||||
کادریلیارد
|
||||
کوینتیلیون
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
_ordinal_words = set("""
|
||||
اول
|
||||
سوم
|
||||
سیام""".split()
|
||||
)
|
||||
سیام""".split())
|
||||
|
||||
_ordinal_words.update({num + MIM for num in _num_words})
|
||||
_ordinal_words.update({num + ZWNJ_O_MIM for num in _num_words})
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
# Stop words from HAZM package
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
و
|
||||
در
|
||||
به
|
||||
|
|
@ -389,5 +388,4 @@ STOP_WORDS = set(
|
|||
لذا
|
||||
زاده
|
||||
گردد
|
||||
اینجا""".split()
|
||||
)
|
||||
اینجا""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Source https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt
|
||||
# Reformatted with some minor corrections
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
aiemmin aika aikaa aikaan aikaisemmin aikaisin aikana aikoina aikoo aikovat
|
||||
aina ainakaan ainakin ainoa ainoat aiomme aion aiotte aivan ajan alas alemmas
|
||||
alkuisin alkuun alla alle aloitamme aloitan aloitat aloitatte aloitattivat
|
||||
|
|
@ -106,5 +105,4 @@ yhtäällä yhtäältä yhtään yhä yksi yksin yksittäin yleensä ylemmäs yl
|
|||
ympäri
|
||||
|
||||
älköön älä
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,24 +1,20 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = set(
|
||||
"""
|
||||
_num_words = set("""
|
||||
zero un une deux trois quatre cinq six sept huit neuf dix
|
||||
onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf
|
||||
vingt trente quarante cinquante soixante soixante-dix septante quatre-vingt huitante quatre-vingt-dix nonante
|
||||
cent mille mil million milliard billion quadrillion quintillion
|
||||
sextillion septillion octillion nonillion decillion
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
_ordinal_words = set("""
|
||||
premier première deuxième second seconde troisième quatrième cinquième sixième septième huitième neuvième dixième
|
||||
onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neuvième
|
||||
vingtième trentième quarantième cinquantième soixantième soixante-dixième septantième quatre-vingtième huitantième quatre-vingt-dixième nonantième
|
||||
centième millième millionnième milliardième billionnième quadrillionnième quintillionnième
|
||||
sextillionnième septillionnième octillionnième nonillionnième decillionnième
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
|
||||
def like_num(text):
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a à â abord afin ah ai aie ainsi ait allaient allons
|
||||
alors anterieur anterieure anterieures antérieur antérieure antérieures
|
||||
apres après as assez attendu au
|
||||
|
|
@ -80,5 +79,4 @@ votre votres vous vous-mêmes vu vé vôtre vôtres
|
|||
|
||||
y
|
||||
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a ach ag agus an aon ar arna as
|
||||
|
||||
ba beirt bhúr
|
||||
|
|
@ -39,5 +38,4 @@ um
|
|||
í
|
||||
|
||||
ó ón óna ónár
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
'ad
|
||||
'ar
|
||||
'd # iad
|
||||
|
|
@ -382,7 +381,4 @@ urrainn
|
|||
ì
|
||||
ò
|
||||
ó
|
||||
""".split(
|
||||
"\n"
|
||||
)
|
||||
)
|
||||
""".split("\n"))
|
||||
|
|
|
|||
|
|
@ -1974,9 +1974,7 @@ Tron an
|
|||
tuilleadh 's a chòir
|
||||
Tuilleadh 's a chòir
|
||||
tuilleadh sa chòir
|
||||
Tuilleadh sa chòir""".split(
|
||||
"\n"
|
||||
):
|
||||
Tuilleadh sa chòir""".split("\n"):
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
αὐτῷ αὐτοῦ αὐτῆς αὐτόν αὐτὸν αὐτῶν αὐτὸς αὐτὸ αὐτό αὐτός αὐτὴν αὐτοῖς αὐτοὺς αὔτ' αὐτὰ αὐτῇ αὐτὴ
|
||||
αὐτὼ αὑταὶ καὐτὸς αὐτά αὑτός αὐτοῖσι αὐτοῖσιν αὑτὸς αὐτήν αὐτοῖσί αὐτοί αὐτοὶ αὐτοῖο αὐτάων αὐτὰς
|
||||
αὐτέων αὐτώ αὐτάς αὐτούς αὐτή αὐταί αὐταὶ αὐτῇσιν τὠυτῷ τὠυτὸ ταὐτὰ ταύτῃ αὐτῇσι αὐτῇς αὐταῖς αὐτᾶς αὐτὰν ταὐτὸν
|
||||
|
|
@ -57,5 +56,4 @@ STOP_WORDS = set(
|
|||
|
||||
ὣς ὡς ὥς ὧς ὥστ' ὥστε ὥσθ' ὤ ὢ
|
||||
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
એમ
|
||||
આ
|
||||
એ
|
||||
|
|
@ -84,5 +83,4 @@ STOP_WORDS = set(
|
|||
દર
|
||||
એટલો
|
||||
પરંતુ
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
אני
|
||||
את
|
||||
אתה
|
||||
|
|
@ -218,5 +217,4 @@ STOP_WORDS = set(
|
|||
אחרות
|
||||
אשר
|
||||
או
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
अंदर
|
||||
अत
|
||||
अदि
|
||||
|
|
@ -235,5 +234,4 @@ STOP_WORDS = set(
|
|||
होते
|
||||
होना
|
||||
होने
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
# Source: https://github.com/stopwords-iso/stopwords-hr
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a
|
||||
ah
|
||||
aha
|
||||
|
|
@ -340,5 +339,4 @@ zbog
|
|||
željeo
|
||||
zimus
|
||||
zum
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a abo ale ani
|
||||
|
||||
dokelž
|
||||
|
|
@ -15,5 +14,4 @@ pak potom
|
|||
tež tohodla
|
||||
|
||||
zo zoby
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,24 +1,20 @@
|
|||
from ...attrs import LIKE_NUM, NORM
|
||||
|
||||
# Cardinal numbers in Creole
|
||||
_num_words = set(
|
||||
"""
|
||||
_num_words = set("""
|
||||
zewo youn en de twa kat senk sis sèt uit nèf dis
|
||||
onz douz trèz katoz kenz sèz disèt dizwit diznèf
|
||||
vent trant karant sinkant swasant swasann-dis
|
||||
san mil milyon milya
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
# Ordinal numbers in Creole (some are French-influenced, some simplified)
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
_ordinal_words = set("""
|
||||
premye dezyèm twazyèm katryèm senkyèm sizyèm sètvyèm uitvyèm nèvyèm dizyèm
|
||||
onzèm douzyèm trèzyèm katozyèm kenzèm sèzyèm disetyèm dizwityèm diznèvyèm
|
||||
ventyèm trantyèm karantyèm sinkantyèm swasantyèm
|
||||
swasann-disyèm santyèm milyèm milyonnyèm milyadyèm
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
NORM_MAP = {
|
||||
"'m": "mwen",
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a ak an ankò ant apre ap atò avan avanlè
|
||||
byen bò byenke
|
||||
|
||||
|
|
@ -39,8 +38,7 @@ sa san si swa si
|
|||
|
||||
men mèsi oswa osinon
|
||||
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
# Add common contractions, with and without apostrophe variants
|
||||
contractions = ["m'", "n'", "w'", "y'", "l'", "t'", "k'"]
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a abban ahhoz ahogy ahol aki akik akkor akár alatt amely amelyek amelyekben
|
||||
amelyeket amelyet amelynek ami amikor amit amolyan amíg annak arra arról az
|
||||
azok azon azonban azt aztán azután azzal azért
|
||||
|
|
@ -58,5 +57,4 @@ volna volt voltak voltam voltunk
|
|||
úgy új újabb újra
|
||||
|
||||
ő őket
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
նա
|
||||
ողջը
|
||||
այստեղ
|
||||
|
|
@ -103,5 +102,4 @@ STOP_WORDS = set(
|
|||
այս
|
||||
մեջ
|
||||
թ
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
ID_BASE_EXCEPTIONS = set(
|
||||
"""
|
||||
ID_BASE_EXCEPTIONS = set("""
|
||||
aba-aba
|
||||
abah-abah
|
||||
abal-abal
|
||||
|
|
@ -3898,5 +3897,4 @@ yel-yel
|
|||
yo-yo
|
||||
zam-zam
|
||||
zig-zag
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya
|
||||
aku akulah amat amatlah anda andalah antar antara antaranya apa apaan apabila
|
||||
apakah apalagi apatah artinya asal asalkan atas atau ataukah ataupun awal
|
||||
|
|
@ -114,5 +113,4 @@ ucap ucapnya ujar ujarnya umum umumnya ungkap ungkapnya untuk usah usai
|
|||
waduh wah wahai waktu waktunya walau walaupun wong
|
||||
|
||||
yaitu yakin yakni yang
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Source: https://github.com/Xangis/extra-stopwords
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
afhverju
|
||||
aftan
|
||||
aftur
|
||||
|
|
@ -154,5 +153,4 @@ ykkar
|
|||
því
|
||||
þær
|
||||
ætti
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl
|
||||
agli ahime ahimè ai al alcuna alcuni alcuno all alla alle allo allora altri
|
||||
altrimenti altro altrove altrui anche ancora anni anno ansa anticipo assai
|
||||
|
|
@ -79,5 +78,4 @@ uguali ulteriore ultimo un un' una uno uomo
|
|||
|
||||
v' va vale vari varia varie vario verso vi via vicino visto vita voi volta volte
|
||||
vostra vostre vostri vostro
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -2,8 +2,7 @@
|
|||
# filtering out everything that wasn't hiragana. ー (one) was also added.
|
||||
# Considered keeping some non-hiragana words but too many place names were
|
||||
# present.
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
あ あっ あまり あり ある あるいは あれ
|
||||
い いい いう いく いずれ いっ いつ いる いわ
|
||||
うち
|
||||
|
|
@ -44,5 +43,4 @@ STOP_WORDS = set(
|
|||
を
|
||||
ん
|
||||
一
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
û
|
||||
li
|
||||
bi
|
||||
|
|
@ -40,5 +39,4 @@ gelek
|
|||
hemû
|
||||
kes
|
||||
tişt
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
ಹಲವು
|
||||
ಮೂಲಕ
|
||||
ಹಾಗೂ
|
||||
|
|
@ -82,5 +81,4 @@ STOP_WORDS = set(
|
|||
ಎಂದು
|
||||
ನನ್ನ
|
||||
ಮೇಲೆ
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
이
|
||||
있
|
||||
하
|
||||
|
|
@ -63,5 +62,4 @@ STOP_WORDS = set(
|
|||
원
|
||||
잘
|
||||
놓
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
ага адам айтты айтымында айтып ал алар
|
||||
алардын алган алуу алып анда андан аны
|
||||
анын ар
|
||||
|
|
@ -38,5 +37,4 @@ STOP_WORDS = set(
|
|||
үч үчүн
|
||||
|
||||
өз
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Corrected Perseus list, cf. https://wiki.digitalclassicist.org/Stopwords_for_Greek_and_Latin
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
ab ac ad adhuc aliqui aliquis an ante apud at atque aut autem
|
||||
|
||||
cum cur
|
||||
|
|
@ -33,5 +32,4 @@ tam tamen trans tu tum
|
|||
ubi uel uero
|
||||
|
||||
vel vero
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,22 +1,18 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = set(
|
||||
"""
|
||||
_num_words = set("""
|
||||
null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng
|
||||
véierzéng foffzéng siechzéng siwwenzéng uechtzeng uechzeng nonnzéng nongzéng zwanzeg drësseg véierzeg foffzeg sechzeg siechzeg siwenzeg achtzeg achzeg uechtzeg uechzeg nonnzeg
|
||||
honnert dausend millioun milliard billioun billiard trillioun triliard
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
_ordinal_words = set("""
|
||||
éischten zweeten drëtten véierten fënneften sechsten siwenten aachten néngten zéngten eeleften
|
||||
zwieleften dräizéngten véierzéngten foffzéngten siechzéngten uechtzéngen uechzéngten nonnzéngten nongzéngten zwanzegsten
|
||||
drëssegsten véierzegsten foffzegsten siechzegsten siwenzegsten uechzegsten nonnzegsten
|
||||
honnertsten dausendsten milliounsten
|
||||
milliardsten billiounsten billiardsten trilliounsten trilliardsten
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
|
||||
def like_num(text):
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a
|
||||
à
|
||||
äis
|
||||
|
|
@ -207,5 +206,4 @@ ze
|
|||
zu
|
||||
zum
|
||||
zwar
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
abadde abalala abamu abangi abava ajja ali alina ani anti ateekeddwa atewamu
|
||||
atya awamu aweebwa ayinza ba baali babadde babalina bajja
|
||||
bajjanewankubade bali balina bandi bangi bano bateekeddwa baweebwa bayina bebombi beera bibye
|
||||
|
|
@ -15,5 +14,4 @@ oyina oyo seetaaga si sinakindi singa talina tayina tebaali tebaalina tebayina t
|
|||
tetuteekeddwa tewali teyalina teyayina tolina tu tuyina tulina tuyina twafuna twetaaga wa wabula
|
||||
wabweru wadde waggulunnina wakati waliwobangi waliyo wandi wange wano wansi weebwa yabadde yaffe
|
||||
ye yenna yennyini yina yonna ziba zijja zonna
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei
|
||||
|
||||
bella belle belli bello ben
|
||||
|
|
@ -35,5 +34,4 @@ tanta tante tanti tanto te ti torna tra tròppo tutta tutte tutti tutto
|
|||
un uña unn' unna
|
||||
|
||||
za zu
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Source: https://github.com/stopwords-iso/stopwords-lv
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
aiz
|
||||
ap
|
||||
apakš
|
||||
|
|
@ -163,5 +162,4 @@ viņpus
|
|||
zem
|
||||
ārpus
|
||||
šaipus
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
а
|
||||
абре
|
||||
aв
|
||||
|
|
@ -811,5 +810,4 @@ aв
|
|||
џагара-магара
|
||||
џанам
|
||||
џив-џив
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
അത്
|
||||
ഇത്
|
||||
ആയിരുന്നു
|
||||
|
|
@ -9,5 +8,4 @@ STOP_WORDS = set(
|
|||
അന്ന്
|
||||
ഇന്ന്
|
||||
ആണ്
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
# Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
न
|
||||
अतरी
|
||||
तो
|
||||
|
|
@ -188,5 +187,4 @@ STOP_WORDS = set(
|
|||
होता
|
||||
होती
|
||||
होते
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# from https://prpm.dbp.gov.my/cari1?keyword=
|
||||
# dbp https://en.wikipedia.org/wiki/Dewan_Bahasa_dan_Pustaka
|
||||
MS_BASE_EXCEPTIONS = set(
|
||||
"""
|
||||
MS_BASE_EXCEPTIONS = set("""
|
||||
aba-aba
|
||||
abah-abah
|
||||
abar-abar
|
||||
|
|
@ -1939,5 +1938,4 @@ warna-warni
|
|||
water-cooled
|
||||
world-class
|
||||
yang-yang
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya
|
||||
aku akulah amat amatlah anda andalah antar antara antaranya apa apaan apabila
|
||||
apakah apalagi apatah artinya asal asalkan atas atau ataukah ataupun awal
|
||||
|
|
@ -114,5 +113,4 @@ ucap ucapnya ujar ujarnya umum umumnya ungkap ungkapnya untuk usah usai
|
|||
waduh wah wahai waktu waktunya walau walaupun wong
|
||||
|
||||
yaitu yakin yakni yang
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
alle allerede alt and andre annen annet at av
|
||||
|
||||
bak bare bedre beste blant ble bli blir blitt bris by både
|
||||
|
|
@ -46,5 +45,4 @@ vant var ved veldig vi videre viktig vil ville viser vår være vært
|
|||
å år
|
||||
|
||||
ønsker
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Source: https://github.com/sanjaalcorps/NepaliStopWords/blob/master/NepaliStopWords.txt
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
अक्सर
|
||||
अगाडि
|
||||
अगाडी
|
||||
|
|
@ -490,5 +489,4 @@ STOP_WORDS = set(
|
|||
होइन
|
||||
होकि
|
||||
होला
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,21 +1,17 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = set(
|
||||
"""
|
||||
_num_words = set("""
|
||||
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
|
||||
veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
|
||||
duizend miljoen miljard biljoen biljard triljoen triljard
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
_ordinal_words = set("""
|
||||
eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
|
||||
twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
|
||||
zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
|
||||
miljardste biljoenste biljardste triljoenste triljardste
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
|
||||
def like_num(text):
|
||||
|
|
|
|||
|
|
@ -13,8 +13,7 @@
|
|||
# should have a Dutch counterpart here.
|
||||
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaande aangezien achter achterna
|
||||
afgelopen aldus alhoewel anderzijds
|
||||
|
||||
|
|
@ -68,5 +67,4 @@ welk welke welken werd werden wiens wier wilde wordt
|
|||
|
||||
zal ze zei zelf zich zij zijn zo zonder zou zeer zeker zekere zelfde zelfs zichzelf zijnde zijne zo’n zoals zodra zouden
|
||||
zoveel zowat zulk zulke zulks zullen zult
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# sources: https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt and https://github.com/stopwords-iso/stopwords-pl
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a aby ach acz aczkolwiek aj albo ale alez
|
||||
ależ ani az aż
|
||||
|
||||
|
|
@ -74,5 +73,4 @@ xi xii xiii xiv xv
|
|||
z za zaden zadna zadne zadnych zapewne zawsze zaś
|
||||
ze zeby znow znowu znów zostal został
|
||||
|
||||
żaden żadna żadne żadnych że żeby""".split()
|
||||
)
|
||||
żaden żadna żadne żadnych że żeby""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes
|
||||
ao aos apenas apoia apoio apontar após aquela aquelas aquele aqueles aqui aquilo
|
||||
as assim através atrás até aí
|
||||
|
|
@ -62,5 +61,4 @@ vai vais valor veja vem vens ver vez vezes vinda vindo vinte você vocês vos vo
|
|||
vossas vosso vossos vários vão vêm vós
|
||||
|
||||
zero
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,16 +1,13 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = set(
|
||||
"""
|
||||
_num_words = set("""
|
||||
zero unu doi două trei patru cinci șase șapte opt nouă zece
|
||||
unsprezece doisprezece douăsprezece treisprezece patrusprezece cincisprezece șaisprezece șaptesprezece optsprezece nouăsprezece
|
||||
douăzeci treizeci patruzeci cincizeci șaizeci șaptezeci optzeci nouăzeci
|
||||
sută mie milion miliard bilion trilion cvadrilion catralion cvintilion sextilion septilion enșpemii
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
_ordinal_words = set("""
|
||||
primul doilea treilea patrulea cincilea șaselea șaptelea optulea nouălea zecelea
|
||||
prima doua treia patra cincia șasea șaptea opta noua zecea
|
||||
unsprezecelea doisprezecelea treisprezecelea patrusprezecelea cincisprezecelea șaisprezecelea șaptesprezecelea optsprezecelea nouăsprezecelea
|
||||
|
|
@ -18,8 +15,7 @@ unsprezecea douăsprezecea treisprezecea patrusprezecea cincisprezecea șaisprez
|
|||
douăzecilea treizecilea patruzecilea cincizecilea șaizecilea șaptezecilea optzecilea nouăzecilea sutălea
|
||||
douăzecea treizecea patruzecea cincizecea șaizecea șaptezecea optzecea nouăzecea suta
|
||||
miilea mielea mia milionulea milioana miliardulea miliardelea miliarda enșpemia
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
|
||||
def like_num(text):
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
# Source: https://github.com/stopwords-iso/stopwords-ro
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a
|
||||
abia
|
||||
acea
|
||||
|
|
@ -495,5 +494,4 @@ zice
|
|||
știu
|
||||
ți
|
||||
ție
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,8 +1,6 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = list(
|
||||
set(
|
||||
"""
|
||||
_num_words = list(set("""
|
||||
ноль ноля нолю нолём ноле нулевой нулевого нулевому нулевым нулевом нулевая нулевую нулевое нулевые нулевых нулевыми
|
||||
|
||||
четверть четверти четвертью четвертей четвертям четвертями четвертях
|
||||
|
|
@ -203,9 +201,7 @@ _num_words = list(
|
|||
квинтиллиону квинтиллионов квинтлн
|
||||
|
||||
i ii iii iv v vi vii viii ix x xi xii xiii xiv xv xvi xvii xviii xix xx xxi xxii xxiii xxiv xxv xxvi xxvii xxvii xxix
|
||||
""".split()
|
||||
)
|
||||
)
|
||||
""".split()))
|
||||
|
||||
|
||||
def like_num(text):
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
а авось ага агу аж ай али алло ау ах ая
|
||||
|
||||
б будем будет будете будешь буду будут будучи будь будьте бы был была были было
|
||||
|
|
@ -107,5 +106,4 @@ STOP_WORDS = set(
|
|||
ю
|
||||
|
||||
я явно явных яко якобы якоже
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Source: https://gist.github.com/Akhilesh28/fe8b8e180f64b72e64751bc31cb6d323
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
अहम्
|
||||
आवाम्
|
||||
वयम्
|
||||
|
|
@ -511,5 +510,4 @@ STOP_WORDS = set(
|
|||
ह
|
||||
हन्त
|
||||
हि
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
සහ
|
||||
සමග
|
||||
සමඟ
|
||||
|
|
@ -191,5 +190,4 @@ STOP_WORDS = set(
|
|||
ලෙස
|
||||
පරිදි
|
||||
එහෙත්
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Source: https://github.com/Ardevop-sk/stopwords-sk
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a
|
||||
aby
|
||||
aj
|
||||
|
|
@ -420,5 +419,4 @@ zo
|
|||
ňou
|
||||
ňu
|
||||
že
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -2,8 +2,7 @@ import unicodedata
|
|||
|
||||
from ...attrs import IS_CURRENCY, LIKE_NUM
|
||||
|
||||
_num_words = set(
|
||||
"""
|
||||
_num_words = set("""
|
||||
nula ničla nič ena dva tri štiri pet šest sedem osem
|
||||
devet deset enajst dvanajst trinajst štirinajst petnajst
|
||||
šestnajst sedemnajst osemnajst devetnajst dvajset trideset štirideset
|
||||
|
|
@ -18,11 +17,9 @@ _num_words = set(
|
|||
šestnajstih šestnajstim šestnajstimi petnajstih petnajstim petnajstimi
|
||||
sedemnajstih sedemnajstim sedemnajstimi osemnajstih osemnajstim osemnajstimi
|
||||
devetnajstih devetnajstim devetnajstimi dvajsetih dvajsetim dvajsetimi
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
_ordinal_words = set("""
|
||||
prvi drugi tretji četrti peti šesti sedmi osmi
|
||||
deveti deseti enajsti dvanajsti trinajsti štirinajsti
|
||||
petnajsti šestnajsti sedemnajsti osemnajsti devetnajsti
|
||||
|
|
@ -92,11 +89,9 @@ _ordinal_words = set(
|
|||
osemnajstimi devetnajstimi dvajsetimi tridesetimi štiridesetimi petdesetimi šestdesetimi
|
||||
sedemdesetimi osemdesetimi devetdesetimi stotimi tisočimi milijontimi bilijontimi
|
||||
trilijontimi kvadrilijontimi neštetimi
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
_currency_words = set(
|
||||
"""
|
||||
_currency_words = set("""
|
||||
evro evra evru evrom evrov evroma evrih evrom evre evri evr eur
|
||||
cent centa centu cenom centov centoma centih centom cente centi
|
||||
dolar dolarja dolarji dolarju dolarjem dolarjev dolarjema dolarjih dolarje usd
|
||||
|
|
@ -109,8 +104,7 @@ _currency_words = set(
|
|||
jen jena jeni jenu jenom jenov jenoma jenih jene
|
||||
kuna kuni kune kuno kun kunama kunah kunam kunami
|
||||
marka marki marke markama markah markami
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
|
||||
def like_num(text):
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Source: https://github.com/stopwords-iso/stopwords-sl
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a ali
|
||||
|
||||
b bi bil bila bile bili bilo biti blizu bo bodo bojo bolj bom bomo
|
||||
|
|
@ -80,5 +79,4 @@ x
|
|||
z za zadaj zadnji zakaj zaprta zaprti zaprto zdaj zelo zunaj
|
||||
|
||||
ž že
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Source: https://github.com/andrixh/index-albanian
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a
|
||||
afert
|
||||
ai
|
||||
|
|
@ -225,5 +224,4 @@ vetes
|
|||
vjen
|
||||
yne
|
||||
zakonisht
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
а
|
||||
авај
|
||||
ако
|
||||
|
|
@ -389,5 +388,4 @@ STOP_WORDS = set(
|
|||
ћете
|
||||
ћеш
|
||||
ћу
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
aderton adertonde adjö aldrig alla allas allt alltid alltså än andra andras
|
||||
annan annat ännu artonde arton åtminstone att åtta åttio åttionde åttonde av
|
||||
även
|
||||
|
|
@ -62,5 +61,4 @@ under upp ur ursäkt ut utan utanför ute
|
|||
vad vänster vänstra var vår vara våra varför varifrån varit varken värre
|
||||
varsågod vart vårt vem vems verkligen vi vid vidare viktig viktigare viktigast
|
||||
viktigt vilka vilken vilket vill
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Stop words
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
ஒரு
|
||||
என்று
|
||||
மற்றும்
|
||||
|
|
@ -127,5 +126,4 @@ STOP_WORDS = set(
|
|||
வரையில்
|
||||
சற்று
|
||||
எனக்
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Source: https://github.com/Xangis/extra-stopwords (MIT License)
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
అందరూ
|
||||
అందుబాటులో
|
||||
అడగండి
|
||||
|
|
@ -52,5 +51,4 @@ STOP_WORDS = set(
|
|||
వేరుగా
|
||||
వ్యతిరేకంగా
|
||||
సంబంధం
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
ทั้งนี้ ดัง ขอ รวม หลังจาก เป็น หลัง หรือ ๆ เกี่ยวกับ ซึ่งได้แก่ ด้วยเพราะ ด้วยว่า ด้วยเหตุเพราะ
|
||||
ด้วยเหตุว่า สุดๆ เสร็จแล้ว เช่น เข้า ถ้า ถูก ถึง ต่างๆ ใคร เปิดเผย ครา รือ ตาม ใน ได้แก่ ได้แต่
|
||||
ได้ที่ ตลอดถึง นอกจากว่า นอกนั้น จริง อย่างดี ส่วน เพียงเพื่อ เดียว จัด ทั้งที ทั้งคน ทั้งตัว ไกลๆ
|
||||
|
|
@ -71,5 +70,4 @@ STOP_WORDS = set(
|
|||
แห่งนี้ แห่งโน้น แห่งไหน แหละ ให้แก่ ใหญ่ ใหญ่โต อย่างมาก อย่างยิ่ง อย่างไรก็ อย่างไรก็ได้ อย่างไรเสีย
|
||||
อย่างละ อย่างหนึ่ง อย่างๆ อัน อันจะ อันได้แก่ อันที่ อันที่จริง อันที่จะ อันเนื่องมาจาก อันละ อันๆ อาจจะ
|
||||
อาจเป็น อาจเป็นด้วย อื่น อื่นๆ เอ็ง เอา ฯ ฯล ฯลฯ 555 กำ ขอโทษ เยี่ยม นี่คือ
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
# Stop words from Tigrinya Wordcount: https://github.com/fgaim/Tigrinya-WordCount/blob/main/ti_stop_words.txt
|
||||
|
||||
# Stop words
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
'ምበር 'ሞ 'ቲ 'ታ 'ኳ 'ውን 'ዚ 'የ 'ዩ 'ያ 'ዮም 'ዮን
|
||||
ልዕሊ ሒዙ ሒዛ ሕጂ መበል መን መንጎ መጠን ማለት ምስ ምባል
|
||||
ምእንቲ ምኽንያቱ ምኽንያት ምዃኑ ምዃንና ምዃኖም
|
||||
|
|
@ -23,5 +22,4 @@ STOP_WORDS = set(
|
|||
ዝነበረ ዝነበረት ዝነበሩ ዝካየድ ዝኸውን ዝኽእል ዝኾነ ዝዀነ
|
||||
የለን ይቕረብ ይብል ይኸውን ይኹን ይኽእል ደኣ ድሕሪ ድማ
|
||||
ገለ ገሊጹ ገና ገይሩ ግና ግን ጥራይ
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
akin
|
||||
aking
|
||||
ako
|
||||
|
|
@ -147,5 +146,4 @@ tulad
|
|||
tungkol
|
||||
una
|
||||
walang
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
# Stop words
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
ke gareng ga selekanyo tlhwatlhwa yo mongwe se
|
||||
sengwe fa go le jalo gongwe ba na mo tikologong
|
||||
jaaka kwa morago nna gonne ka sa pele nako teng
|
||||
|
|
@ -16,5 +15,4 @@ tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi
|
|||
bonala e tshwanang bogolo tsenya tsweetswee karolo
|
||||
sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa
|
||||
tlhano lesometlhano botlalo lekgolo
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -111,8 +111,7 @@ for orth in [
|
|||
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
|
||||
|
||||
|
||||
emoticons = set(
|
||||
r"""
|
||||
emoticons = set(r"""
|
||||
:)
|
||||
:-)
|
||||
:))
|
||||
|
|
@ -243,8 +242,7 @@ o.0
|
|||
¯\(ツ)/¯
|
||||
(╯°□°)╯︵┻━┻
|
||||
><(((*>
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
||||
|
||||
for orth in emoticons:
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
# Source: https://github.com/stopwords-iso/stopwords-tr
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
acaba
|
||||
acep
|
||||
adamakıllı
|
||||
|
|
@ -553,5 +552,4 @@ zarfında
|
|||
zaten
|
||||
zati
|
||||
zira
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Tatar stopwords are from https://github.com/aliiae/stopwords-tt
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""алай алайса алар аларга аларда алардан аларны аларның аларча
|
||||
STOP_WORDS = set("""алай алайса алар аларга аларда алардан аларны аларның аларча
|
||||
алары аларын аларынга аларында аларыннан аларының алтмыш алтмышынчы алтмышынчыга
|
||||
алтмышынчыда алтмышынчыдан алтмышынчылар алтмышынчыларга алтмышынчыларда
|
||||
алтмышынчылардан алтмышынчыларны алтмышынчыларның алтмышынчыны алтмышынчының
|
||||
|
|
@ -169,5 +168,4 @@ STOP_WORDS = set(
|
|||
|
||||
өстәп өч өчен өченче өченчегә өченчедә өченчедән өченчеләр өченчеләргә
|
||||
өченчеләрдә өченчеләрдән өченчеләрне өченчеләрнең өченчене өченченең өчләп
|
||||
өчәрләп""".split()
|
||||
)
|
||||
өчәрләп""".split())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
STOP_WORDS = set(
|
||||
"""а
|
||||
STOP_WORDS = set("""а
|
||||
або
|
||||
адже
|
||||
аж
|
||||
|
|
@ -465,5 +464,4 @@ STOP_WORDS = set(
|
|||
якій
|
||||
якого
|
||||
якої
|
||||
якщо""".split()
|
||||
)
|
||||
якщо""".split())
|
||||
|
|
|
|||
|
|
@ -5,7 +5,8 @@ from ...attrs import LIKE_NUM
|
|||
# https://en.wikibooks.org/wiki/Urdu/Vocabulary/Numbers
|
||||
# https://www.urdu-english.com/lessons/beginner/numbers
|
||||
|
||||
_num_words = """ایک دو تین چار پانچ چھ سات آٹھ نو دس گیارہ بارہ تیرہ چودہ پندرہ سولہ سترہ
|
||||
_num_words = (
|
||||
"""ایک دو تین چار پانچ چھ سات آٹھ نو دس گیارہ بارہ تیرہ چودہ پندرہ سولہ سترہ
|
||||
اٹهارا انیس بیس اکیس بائیس تئیس چوبیس پچیس چھببیس
|
||||
ستایس اٹھائس انتيس تیس اکتیس بتیس تینتیس چونتیس پینتیس
|
||||
چھتیس سینتیس ارتیس انتالیس چالیس اکتالیس بیالیس تیتالیس
|
||||
|
|
@ -17,6 +18,7 @@ _num_words = """ایک دو تین چار پانچ چھ سات آٹھ نو دس
|
|||
سٹیاسی اٹھیاسی نواسی نوے اکانوے بانوے ترانوے
|
||||
چورانوے پچانوے چھیانوے ستانوے اٹھانوے ننانوے سو
|
||||
""".split()
|
||||
)
|
||||
|
||||
# source https://www.google.com/intl/ur/inputtools/try/
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
# Source: collected from different resource on internet
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
ثھی
|
||||
خو
|
||||
گی
|
||||
|
|
@ -509,5 +508,4 @@ STOP_WORDS = set(
|
|||
ہورہی
|
||||
ثبعث
|
||||
ضت
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
# Source: https://github.com/stopwords/vietnamese-stopwords
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
a_lô
|
||||
a_ha
|
||||
ai
|
||||
|
|
@ -1943,7 +1942,4 @@ yêu_cầu
|
|||
ừ_ào
|
||||
ừ_ừ
|
||||
ử
|
||||
""".split(
|
||||
"\n"
|
||||
)
|
||||
)
|
||||
""".split("\n"))
|
||||
|
|
|
|||
|
|
@ -91,13 +91,13 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
def __call__(self, text: str) -> Doc:
|
||||
if self.segmenter == Segmenter.jieba:
|
||||
words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x]) # type: ignore[union-attr]
|
||||
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||
words, spaces = util.get_words_and_spaces(words, text)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
elif self.segmenter == Segmenter.pkuseg:
|
||||
if self.pkuseg_seg is None:
|
||||
raise ValueError(Errors.E1000)
|
||||
words = self.pkuseg_seg.cut(text)
|
||||
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||
words, spaces = util.get_words_and_spaces(words, text)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
# warn if segmenter setting is not the only remaining option "char"
|
||||
|
|
@ -112,7 +112,7 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
|
||||
# split into individual characters
|
||||
words = list(text)
|
||||
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||
words, spaces = util.get_words_and_spaces(words, text)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
def pkuseg_update_user_dict(self, words: List[str], reset: bool = False):
|
||||
|
|
@ -210,7 +210,7 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir))
|
||||
if pkuseg_data["processors_data"]:
|
||||
processors_data = pkuseg_data["processors_data"]
|
||||
(user_dict, do_process, common_words, other_words) = processors_data
|
||||
user_dict, do_process, common_words, other_words = processors_data
|
||||
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
|
||||
self.pkuseg_seg.postprocesser.do_process = do_process
|
||||
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
||||
|
|
@ -268,7 +268,7 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
raise ImportError(self._pkuseg_install_msg) from None
|
||||
if self.segmenter == Segmenter.pkuseg:
|
||||
data = srsly.read_msgpack(path)
|
||||
(user_dict, do_process, common_words, other_words) = data
|
||||
user_dict, do_process, common_words, other_words = data
|
||||
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
|
||||
self.pkuseg_seg.postprocesser.do_process = do_process
|
||||
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# stop words as whitespace-separated list
|
||||
# Chinese stop words,maybe not enough
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
STOP_WORDS = set("""
|
||||
!
|
||||
"
|
||||
#
|
||||
|
|
@ -1895,5 +1894,4 @@ sup
|
|||
~±
|
||||
~+
|
||||
¥
|
||||
""".split()
|
||||
)
|
||||
""".split())
|
||||
|
|
|
|||
|
|
@ -85,7 +85,7 @@ class Table(OrderedDict):
|
|||
value: The value to set.
|
||||
"""
|
||||
key = get_string_id(key)
|
||||
OrderedDict.__setitem__(self, key, value) # type:ignore[assignment]
|
||||
OrderedDict.__setitem__(self, key, value) # type: ignore[assignment]
|
||||
self.bloom.add(key)
|
||||
|
||||
def set(self, key: Union[str, int], value: Any) -> None:
|
||||
|
|
@ -104,7 +104,7 @@ class Table(OrderedDict):
|
|||
RETURNS: The value.
|
||||
"""
|
||||
key = get_string_id(key)
|
||||
return OrderedDict.__getitem__(self, key) # type:ignore[index]
|
||||
return OrderedDict.__getitem__(self, key) # type: ignore[index]
|
||||
|
||||
def get(self, key: Union[str, int], default: Optional[Any] = None) -> Any:
|
||||
"""Get the value for a given key. String keys will be hashed.
|
||||
|
|
@ -114,7 +114,7 @@ class Table(OrderedDict):
|
|||
RETURNS: The value.
|
||||
"""
|
||||
key = get_string_id(key)
|
||||
return OrderedDict.get(self, key, default) # type:ignore[arg-type]
|
||||
return OrderedDict.get(self, key, default) # type: ignore[arg-type]
|
||||
|
||||
def __contains__(self, key: Union[str, int]) -> bool: # type: ignore[override]
|
||||
"""Check whether a key is in the table. String keys will be hashed.
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ def test_create_from_words_and_text(vocab):
|
|||
# no whitespace in words
|
||||
words = ["'", "dogs", "'", "run"]
|
||||
text = " 'dogs'\n\nrun "
|
||||
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||
words, spaces = util.get_words_and_spaces(words, text)
|
||||
doc = Doc(vocab, words=words, spaces=spaces)
|
||||
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
||||
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
|
||||
|
|
@ -38,7 +38,7 @@ def test_create_from_words_and_text(vocab):
|
|||
# partial whitespace in words
|
||||
words = [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
||||
text = " 'dogs'\n\nrun "
|
||||
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||
words, spaces = util.get_words_and_spaces(words, text)
|
||||
doc = Doc(vocab, words=words, spaces=spaces)
|
||||
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
||||
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
|
||||
|
|
@ -50,7 +50,7 @@ def test_create_from_words_and_text(vocab):
|
|||
# non-standard whitespace tokens
|
||||
words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
|
||||
text = " 'dogs'\n\nrun "
|
||||
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||
words, spaces = util.get_words_and_spaces(words, text)
|
||||
doc = Doc(vocab, words=words, spaces=spaces)
|
||||
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
||||
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
|
||||
|
|
@ -63,7 +63,7 @@ def test_create_from_words_and_text(vocab):
|
|||
with pytest.raises(ValueError):
|
||||
words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
|
||||
text = " 'dogs'\n\nrun "
|
||||
(words, spaces) = util.get_words_and_spaces(words + ["away"], text)
|
||||
words, spaces = util.get_words_and_spaces(words + ["away"], text)
|
||||
|
||||
|
||||
def test_create_with_heads_and_no_deps(vocab):
|
||||
|
|
|
|||
|
|
@ -437,7 +437,7 @@ def test_phrase_matcher_pickle(en_vocab):
|
|||
assert matches == matches_unpickled
|
||||
|
||||
# clunky way to vaguely check that callback is unpickled
|
||||
(vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1]
|
||||
vocab, docs, callbacks, attr = matcher_unpickled.__reduce__()[1]
|
||||
assert isinstance(callbacks.get("TEST2"), Mock)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user