This commit is contained in:
Matthew Honnibal 2026-03-03 09:56:06 +01:00 committed by GitHub
parent c1e7cb2ebf
commit 453732d32d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
92 changed files with 207 additions and 403 deletions

View File

@ -1,7 +1,6 @@
# Source: https://github.com/stopwords-iso/stopwords-af
STOP_WORDS = set(
"""
STOP_WORDS = set("""
'n
aan
af
@ -53,5 +52,4 @@ vir
was
wat
ʼn
""".split()
)
""".split())

View File

@ -1,8 +1,7 @@
# Stop words by Teshome Kassie http://etd.aau.edu.et/bitstream/handle/123456789/3315/Teshome%20Kassie.pdf?sequence=1&isAllowed=y
# Stop words by Tihitina Petros http://etd.aau.edu.et/bitstream/handle/123456789/3384/Tihitina%20Petros.pdf?sequence=1&isAllowed=y
STOP_WORDS = set(
"""
STOP_WORDS = set("""
ግን አንቺ አንተ እናንተ ያንተ ያንቺ የናንተ ራስህን ራስሽን ራሳችሁን
ሁሉ ኋላ በሰሞኑ አሉ በኋላ ሁኔታ በኩል አስታውቀዋል ሆነ በውስጥ
አስታውሰዋል ሆኑ ባጣም እስካሁን ሆኖም በተለይ አሳሰበ ሁል በተመለከተ
@ -29,5 +28,4 @@ STOP_WORDS = set(
በዚህም መሆን ምንጊዜም እነዚህም በዚህና ያለ ስም
ሲኖር ከዚህም መሆኑን በሁኔታው የማያንስ እነዚህኑ ማንም ከነዚሁ
ያላቸውን እጅግ ሲሆኑ ለሆኑ ሊሆን ለማናቸውም
""".split()
)
""".split())

View File

@ -1,7 +1,6 @@
from ...attrs import LIKE_NUM
_num_words = set(
"""
_num_words = set("""
صفر
واحد
إثنان
@ -51,11 +50,9 @@ _num_words = set(
مليون
مليار
مليارات
""".split()
)
""".split())
_ordinal_words = set(
"""
_ordinal_words = set("""
اول
أول
حاد
@ -70,8 +67,7 @@ _ordinal_words = set(
ثامن
تاسع
عاشر
""".split()
)
""".split())
def like_num(text):

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
من
نحو
لعل
@ -386,5 +385,4 @@ STOP_WORDS = set(
وإن
ولو
يا
""".split()
)
""".split())

View File

@ -1,6 +1,5 @@
# Source: https://github.com/eliasdabbas/advertools/blob/master/advertools/stopwords.py
STOP_WORDS = set(
"""
STOP_WORDS = set("""
amma
arasında
artıq
@ -141,5 +140,4 @@ zaman
əlbəttə
ən
əslində
""".split()
)
""".split())

View File

@ -4,8 +4,7 @@ References:
https://postvai.com/books/stop-dumi.pdf - Additions to the original list in order to improve it.
"""
STOP_WORDS = set(
"""
STOP_WORDS = set("""
а автентичен аз ако ала
бе без беше би бивш бивша бившо бивши бил била били било благодаря близо бъдат
@ -76,5 +75,4 @@ STOP_WORDS = set(
юмрук
я як
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
অতএব অথচ অথব অন অন অন অন অনতত অবধি অবশ অর অন অন অরধভ
আগ আগ আগ আছ আজ আদযভ আপন আপনি আব আমর আম আম আম আমি আর আরও
ইতি ইহ
@ -38,5 +37,4 @@ STOP_WORDS = set(
রণ মন সঙ সঙ সব সব সমস সমরতি সময় সহ সহি তর ি পষ বয
হইত হইব হইয হওয হওয হওয হচ হত হত হত হন হব হব হয হয হযি হয হয হযি হয
হয় হল হল হল হল হল ি ি হয় হয় হয় হইয় হয়ি হয় হয়নি হয় হয়ত হওয় হওয় হওয়
""".split()
)
""".split())

View File

@ -1,7 +1,6 @@
# Source: https://zenodo.org/records/10148636
STOP_WORDS = set(
"""
STOP_WORDS = set("""
@ -194,5 +193,4 @@ STOP_WORDS = set(
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a abans ací ah així això al aleshores algun alguna algunes alguns alhora allà allí allò
als altra altre altres amb ambdues ambdós anar ans apa aquell aquella aquelles aquells
aquest aquesta aquestes aquests aquí
@ -48,5 +47,4 @@ un una unes uns us últim ús
va vaig vam van vas veu vosaltres vostra vostre vostres
""".split()
)
""".split())

View File

@ -1,8 +1,7 @@
# Source: https://github.com/Alir3z4/stop-words
# Source: https://github.com/stopwords-iso/stopwords-cs/blob/master/stopwords-cs.txt
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a
aby
ahoj
@ -361,5 +360,4 @@ zač
zatímco
ze
že
""".split()
)
""".split())

View File

@ -1,7 +1,6 @@
# Source: Handpicked by Jens Dahl Møllerhøj.
STOP_WORDS = set(
"""
STOP_WORDS = set("""
af aldrig alene alle allerede alligevel alt altid anden andet andre at
bag begge blandt blev blive bliver burde bør
@ -41,5 +40,4 @@ ud uden udover under undtagen
var ved vi via vil ville vore vores vær være været
øvrigt
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
á a ab aber ach acht achte achten achter achtes ag alle allein allem allen
aller allerdings alles allgemeinen als also am an andere anderen anderem andern
anders auch auf aus ausser außer ausserdem außerdem
@ -74,5 +73,4 @@ wollt wollte wollten worden wurde würde wurden würden
zehn zehnte zehnten zehnter zehntes zeit zu zuerst zugleich zum zunächst zur
zurück zusammen zwanzig zwar zwei zweite zweiten zweiter zweites zwischen
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a abo aby ako ale
daniž dokulaž
@ -11,5 +10,4 @@ jolic
pak pótom
teke togodla
""".split()
)
""".split())

View File

@ -1,7 +1,6 @@
# Stop words
# Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0
STOP_WORDS = set(
"""
STOP_WORDS = set("""
αδιάκοπα αι ακόμα ακόμη ακριβώς άλλα αλλά αλλαχού άλλες άλλη άλλην
άλλης αλλιώς αλλιώτικα άλλο άλλοι αλλοιώς αλλοιώτικα άλλον άλλος άλλοτε αλλού
άλλους άλλων άμα άμεσα αμέσως αν ανά ανάμεσα αναμεταξύ άνευ αντί αντίπερα αντίς
@ -83,5 +82,4 @@ STOP_WORDS = set(
χωρίς χωριστά
ω ως ωσάν ωσότου ώσπου ώστε ωστόσο ωχ
""".split()
)
""".split())

View File

@ -1,6 +1,5 @@
# Stop words
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a about above across after afterwards again against all almost alone along
already also although always am among amongst amount an and another any anyhow
anyone anything anyway anywhere are around as at
@ -62,8 +61,7 @@ whereafter whereas whereby wherein whereupon wherever whether which while
whither who whoever whole whom whose why will with within without would
yet you your yours yourself yourselves
""".split()
)
""".split())
contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"]
STOP_WORDS.update(contractions)

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a acuerdo adelante ademas además afirmó agregó ahi ahora ahí al algo alguna
algunas alguno algunos algún alli allí alrededor ambos ante anterior antes
apenas aproximadamente aquel aquella aquellas aquello aquellos aqui aquél
@ -76,5 +75,4 @@ va vais vamos van varias varios vaya veces ver verdad verdadera verdadero vez
vosotras vosotros voy vuestra vuestras vuestro vuestros
y ya yo
""".split()
)
""".split())

View File

@ -1,7 +1,6 @@
# Source: https://github.com/stopwords-iso/stopwords-et
STOP_WORDS = set(
"""
STOP_WORDS = set("""
aga
ei
et
@ -37,5 +36,4 @@ siis
ta
te
ära
""".split()
)
""".split())

View File

@ -1,8 +1,7 @@
# Source: https://github.com/stopwords-iso/stopwords-eu
# https://www.ranks.nl/stopwords/basque
# https://www.mustgo.com/worldlanguages/basque/
STOP_WORDS = set(
"""
STOP_WORDS = set("""
al
anitz
arabera
@ -101,5 +100,4 @@ zu
zuek
zuen
zuten
""".split()
)
""".split())

View File

@ -5,8 +5,7 @@ ZWNJ_O_MIM = "‌ام"
YE_NUN = "ین"
_num_words = set(
"""
_num_words = set("""
صفر
یک
دو
@ -63,15 +62,12 @@ _num_words = set(
کوادریلیون
کادریلیارد
کوینتیلیون
""".split()
)
""".split())
_ordinal_words = set(
"""
_ordinal_words = set("""
اول
سوم
سیام""".split()
)
سیام""".split())
_ordinal_words.update({num + MIM for num in _num_words})
_ordinal_words.update({num + ZWNJ_O_MIM for num in _num_words})

View File

@ -1,6 +1,5 @@
# Stop words from HAZM package
STOP_WORDS = set(
"""
STOP_WORDS = set("""
و
در
به
@ -389,5 +388,4 @@ STOP_WORDS = set(
لذا
زاده
گردد
اینجا""".split()
)
اینجا""".split())

View File

@ -1,7 +1,6 @@
# Source https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt
# Reformatted with some minor corrections
STOP_WORDS = set(
"""
STOP_WORDS = set("""
aiemmin aika aikaa aikaan aikaisemmin aikaisin aikana aikoina aikoo aikovat
aina ainakaan ainakin ainoa ainoat aiomme aion aiotte aivan ajan alas alemmas
alkuisin alkuun alla alle aloitamme aloitan aloitat aloitatte aloitattivat
@ -106,5 +105,4 @@ yhtäällä yhtäältä yhtään yhä yksi yksin yksittäin yleensä ylemmäs yl
ympäri
älköön älä
""".split()
)
""".split())

View File

@ -1,24 +1,20 @@
from ...attrs import LIKE_NUM
_num_words = set(
"""
_num_words = set("""
zero un une deux trois quatre cinq six sept huit neuf dix
onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf
vingt trente quarante cinquante soixante soixante-dix septante quatre-vingt huitante quatre-vingt-dix nonante
cent mille mil million milliard billion quadrillion quintillion
sextillion septillion octillion nonillion decillion
""".split()
)
""".split())
_ordinal_words = set(
"""
_ordinal_words = set("""
premier première deuxième second seconde troisième quatrième cinquième sixième septième huitième neuvième dixième
onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neuvième
vingtième trentième quarantième cinquantième soixantième soixante-dixième septantième quatre-vingtième huitantième quatre-vingt-dixième nonantième
centième millième millionnième milliardième billionnième quadrillionnième quintillionnième
sextillionnième septillionnième octillionnième nonillionnième decillionnième
""".split()
)
""".split())
def like_num(text):

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a à â abord afin ah ai aie ainsi ait allaient allons
alors anterieur anterieure anterieures antérieur antérieure antérieures
apres après as assez attendu au
@ -80,5 +79,4 @@ votre votres vous vous-mêmes vu vé vôtre vôtres
y
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a ach ag agus an aon ar arna as
ba beirt bhúr
@ -39,5 +38,4 @@ um
í
ó ón óna ónár
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
'ad
'ar
'd # iad
@ -382,7 +381,4 @@ urrainn
ì
ò
ó
""".split(
"\n"
)
)
""".split("\n"))

View File

@ -1974,9 +1974,7 @@ Tron an
tuilleadh 's a chòir
Tuilleadh 's a chòir
tuilleadh sa chòir
Tuilleadh sa chòir""".split(
"\n"
):
Tuilleadh sa chòir""".split("\n"):
_exc[orth] = [{ORTH: orth}]

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
αὐτῷ αὐτοῦ αὐτῆς αὐτόν αὐτὸν αὐτῶν αὐτὸς αὐτὸ αὐτό αὐτός αὐτὴν αὐτοῖς αὐτοὺς αὔτ' αὐτὰ αὐτῇ αὐτὴ
αὐτὼ αὑταὶ καὐτὸς αὐτά αὑτός αὐτοῖσι αὐτοῖσιν αὑτὸς αὐτήν αὐτοῖσί αὐτοί αὐτοὶ αὐτοῖο αὐτάων αὐτὰς
αὐτέων αὐτώ αὐτάς αὐτούς αὐτή αὐταί αὐταὶ αὐτῇσιν τὠυτῷ τὠυτὸ ταὐτὰ ταύτῃ αὐτῇσι αὐτῇς αὐταῖς αὐτᾶς αὐτὰν ταὐτὸν
@ -57,5 +56,4 @@ STOP_WORDS = set(
ὣς ὡς ὥς ὧς ὥστ' ὥστε ὥσθ'
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
એમ
@ -84,5 +83,4 @@ STOP_WORDS = set(
દર
એટલ
પર
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
אני
את
אתה
@ -218,5 +217,4 @@ STOP_WORDS = set(
אחרות
אשר
או
""".split()
)
""".split())

View File

@ -1,7 +1,6 @@
# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6
STOP_WORDS = set(
"""
STOP_WORDS = set("""
दर
अत
अदि
@ -235,5 +234,4 @@ STOP_WORDS = set(
""".split()
)
""".split())

View File

@ -1,6 +1,5 @@
# Source: https://github.com/stopwords-iso/stopwords-hr
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a
ah
aha
@ -340,5 +339,4 @@ zbog
željeo
zimus
zum
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a abo ale ani
dokelž
@ -15,5 +14,4 @@ pak potom
tež tohodla
zo zoby
""".split()
)
""".split())

View File

@ -1,24 +1,20 @@
from ...attrs import LIKE_NUM, NORM
# Cardinal numbers in Creole
_num_words = set(
"""
_num_words = set("""
zewo youn en de twa kat senk sis sèt uit nèf dis
onz douz trèz katoz kenz sèz disèt dizwit diznèf
vent trant karant sinkant swasant swasann-dis
san mil milyon milya
""".split()
)
""".split())
# Ordinal numbers in Creole (some are French-influenced, some simplified)
_ordinal_words = set(
"""
_ordinal_words = set("""
premye dezyèm twazyèm katryèm senkyèm sizyèm sètvyèm uitvyèm nèvyèm dizyèm
onzèm douzyèm trèzyèm katozyèm kenzèm sèzyèm disetyèm dizwityèm diznèvyèm
ventyèm trantyèm karantyèm sinkantyèm swasantyèm
swasann-disyèm santyèm milyèm milyonnyèm milyadyèm
""".split()
)
""".split())
NORM_MAP = {
"'m": "mwen",

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a ak an ankò ant apre ap atò avan avanlè
byen byenke
@ -39,8 +38,7 @@ sa san si swa si
men mèsi oswa osinon
""".split()
)
""".split())
# Add common contractions, with and without apostrophe variants
contractions = ["m'", "n'", "w'", "y'", "l'", "t'", "k'"]

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a abban ahhoz ahogy ahol aki akik akkor akár alatt amely amelyek amelyekben
amelyeket amelyet amelynek ami amikor amit amolyan amíg annak arra arról az
azok azon azonban azt aztán azután azzal azért
@ -58,5 +57,4 @@ volna volt voltak voltam voltunk
úgy új újabb újra
ő őket
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
նա
ողջը
այստեղ
@ -103,5 +102,4 @@ STOP_WORDS = set(
այս
մեջ
թ
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
ID_BASE_EXCEPTIONS = set(
"""
ID_BASE_EXCEPTIONS = set("""
aba-aba
abah-abah
abal-abal
@ -3898,5 +3897,4 @@ yel-yel
yo-yo
zam-zam
zig-zag
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya
aku akulah amat amatlah anda andalah antar antara antaranya apa apaan apabila
apakah apalagi apatah artinya asal asalkan atas atau ataukah ataupun awal
@ -114,5 +113,4 @@ ucap ucapnya ujar ujarnya umum umumnya ungkap ungkapnya untuk usah usai
waduh wah wahai waktu waktunya walau walaupun wong
yaitu yakin yakni yang
""".split()
)
""".split())

View File

@ -1,7 +1,6 @@
# Source: https://github.com/Xangis/extra-stopwords
STOP_WORDS = set(
"""
STOP_WORDS = set("""
afhverju
aftan
aftur
@ -154,5 +153,4 @@ ykkar
því
þær
ætti
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl
agli ahime ahimè ai al alcuna alcuni alcuno all alla alle allo allora altri
altrimenti altro altrove altrui anche ancora anni anno ansa anticipo assai
@ -79,5 +78,4 @@ uguali ulteriore ultimo un un' una uno uomo
v' va vale vari varia varie vario verso vi via vicino visto vita voi volta volte
vostra vostre vostri vostro
""".split()
)
""".split())

View File

@ -2,8 +2,7 @@
# filtering out everything that wasn't hiragana. ー (one) was also added.
# Considered keeping some non-hiragana words but too many place names were
# present.
STOP_WORDS = set(
"""
STOP_WORDS = set("""
あっ あまり あり ある あるいは あれ
いい いう いく いずれ いっ いつ いる いわ
うち
@ -44,5 +43,4 @@ STOP_WORDS = set(
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
û
li
bi
@ -40,5 +39,4 @@ gelek
hemû
kes
tişt
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
ಹಲವ
ಲಕ
@ -82,5 +81,4 @@ STOP_WORDS = set(
ನನ
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
@ -63,5 +62,4 @@ STOP_WORDS = set(
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
ага адам айтты айтымында айтып ал алар
алардын алган алуу алып анда андан аны
анын ар
@ -38,5 +37,4 @@ STOP_WORDS = set(
үч үчүн
өз
""".split()
)
""".split())

View File

@ -1,7 +1,6 @@
# Corrected Perseus list, cf. https://wiki.digitalclassicist.org/Stopwords_for_Greek_and_Latin
STOP_WORDS = set(
"""
STOP_WORDS = set("""
ab ac ad adhuc aliqui aliquis an ante apud at atque aut autem
cum cur
@ -33,5 +32,4 @@ tam tamen trans tu tum
ubi uel uero
vel vero
""".split()
)
""".split())

View File

@ -1,22 +1,18 @@
from ...attrs import LIKE_NUM
_num_words = set(
"""
_num_words = set("""
null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng
véierzéng foffzéng siechzéng siwwenzéng uechtzeng uechzeng nonnzéng nongzéng zwanzeg drësseg véierzeg foffzeg sechzeg siechzeg siwenzeg achtzeg achzeg uechtzeg uechzeg nonnzeg
honnert dausend millioun milliard billioun billiard trillioun triliard
""".split()
)
""".split())
_ordinal_words = set(
"""
_ordinal_words = set("""
éischten zweeten drëtten véierten fënneften sechsten siwenten aachten néngten zéngten eeleften
zwieleften dräizéngten véierzéngten foffzéngten siechzéngten uechtzéngen uechzéngten nonnzéngten nongzéngten zwanzegsten
drëssegsten véierzegsten foffzegsten siechzegsten siwenzegsten uechzegsten nonnzegsten
honnertsten dausendsten milliounsten
milliardsten billiounsten billiardsten trilliounsten trilliardsten
""".split()
)
""".split())
def like_num(text):

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a
à
äis
@ -207,5 +206,4 @@ ze
zu
zum
zwar
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
abadde abalala abamu abangi abava ajja ali alina ani anti ateekeddwa atewamu
atya awamu aweebwa ayinza ba baali babadde babalina bajja
bajjanewankubade bali balina bandi bangi bano bateekeddwa baweebwa bayina bebombi beera bibye
@ -15,5 +14,4 @@ oyina oyo seetaaga si sinakindi singa talina tayina tebaali tebaalina tebayina t
tetuteekeddwa tewali teyalina teyayina tolina tu tuyina tulina tuyina twafuna twetaaga wa wabula
wabweru wadde waggulunnina wakati waliwobangi waliyo wandi wange wano wansi weebwa yabadde yaffe
ye yenna yennyini yina yonna ziba zijja zonna
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei
bella belle belli bello ben
@ -35,5 +34,4 @@ tanta tante tanti tanto te ti torna tra tròppo tutta tutte tutti tutto
un uña unn' unna
za zu
""".split()
)
""".split())

View File

@ -1,7 +1,6 @@
# Source: https://github.com/stopwords-iso/stopwords-lv
STOP_WORDS = set(
"""
STOP_WORDS = set("""
aiz
ap
apakš
@ -163,5 +162,4 @@ viņpus
zem
ārpus
šaipus
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
а
абре
@ -811,5 +810,4 @@ aв
џагара-магара
џанам
џив-џив
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
അത
ഇത
ആയി
@ -9,5 +8,4 @@ STOP_WORDS = set(
അന
ഇന
ആണ
""".split()
)
""".split())

View File

@ -1,6 +1,5 @@
# Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json
STOP_WORDS = set(
"""
STOP_WORDS = set("""
अतर
@ -188,5 +187,4 @@ STOP_WORDS = set(
""".split()
)
""".split())

View File

@ -1,7 +1,6 @@
# from https://prpm.dbp.gov.my/cari1?keyword=
# dbp https://en.wikipedia.org/wiki/Dewan_Bahasa_dan_Pustaka
MS_BASE_EXCEPTIONS = set(
"""
MS_BASE_EXCEPTIONS = set("""
aba-aba
abah-abah
abar-abar
@ -1939,5 +1938,4 @@ warna-warni
water-cooled
world-class
yang-yang
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya
aku akulah amat amatlah anda andalah antar antara antaranya apa apaan apabila
apakah apalagi apatah artinya asal asalkan atas atau ataukah ataupun awal
@ -114,5 +113,4 @@ ucap ucapnya ujar ujarnya umum umumnya ungkap ungkapnya untuk usah usai
waduh wah wahai waktu waktunya walau walaupun wong
yaitu yakin yakni yang
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
alle allerede alt and andre annen annet at av
bak bare bedre beste blant ble bli blir blitt bris by både
@ -46,5 +45,4 @@ vant var ved veldig vi videre viktig vil ville viser vår være vært
å år
ønsker
""".split()
)
""".split())

View File

@ -1,7 +1,6 @@
# Source: https://github.com/sanjaalcorps/NepaliStopWords/blob/master/NepaliStopWords.txt
STOP_WORDS = set(
"""
STOP_WORDS = set("""
अकसर
अगि
अग
@ -490,5 +489,4 @@ STOP_WORDS = set(
इन
ि
""".split()
)
""".split())

View File

@ -1,21 +1,17 @@
from ...attrs import LIKE_NUM
_num_words = set(
"""
_num_words = set("""
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
duizend miljoen miljard biljoen biljard triljoen triljard
""".split()
)
""".split())
_ordinal_words = set(
"""
_ordinal_words = set("""
eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
miljardste biljoenste biljardste triljoenste triljardste
""".split()
)
""".split())
def like_num(text):

View File

@ -13,8 +13,7 @@
# should have a Dutch counterpart here.
STOP_WORDS = set(
"""
STOP_WORDS = set("""
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaande aangezien achter achterna
afgelopen aldus alhoewel anderzijds
@ -68,5 +67,4 @@ welk welke welken werd werden wiens wier wilde wordt
zal ze zei zelf zich zij zijn zo zonder zou zeer zeker zekere zelfde zelfs zichzelf zijnde zijne zon zoals zodra zouden
zoveel zowat zulk zulke zulks zullen zult
""".split()
)
""".split())

View File

@ -1,7 +1,6 @@
# sources: https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt and https://github.com/stopwords-iso/stopwords-pl
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a aby ach acz aczkolwiek aj albo ale alez
ależ ani az
@ -74,5 +73,4 @@ xi xii xiii xiv xv
z za zaden zadna zadne zadnych zapewne zawsze zaś
ze zeby znow znowu znów zostal został
żaden żadna żadne żadnych że żeby""".split()
)
żaden żadna żadne żadnych że żeby""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes
ao aos apenas apoia apoio apontar após aquela aquelas aquele aqueles aqui aquilo
as assim através atrás até
@ -62,5 +61,4 @@ vai vais valor veja vem vens ver vez vezes vinda vindo vinte você vocês vos vo
vossas vosso vossos vários vão vêm vós
zero
""".split()
)
""".split())

View File

@ -1,16 +1,13 @@
from ...attrs import LIKE_NUM
_num_words = set(
"""
_num_words = set("""
zero unu doi două trei patru cinci șase șapte opt nouă zece
unsprezece doisprezece douăsprezece treisprezece patrusprezece cincisprezece șaisprezece șaptesprezece optsprezece nouăsprezece
douăzeci treizeci patruzeci cincizeci șaizeci șaptezeci optzeci nouăzeci
sută mie milion miliard bilion trilion cvadrilion catralion cvintilion sextilion septilion enșpemii
""".split()
)
""".split())
_ordinal_words = set(
"""
_ordinal_words = set("""
primul doilea treilea patrulea cincilea șaselea șaptelea optulea nouălea zecelea
prima doua treia patra cincia șasea șaptea opta noua zecea
unsprezecelea doisprezecelea treisprezecelea patrusprezecelea cincisprezecelea șaisprezecelea șaptesprezecelea optsprezecelea nouăsprezecelea
@ -18,8 +15,7 @@ unsprezecea douăsprezecea treisprezecea patrusprezecea cincisprezecea șaisprez
douăzecilea treizecilea patruzecilea cincizecilea șaizecilea șaptezecilea optzecilea nouăzecilea sutălea
douăzecea treizecea patruzecea cincizecea șaizecea șaptezecea optzecea nouăzecea suta
miilea mielea mia milionulea milioana miliardulea miliardelea miliarda enșpemia
""".split()
)
""".split())
def like_num(text):

View File

@ -1,6 +1,5 @@
# Source: https://github.com/stopwords-iso/stopwords-ro
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a
abia
acea
@ -495,5 +494,4 @@ zice
știu
ți
ție
""".split()
)
""".split())

View File

@ -1,8 +1,6 @@
from ...attrs import LIKE_NUM
_num_words = list(
set(
"""
_num_words = list(set("""
ноль ноля нолю нолём ноле нулевой нулевого нулевому нулевым нулевом нулевая нулевую нулевое нулевые нулевых нулевыми
четверть четверти четвертью четвертей четвертям четвертями четвертях
@ -203,9 +201,7 @@ _num_words = list(
квинтиллиону квинтиллионов квинтлн
i ii iii iv v vi vii viii ix x xi xii xiii xiv xv xvi xvii xviii xix xx xxi xxii xxiii xxiv xxv xxvi xxvii xxvii xxix
""".split()
)
)
""".split()))
def like_num(text):

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
а авось ага агу аж ай али алло ау ах ая
б будем будет будете будешь буду будут будучи будь будьте бы был была были было
@ -107,5 +106,4 @@ STOP_WORDS = set(
ю
я явно явных яко якобы якоже
""".split()
)
""".split())

View File

@ -1,7 +1,6 @@
# Source: https://gist.github.com/Akhilesh28/fe8b8e180f64b72e64751bc31cb6d323
STOP_WORDS = set(
"""
STOP_WORDS = set("""
अहम
आव
वयम
@ -511,5 +510,4 @@ STOP_WORDS = set(
हन
ि
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
සහ
සමග
සමඟ
@ -191,5 +190,4 @@ STOP_WORDS = set(
පර
එහ
""".split()
)
""".split())

View File

@ -1,7 +1,6 @@
# Source: https://github.com/Ardevop-sk/stopwords-sk
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a
aby
aj
@ -420,5 +419,4 @@ zo
ňou
ňu
že
""".split()
)
""".split())

View File

@ -2,8 +2,7 @@ import unicodedata
from ...attrs import IS_CURRENCY, LIKE_NUM
_num_words = set(
"""
_num_words = set("""
nula ničla nič ena dva tri štiri pet šest sedem osem
devet deset enajst dvanajst trinajst štirinajst petnajst
šestnajst sedemnajst osemnajst devetnajst dvajset trideset štirideset
@ -18,11 +17,9 @@ _num_words = set(
šestnajstih šestnajstim šestnajstimi petnajstih petnajstim petnajstimi
sedemnajstih sedemnajstim sedemnajstimi osemnajstih osemnajstim osemnajstimi
devetnajstih devetnajstim devetnajstimi dvajsetih dvajsetim dvajsetimi
""".split()
)
""".split())
_ordinal_words = set(
"""
_ordinal_words = set("""
prvi drugi tretji četrti peti šesti sedmi osmi
deveti deseti enajsti dvanajsti trinajsti štirinajsti
petnajsti šestnajsti sedemnajsti osemnajsti devetnajsti
@ -92,11 +89,9 @@ _ordinal_words = set(
osemnajstimi devetnajstimi dvajsetimi tridesetimi štiridesetimi petdesetimi šestdesetimi
sedemdesetimi osemdesetimi devetdesetimi stotimi tisočimi milijontimi bilijontimi
trilijontimi kvadrilijontimi neštetimi
""".split()
)
""".split())
_currency_words = set(
"""
_currency_words = set("""
evro evra evru evrom evrov evroma evrih evrom evre evri evr eur
cent centa centu cenom centov centoma centih centom cente centi
dolar dolarja dolarji dolarju dolarjem dolarjev dolarjema dolarjih dolarje usd
@ -109,8 +104,7 @@ _currency_words = set(
jen jena jeni jenu jenom jenov jenoma jenih jene
kuna kuni kune kuno kun kunama kunah kunam kunami
marka marki marke markama markah markami
""".split()
)
""".split())
def like_num(text):

View File

@ -1,7 +1,6 @@
# Source: https://github.com/stopwords-iso/stopwords-sl
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a ali
b bi bil bila bile bili bilo biti blizu bo bodo bojo bolj bom bomo
@ -80,5 +79,4 @@ x
z za zadaj zadnji zakaj zaprta zaprti zaprto zdaj zelo zunaj
ž že
""".split()
)
""".split())

View File

@ -1,7 +1,6 @@
# Source: https://github.com/andrixh/index-albanian
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a
afert
ai
@ -225,5 +224,4 @@ vetes
vjen
yne
zakonisht
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
а
авај
ако
@ -389,5 +388,4 @@ STOP_WORDS = set(
ћете
ћеш
ћу
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
aderton adertonde adjö aldrig alla allas allt alltid alltså än andra andras
annan annat ännu artonde arton åtminstone att åtta åttio åttionde åttonde av
även
@ -62,5 +61,4 @@ under upp ur ursäkt ut utan utanför ute
vad vänster vänstra var vår vara våra varför varifrån varit varken värre
varsågod vart vårt vem vems verkligen vi vid vidare viktig viktigare viktigast
viktigt vilka vilken vilket vill
""".split()
)
""".split())

View File

@ -1,7 +1,6 @@
# Stop words
STOP_WORDS = set(
"""
STOP_WORDS = set("""
ஒர
என
மற
@ -127,5 +126,4 @@ STOP_WORDS = set(
வரி
சற
எனக
""".split()
)
""".split())

View File

@ -1,7 +1,6 @@
# Source: https://github.com/Xangis/extra-stopwords (MIT License)
STOP_WORDS = set(
"""
STOP_WORDS = set("""
దర
అడగి
@ -52,5 +51,4 @@ STOP_WORDS = set(
యతి
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
งน ขอ รวม หลงจาก เป หล หร เกยวก งไดแก วยเพราะ วยว วยเหตเพราะ
วยเหต ดๆ เสรจแล เช เข างๆ ใคร เปดเผย ครา ตาม ใน ไดแก ไดแต
ได ตลอดถ นอกจากว นอกน จร อยางด วน เพยงเพ เดยว งท งคน งต ไกลๆ
@ -71,5 +70,4 @@ STOP_WORDS = set(
แหงน แหงโน แหงไหน แหละ ใหแก ใหญ ใหญโต อยางมาก อยางย อยางไรก อยางไรกได อยางไรเส
อยางละ อยางหน อยางๆ นจะ นไดแก นท นทจร นทจะ นเนองมาจาก นละ นๆ อาจจะ
อาจเป อาจเปนดวย นๆ เอ เอา ฯล ฯลฯ 555 กำ ขอโทษ เยยม
""".split()
)
""".split())

View File

@ -1,8 +1,7 @@
# Stop words from Tigrinya Wordcount: https://github.com/fgaim/Tigrinya-WordCount/blob/main/ti_stop_words.txt
# Stop words
STOP_WORDS = set(
"""
STOP_WORDS = set("""
'ምበር ' '' ''ውን '' '' 'ዮም 'ዮን
ልዕሊ ሒዙ ሒዛ ሕጂ መበል መን መንጎ መጠን ማለት ምስ ምባል
ምእንቲ ምኽንያቱ ምኽንያት ምዃኑ ምዃንና ምዃኖም
@ -23,5 +22,4 @@ STOP_WORDS = set(
ዝነበረ ዝነበረት ዝነበሩ ዝካየድ ዝኸውን ዝኽእል ዝኾነ ዝዀነ
የለን ይቕረብ ይብል ይኸውን ይኹን ይኽእል ደኣ ድሕሪ ድማ
ገለ ገሊጹ ገና ገይሩ ግና ግን ጥራይ
""".split()
)
""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""
STOP_WORDS = set("""
akin
aking
ako
@ -147,5 +146,4 @@ tulad
tungkol
una
walang
""".split()
)
""".split())

View File

@ -1,6 +1,5 @@
# Stop words
STOP_WORDS = set(
"""
STOP_WORDS = set("""
ke gareng ga selekanyo tlhwatlhwa yo mongwe se
sengwe fa go le jalo gongwe ba na mo tikologong
jaaka kwa morago nna gonne ka sa pele nako teng
@ -16,5 +15,4 @@ tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi
bonala e tshwanang bogolo tsenya tsweetswee karolo
sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa
tlhano lesometlhano botlalo lekgolo
""".split()
)
""".split())

View File

@ -111,8 +111,7 @@ for orth in [
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
emoticons = set(
r"""
emoticons = set(r"""
:)
:-)
:))
@ -243,8 +242,7 @@ o.0
¯\()/¯
(°°
><(((*>
""".split()
)
""".split())
for orth in emoticons:

View File

@ -1,6 +1,5 @@
# Source: https://github.com/stopwords-iso/stopwords-tr
STOP_WORDS = set(
"""
STOP_WORDS = set("""
acaba
acep
adamakıllı
@ -553,5 +552,4 @@ zarfında
zaten
zati
zira
""".split()
)
""".split())

View File

@ -1,7 +1,6 @@
# Tatar stopwords are from https://github.com/aliiae/stopwords-tt
STOP_WORDS = set(
"""алай алайса алар аларга аларда алардан аларны аларның аларча
STOP_WORDS = set("""алай алайса алар аларга аларда алардан аларны аларның аларча
алары аларын аларынга аларында аларыннан аларының алтмыш алтмышынчы алтмышынчыга
алтмышынчыда алтмышынчыдан алтмышынчылар алтмышынчыларга алтмышынчыларда
алтмышынчылардан алтмышынчыларны алтмышынчыларның алтмышынчыны алтмышынчының
@ -169,5 +168,4 @@ STOP_WORDS = set(
өстәп өч өчен өченче өченчегә өченчедә өченчедән өченчеләр өченчеләргә
өченчеләрдә өченчеләрдән өченчеләрне өченчеләрнең өченчене өченченең өчләп
өчәрләп""".split()
)
өчәрләп""".split())

View File

@ -1,5 +1,4 @@
STOP_WORDS = set(
"""а
STOP_WORDS = set("""а
або
адже
аж
@ -465,5 +464,4 @@ STOP_WORDS = set(
якій
якого
якої
якщо""".split()
)
якщо""".split())

View File

@ -5,7 +5,8 @@ from ...attrs import LIKE_NUM
# https://en.wikibooks.org/wiki/Urdu/Vocabulary/Numbers
# https://www.urdu-english.com/lessons/beginner/numbers
_num_words = """ایک دو تین چار پانچ چھ سات آٹھ نو دس گیارہ بارہ تیرہ چودہ پندرہ سولہ سترہ
_num_words = (
"""ایک دو تین چار پانچ چھ سات آٹھ نو دس گیارہ بارہ تیرہ چودہ پندرہ سولہ سترہ
اٹهارا انیس بیس اکیس بائیس تئیس چوبیس پچیس چھببیس
ستایس اٹھائس انتيس تیس اکتیس بتیس تینتیس چونتیس پینتیس
چھتیس سینتیس ارتیس انتالیس چالیس اکتالیس بیالیس تیتالیس
@ -17,6 +18,7 @@ _num_words = """ایک دو تین چار پانچ چھ سات آٹھ نو دس
سٹیاسی اٹھیاسی نواسی نوے اکانوے بانوے ترانوے
چورانوے پچانوے چھیانوے ستانوے اٹھانوے ننانوے سو
""".split()
)
# source https://www.google.com/intl/ur/inputtools/try/

View File

@ -1,6 +1,5 @@
# Source: collected from different resource on internet
STOP_WORDS = set(
"""
STOP_WORDS = set("""
ثھی
خو
گی
@ -509,5 +508,4 @@ STOP_WORDS = set(
ہورہی
ثبعث
ضت
""".split()
)
""".split())

View File

@ -1,6 +1,5 @@
# Source: https://github.com/stopwords/vietnamese-stopwords
STOP_WORDS = set(
"""
STOP_WORDS = set("""
a_lô
a_ha
ai
@ -1943,7 +1942,4 @@ yêu_cầu
ừ_ào
ừ_ừ
""".split(
"\n"
)
)
""".split("\n"))

View File

@ -91,13 +91,13 @@ class ChineseTokenizer(DummyTokenizer):
def __call__(self, text: str) -> Doc:
if self.segmenter == Segmenter.jieba:
words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x]) # type: ignore[union-attr]
(words, spaces) = util.get_words_and_spaces(words, text)
words, spaces = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces)
elif self.segmenter == Segmenter.pkuseg:
if self.pkuseg_seg is None:
raise ValueError(Errors.E1000)
words = self.pkuseg_seg.cut(text)
(words, spaces) = util.get_words_and_spaces(words, text)
words, spaces = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces)
# warn if segmenter setting is not the only remaining option "char"
@ -112,7 +112,7 @@ class ChineseTokenizer(DummyTokenizer):
# split into individual characters
words = list(text)
(words, spaces) = util.get_words_and_spaces(words, text)
words, spaces = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces)
def pkuseg_update_user_dict(self, words: List[str], reset: bool = False):
@ -210,7 +210,7 @@ class ChineseTokenizer(DummyTokenizer):
self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir))
if pkuseg_data["processors_data"]:
processors_data = pkuseg_data["processors_data"]
(user_dict, do_process, common_words, other_words) = processors_data
user_dict, do_process, common_words, other_words = processors_data
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
self.pkuseg_seg.postprocesser.do_process = do_process
self.pkuseg_seg.postprocesser.common_words = set(common_words)
@ -268,7 +268,7 @@ class ChineseTokenizer(DummyTokenizer):
raise ImportError(self._pkuseg_install_msg) from None
if self.segmenter == Segmenter.pkuseg:
data = srsly.read_msgpack(path)
(user_dict, do_process, common_words, other_words) = data
user_dict, do_process, common_words, other_words = data
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
self.pkuseg_seg.postprocesser.do_process = do_process
self.pkuseg_seg.postprocesser.common_words = set(common_words)

View File

@ -1,7 +1,6 @@
# stop words as whitespace-separated list
# Chinese stop words,maybe not enough
STOP_WORDS = set(
"""
STOP_WORDS = set("""
!
"
#
@ -1895,5 +1894,4 @@ sup
±
""".split()
)
""".split())

View File

@ -85,7 +85,7 @@ class Table(OrderedDict):
value: The value to set.
"""
key = get_string_id(key)
OrderedDict.__setitem__(self, key, value) # type:ignore[assignment]
OrderedDict.__setitem__(self, key, value) # type: ignore[assignment]
self.bloom.add(key)
def set(self, key: Union[str, int], value: Any) -> None:
@ -104,7 +104,7 @@ class Table(OrderedDict):
RETURNS: The value.
"""
key = get_string_id(key)
return OrderedDict.__getitem__(self, key) # type:ignore[index]
return OrderedDict.__getitem__(self, key) # type: ignore[index]
def get(self, key: Union[str, int], default: Optional[Any] = None) -> Any:
"""Get the value for a given key. String keys will be hashed.
@ -114,7 +114,7 @@ class Table(OrderedDict):
RETURNS: The value.
"""
key = get_string_id(key)
return OrderedDict.get(self, key, default) # type:ignore[arg-type]
return OrderedDict.get(self, key, default) # type: ignore[arg-type]
def __contains__(self, key: Union[str, int]) -> bool: # type: ignore[override]
"""Check whether a key is in the table. String keys will be hashed.

View File

@ -26,7 +26,7 @@ def test_create_from_words_and_text(vocab):
# no whitespace in words
words = ["'", "dogs", "'", "run"]
text = " 'dogs'\n\nrun "
(words, spaces) = util.get_words_and_spaces(words, text)
words, spaces = util.get_words_and_spaces(words, text)
doc = Doc(vocab, words=words, spaces=spaces)
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
@ -38,7 +38,7 @@ def test_create_from_words_and_text(vocab):
# partial whitespace in words
words = [" ", "'", "dogs", "'", "\n\n", "run", " "]
text = " 'dogs'\n\nrun "
(words, spaces) = util.get_words_and_spaces(words, text)
words, spaces = util.get_words_and_spaces(words, text)
doc = Doc(vocab, words=words, spaces=spaces)
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
@ -50,7 +50,7 @@ def test_create_from_words_and_text(vocab):
# non-standard whitespace tokens
words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
text = " 'dogs'\n\nrun "
(words, spaces) = util.get_words_and_spaces(words, text)
words, spaces = util.get_words_and_spaces(words, text)
doc = Doc(vocab, words=words, spaces=spaces)
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
@ -63,7 +63,7 @@ def test_create_from_words_and_text(vocab):
with pytest.raises(ValueError):
words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
text = " 'dogs'\n\nrun "
(words, spaces) = util.get_words_and_spaces(words + ["away"], text)
words, spaces = util.get_words_and_spaces(words + ["away"], text)
def test_create_with_heads_and_no_deps(vocab):

View File

@ -437,7 +437,7 @@ def test_phrase_matcher_pickle(en_vocab):
assert matches == matches_unpickled
# clunky way to vaguely check that callback is unpickled
(vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1]
vocab, docs, callbacks, attr = matcher_unpickled.__reduce__()[1]
assert isinstance(callbacks.get("TEST2"), Mock)