diff --git a/spacy/tests/tokenizer/test_naughty_strings.py b/spacy/tests/tokenizer/test_naughty_strings.py new file mode 100644 index 000000000..57e39e151 --- /dev/null +++ b/spacy/tests/tokenizer/test_naughty_strings.py @@ -0,0 +1,143 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +# Examples taken from the "Big List of Naughty Strings" +# https://github.com/minimaxir/big-list-of-naughty-strings + + +NAUGHTY_STRINGS = [ + # ASCII punctuation + ",./;'[]\-=", + '<>?:"{}|_+', + '!@#$%^&*()`~"', + + # Unicode additional control characters, byte order marks + "­؀؁؂؃؄؅؜۝܏᠎​‌‍‎‏‪", + "￾", + + # Unicode Symbols + "Ω≈ç√∫˜µ≤≥÷", + "åß∂ƒ©˙∆˚¬…æ", + "œ∑´®†¥¨ˆøπ“‘", + "¡™£¢∞§¶•ªº–≠", + "¸˛Ç◊ı˜Â¯˘¿", + "ÅÍÎÏ˝ÓÔÒÚÆ☃", + "Œ„´‰ˇÁ¨ˆØ∏”’", + "`⁄€‹›fifl‡°·‚—±", + "⅛⅜⅝⅞", + "ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя", + "٠١٢٣٤٥٦٧٨٩", + + # Unicode Subscript/Superscript/Accents + "⁰⁴⁵", + "₀₁₂", + "⁰⁴⁵₀₁₂", + "ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็", + + # Two-Byte Characters + "田中さんにあげて下さい", + "パーティーへ行かないか", + "和製漢語", + "部落格", + "사회과학원 어학연구소", + "찦차를 타고 온 펲시맨과 쑛다리 똠방각하", + "社會科學院語學研究所", + "울란바토르", + "𠜎𠜱𠝹𠱓𠱸𠲖𠳏", + + # Japanese Emoticons + "ヽ༼ຈل͜ຈ༽ノ ヽ༼ຈل͜ຈ༽ノ", + "(。◕ ∀ ◕。)", + "`ィ(´∀`∩", + "__ロ(,_,*)", + "・( ̄∀ ̄)・:*:", + "゚・✿ヾ╲(。◕‿◕。)╱✿・゚", + ",。・:*:・゜’( ☻ ω ☻ )。・:*:・゜’", + "(╯°□°)╯︵ ┻━┻)" + "(ノಥ益ಥ)ノ ┻━┻", + "┬─┬ノ( º _ ºノ)", + "( ͡° ͜ʖ ͡°)", + + # Emoji + "😍", + "👩🏽", + "👾 🙇 💁 🙅 🙆 🙋 🙎 🙍", + "🐵 🙈 🙉 🙊", + "❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙", + "✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿", + "🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧", + "0️⃣ 1️⃣ 2️⃣ 3️⃣ 4️⃣ 5️⃣ 6️⃣ 7️⃣ 8️⃣ 9️⃣ 🔟", + + # Regional Indicator Symbols + "🇺🇸🇷🇺🇸 🇦🇫🇦🇲🇸", + "🇺🇸🇷🇺🇸🇦🇫🇦🇲", + "🇺🇸🇷🇺🇸🇦", + + # Unicode Numbers + "123", + "١٢٣", + + # Right-To-Left Strings + + "ثم نفس سقطت وبالتحديد،, جزيرتي باستخدام أن دنو. إذ هنا؟ الستار وتنصيب كان. أهّل ايطاليا، بريطانيا-فرنسا قد أخذ. سليمان، إتفاقية بين ما, يذكر الحدود أي بعد, معاملة بولندا، الإطلاق عل إيو.", + "إيو.", + "בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ", + "הָיְתָהtestالصفحات التّحول", + "﷽", + "ﷺ", + "مُنَاقَشَةُ سُبُلِ اِسْتِخْدَامِ اللُّغَةِ فِي النُّظُمِ الْقَائِمَةِ وَفِيم يَخُصَّ التَّطْبِيقَاتُ الْحاسُوبِيَّةُ،", + + # Trick Unicode + "‪‪test‪", + "‫test", + "
test
", + "test⁠test", + "⁦test⁧", + + # Zalgo Text + "Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣", + + + "̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰", + + + "̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟", + + + "̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕", + + + "Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮", + + + # Unicode Upsidedown + "˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯǝʇ poɯsnᴉǝ op pǝs 'ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ 'ʇǝɯɐ ʇᴉs ɹolop ɯnsdᴉ ɯǝɹo˥", + "00˙Ɩ$-", + + # Unicode font + "The quick brown fox jumps over the lazy dog", + "𝐓𝐡𝐞 𝐪𝐮𝐢𝐜𝐤 𝐛𝐫𝐨𝐰𝐧 𝐟𝐨𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨𝐯𝐞𝐫 𝐭𝐡𝐞 𝐥𝐚𝐳𝐲 𝐝𝐨𝐠", + "𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌", + "𝑻𝒉𝒆 𝒒𝒖𝒊𝒄𝒌 𝒃𝒓𝒐𝒘𝒏 𝒇𝒐𝒙 𝒋𝒖𝒎𝒑𝒔 𝒐𝒗𝒆𝒓 𝒕𝒉𝒆 𝒍𝒂𝒛𝒚 𝒅𝒐𝒈", + "𝓣𝓱𝓮 𝓺𝓾𝓲𝓬𝓴 𝓫𝓻𝓸𝔀𝓷 𝓯𝓸𝔁 𝓳𝓾𝓶𝓹𝓼 𝓸𝓿𝓮𝓻 𝓽𝓱𝓮 𝓵𝓪𝔃𝔂 𝓭𝓸𝓰", + "𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘", + "𝚃𝚑𝚎 𝚚𝚞𝚒𝚌𝚔 𝚋𝚛𝚘𝚠𝚗 𝚏𝚘𝚡 𝚓𝚞𝚖𝚙𝚜 𝚘𝚟𝚎𝚛 𝚝𝚑𝚎 𝚕𝚊𝚣𝚢 𝚍𝚘𝚐", + "⒯⒣⒠ ⒬⒰⒤⒞⒦ ⒝⒭⒪⒲⒩ ⒡⒪⒳ ⒥⒰⒨⒫⒮ ⒪⒱⒠⒭ ⒯⒣⒠ ⒧⒜⒵⒴ ⒟⒪⒢", + + # File paths + "../../../../../../../../../../../etc/passwd%00", + "../../../../../../../../../../../etc/hosts", + + # iOS Vulnerabilities + "Powerلُلُصّبُلُلصّبُررً ॣ ॣh ॣ ॣ冗", + "🏳0🌈️" +] + + +@pytest.mark.slow +@pytest.mark.parametrize('text', NAUGHTY_STRINGS) +def test_tokenizer_naughty_strings(tokenizer, text): + tokens = tokenizer(text) + assert tokens.text_with_ws == text