2017-06-06 18:43:51 +03:00
# coding: utf-8
from __future__ import unicode_literals
import pytest
# Examples taken from the "Big List of Naughty Strings"
# https://github.com/minimaxir/big-list-of-naughty-strings
NAUGHTY_STRINGS = [
# ASCII punctuation
2019-02-21 13:56:47 +03:00
r " ,./; ' [] \ -= " ,
r ' <>?: " {} |_+ ' ,
r ' !@#$ % ^&*()`~ " ' ,
2017-06-06 18:43:51 +03:00
# Unicode additional control characters, byte order marks
2019-02-21 13:56:47 +03:00
r " " ,
r " " ,
2017-06-06 18:43:51 +03:00
# Unicode Symbols
2019-02-21 13:56:47 +03:00
r " Ω≈ç√∫˜µ≤≥÷ " ,
r " åß∂ƒ©˙∆˚¬…æ " ,
2017-06-06 18:43:51 +03:00
" œ∑´®†¥¨ˆøπ“‘ " ,
2019-02-21 13:56:47 +03:00
r " ¡™£¢∞§¶•ªº–≠ " ,
r " ¸˛Ç◊ı˜Â¯˘¿ " ,
r " ÅÍÎÏ˝ÓÔÒÚÆ☃ " ,
r " Œ„´‰ˇÁ¨ˆØ∏”’ " ,
r " `⁄€‹›fifl‡°·‚—± " ,
r " ⅛⅜⅝⅞ " ,
r " ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя " ,
r " ٠١٢٣٤٥٦٧٨٩ " ,
2017-06-06 18:43:51 +03:00
# Unicode Subscript/Superscript/Accents
2019-02-21 13:56:47 +03:00
r " ⁰⁴⁵ " ,
r " ₀₁₂ " ,
r " ⁰⁴⁵₀₁₂ " ,
r " ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ " ,
2017-06-06 18:43:51 +03:00
# Two-Byte Characters
2019-02-21 13:56:47 +03:00
r " 田中さんにあげて下さい " ,
r " パーティーへ行かないか " ,
r " 和製漢語 " ,
r " 部落格 " ,
r " 사회과학원 어학연구소 " ,
r " 찦차를 타고 온 펲시맨과 쑛다리 똠방각하 " ,
r " 社會科學院語學研究所 " ,
r " 울란바토르 " ,
r " 𠜎𠜱𠝹𠱓𠱸𠲖𠳏 " ,
2017-06-06 18:43:51 +03:00
# Japanese Emoticons
2019-02-21 13:56:47 +03:00
r " ヽ༼ຈل͜ຈ༽ノ ヽ༼ຈل͜ຈ༽ノ " ,
r " (。◕ ∀ ◕。) " ,
r " `ィ(´∀`∩ " ,
r " __ロ(,_,*) " ,
r " ・( ̄∀ ̄)・:*: " ,
r " ゚・✿ヾ╲(。◕‿◕。)╱✿・゚ " ,
r " ,。・:*:・゜’( ☻ ω ☻ )。・:*:・゜’ " ,
r " (╯°□°)╯︵ ┻━┻) " " (ノಥ益ಥ)ノ ┻━┻ " ,
r " ┬─┬ノ( º _ ºノ) " ,
r " ( ͡° ͜ʖ ͡°) " ,
2017-06-06 18:43:51 +03:00
# Emoji
2019-02-21 13:56:47 +03:00
r " 😍 " ,
r " 👩🏽 " ,
r " 👾 🙇 💁 🙅 🙆 🙋 🙎 🙍 " ,
r " 🐵 🙈 🙉 🙊 " ,
r " ❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙 " ,
r " ✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿 " ,
r " 🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧 " ,
r " 0️ ⃣ 1️ ⃣ 2️ ⃣ 3️ ⃣ 4️ ⃣ 5️ ⃣ 6️ ⃣ 7️ ⃣ 8️ ⃣ 9️ ⃣ 🔟 " ,
2017-06-06 18:43:51 +03:00
# Regional Indicator Symbols
2019-02-21 13:56:47 +03:00
r " 🇺🇸🇷🇺🇸 🇦🇫🇦🇲🇸 " ,
r " 🇺🇸🇷🇺🇸🇦🇫🇦🇲 " ,
r " 🇺🇸🇷🇺🇸🇦 " ,
2017-06-06 18:43:51 +03:00
# Unicode Numbers
2019-02-21 13:56:47 +03:00
r " 1 2 3 " ,
r " ١٢٣ " ,
2017-06-06 18:43:51 +03:00
# Right-To-Left Strings
2019-02-21 13:56:47 +03:00
r " ثم نفس سقطت وبالتحديد،, جزيرتي باستخدام أن دنو. إذ هنا؟ الستار وتنصيب كان. أهّل ايطاليا، بريطانيا-فرنسا قد أخذ. سليمان، إتفاقية بين ما, يذكر الحدود أي بعد, معاملة بولندا، الإطلاق عل إيو. " ,
r " إيو. " ,
r " בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ " ,
r " הָי ְתָהtestا لصفحا ت التّحول " ,
r " ﷽ " ,
r " ﷺ " ,
r " مُنَاقَشَةُ سُبُلِ اِسْتِخْدَامِ اللُّغَةِ فِي النُّظُمِ الْقَائِمَةِ وَفِيم يَخُصَّ التَّطْبِيقَاتُ الْحاسُوبِيَّةُ، " ,
2017-06-06 18:43:51 +03:00
# Trick Unicode
2019-02-21 13:56:47 +03:00
r " test " ,
r " test" ,
r "
test
" ,
r " test test " ,
r " test " ,
2017-06-06 18:43:51 +03:00
# Zalgo Text
2019-02-21 13:56:47 +03:00
r " Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏ ͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣ " ,
r " ̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰ " ,
r " ̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏ ͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟ " ,
r " ̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏ ͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏ ͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕ " ,
r " Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮ " ,
2017-06-06 18:43:51 +03:00
# Unicode Upsidedown
2019-02-21 13:56:47 +03:00
r " ˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯ ǝʇ poɯ snᴉǝ op pǝs ' ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ ' ʇǝɯɐ ʇᴉs ɹolop ɯ nsdᴉ ɯ ǝɹo˥ " ,
r " 00˙Ɩ$- " ,
2017-06-06 18:43:51 +03:00
# Unicode font
2019-02-21 13:56:47 +03:00
r " T h e q u i c k b r o w n f o x j u m p s o v e r t h e l a z y d o g " ,
r " 𝐓 𝐡 𝐞 𝐪 𝐮 𝐢 𝐜 𝐤 𝐛 𝐫 𝐨 𝐰 𝐧 𝐟 𝐨 𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨 𝐯 𝐞 𝐫 𝐭 𝐡 𝐞 𝐥 𝐚 𝐳 𝐲 𝐝 𝐨 𝐠 " ,
r " 𝕿 𝖍 𝖊 𝖖 𝖚 𝖎 𝖈 𝖐 𝖇 𝖗 𝖔 𝖜 𝖓 𝖋 𝖔 𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔 𝖛 𝖊 𝖗 𝖙 𝖍 𝖊 𝖑 𝖆 𝖟 𝖞 𝖉 𝖔 𝖌 " ,
r " 𝑻 𝒉 𝒆 𝒒 𝒖 𝒊 𝒄 𝒌 𝒃 𝒓 𝒐 𝒘 𝒏 𝒇 𝒐 𝒙 𝒋𝒖𝒎𝒑𝒔 𝒐 𝒗 𝒆 𝒓 𝒕 𝒉 𝒆 𝒍 𝒂 𝒛 𝒚 𝒅 𝒐 𝒈 " ,
r " 𝓣 𝓱 𝓮 𝓺 𝓾 𝓲 𝓬 𝓴 𝓫 𝓻 𝓸 𝔀 𝓷 𝓯 𝓸 𝔁 𝓳𝓾𝓶𝓹𝓼 𝓸 𝓿 𝓮 𝓻 𝓽 𝓱 𝓮 𝓵 𝓪 𝔃 𝔂 𝓭 𝓸 𝓰 " ,
r " 𝕋 𝕙 𝕖 𝕢 𝕦 𝕚 𝕔 𝕜 𝕓 𝕣 𝕠 𝕨 𝕟 𝕗 𝕠 𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠 𝕧 𝕖 𝕣 𝕥 𝕙 𝕖 𝕝 𝕒 𝕫 𝕪 𝕕 𝕠 𝕘 " ,
r " 𝚃 𝚑 𝚎 𝚚 𝚞 𝚒 𝚌 𝚔 𝚋 𝚛 𝚘 𝚠 𝚗 𝚏 𝚘 𝚡 𝚓𝚞𝚖𝚙𝚜 𝚘 𝚟 𝚎 𝚛 𝚝 𝚑 𝚎 𝚕 𝚊 𝚣 𝚢 𝚍 𝚘 𝚐 " ,
r " ⒯⒣⒠ ⒬⒰⒤⒞⒦ ⒝⒭⒪⒲⒩ ⒡⒪⒳ ⒥⒰⒨⒫⒮ ⒪⒱⒠⒭ ⒯⒣⒠ ⒧⒜⒵⒴ ⒟⒪⒢ " ,
2017-06-06 18:43:51 +03:00
# File paths
2019-02-21 13:56:47 +03:00
r " ../../../../../../../../../../../etc/passwd % 00 " ,
r " ../../../../../../../../../../../etc/hosts " ,
2017-06-06 18:43:51 +03:00
# iOS Vulnerabilities
2019-02-21 13:56:47 +03:00
r " Powerلُلُصّبُلُلصّبُررً ॣ ॣh ॣ ॣ冗 " ,
r " 🏳0🌈️ " ,
2017-06-06 18:43:51 +03:00
]
@pytest.mark.slow
2018-11-27 03:09:36 +03:00
@pytest.mark.parametrize ( " text " , NAUGHTY_STRINGS )
2017-06-06 18:43:51 +03:00
def test_tokenizer_naughty_strings ( tokenizer , text ) :
tokens = tokenizer ( text )
assert tokens . text_with_ws == text