2017-06-06 18:43:51 +03:00
# coding: utf-8
from __future__ import unicode_literals
import pytest
# Examples taken from the "Big List of Naughty Strings"
# https://github.com/minimaxir/big-list-of-naughty-strings
NAUGHTY_STRINGS = [
# ASCII punctuation
" ,./; ' [] \ -= " ,
' <>?: " {} |_+ ' ,
' !@#$ % ^&*()`~ " ' ,
# Unicode additional control characters, byte order marks
" " ,
" " ,
# Unicode Symbols
" Ω≈ç√∫˜µ≤≥÷ " ,
" åß∂ƒ©˙∆˚¬…æ " ,
" œ∑´®†¥¨ˆøπ“‘ " ,
" ¡™£¢∞§¶•ªº–≠ " ,
" ¸˛Ç◊ı˜Â¯˘¿ " ,
" ÅÍÎÏ˝ÓÔÒÚÆ☃ " ,
" Œ„´‰ˇÁ¨ˆØ∏”’ " ,
" `⁄€‹›fifl‡°·‚—± " ,
" ⅛⅜⅝⅞ " ,
" ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя " ,
" ٠١٢٣٤٥٦٧٨٩ " ,
# Unicode Subscript/Superscript/Accents
" ⁰⁴⁵ " ,
" ₀₁₂ " ,
" ⁰⁴⁵₀₁₂ " ,
" ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ " ,
# Two-Byte Characters
" 田中さんにあげて下さい " ,
" パーティーへ行かないか " ,
" 和製漢語 " ,
" 部落格 " ,
" 사회과학원 어학연구소 " ,
" 찦차를 타고 온 펲시맨과 쑛다리 똠방각하 " ,
" 社會科學院語學研究所 " ,
" 울란바토르 " ,
" 𠜎𠜱𠝹𠱓𠱸𠲖𠳏 " ,
# Japanese Emoticons
" ヽ༼ຈل͜ຈ༽ノ ヽ༼ຈل͜ຈ༽ノ " ,
" (。◕ ∀ ◕。) " ,
" `ィ(´∀`∩ " ,
" __ロ(,_,*) " ,
" ・( ̄∀ ̄)・:*: " ,
" ゚・✿ヾ╲(。◕‿◕。)╱✿・゚ " ,
" ,。・:*:・゜’( ☻ ω ☻ )。・:*:・゜’ " ,
2018-11-27 03:09:36 +03:00
" (╯°□°)╯︵ ┻━┻) " " (ノಥ益ಥ)ノ ┻━┻ " ,
2017-06-06 18:43:51 +03:00
" ┬─┬ノ( º _ ºノ) " ,
" ( ͡° ͜ʖ ͡°) " ,
# Emoji
" 😍 " ,
" 👩🏽 " ,
" 👾 🙇 💁 🙅 🙆 🙋 🙎 🙍 " ,
" 🐵 🙈 🙉 🙊 " ,
" ❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙 " ,
" ✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿 " ,
" 🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧 " ,
" 0️ ⃣ 1️ ⃣ 2️ ⃣ 3️ ⃣ 4️ ⃣ 5️ ⃣ 6️ ⃣ 7️ ⃣ 8️ ⃣ 9️ ⃣ 🔟 " ,
# Regional Indicator Symbols
" 🇺🇸🇷🇺🇸 🇦🇫🇦🇲🇸 " ,
" 🇺🇸🇷🇺🇸🇦🇫🇦🇲 " ,
" 🇺🇸🇷🇺🇸🇦 " ,
# Unicode Numbers
" 1 2 3 " ,
" ١٢٣ " ,
# Right-To-Left Strings
" ثم نفس سقطت وبالتحديد،, جزيرتي باستخدام أن دنو. إذ هنا؟ الستار وتنصيب كان. أهّل ايطاليا، بريطانيا-فرنسا قد أخذ. سليمان، إتفاقية بين ما, يذكر الحدود أي بعد, معاملة بولندا، الإطلاق عل إيو. " ,
" إيو. " ,
" בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ " ,
" הָי ְתָהtestا لصفحا ت التّحول " ,
" ﷽ " ,
" ﷺ " ,
" مُنَاقَشَةُ سُبُلِ اِسْتِخْدَامِ اللُّغَةِ فِي النُّظُمِ الْقَائِمَةِ وَفِيم يَخُصَّ التَّطْبِيقَاتُ الْحاسُوبِيَّةُ، " ,
# Trick Unicode
" test " ,
" test" ,
"
test
" ,
" test test " ,
" test " ,
# Zalgo Text
" Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏ ͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣ " ,
" ̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰ " ,
" ̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏ ͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟ " ,
" ̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏ ͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏ ͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕ " ,
" Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮ " ,
# Unicode Upsidedown
" ˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯ ǝʇ poɯ snᴉǝ op pǝs ' ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ ' ʇǝɯɐ ʇᴉs ɹolop ɯ nsdᴉ ɯ ǝɹo˥ " ,
" 00˙Ɩ$- " ,
# Unicode font
" T h e q u i c k b r o w n f o x j u m p s o v e r t h e l a z y d o g " ,
" 𝐓 𝐡 𝐞 𝐪 𝐮 𝐢 𝐜 𝐤 𝐛 𝐫 𝐨 𝐰 𝐧 𝐟 𝐨 𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨 𝐯 𝐞 𝐫 𝐭 𝐡 𝐞 𝐥 𝐚 𝐳 𝐲 𝐝 𝐨 𝐠 " ,
" 𝕿 𝖍 𝖊 𝖖 𝖚 𝖎 𝖈 𝖐 𝖇 𝖗 𝖔 𝖜 𝖓 𝖋 𝖔 𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔 𝖛 𝖊 𝖗 𝖙 𝖍 𝖊 𝖑 𝖆 𝖟 𝖞 𝖉 𝖔 𝖌 " ,
" 𝑻 𝒉 𝒆 𝒒 𝒖 𝒊 𝒄 𝒌 𝒃 𝒓 𝒐 𝒘 𝒏 𝒇 𝒐 𝒙 𝒋𝒖𝒎𝒑𝒔 𝒐 𝒗 𝒆 𝒓 𝒕 𝒉 𝒆 𝒍 𝒂 𝒛 𝒚 𝒅 𝒐 𝒈 " ,
" 𝓣 𝓱 𝓮 𝓺 𝓾 𝓲 𝓬 𝓴 𝓫 𝓻 𝓸 𝔀 𝓷 𝓯 𝓸 𝔁 𝓳𝓾𝓶𝓹𝓼 𝓸 𝓿 𝓮 𝓻 𝓽 𝓱 𝓮 𝓵 𝓪 𝔃 𝔂 𝓭 𝓸 𝓰 " ,
" 𝕋 𝕙 𝕖 𝕢 𝕦 𝕚 𝕔 𝕜 𝕓 𝕣 𝕠 𝕨 𝕟 𝕗 𝕠 𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠 𝕧 𝕖 𝕣 𝕥 𝕙 𝕖 𝕝 𝕒 𝕫 𝕪 𝕕 𝕠 𝕘 " ,
" 𝚃 𝚑 𝚎 𝚚 𝚞 𝚒 𝚌 𝚔 𝚋 𝚛 𝚘 𝚠 𝚗 𝚏 𝚘 𝚡 𝚓𝚞𝚖𝚙𝚜 𝚘 𝚟 𝚎 𝚛 𝚝 𝚑 𝚎 𝚕 𝚊 𝚣 𝚢 𝚍 𝚘 𝚐 " ,
" ⒯⒣⒠ ⒬⒰⒤⒞⒦ ⒝⒭⒪⒲⒩ ⒡⒪⒳ ⒥⒰⒨⒫⒮ ⒪⒱⒠⒭ ⒯⒣⒠ ⒧⒜⒵⒴ ⒟⒪⒢ " ,
# File paths
" ../../../../../../../../../../../etc/passwd % 00 " ,
" ../../../../../../../../../../../etc/hosts " ,
# iOS Vulnerabilities
" Powerلُلُصّبُلُلصّبُررً ॣ ॣh ॣ ॣ冗 " ,
2018-11-27 03:09:36 +03:00
" 🏳0🌈️ " ,
2017-06-06 18:43:51 +03:00
]
@pytest.mark.slow
2018-11-27 03:09:36 +03:00
@pytest.mark.parametrize ( " text " , NAUGHTY_STRINGS )
2017-06-06 18:43:51 +03:00
def test_tokenizer_naughty_strings ( tokenizer , text ) :
tokens = tokenizer ( text )
assert tokens . text_with_ws == text