Add all symbols in Unicode Currency Symbols block (#8212)

* Add all symbols in Unicode Currency Symbols block In #8102 it came up that the rupee symbol was treated different from dollar / euro / yen symbols. This adds many symbols not already included. * Fix test * Fix training test
2025-08-07 13:44:55 +03:00 · 2021-05-31 17:03:40 +09:00 · 2021-05-31 17:03:40 +09:00 · d1a221a374
commit d1a221a374
parent fc37715cfb
2 changed files with 6 additions and 3 deletions
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -260,7 +260,10 @@ _units = (
    "кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб"
    "كم كم² كم³ م م² م³ سم سم² سم³ مم مم² مم³ كم غرام جرام جم كغ ملغ كوب اكواب"
 )
-_currency = r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴"
+_currency = (
+    r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴ ₠ ₡ ₢ ₣ ₤ ₥ ₦ ₧ ₨ ₩ ₪ ₫ € ₭ ₮ ₯ ₰ "
+    r"₱ ₲ ₳ ₴ ₵ ₶ ₷ ₸ ₹ ₺ ₻ ₼ ₽ ₾ ₿"
+)

 # These expressions contain various unicode variations, including characters
 # used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@ -336,8 +336,8 @@ def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer):


 def test_gold_biluo_4791(en_vocab, en_tokenizer):
-    doc = en_tokenizer("I'll return the ₹54 amount")
-    gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"]
+    doc = en_tokenizer("I'll return the A54 amount")
+    gold_words = ["I", "'ll", "return", "the", "A", "54", "amount"]
    gold_spaces = [False, True, True, True, False, True, False]
    entities = [(16, 19, "MONEY")]
    example = Example.from_dict(