Add all symbols in Unicode Currency Symbols block (#8212)

* Add all symbols in Unicode Currency Symbols block

In #8102 it came up that the rupee symbol was treated different from
dollar / euro / yen symbols. This adds many symbols not already
included.

* Fix test

* Fix training test
This commit is contained in:
Paul O'Leary McCann 2021-05-31 17:03:40 +09:00 committed by GitHub
parent fc37715cfb
commit d1a221a374
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 6 additions and 3 deletions

View File

@ -260,7 +260,10 @@ _units = (
"кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб"
"كم كم² كم³ م م² م³ سم سم² سم³ مم مم² مم³ كم غرام جرام جم كغ ملغ كوب اكواب"
)
_currency = r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴"
_currency = (
r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴ ₠ ₡ ₢ ₣ ₤ ₥ ₦ ₧ ₨ ₩ ₪ ₫ € ₭ ₮ ₯ ₰ "
r"₱ ₲ ₳ ₴ ₵ ₶ ₷ ₸ ₹ ₺ ₻ ₼ ₽ ₾ ₿"
)
# These expressions contain various unicode variations, including characters
# used in Chinese (see #1333, #1340, #1351) unless there are cross-language

View File

@ -336,8 +336,8 @@ def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer):
def test_gold_biluo_4791(en_vocab, en_tokenizer):
doc = en_tokenizer("I'll return the 54 amount")
gold_words = ["I", "'ll", "return", "the", "", "54", "amount"]
doc = en_tokenizer("I'll return the A54 amount")
gold_words = ["I", "'ll", "return", "the", "A", "54", "amount"]
gold_spaces = [False, True, True, True, False, True, False]
entities = [(16, 19, "MONEY")]
example = Example.from_dict(