From f0a577f7a5c67f55807fdbda9e9a936464723931 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 6 Nov 2019 12:16:56 +0100 Subject: [PATCH] Update Hungarian punctuation to remove empty string Update Hungarian punctuation definitions so that `_units` does not match an empty string. --- spacy/lang/hu/punctuation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py index bc043486f..dc7ca3bb5 100644 --- a/spacy/lang/hu/punctuation.py +++ b/spacy/lang/hu/punctuation.py @@ -2,7 +2,8 @@ from __future__ import unicode_literals from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CONCAT_QUOTES -from ..char_classes import CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..char_classes import CONCAT_ICONS, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..char_classes import LIST_UNITS, merge_chars # removing ° from the special icons to keep e.g. 99° as one token @@ -10,7 +11,8 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "") _currency = r"\$¢£€¥฿" _quotes = CONCAT_QUOTES.replace("'", "") -_units = UNITS.replace("%", "") +_list_units = [s for s in LIST_UNITS if s != "%"] +_units = merge_chars(" ".join(_list_units)) _prefixes = ( LIST_PUNCT