From d0bd3f5ee43e976990fc562ce412537ef4f3b23f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 24 Mar 2023 16:26:40 +0100 Subject: [PATCH] Update Serbian tokenization for UD Serbian SET (#12442) --- spacy/lang/sr/__init__.py | 3 +++ spacy/lang/sr/punctuation.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 spacy/lang/sr/punctuation.py diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py index fd0c8c832..b99ce96ec 100644 --- a/spacy/lang/sr/__init__.py +++ b/spacy/lang/sr/__init__.py @@ -1,11 +1,14 @@ from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ...language import Language, BaseDefaults class SerbianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS + infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/sr/punctuation.py b/spacy/lang/sr/punctuation.py new file mode 100644 index 000000000..793a20ec2 --- /dev/null +++ b/spacy/lang/sr/punctuation.py @@ -0,0 +1,36 @@ +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES +from ..char_classes import CURRENCY, UNITS, PUNCT +from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER + + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + ] +) + +_suffixes = ( + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_ICONS + + [ + r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=UNITS), + r"(?<=[{a}{e}{p}(?:{q})])\.".format( + a=ALPHA, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT + ), + ] +) + +TOKENIZER_INFIXES = _infixes +TOKENIZER_SUFFIXES = _suffixes