Update Serbian tokenization for UD Serbian SET (#12442)

This commit is contained in:
Adriane Boyd 2023-03-24 16:26:40 +01:00 committed by GitHub
parent 28de85737f
commit d0bd3f5ee4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 39 additions and 0 deletions

View File

@ -1,11 +1,14 @@
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ...language import Language, BaseDefaults
class SerbianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS

View File

@ -0,0 +1,36 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
from ..char_classes import CURRENCY, UNITS, PUNCT
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)
_suffixes = (
LIST_PUNCT
+ LIST_ELLIPSES
+ LIST_QUOTES
+ LIST_ICONS
+ [
r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[{a}{e}{p}(?:{q})])\.".format(
a=ALPHA, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
),
]
)
TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes