mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Update Serbian tokenization for UD Serbian SET (#12442)
This commit is contained in:
parent
28de85737f
commit
d0bd3f5ee4
|
@ -1,11 +1,14 @@
|
|||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class SerbianDefaults(BaseDefaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
|
36
spacy/lang/sr/punctuation.py
Normal file
36
spacy/lang/sr/punctuation.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
|
||||
from ..char_classes import CURRENCY, UNITS, PUNCT
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
]
|
||||
)
|
||||
|
||||
_suffixes = (
|
||||
LIST_PUNCT
|
||||
+ LIST_ELLIPSES
|
||||
+ LIST_QUOTES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9])\+",
|
||||
r"(?<=°[FfCcKk])\.",
|
||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
r"(?<=[{a}{e}{p}(?:{q})])\.".format(
|
||||
a=ALPHA, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
Loading…
Reference in New Issue
Block a user