spaCy/spacy/lang/sv/punctuation.py
lise-brinck e2de188cf1
Bugfix/swedish tokenizer (#12315)
* add unittest for explosion#12311

* create punctuation.py for swedish

* removed : from infixes in swedish punctuation.py

* allow : as infix if succeeding char is uppercase
2023-02-27 10:53:45 +01:00

34 lines
1023 B
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..punctuation import TOKENIZER_SUFFIXES
_quotes = CONCAT_QUOTES.replace("'", "")
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER),
]
)
_suffixes = [
suffix
for suffix in TOKENIZER_SUFFIXES
if suffix not in ["'s", "'S", "s", "S", r"\'"]
]
_suffixes += [r"(?<=[^sSxXzZ])\'"]
TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes