spaCy/spacy/lang/ko/punctuation.py
Adriane Boyd 30030176ee
Update Korean defaults for Tokenizer (#10322)
Update Korean defaults for `Tokenizer` for tokenization following UD
Korean Kaist.
2022-02-21 10:26:19 +01:00

13 lines
266 B
Python

from ..char_classes import LIST_QUOTES
from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
_infixes = (
["·", "", "\(", "\)"]
+ [r"(?<=[0-9])~(?=[0-9-])"]
+ LIST_QUOTES
+ BASE_TOKENIZER_INFIXES
)
TOKENIZER_INFIXES = _infixes