mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
30030176ee
Update Korean defaults for `Tokenizer` for tokenization following UD Korean Kaist.
13 lines
266 B
Python
13 lines
266 B
Python
from ..char_classes import LIST_QUOTES
|
|
from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
|
|
|
|
|
|
_infixes = (
|
|
["·", "ㆍ", "\(", "\)"]
|
|
+ [r"(?<=[0-9])~(?=[0-9-])"]
|
|
+ LIST_QUOTES
|
|
+ BASE_TOKENIZER_INFIXES
|
|
)
|
|
|
|
TOKENIZER_INFIXES = _infixes
|