mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
a7ee4b6f17
- added some tests for tokenization issues - fixed some issues with tokenization of words with hyphen infix - rewrote the "tokenizer_exceptions.py" file (stemming from the German version)
17 lines
459 B
Python
17 lines
459 B
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
# TODO
|
|
# norm execptions: find a possibility to deal with the zillions of spelling
|
|
# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
|
|
# here one could include the most common spelling mistakes
|
|
|
|
_exc = {"dass": "datt", "viläicht": "vläicht"}
|
|
|
|
|
|
NORM_EXCEPTIONS = {}
|
|
|
|
for string, norm in _exc.items():
|
|
NORM_EXCEPTIONS[string] = norm
|
|
NORM_EXCEPTIONS[string.title()] = norm
|