Apply emoticon exceptions to tokenizer

This commit is contained in:
Ines Montani 2016-12-07 21:11:59 +01:00
parent 71f0f34cb3
commit 0d07d7fc80
2 changed files with 6 additions and 3 deletions

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
import re
from ..symbols import *
from ..language_data import EMOTICONS
PRON_LEMMA = "-PRON-"
@ -2017,9 +2018,10 @@ self_map = [
"z."
]
overlap = set(TOKENIZER_EXCEPTIONS.keys()).intersection(set(self_map))
assert not overlap, overlap
TOKENIZER_EXCEPTIONS.update({orth: [{ORTH: orth}] for orth in self_map})
for orths in [self_map, EMOTICONS]:
overlap = set(TOKENIZER_EXCEPTIONS.keys()).intersection(set(orths))
assert not overlap, overlap
TOKENIZER_EXCEPTIONS.update({orth: [{ORTH: orth}] for orth in orths})
TOKENIZER_PREFIXES = r'''

View File

@ -0,0 +1 @@
from .emoticons import *