Add support for elision in French

This commit is contained in:
Raphaël Bournhonesque 2017-01-24 09:47:13 +01:00
parent 199ae10690
commit 902f136f18
2 changed files with 18 additions and 0 deletions

View File

@ -7,6 +7,7 @@ from ..language import Language
from ..attrs import LANG
from .language_data import *
from .punctuation import TOKENIZER_INFIXES
class French(Language):
@ -18,3 +19,4 @@ class French(Language):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
infixes = tuple(TOKENIZER_INFIXES)

16
spacy/fr/punctuation.py Normal file
View File

@ -0,0 +1,16 @@
# encoding: utf8
from __future__ import unicode_literals
from ..language_data.punctuation import ALPHA, TOKENIZER_INFIXES
_ELISION = " ' "
ELISION = _ELISION.strip().replace(' ', '').replace('\n', '')
TOKENIZER_INFIXES += [
r'(?<=[{a}][{el}])(?=[{a}])'.format(a=ALPHA, el=ELISION),
]
__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]