mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Add support for elision in French
This commit is contained in:
parent
199ae10690
commit
902f136f18
|
@ -7,6 +7,7 @@ from ..language import Language
|
|||
from ..attrs import LANG
|
||||
|
||||
from .language_data import *
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class French(Language):
|
||||
|
@ -18,3 +19,4 @@ class French(Language):
|
|||
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
|
|
16
spacy/fr/punctuation.py
Normal file
16
spacy/fr/punctuation.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
# encoding: utf8
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..language_data.punctuation import ALPHA, TOKENIZER_INFIXES
|
||||
|
||||
|
||||
_ELISION = " ' ’ "
|
||||
ELISION = _ELISION.strip().replace(' ', '').replace('\n', '')
|
||||
|
||||
TOKENIZER_INFIXES += [
|
||||
r'(?<=[{a}][{el}])(?=[{a}])'.format(a=ALPHA, el=ELISION),
|
||||
]
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
Loading…
Reference in New Issue
Block a user