mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
Add support for elision in French
This commit is contained in:
parent
199ae10690
commit
902f136f18
|
@ -7,6 +7,7 @@ from ..language import Language
|
||||||
from ..attrs import LANG
|
from ..attrs import LANG
|
||||||
|
|
||||||
from .language_data import *
|
from .language_data import *
|
||||||
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
class French(Language):
|
class French(Language):
|
||||||
|
@ -18,3 +19,4 @@ class French(Language):
|
||||||
|
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
infixes = tuple(TOKENIZER_INFIXES)
|
||||||
|
|
16
spacy/fr/punctuation.py
Normal file
16
spacy/fr/punctuation.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
# encoding: utf8
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..language_data.punctuation import ALPHA, TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
|
_ELISION = " ' ’ "
|
||||||
|
ELISION = _ELISION.strip().replace(' ', '').replace('\n', '')
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES += [
|
||||||
|
r'(?<=[{a}][{el}])(?=[{a}])'.format(a=ALPHA, el=ELISION),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
Loading…
Reference in New Issue
Block a user