spaCy/spacy/fr/language_data.py

# encoding: utf8
from __future__ import unicode_literals

from .. import language_data as base
from ..language_data import strings_to_exc, update_exc

from .punctuation import ELISION

from ..symbols import *
from .stop_words import STOP_WORDS


STOP_WORDS = set(STOP_WORDS)


TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))

ABBREVIATIONS = {
    "janv.": [
        {LEMMA: "janvier", ORTH: "janv."}
    ],
    "févr.": [
        {LEMMA: "février", ORTH: "févr."}
    ],
    "avr.": [
        {LEMMA: "avril", ORTH: "avr."}
    ],
    "juill.": [
        {LEMMA: "juillet", ORTH: "juill."}
    ],
    "sept.": [
        {LEMMA: "septembre", ORTH: "sept."}
    ],
    "oct.": [
        {LEMMA: "octobre", ORTH: "oct."}
    ],
    "nov.": [
        {LEMMA: "novembre", ORTH: "nov."}
    ],
    "déc.": [
        {LEMMA: "décembre", ORTH: "déc."}
    ],
}


INFIXES_EXCEPTIONS_BASE = ["aujourd'hui",
                           "prud'homme", "prud'hommes",
                           "prud'homal", "prud'homaux", "prud'homale",
                           "prud'homales",
                           "prud'hommal", "prud'hommaux", "prud'hommale",
                           "prud'hommales",
                           "prud'homie", "prud'homies",
                           "prud'hommesque", "prud'hommesques",
                           "prud'hommesquement"]

INFIXES_EXCEPTIONS = []
for elision_char in ELISION:
    INFIXES_EXCEPTIONS += [infix.replace("'", elision_char)
                           for infix in INFIXES_EXCEPTIONS_BASE]

INFIXES_EXCEPTIONS += [word.capitalize() for word in INFIXES_EXCEPTIONS]

update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(INFIXES_EXCEPTIONS))
update_exc(TOKENIZER_EXCEPTIONS, ABBREVIATIONS)


__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]