Reorganise French language data

This commit is contained in:
ines 2017-05-08 15:49:05 +02:00
parent 0207ffdd52
commit 7f05e977fa
5 changed files with 147 additions and 225 deletions

View File

@ -1,37 +1,34 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals
from ..language import Language, BaseDefaults from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from ..attrs import LANG from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from .language_data import * from ..language_data import BASE_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ..language import Language
from ..lemmatizerlookup import Lemmatizer from ..lemmatizerlookup import Lemmatizer
from .lemmatization import LOOK_UP from ..attrs import LANG
from ..util import update_exc
class FrenchDefaults(BaseDefaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fr'
stop_words = STOP_WORDS
infixes = tuple(TOKENIZER_INFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
token_match = TOKEN_MATCH
@classmethod
def create_tokenizer(cls, nlp=None):
cls.tokenizer_exceptions = get_tokenizer_exceptions()
return super(FrenchDefaults, cls).create_tokenizer(nlp)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOK_UP)
class French(Language): class French(Language):
lang = 'fr' lang = 'fr'
Defaults = FrenchDefaults class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fr'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
infixes = tuple(TOKENIZER_INFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
token_match = TOKEN_MATCH
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
EXPORT = French __all__ = ['French']

View File

@ -3,7 +3,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
BASE_EXCEPTIONS = [ FR_BASE_EXCEPTIONS = [
"0-day", "0-day",
"0-days", "0-days",
"1000Base-T", "1000Base-T",

View File

@ -1,7 +1,8 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
LOOK_UP = {
LOOKUP = {
"Ap.": "après", "Ap.": "après",
"Apr.": "après", "Apr.": "après",
"Auxerroises": "Auxerrois", "Auxerroises": "Auxerrois",

View File

@ -1,14 +1,12 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..language_data.punctuation import ALPHA, TOKENIZER_INFIXES, LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY,\ from ..language_data.punctuation import ALPHA, TOKENIZER_INFIXES, LIST_PUNCT
UNITS, ALPHA_LOWER, QUOTES, ALPHA_UPPER from ..language_data.punctuation import LIST_ELLIPSES, LIST_QUOTES, CURRENCY
from ..language_data.punctuation import UNITS, ALPHA_LOWER, QUOTES, ALPHA_UPPER
_ELISION = " ' " ELISION = " ' ".strip().replace(' ', '').replace('\n', '')
ELISION = _ELISION.strip().replace(' ', '').replace('\n', '')
HYPHENS = r"""- """.strip().replace(' ', '').replace('\n', '') HYPHENS = r"""- """.strip().replace(' ', '').replace('\n', '')
@ -24,14 +22,8 @@ TOKENIZER_SUFFIXES = (
r'(?<=[0-9])(?:{c})'.format(c=CURRENCY), r'(?<=[0-9])(?:{c})'.format(c=CURRENCY),
r'(?<=[0-9])(?:{u})'.format(u=UNITS), r'(?<=[0-9])(?:{u})'.format(u=UNITS),
r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES), r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES),
r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER), r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER)])
]
)
TOKENIZER_INFIXES += [ TOKENIZER_INFIXES += [
r'(?<=[{a}][{el}])(?=[{a}])'.format(a=ALPHA, el=ELISION), r'(?<=[{a}][{el}])(?=[{a}])'.format(a=ALPHA, el=ELISION)]
]
__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]

View File

@ -1,219 +1,151 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from .. import language_data as base
from ..language_data import strings_to_exc, update_exc
from ..language_data.tokenizer_exceptions import _URL_PATTERN
from ..language_data.punctuation import ALPHA_LOWER
from .punctuation import ELISION, HYPHENS
from ..symbols import *
import os
import io
import regex as re import regex as re
from .punctuation import ELISION, HYPHENS
def get_exceptions(): from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
from ._tokenizer_exceptions_list import BASE_EXCEPTIONS from ..symbols import ORTH, LEMMA, TAG, NORM
return BASE_EXCEPTIONS from ..deprecated import PRON_LEMMA
from ..language_data.tokenizer_exceptions import _URL_PATTERN
from ..language_data.punctuation import ALPHA_LOWER
def upper_first_letter(text): def upper_first_letter(text):
if len(text) == 0: if len(text) == 0:
return text return text
if len(text) == 1: if len(text) == 1:
return text.upper() return text.upper()
return text[0].upper() + text[1:] return text[0].upper() + text[1:]
def lower_first_letter(text): def lower_first_letter(text):
if len(text) == 0: if len(text) == 0:
return text return text
if len(text) == 1: if len(text) == 1:
return text.lower() return text.lower()
return text[0].lower() + text[1:] return text[0].lower() + text[1:]
def get_tokenizer_exceptions(): _exc = {
tokenizer_exceptions = strings_to_exc(base.EMOTICONS)
update_exc(tokenizer_exceptions, strings_to_exc(base.ABBREVIATIONS))
ABBREVIATIONS_1 = {
"av.": [
{LEMMA: "avant", ORTH: "av."}
],
"janv.": [
{LEMMA: "janvier", ORTH: "janv."}
],
"févr.": [
{LEMMA: "février", ORTH: "févr."}
],
"avr.": [
{LEMMA: "avril", ORTH: "avr."}
],
"juill.": [
{LEMMA: "juillet", ORTH: "juill."}
],
"sept.": [
{LEMMA: "septembre", ORTH: "sept."}
],
"oct.": [
{LEMMA: "octobre", ORTH: "oct."}
],
"nov.": [
{LEMMA: "novembre", ORTH: "nov."}
],
"déc.": [
{LEMMA: "décembre", ORTH: "déc."}
],
"apr.": [
{LEMMA: "après", ORTH: "apr."}
],
"J.-C.": [ "J.-C.": [
{LEMMA: "Jésus", ORTH: "J."}, {LEMMA: "Jésus", ORTH: "J."},
{LEMMA: "Christ", ORTH: "-C."} {LEMMA: "Christ", ORTH: "-C."}]
],
"Dr.": [
{LEMMA: "docteur", ORTH: "Dr."}
],
"M.": [
{LEMMA: "monsieur", ORTH: "M."}
],
"Mr.": [
{LEMMA: "monsieur", ORTH: "Mr."}
],
"Mme.": [
{LEMMA: "madame", ORTH: "Mme."}
],
"Mlle.": [
{LEMMA: "mademoiselle", ORTH: "Mlle."}
],
"": [
{LEMMA: "numéro", ORTH: ""}
],
"": [
{LEMMA: "degrés", ORTH: ""}
],
"St.": [
{LEMMA: "saint", ORTH: "St."}
],
"Ste.": [
{LEMMA: "sainte", ORTH: "Ste."}
]
} }
ABBREVIATIONS_2 = [
"etc.",
]
VERBS = {} for exc_data in [
for verb, verb_lemma in (("a", "avoir"), ("est", "être"), {LEMMA: "avant", ORTH: "av."},
("semble", "sembler"), ("indique", "indiquer"), {LEMMA: "janvier", ORTH: "janv."},
("moque", "moquer"), ("passe", "passer")): {LEMMA: "février", ORTH: "févr."},
{LEMMA: "avril", ORTH: "avr."},
{LEMMA: "juillet", ORTH: "juill."},
{LEMMA: "septembre", ORTH: "sept."},
{LEMMA: "octobre", ORTH: "oct."},
{LEMMA: "novembre", ORTH: "nov."},
{LEMMA: "décembre", ORTH: "déc."},
{LEMMA: "après", ORTH: "apr."},
{LEMMA: "docteur", ORTH: "Dr."},
{LEMMA: "monsieur", ORTH: "M."},
{LEMMA: "monsieur", ORTH: "Mr."},
{LEMMA: "madame", ORTH: "Mme."},
{LEMMA: "mademoiselle", ORTH: "Mlle."},
{LEMMA: "numéro", ORTH: ""},
{LEMMA: "degrés", ORTH: ""},
{LEMMA: "saint", ORTH: "St."},
{LEMMA: "sainte", ORTH: "Ste."}]:
_exc[exc_data[ORTH]] = [dict(exc_data)]
for orth in FR_BASE_EXCEPTIONS + ["etc."]:
_exc[orth] = [{ORTH: orth}]
for verb, verb_lemma in [
("a", "avoir"),
("est", "être"),
("semble", "sembler"),
("indique", "indiquer"),
("moque", "moquer"),
("passe", "passer")]:
for orth in [verb, verb.title()]: for orth in [verb, verb.title()]:
for pronoun in ("elle", "il", "on"): for pronoun in ["elle", "il", "on"]:
token = "{}-t-{}".format(orth, pronoun) token = "{}-t-{}".format(orth, pronoun)
VERBS[token] = [ _exc[token] = [
{LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"}, {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
{LEMMA: "t", ORTH: "-t"}, {LEMMA: "t", ORTH: "-t"},
{LEMMA: pronoun, ORTH: "-" + pronoun} {LEMMA: pronoun, ORTH: "-" + pronoun}]
]
for verb, verb_lemma in [("est","être")]: for verb, verb_lemma in [
("est","être")]:
for orth in [verb, verb.title()]: for orth in [verb, verb.title()]:
token = "{}-ce".format(orth) token = "{}-ce".format(orth)
VERBS[token] = [ _exc[token] = [
{LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"}, {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
{LEMMA: 'ce', ORTH: '-ce'} {LEMMA: 'ce', ORTH: '-ce'}]
]
for pre, pre_lemma in (("qu'", "que"), ("n'", "ne")):
for pre, pre_lemma in [
("qu'", "que"),
("n'", "ne")]:
for orth in [pre,pre.title()]: for orth in [pre,pre.title()]:
VERBS['{}est-ce'.format(orth)] = [ _exc['%sest-ce' % orth] = [
{LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"}, {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
{LEMMA: 'être', ORTH: "est", TAG: "VERB"}, {LEMMA: 'être', ORTH: "est", TAG: "VERB"},
{LEMMA: 'ce', ORTH: '-ce'} {LEMMA: 'ce', ORTH: '-ce'}]
]
HYPHEN = ['-', '']
base_exceptions = get_exceptions()
infixes_exceptions = []
_infixes_exc = []
for elision_char in ELISION: for elision_char in ELISION:
for hyphen_char in HYPHEN: for hyphen_char in ['-', '']:
infixes_exceptions += [infix.replace("'", elision_char).replace('-', hyphen_char) _infixes_exc += [infix.replace("'", elision_char).replace('-', hyphen_char)
for infix in base_exceptions] for infix in FR_BASE_EXCEPTIONS]
_infixes_exc += [upper_first_letter(word) for word in _infixes_exc]
_infixes_exc = list(set(_infixes_exc))
infixes_exceptions += [upper_first_letter(word) for word in infixes_exceptions] for exc_data in _infixes_exc:
_exc[orth] = [{ORTH: orth}]
infixes_exceptions = list(set(infixes_exceptions))
update_exc(tokenizer_exceptions, strings_to_exc(infixes_exceptions))
update_exc(tokenizer_exceptions, ABBREVIATIONS_1)
update_exc(tokenizer_exceptions, strings_to_exc(ABBREVIATIONS_2))
update_exc(tokenizer_exceptions, VERBS)
return tokenizer_exceptions
HYPHEN_PREFIX = [ _hyphen_prefix = [
'a[ée]ro', 'abat', 'a[fg]ro', 'after', 'am[ée]ricano', 'anglo', 'anti', 'apr[èe]s', 'arabo', 'arcs?', 'archi', 'a[ée]ro', 'abat', 'a[fg]ro', 'after', 'am[ée]ricano', 'anglo', 'anti',
'arrières?', 'avant', 'auto', 'apr[èe]s', 'arabo', 'arcs?', 'archi', 'arrières?', 'avant', 'auto',
'banc', 'bas(?:ses?)?', 'bec?', 'best', 'bio?', 'bien', 'blanc', 'bo[îi]te', 'bois', 'bou(?:c|rg)', 'b[êe]ta', 'banc', 'bas(?:ses?)?', 'bec?', 'best', 'bio?', 'bien', 'blanc', 'bo[îi]te',
'cache', 'cap(?:ello)?', 'champ', 'chapelle', 'ch[âa]teau', 'cha(?:ud|t)e?s?', 'chou', 'chromo', 'claire?s?', 'bois', 'bou(?:c|rg)', 'b[êe]ta', 'cache', 'cap(?:ello)?', 'champ',
'co(?:de|ca)?', 'compte', 'contre', 'cordon', 'coupe?', 'court', 'crash', 'crise', 'croche', 'cross', 'cyber', 'chapelle', 'ch[âa]teau', 'cha(?:ud|t)e?s?', 'chou', 'chromo', 'claire?s?',
'côte', 'co(?:de|ca)?', 'compte', 'contre', 'cordon', 'coupe?', 'court', 'crash',
'demi', 'di(?:sney)?', 'd[ée]s?', 'double', 'dys', 'crise', 'croche', 'cross', 'cyber', 'côte', 'demi', 'di(?:sney)?',
'entre', 'est', 'ethno', 'extra', 'extrême', '[ée]co', 'd[ée]s?', 'double', 'dys', 'entre', 'est', 'ethno', 'extra', 'extrême',
'fil', 'fort', 'franco?s?', '[ée]co', 'fil', 'fort', 'franco?s?', 'gallo', 'gardes?', 'gastro',
'gallo', 'gardes?', 'gastro', 'grande?', 'gratte', 'gr[ée]co', 'gros', 'g[ée]o', 'grande?', 'gratte', 'gr[ée]co', 'gros', 'g[ée]o', 'haute?s?', 'hyper',
'haute?s?', 'hyper', 'indo', 'infra', 'inter', 'intra', 'islamo', 'italo', 'jean', 'labio',
'indo', 'infra', 'inter', 'intra', 'islamo', 'italo', 'latino', 'live', 'lot', 'louis', 'm[ai]cro', 'mesnil', 'mi(?:ni)?', 'mono',
'jean', 'mont?s?', 'moyen', 'multi', 'm[ée]cano', 'm[ée]dico', 'm[ée]do', 'm[ée]ta',
'labio', 'latino', 'live', 'lot', 'louis', 'mots?', 'noix', 'non', 'nord', 'notre', 'n[ée]o', 'ouest', 'outre', 'ouvre',
'm[ai]cro', 'mesnil', 'mi(?:ni)?', 'mono', 'mont?s?', 'moyen', 'multi', 'm[ée]cano', 'm[ée]dico', 'm[ée]do', 'm[ée]ta', 'passe', 'perce', 'pharmaco', 'ph[oy]to', 'pique', 'poissons?', 'ponce',
'mots?', 'pont', 'po[rs]t', 'primo', 'pro(?:cès|to)?', 'pare', 'petite?', 'porte',
'noix', 'non', 'nord', 'notre', 'n[ée]o', 'pré', 'prêchi', 'pseudo', 'pêle', 'péri', 'puy', 'quasi', 'recourt',
'ouest', 'outre', 'ouvre', 'rythmo', 'r[ée]', 'r[ée]tro', 'sans', 'sainte?s?', 'semi', 'social',
'passe', 'perce', 'pharmaco', 'ph[oy]to', 'pique', 'poissons?', 'ponce', 'pont', 'po[rs]t', 'sous', 'su[bdr]', 'super', 'tire', 'thermo', 'tiers', 'trans',
'primo', 'pro(?:cès|to)?', 'pare', 'petite?', 'porte', 'pré', 'prêchi', 'pseudo', 'pêle', 'péri', 'puy', 'tr(?:i|ou)', 't[ée]l[ée]', 'vi[cd]e', 'vid[ée]o', 'vie(?:ux|illes?)',
'quasi', 'vill(?:e|eneuve|ers|ette|iers|y)', 'ultra', 'à', '[ée]lectro', '[ée]qui']
'recourt', 'rythmo', 'r[ée]', 'r[ée]tro',
'sans', 'sainte?s?', 'semi', 'social', 'sous', 'su[bdr]', 'super',
'tire', 'thermo', 'tiers', 'trans', 'tr(?:i|ou)', 't[ée]l[ée]',
'vi[cd]e', 'vid[ée]o', 'vie(?:ux|illes?)', 'vill(?:e|eneuve|ers|ette|iers|y)',
'ultra',
'à',
'[ée]lectro', '[ée]qui'
]
ELISION_PREFIX = ['entr', 'grande?s?'] _elision_prefix = ['entr', 'grande?s?']
_other_hyphens = ''.join([h for h in HYPHENS if h != '-'])
REGULAR_EXP = [ _regular_exp = [
'^droits?[{hyphen}]de[{hyphen}]l\'homm[{alpha}]+$'.format(hyphen=HYPHENS, alpha=ALPHA_LOWER), '^droits?[{hyphen}]de[{hyphen}]l\'homm[{alpha}]+$'.format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
'^zig[{hyphen}]zag[{alpha}]*$'.format(hyphen=HYPHENS, alpha=ALPHA_LOWER), '^zig[{hyphen}]zag[{alpha}]*$'.format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
'^prud[{elision}]homm[{alpha}]*$'.format(elision=ELISION, alpha=ALPHA_LOWER), '^prud[{elision}]homm[{alpha}]*$'.format(elision=ELISION, alpha=ALPHA_LOWER)]
] _regular_exp += ["^{prefix}[{hyphen}][{alpha}][{alpha}{elision}{other_hyphen}\-]*$".format(
prefix=p, hyphen=HYPHENS, other_hyphen=_other_hyphens,
elision=ELISION, alpha=ALPHA_LOWER)
for p in _hyphen_prefix]
_regular_exp += ["^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".format(
prefix=p, elision=HYPHENS, hyphen=_other_hyphens, alpha=ALPHA_LOWER)
for p in _elision_prefix]
_regular_exp.append(_URL_PATTERN)
other_hyphens = ''.join([h for h in HYPHENS if h != '-'])
REGULAR_EXP += ["^{prefix}[{hyphen}][{alpha}][{alpha}{elision}{other_hyphen}\-]*$".format( TOKENIZER_EXCEPTIONS = dict(_exc)
prefix=p, hyphen=HYPHENS, other_hyphen=other_hyphens, elision=ELISION, alpha=ALPHA_LOWER) TOKEN_MATCH = re.compile('|'.join('(?:{})'.format(m) for m in _regular_exp), re.IGNORECASE).match
for p in HYPHEN_PREFIX]
REGULAR_EXP += ["^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".format(
prefix=p, elision=HYPHENS, hyphen=other_hyphens, alpha=ALPHA_LOWER)
for p in ELISION_PREFIX]
REGULAR_EXP.append(_URL_PATTERN)
TOKEN_MATCH = re.compile('|'.join('(?:{})'.format(m) for m in REGULAR_EXP), re.IGNORECASE).match
__all__ = ["get_tokenizer_exceptions", "TOKEN_MATCH"]