Add tokenizer exceptions for French

This commit is contained in:
Raphaël Bournhonesque 2017-02-02 08:36:16 +01:00
parent 3ea0df6ba7
commit 85f951ca99
7 changed files with 26531 additions and 70 deletions

View File

@ -1,20 +1,30 @@
# encoding: utf8 # encoding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
from ..language import Language from ..language import Language, BaseDefaults
from ..attrs import LANG from ..attrs import LANG
from .language_data import * from .language_data import *
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .tokenizer_exceptions import get_tokenizer_exceptions, TOKEN_MATCH
class FrenchDefaults(BaseDefaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fr'
stop_words = STOP_WORDS
infixes = tuple(TOKENIZER_INFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
token_match = TOKEN_MATCH
@classmethod
def create_tokenizer(cls, nlp=None):
cls.tokenizer_exceptions = get_tokenizer_exceptions()
return super(FrenchDefaults, cls).create_tokenizer(nlp)
class French(Language): class French(Language):
lang = 'fr' lang = 'fr'
class Defaults(Language.Defaults): Defaults = FrenchDefaults
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fr'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
infixes = tuple(TOKENIZER_INFIXES)

View File

@ -1,68 +1,10 @@
# encoding: utf8 # encoding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from .. import language_data as base
from ..language_data import strings_to_exc, update_exc
from .punctuation import ELISION
from ..symbols import *
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) __all__ = ["STOP_WORDS"]
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
ABBREVIATIONS = {
"janv.": [
{LEMMA: "janvier", ORTH: "janv."}
],
"févr.": [
{LEMMA: "février", ORTH: "févr."}
],
"avr.": [
{LEMMA: "avril", ORTH: "avr."}
],
"juill.": [
{LEMMA: "juillet", ORTH: "juill."}
],
"sept.": [
{LEMMA: "septembre", ORTH: "sept."}
],
"oct.": [
{LEMMA: "octobre", ORTH: "oct."}
],
"nov.": [
{LEMMA: "novembre", ORTH: "nov."}
],
"déc.": [
{LEMMA: "décembre", ORTH: "déc."}
],
}
INFIXES_EXCEPTIONS_BASE = ["aujourd'hui",
"prud'homme", "prud'hommes",
"prud'homal", "prud'homaux", "prud'homale",
"prud'homales",
"prud'hommal", "prud'hommaux", "prud'hommale",
"prud'hommales",
"prud'homie", "prud'homies",
"prud'hommesque", "prud'hommesques",
"prud'hommesquement"]
INFIXES_EXCEPTIONS = []
for elision_char in ELISION:
INFIXES_EXCEPTIONS += [infix.replace("'", elision_char)
for infix in INFIXES_EXCEPTIONS_BASE]
INFIXES_EXCEPTIONS += [word.capitalize() for word in INFIXES_EXCEPTIONS]
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(INFIXES_EXCEPTIONS))
update_exc(TOKENIZER_EXCEPTIONS, ABBREVIATIONS)
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -2,12 +2,33 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ..language_data.punctuation import ALPHA, TOKENIZER_INFIXES from ..language_data.punctuation import ALPHA, TOKENIZER_INFIXES, LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY,\
UNITS, ALPHA_LOWER, QUOTES, ALPHA_UPPER
_ELISION = " ' " _ELISION = " ' "
ELISION = _ELISION.strip().replace(' ', '').replace('\n', '') ELISION = _ELISION.strip().replace(' ', '').replace('\n', '')
HYPHENS = r"""- """.strip().replace(' ', '').replace('\n', '')
TOKENIZER_SUFFIXES = (
LIST_PUNCT +
LIST_ELLIPSES +
LIST_QUOTES +
[
r'(?<=[0-9])\+',
r'(?<=°[FfCcKk])\.', # 4°C. -> ["4°C", "."]
r'(?<=[0-9])°[FfCcKk]', # 4°C -> ["4", "°C"]
r'(?<=[0-9])%', # 4% -> ["4", "%"]
r'(?<=[0-9])(?:{c})'.format(c=CURRENCY),
r'(?<=[0-9])(?:{u})'.format(u=UNITS),
r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES),
r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER),
]
)
TOKENIZER_INFIXES += [ TOKENIZER_INFIXES += [
r'(?<=[{a}][{el}])(?=[{a}])'.format(a=ALPHA, el=ELISION), r'(?<=[{a}][{el}])(?=[{a}])'.format(a=ALPHA, el=ELISION),
] ]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,182 @@
# encoding: utf8
from __future__ import unicode_literals
from .. import language_data as base
from ..language_data import strings_to_exc, update_exc
from ..language_data.tokenizer_exceptions import _URL_PATTERN
from ..language_data.punctuation import ALPHA_LOWER
from .punctuation import ELISION, HYPHENS
from ..symbols import *
import os
import io
import re
def iter_exceptions():
with io.open(os.path.join(os.path.dirname(__file__), 'resources/tokenizer_exceptions'),
'rt', encoding='utf8') as f:
for line in f:
yield line.strip('\n')
def upper_first_letter(text):
if len(text) == 0:
return text
if len(text) == 1:
return text.upper()
return text[0].upper() + text[1:]
def lower_first_letter(text):
if len(text) == 0:
return text
if len(text) == 1:
return text.lower()
return text[0].lower() + text[1:]
def get_tokenizer_exceptions():
tokenizer_exceptions = strings_to_exc(base.EMOTICONS)
update_exc(tokenizer_exceptions, strings_to_exc(base.ABBREVIATIONS))
ABBREVIATIONS_1 = {
"av.": [
{LEMMA: "avant", ORTH: "av."}
],
"janv.": [
{LEMMA: "janvier", ORTH: "janv."}
],
"févr.": [
{LEMMA: "février", ORTH: "févr."}
],
"avr.": [
{LEMMA: "avril", ORTH: "avr."}
],
"juill.": [
{LEMMA: "juillet", ORTH: "juill."}
],
"sept.": [
{LEMMA: "septembre", ORTH: "sept."}
],
"oct.": [
{LEMMA: "octobre", ORTH: "oct."}
],
"nov.": [
{LEMMA: "novembre", ORTH: "nov."}
],
"déc.": [
{LEMMA: "décembre", ORTH: "déc."}
],
}
ABBREVIATIONS_2 = [
"Dr.",
"etc.",
]
VERBS = {}
for verb, verb_lemma in (("a", "avoir"), ("est", "être"), ("semble", "sembler"), ("indique", "indiquer"),
("moque", "moquer"), ("passe", "passer")):
for pronoun in ("elle", "il", "on"):
token = "{}-t-{}".format(verb, pronoun)
VERBS[token] = [
{LEMMA: verb_lemma, ORTH: verb},
{LEMMA: "t", ORTH: "-t"},
{LEMMA: pronoun, ORTH: "-" + pronoun}
]
VERBS['est-ce'] = [
{LEMMA: 'être', ORTH: "est"},
{LEMMA: 'ce', ORTH: '-ce'}
]
for pre, pre_lemma in (("qu'", "que"), ("Qu'", "Que"), ("n'", "ne"), ("N'", "Ne")):
VERBS['{}est-ce'.format(pre)] = [
{LEMMA: pre_lemma, ORTH: pre},
{LEMMA: 'être', ORTH: "est"},
{LEMMA: 'ce', ORTH: '-ce'}
]
HYPHEN = ['-', '']
base_exceptions = list(iter_exceptions())
infixes_exceptions = []
for elision_char in ELISION:
for hyphen_char in HYPHEN:
infixes_exceptions += [infix.replace("'", elision_char).replace('-', hyphen_char)
for infix in base_exceptions]
infixes_exceptions += [upper_first_letter(word) for word in infixes_exceptions]
infixes_exceptions = list(set(infixes_exceptions))
update_exc(tokenizer_exceptions, strings_to_exc(infixes_exceptions))
update_exc(tokenizer_exceptions, ABBREVIATIONS_1)
update_exc(tokenizer_exceptions, strings_to_exc(ABBREVIATIONS_2))
update_exc(tokenizer_exceptions, VERBS)
return tokenizer_exceptions
HYPHEN_PREFIX = [
'a[ée]ro', 'abat', 'a[fg]ro', 'after', 'am[ée]ricano', 'anglo', 'anti', 'apr[èe]s', 'arabo', 'arcs?', 'archi',
'arrières?', 'avant', 'auto',
'banc', 'bas(ses?)?', 'bec?', 'best', 'bio?', 'bien', 'blanc', 'bo[îi]te', 'bois', 'bou(c|rg)', 'b[êe]ta',
'cache', 'cap(ello)?', 'champ', 'chapelle', 'ch[âa]teau', 'cha(ud|t)e?s?', 'chou', 'chromo', 'claire?s?',
'co(de|ca)?', 'compte', 'contre', 'cordon', 'coupe?', 'court', 'crash', 'crise', 'croche', 'cross', 'cyber',
'côte',
'demi', 'di(sney)?', 'd[ée]s?', 'double', 'dys',
'entre', 'est', 'ethno', 'extra', 'extrême', '[ée]co',
'fil', 'fort', 'franco?s?',
'gallo', 'gardes?', 'gastro', 'grande?', 'gratte', 'gr[ée]co', 'gros', 'g[ée]o',
'haute?s?', 'hyper',
'indo', 'infra', 'inter', 'intra', 'islamo', 'italo',
'jean',
'labio', 'latino', 'live', 'lot', 'louis',
'm[ai]cro', 'mesnil', 'mi(ni)?', 'mono', 'mont?s?', 'moyen', 'multi', 'm[ée]cano', 'm[ée]dico', 'm[ée]do', 'm[ée]ta',
'mots?',
'noix', 'non', 'nord', 'notre', 'n[ée]o',
'ouest', 'outre', 'ouvre',
'passe', 'perce', 'pharmaco', 'ph[oy]to', 'pique', 'poissons?', 'ponce', 'pont', 'po[rs]t',
'primo', 'pro(cès|to)?', 'pare', 'petite?', 'porte', 'pré', 'prêchi', 'pseudo', 'pêle', 'péri', 'puy',
'quasi',
'recourt', 'rythmo', 'r[ée]', 'r[ée]tro',
'sans', 'sainte?s?', 'semi', 'social', 'sous', 'su[bdr]', 'super',
'tire', 'thermo', 'tiers', 'trans', 'tr(i|ou)', 't[ée]l[ée]',
'vi[cd]e', 'vid[ée]o', 'vie(ux|illes?)', 'vill(e|eneuve|ers|ette|iers|y)',
'ultra',
'à',
'[ée]lectro', '[ée]qui'
]
ELISION_PREFIX = ['entr', 'grande?s?']
REGULAR_EXP = [
'^droits?[{hyphen}]de[{hyphen}]l\'homm[{alpha}]+$'.format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
'^zig[{hyphen}]zag[{alpha}]*$'.format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
'^prud[{elision}]homm[{alpha}]*$'.format(elision=ELISION, alpha=ALPHA_LOWER),
]
other_hyphens = ''.join([h for h in HYPHENS if h != '-'])
REGULAR_EXP += ["^{prefix}[{hyphen}][{alpha}][{alpha}{elision}{other_hyphen}\-]*$".format(
prefix=p, hyphen=HYPHENS, other_hyphen=other_hyphens, elision=ELISION, alpha=ALPHA_LOWER)
for p in HYPHEN_PREFIX]
REGULAR_EXP += ["^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".format(
prefix=p, elision=HYPHENS, hyphen=other_hyphens, alpha=ALPHA_LOWER)
for p in ELISION_PREFIX]
REGULAR_EXP.append(_URL_PATTERN)
TOKEN_MATCH = re.compile('|'.join('({})'.format(m) for m in REGULAR_EXP), re.IGNORECASE).match
__all__ = ("get_tokenizer_exceptions", "TOKEN_MATCH")

View File

@ -52,7 +52,7 @@ def de_tokenizer():
return German.Defaults.create_tokenizer() return German.Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture(scope='module')
def fr_tokenizer(): def fr_tokenizer():
return French.Defaults.create_tokenizer() return French.Defaults.create_tokenizer()

View File

@ -28,3 +28,11 @@ def test_tokenizer_handles_exc_in_text(fr_tokenizer):
assert tokens[6].text == "janv." assert tokens[6].text == "janv."
assert tokens[6].lemma_ == "janvier" assert tokens[6].lemma_ == "janvier"
assert tokens[8].text == "prudhommes" assert tokens[8].text == "prudhommes"
def test_tokenizer_handles_exc_in_text_2(fr_tokenizer):
text = "Cette après-midi, je suis allé dans un restaurant italo-mexicain."
tokens = fr_tokenizer(text)
assert len(tokens) == 11
assert tokens[1].text == "après-midi"
assert tokens[9].text == "italo-mexicain"