Merge pull request #818 from raphael0202/tokenizer_exceptions

Add tokenizer exceptions for French
2025-10-18 09:44:16 +03:00 · 2017-02-09 16:41:21 +01:00 · 2017-02-09 16:41:21 +01:00 · b95afdf39c
commit b95afdf39c
parent b0ccf32378 309da78bf0
7 changed files with 26567 additions and 108 deletions
--- a/spacy/fr/init.py
+++ b/spacy/fr/init.py
@ -1,20 +1,30 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function

-from ..language import Language
+from ..language import Language, BaseDefaults
 from ..attrs import LANG

 from .language_data import *
-from .punctuation import TOKENIZER_INFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .tokenizer_exceptions import get_tokenizer_exceptions, TOKEN_MATCH
+
+
+class FrenchDefaults(BaseDefaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'fr'
+
+    stop_words = STOP_WORDS
+    infixes = tuple(TOKENIZER_INFIXES)
+    suffixes = tuple(TOKENIZER_SUFFIXES)
+    token_match = TOKEN_MATCH
+
+    @classmethod
+    def create_tokenizer(cls, nlp=None):
+        cls.tokenizer_exceptions = get_tokenizer_exceptions()
+        return super(FrenchDefaults, cls).create_tokenizer(nlp)


 class French(Language):
    lang = 'fr'

-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'fr'
-
-        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
-        stop_words = STOP_WORDS
-        infixes = tuple(TOKENIZER_INFIXES)
+    Defaults = FrenchDefaults
--- a/spacy/fr/language_data.py
+++ b/spacy/fr/language_data.py
@ -1,106 +1,10 @@
 # encoding: utf8
 from __future__ import unicode_literals

-from .. import language_data as base
-from ..language_data import strings_to_exc, update_exc
-
-from .punctuation import ELISION
-
-from ..symbols import *
 from .stop_words import STOP_WORDS


 STOP_WORDS = set(STOP_WORDS)


-TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
-update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
-
-ABBREVIATIONS = {
-    "janv.": [
-        {LEMMA: "janvier", ORTH: "janv."}
-    ],
-    "févr.": [
-        {LEMMA: "février", ORTH: "févr."}
-    ],
-    "avr.": [
-        {LEMMA: "avril", ORTH: "avr."}
-    ],
-    "juill.": [
-        {LEMMA: "juillet", ORTH: "juill."}
-    ],
-    "sept.": [
-        {LEMMA: "septembre", ORTH: "sept."}
-    ],
-    "oct.": [
-        {LEMMA: "octobre", ORTH: "oct."}
-    ],
-    "nov.": [
-        {LEMMA: "novembre", ORTH: "nov."}
-    ],
-    "déc.": [
-        {LEMMA: "décembre", ORTH: "déc."}
-    ],
-    "av.": [
-        {LEMMA: "avant", ORTH: "av."}
-    ],
-    "apr.": [
-        {LEMMA: "après", ORTH: "apr."}
-    ],
-    "J.-C.": [
-        {LEMMA: "jésus", ORTH: "J."},
-        {LEMMA: "christ", ORTH: "-C."}
-    ],
-    "Dr.": [
-        {LEMMA: "docteur", ORTH: "Dr."}
-    ],
-    "M.": [
-        {LEMMA: "monsieur", ORTH: "M."}
-    ],
-    "Mr.": [
-        {LEMMA: "monsieur", ORTH: "Mr."}
-    ],
-    "Mme.": [
-        {LEMMA: "madame", ORTH: "Mme."}
-    ],
-    "Mlle.": [
-        {LEMMA: "mademoiselle", ORTH: "Mlle."}
-    ],
-    "n°": [
-        {LEMMA: "numéro", ORTH: "n°"}
-    ],
-    "d°": [
-        {LEMMA: "degrés", ORTH: "d°"}
-    ],
-    "St.": [
-        {LEMMA: "saint", ORTH: "St."}
-    ],
-    "Ste.": [
-        {LEMMA: "sainte", ORTH: "Ste."}
-    ]
-}
-
-
-INFIXES_EXCEPTIONS_BASE = ["aujourd'hui",
-                           "prud'homme", "prud'hommes",
-                           "prud'homal", "prud'homaux", "prud'homale",
-                           "prud'homales",
-                           "prud'hommal", "prud'hommaux", "prud'hommale",
-                           "prud'hommales",
-                           "prud'homie", "prud'homies",
-                           "prud'hommesque", "prud'hommesques",
-                           "prud'hommesquement",
-                           "c'est-à-dire", "quelqu'un", "rendez-vous"]
-
-INFIXES_EXCEPTIONS = []
-for elision_char in ELISION:
-    INFIXES_EXCEPTIONS += [infix.replace("'", elision_char)
-                           for infix in INFIXES_EXCEPTIONS_BASE]
-
-INFIXES_EXCEPTIONS += [word.capitalize() for word in INFIXES_EXCEPTIONS]
-
-update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(INFIXES_EXCEPTIONS))
-update_exc(TOKENIZER_EXCEPTIONS, ABBREVIATIONS)
-
-
-__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
+__all__ = ["STOP_WORDS"]
--- a/spacy/fr/punctuation.py
+++ b/spacy/fr/punctuation.py
@ -2,12 +2,33 @@

 from __future__ import unicode_literals

-from ..language_data.punctuation import ALPHA, TOKENIZER_INFIXES
+from ..language_data.punctuation import ALPHA, TOKENIZER_INFIXES, LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY,\
+    UNITS, ALPHA_LOWER, QUOTES, ALPHA_UPPER


 _ELISION = " ' ’ "
 ELISION = _ELISION.strip().replace(' ', '').replace('\n', '')

+HYPHENS = r"""- – — ‐ ‑""".strip().replace(' ', '').replace('\n', '')
+
+
+TOKENIZER_SUFFIXES = (
+    LIST_PUNCT +
+    LIST_ELLIPSES +
+    LIST_QUOTES +
+    [
+        r'(?<=[0-9])\+',
+        r'(?<=°[FfCcKk])\.',  # 4°C. -> ["4°C", "."]
+        r'(?<=[0-9])°[FfCcKk]',  # 4°C -> ["4", "°C"]
+        r'(?<=[0-9])%',  # 4% -> ["4", "%"]
+        r'(?<=[0-9])(?:{c})'.format(c=CURRENCY),
+        r'(?<=[0-9])(?:{u})'.format(u=UNITS),
+        r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES),
+        r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER),
+    ]
+)
+
+
 TOKENIZER_INFIXES += [
    r'(?<=[{a}][{el}])(?=[{a}])'.format(a=ALPHA, el=ELISION),
 ]
--- a/spacy/fr/resources/tokenizer_exceptions
+++ b/spacy/fr/resources/tokenizer_exceptions
--- a/spacy/fr/tokenizer_exceptions.py
+++ b/spacy/fr/tokenizer_exceptions.py
@ -0,0 +1,217 @@
+# encoding: utf8
+
+from __future__ import unicode_literals
+
+from .. import language_data as base
+from ..language_data import strings_to_exc, update_exc
+from ..language_data.tokenizer_exceptions import _URL_PATTERN
+from ..language_data.punctuation import ALPHA_LOWER
+
+from .punctuation import ELISION, HYPHENS
+
+from ..symbols import *
+
+import os
+import io
+import re
+
+
+def iter_exceptions():
+    with io.open(os.path.join(os.path.dirname(__file__), 'resources/tokenizer_exceptions'),
+                 'rt', encoding='utf8') as f:
+        for line in f:
+            yield line.strip('\n')
+
+
+def upper_first_letter(text):
+    if len(text) == 0:
+        return text
+
+    if len(text) == 1:
+        return text.upper()
+
+    return text[0].upper() + text[1:]
+
+
+def lower_first_letter(text):
+    if len(text) == 0:
+        return text
+
+    if len(text) == 1:
+        return text.lower()
+
+    return text[0].lower() + text[1:]
+
+
+def get_tokenizer_exceptions():
+    tokenizer_exceptions = strings_to_exc(base.EMOTICONS)
+    update_exc(tokenizer_exceptions, strings_to_exc(base.ABBREVIATIONS))
+
+    ABBREVIATIONS_1 = {
+        "av.": [
+            {LEMMA: "avant", ORTH: "av."}
+        ],
+        "janv.": [
+            {LEMMA: "janvier", ORTH: "janv."}
+        ],
+        "févr.": [
+            {LEMMA: "février", ORTH: "févr."}
+        ],
+        "avr.": [
+            {LEMMA: "avril", ORTH: "avr."}
+        ],
+        "juill.": [
+            {LEMMA: "juillet", ORTH: "juill."}
+        ],
+        "sept.": [
+            {LEMMA: "septembre", ORTH: "sept."}
+        ],
+        "oct.": [
+            {LEMMA: "octobre", ORTH: "oct."}
+        ],
+        "nov.": [
+            {LEMMA: "novembre", ORTH: "nov."}
+        ],
+        "déc.": [
+            {LEMMA: "décembre", ORTH: "déc."}
+        ],
+        "apr.": [
+            {LEMMA: "après", ORTH: "apr."}
+        ],
+        "J.-C.": [
+            {LEMMA: "Jésus", ORTH: "J."},
+            {LEMMA: "Christ", ORTH: "-C."}
+        ],
+        "Dr.": [
+            {LEMMA: "docteur", ORTH: "Dr."}
+        ],
+        "M.": [
+            {LEMMA: "monsieur", ORTH: "M."}
+        ],
+        "Mr.": [
+            {LEMMA: "monsieur", ORTH: "Mr."}
+        ],
+        "Mme.": [
+            {LEMMA: "madame", ORTH: "Mme."}
+        ],
+        "Mlle.": [
+            {LEMMA: "mademoiselle", ORTH: "Mlle."}
+        ],
+        "n°": [
+            {LEMMA: "numéro", ORTH: "n°"}
+        ],
+        "d°": [
+            {LEMMA: "degrés", ORTH: "d°"}
+        ],
+        "St.": [
+            {LEMMA: "saint", ORTH: "St."}
+        ],
+        "Ste.": [
+            {LEMMA: "sainte", ORTH: "Ste."}
+        ]
+    }
+
+    ABBREVIATIONS_2 = [
+        "etc.",
+    ]
+
+    VERBS = {}
+    for verb, verb_lemma in (("a", "avoir"), ("est", "être"),
+                             ("semble", "sembler"), ("indique", "indiquer"),
+                             ("moque", "moquer"), ("passe", "passer")):
+        for pronoun in ("elle", "il", "on"):
+            token = "{}-t-{}".format(verb, pronoun)
+            VERBS[token] = [
+                {LEMMA: verb_lemma, ORTH: verb},
+                {LEMMA: "t", ORTH: "-t"},
+                {LEMMA: pronoun, ORTH: "-" + pronoun}
+            ]
+
+    VERBS['est-ce'] = [
+        {LEMMA: 'être', ORTH: "est"},
+        {LEMMA: 'ce', ORTH: '-ce'}
+    ]
+
+    for pre, pre_lemma in (("qu'", "que"), ("Qu'", "Que"), ("n'", "ne"),
+                           ("N'", "Ne")):
+        VERBS['{}est-ce'.format(pre)] = [
+            {LEMMA: pre_lemma, ORTH: pre},
+            {LEMMA: 'être', ORTH: "est"},
+            {LEMMA: 'ce', ORTH: '-ce'}
+        ]
+
+    HYPHEN = ['-', '‐']
+
+    base_exceptions = list(iter_exceptions())
+    infixes_exceptions = []
+
+    for elision_char in ELISION:
+        for hyphen_char in HYPHEN:
+            infixes_exceptions += [infix.replace("'", elision_char).replace('-', hyphen_char)
+                                   for infix in base_exceptions]
+
+    infixes_exceptions += [upper_first_letter(word) for word in infixes_exceptions]
+
+    infixes_exceptions = list(set(infixes_exceptions))
+
+    update_exc(tokenizer_exceptions, strings_to_exc(infixes_exceptions))
+    update_exc(tokenizer_exceptions, ABBREVIATIONS_1)
+    update_exc(tokenizer_exceptions, strings_to_exc(ABBREVIATIONS_2))
+    update_exc(tokenizer_exceptions, VERBS)
+    return tokenizer_exceptions
+
+
+HYPHEN_PREFIX = [
+    'a[ée]ro', 'abat', 'a[fg]ro', 'after', 'am[ée]ricano', 'anglo', 'anti', 'apr[èe]s', 'arabo', 'arcs?', 'archi',
+    'arrières?', 'avant', 'auto',
+    'banc', 'bas(ses?)?', 'bec?', 'best', 'bio?', 'bien', 'blanc', 'bo[îi]te', 'bois', 'bou(c|rg)', 'b[êe]ta',
+    'cache', 'cap(ello)?', 'champ', 'chapelle', 'ch[âa]teau', 'cha(ud|t)e?s?', 'chou', 'chromo', 'claire?s?',
+    'co(de|ca)?', 'compte', 'contre', 'cordon', 'coupe?', 'court', 'crash', 'crise', 'croche', 'cross', 'cyber',
+    'côte',
+    'demi', 'di(sney)?', 'd[ée]s?', 'double', 'dys',
+    'entre', 'est', 'ethno', 'extra', 'extrême', '[ée]co',
+    'fil', 'fort', 'franco?s?',
+    'gallo', 'gardes?', 'gastro', 'grande?', 'gratte', 'gr[ée]co', 'gros', 'g[ée]o',
+    'haute?s?', 'hyper',
+    'indo', 'infra', 'inter', 'intra', 'islamo', 'italo',
+    'jean',
+    'labio', 'latino', 'live', 'lot', 'louis',
+    'm[ai]cro', 'mesnil', 'mi(ni)?', 'mono', 'mont?s?', 'moyen', 'multi', 'm[ée]cano', 'm[ée]dico', 'm[ée]do', 'm[ée]ta',
+    'mots?',
+    'noix', 'non', 'nord', 'notre', 'n[ée]o',
+    'ouest', 'outre', 'ouvre',
+    'passe', 'perce', 'pharmaco', 'ph[oy]to', 'pique', 'poissons?', 'ponce', 'pont', 'po[rs]t',
+    'primo', 'pro(cès|to)?', 'pare', 'petite?', 'porte', 'pré', 'prêchi', 'pseudo', 'pêle', 'péri', 'puy',
+    'quasi',
+    'recourt', 'rythmo', 'r[ée]', 'r[ée]tro',
+    'sans', 'sainte?s?', 'semi', 'social', 'sous', 'su[bdr]', 'super',
+    'tire', 'thermo', 'tiers', 'trans', 'tr(i|ou)', 't[ée]l[ée]',
+    'vi[cd]e', 'vid[ée]o', 'vie(ux|illes?)', 'vill(e|eneuve|ers|ette|iers|y)',
+    'ultra',
+    'à',
+    '[ée]lectro', '[ée]qui'
+    ]
+
+ELISION_PREFIX = ['entr', 'grande?s?']
+
+REGULAR_EXP = [
+    '^droits?[{hyphen}]de[{hyphen}]l\'homm[{alpha}]+$'.format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
+    '^zig[{hyphen}]zag[{alpha}]*$'.format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
+    '^prud[{elision}]homm[{alpha}]*$'.format(elision=ELISION, alpha=ALPHA_LOWER),
+]
+
+other_hyphens = ''.join([h for h in HYPHENS if h != '-'])
+
+REGULAR_EXP += ["^{prefix}[{hyphen}][{alpha}][{alpha}{elision}{other_hyphen}\-]*$".format(
+    prefix=p, hyphen=HYPHENS, other_hyphen=other_hyphens, elision=ELISION, alpha=ALPHA_LOWER)
+                for p in HYPHEN_PREFIX]
+
+REGULAR_EXP += ["^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".format(
+    prefix=p, elision=HYPHENS, hyphen=other_hyphens, alpha=ALPHA_LOWER)
+                for p in ELISION_PREFIX]
+
+REGULAR_EXP.append(_URL_PATTERN)
+
+TOKEN_MATCH = re.compile('|'.join('({})'.format(m) for m in REGULAR_EXP), re.IGNORECASE).match
+
+__all__ = ("get_tokenizer_exceptions", "TOKEN_MATCH")
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -53,7 +53,7 @@ def de_tokenizer():
    return German.Defaults.create_tokenizer()


-@pytest.fixture
+@pytest.fixture(scope='module')
 def fr_tokenizer():
    return French.Defaults.create_tokenizer()

--- a/spacy/tests/fr/test_exceptions.py
+++ b/spacy/tests/fr/test_exceptions.py
@ -14,6 +14,8 @@ def test_tokenizer_infix_exceptions(fr_tokenizer, text):

@pytest.mark.parametrize('text,lemma', [("janv.", "janvier"),
                                        ("juill.", "juillet"),
+                                        ("Dr.", "docteur"),
+                                        ("av.", "avant"),
                                        ("sept.", "septembre")])
 def test_tokenizer_handles_abbr(fr_tokenizer, text, lemma):
    tokens = fr_tokenizer(text)
@ -28,3 +30,11 @@ def test_tokenizer_handles_exc_in_text(fr_tokenizer):
    assert tokens[6].text == "janv."
    assert tokens[6].lemma_ == "janvier"
    assert tokens[8].text == "prud’hommes"
+
+
+def test_tokenizer_handles_exc_in_text_2(fr_tokenizer):
+    text = "Cette après-midi, je suis allé dans un restaurant italo-mexicain."
+    tokens = fr_tokenizer(text)
+    assert len(tokens) == 11
+    assert tokens[1].text == "après-midi"
+    assert tokens[9].text == "italo-mexicain"