From 4117a5c7056a65aafb29db137b4f52b264d915fc Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 25 Mar 2020 11:27:42 +0100 Subject: [PATCH] Improve French tokenization (#5202) Improve French tokenization for UD_French-Sequoia. --- spacy/lang/fr/__init__.py | 4 ++- spacy/lang/fr/punctuation.py | 19 +++++++--- spacy/lang/fr/tokenizer_exceptions.py | 50 +++++++++++++++++++++++---- 3 files changed, 61 insertions(+), 12 deletions(-) diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index f56c8688a..7727aff0e 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -2,7 +2,8 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH -from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES +from .punctuation import TOKENIZER_SUFFIXES from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS @@ -27,6 +28,7 @@ class FrenchDefaults(Language.Defaults): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = TAG_MAP stop_words = STOP_WORDS + prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES token_match = TOKEN_MATCH diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py index 1422b4194..e03e91361 100644 --- a/spacy/lang/fr/punctuation.py +++ b/spacy/lang/fr/punctuation.py @@ -1,15 +1,24 @@ # coding: utf8 from __future__ import unicode_literals -from ..punctuation import TOKENIZER_INFIXES +from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..char_classes import merge_chars -ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") -HYPHENS = r"- – — ‐ ‑".strip().replace(" ", "").replace("\n", "") +ELISION = "' ’".replace(" ", "") +HYPHENS = r"- – — ‐ ‑".replace(" ", "") +_prefixes_elision = "d l n" +_prefixes_elision += " " + _prefixes_elision.upper() +_hyphen_suffixes = "ce clés elle en il ils je là moi nous on t vous" +_hyphen_suffixes += " " + _hyphen_suffixes.upper() +_prefixes = TOKENIZER_PREFIXES + [ + r"(?:({pe})[{el}])(?=[{a}])".format(a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)) +] + _suffixes = ( LIST_PUNCT + LIST_ELLIPSES @@ -17,7 +26,6 @@ _suffixes = ( + [ r"(?<=[0-9])\+", r"(?<=°[FfCcKk])\.", # °C. -> ["°C", "."] - r"(?<=[0-9])°[FfCcKk]", # 4°C -> ["4", "°C"] r"(?<=[0-9])%", # 4% -> ["4", "%"] r"(?<=[0-9])(?:{c})".format(c=CURRENCY), r"(?<=[0-9])(?:{u})".format(u=UNITS), @@ -25,14 +33,15 @@ _suffixes = ( al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES ), r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + r"(?<=[{a}])[{h}]({hs})".format(a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)), ] ) - _infixes = TOKENIZER_INFIXES + [ r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) ] +TOKENIZER_PREFIXES = _prefixes TOKENIZER_SUFFIXES = _suffixes TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 4b3b2c908..56c5544a5 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -9,7 +9,7 @@ from ..char_classes import ALPHA_LOWER, ALPHA from ...symbols import ORTH, LEMMA, TAG # not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer -# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS +#from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"] @@ -56,7 +56,28 @@ for exc_data in [ _exc[exc_data[ORTH]] = [exc_data] -for orth in ["etc."]: +for orth in [ + "après-midi", + "au-delà", + "au-dessus", + "celle-ci", + "celles-ci", + "celui-ci", + "cf.", + "ci-dessous", + "elle-même", + "en-dessous", + "etc.", + "jusque-là", + "lui-même", + "MM.", + "No.", + "peut-être", + "pp.", + "quelques-uns", + "rendez-vous", + "Vol.", +]: _exc[orth] = [{ORTH: orth}] @@ -72,7 +93,7 @@ for verb, verb_lemma in [ for pronoun in ["elle", "il", "on"]: token = "{}-t-{}".format(orth, pronoun) _exc[token] = [ - {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"}, + {LEMMA: verb_lemma, ORTH: orth}, #, TAG: "VERB"}, {LEMMA: "t", ORTH: "-t"}, {LEMMA: pronoun, ORTH: "-" + pronoun}, ] @@ -81,7 +102,7 @@ for verb, verb_lemma in [("est", "être")]: for orth in [verb, verb.title()]: token = "{}-ce".format(orth) _exc[token] = [ - {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"}, + {LEMMA: verb_lemma, ORTH: orth}, #, TAG: "VERB"}, {LEMMA: "ce", ORTH: "-ce"}, ] @@ -89,12 +110,29 @@ for verb, verb_lemma in [("est", "être")]: for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]: for orth in [pre, pre.title()]: _exc["%sest-ce" % orth] = [ - {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"}, - {LEMMA: "être", ORTH: "est", TAG: "VERB"}, + {LEMMA: pre_lemma, ORTH: orth}, + {LEMMA: "être", ORTH: "est"}, {LEMMA: "ce", ORTH: "-ce"}, ] +for verb, pronoun in [("est", "il"), ("EST", "IL")]: + token = "{}-{}".format(verb, pronoun) + _exc[token] = [ + {LEMMA: "être", ORTH: verb}, + {LEMMA: pronoun, ORTH: "-" + pronoun}, + ] + + +for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]: + token = "{}'{}-{}".format(s, verb, pronoun) + _exc[token] = [ + {LEMMA: "se", ORTH: s + "'"}, + {LEMMA: "être", ORTH: verb}, + {LEMMA: pronoun, ORTH: "-" + pronoun}, + ] + + _infixes_exc = [] orig_elision = "'" orig_hyphen = "-"