Improve French tokenization (#5202)

Improve French tokenization for UD_French-Sequoia.
This commit is contained in:
adrianeboyd 2020-03-25 11:27:42 +01:00 committed by GitHub
parent a3d09ffe61
commit 4117a5c705
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 61 additions and 12 deletions

View File

@ -2,7 +2,8 @@
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
@ -27,6 +28,7 @@ class FrenchDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
token_match = TOKEN_MATCH

View File

@ -1,15 +1,24 @@
# coding: utf8
from __future__ import unicode_literals
from ..punctuation import TOKENIZER_INFIXES
from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..char_classes import merge_chars
ELISION = " ' ".strip().replace(" ", "").replace("\n", "")
HYPHENS = r"- ".strip().replace(" ", "").replace("\n", "")
ELISION = "' ".replace(" ", "")
HYPHENS = r"- ".replace(" ", "")
_prefixes_elision = "d l n"
_prefixes_elision += " " + _prefixes_elision.upper()
_hyphen_suffixes = "ce clés elle en il ils je là moi nous on t vous"
_hyphen_suffixes += " " + _hyphen_suffixes.upper()
_prefixes = TOKENIZER_PREFIXES + [
r"(?:({pe})[{el}])(?=[{a}])".format(a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision))
]
_suffixes = (
LIST_PUNCT
+ LIST_ELLIPSES
@ -17,7 +26,6 @@ _suffixes = (
+ [
r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.", # °C. -> ["°C", "."]
r"(?<=[0-9])°[FfCcKk]", # 4°C -> ["4", "°C"]
r"(?<=[0-9])%", # 4% -> ["4", "%"]
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
@ -25,14 +33,15 @@ _suffixes = (
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
r"(?<=[{a}])[{h}]({hs})".format(a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)),
]
)
_infixes = TOKENIZER_INFIXES + [
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
]
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_SUFFIXES = _suffixes
TOKENIZER_INFIXES = _infixes

View File

@ -9,7 +9,7 @@ from ..char_classes import ALPHA_LOWER, ALPHA
from ...symbols import ORTH, LEMMA, TAG
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
#from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"]
@ -56,7 +56,28 @@ for exc_data in [
_exc[exc_data[ORTH]] = [exc_data]
for orth in ["etc."]:
for orth in [
"après-midi",
"au-delà",
"au-dessus",
"celle-ci",
"celles-ci",
"celui-ci",
"cf.",
"ci-dessous",
"elle-même",
"en-dessous",
"etc.",
"jusque-là",
"lui-même",
"MM.",
"No.",
"peut-être",
"pp.",
"quelques-uns",
"rendez-vous",
"Vol.",
]:
_exc[orth] = [{ORTH: orth}]
@ -72,7 +93,7 @@ for verb, verb_lemma in [
for pronoun in ["elle", "il", "on"]:
token = "{}-t-{}".format(orth, pronoun)
_exc[token] = [
{LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
{LEMMA: verb_lemma, ORTH: orth}, #, TAG: "VERB"},
{LEMMA: "t", ORTH: "-t"},
{LEMMA: pronoun, ORTH: "-" + pronoun},
]
@ -81,7 +102,7 @@ for verb, verb_lemma in [("est", "être")]:
for orth in [verb, verb.title()]:
token = "{}-ce".format(orth)
_exc[token] = [
{LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
{LEMMA: verb_lemma, ORTH: orth}, #, TAG: "VERB"},
{LEMMA: "ce", ORTH: "-ce"},
]
@ -89,12 +110,29 @@ for verb, verb_lemma in [("est", "être")]:
for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
for orth in [pre, pre.title()]:
_exc["%sest-ce" % orth] = [
{LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
{LEMMA: "être", ORTH: "est", TAG: "VERB"},
{LEMMA: pre_lemma, ORTH: orth},
{LEMMA: "être", ORTH: "est"},
{LEMMA: "ce", ORTH: "-ce"},
]
for verb, pronoun in [("est", "il"), ("EST", "IL")]:
token = "{}-{}".format(verb, pronoun)
_exc[token] = [
{LEMMA: "être", ORTH: verb},
{LEMMA: pronoun, ORTH: "-" + pronoun},
]
for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]:
token = "{}'{}-{}".format(s, verb, pronoun)
_exc[token] = [
{LEMMA: "se", ORTH: s + "'"},
{LEMMA: "être", ORTH: verb},
{LEMMA: pronoun, ORTH: "-" + pronoun},
]
_infixes_exc = []
orig_elision = "'"
orig_hyphen = "-"