mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Improve French tokenization (#5202)
Improve French tokenization for UD_French-Sequoia.
This commit is contained in:
parent
a3d09ffe61
commit
4117a5c705
|
@ -2,7 +2,8 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
@ -27,6 +28,7 @@ class FrenchDefaults(Language.Defaults):
|
|||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
token_match = TOKEN_MATCH
|
||||
|
|
|
@ -1,15 +1,24 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..punctuation import TOKENIZER_INFIXES
|
||||
from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||
from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
from ..char_classes import merge_chars
|
||||
|
||||
|
||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
||||
HYPHENS = r"- – — ‐ ‑".strip().replace(" ", "").replace("\n", "")
|
||||
ELISION = "' ’".replace(" ", "")
|
||||
HYPHENS = r"- – — ‐ ‑".replace(" ", "")
|
||||
_prefixes_elision = "d l n"
|
||||
_prefixes_elision += " " + _prefixes_elision.upper()
|
||||
_hyphen_suffixes = "ce clés elle en il ils je là moi nous on t vous"
|
||||
_hyphen_suffixes += " " + _hyphen_suffixes.upper()
|
||||
|
||||
|
||||
_prefixes = TOKENIZER_PREFIXES + [
|
||||
r"(?:({pe})[{el}])(?=[{a}])".format(a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision))
|
||||
]
|
||||
|
||||
_suffixes = (
|
||||
LIST_PUNCT
|
||||
+ LIST_ELLIPSES
|
||||
|
@ -17,7 +26,6 @@ _suffixes = (
|
|||
+ [
|
||||
r"(?<=[0-9])\+",
|
||||
r"(?<=°[FfCcKk])\.", # °C. -> ["°C", "."]
|
||||
r"(?<=[0-9])°[FfCcKk]", # 4°C -> ["4", "°C"]
|
||||
r"(?<=[0-9])%", # 4% -> ["4", "%"]
|
||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
|
@ -25,14 +33,15 @@ _suffixes = (
|
|||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||
r"(?<=[{a}])[{h}]({hs})".format(a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
_infixes = TOKENIZER_INFIXES + [
|
||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
|
||||
]
|
||||
|
||||
|
||||
TOKENIZER_PREFIXES = _prefixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
|
|
|
@ -9,7 +9,7 @@ from ..char_classes import ALPHA_LOWER, ALPHA
|
|||
from ...symbols import ORTH, LEMMA, TAG
|
||||
|
||||
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
|
||||
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
|
||||
#from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
|
||||
FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"]
|
||||
|
||||
|
||||
|
@ -56,7 +56,28 @@ for exc_data in [
|
|||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
|
||||
for orth in ["etc."]:
|
||||
for orth in [
|
||||
"après-midi",
|
||||
"au-delà",
|
||||
"au-dessus",
|
||||
"celle-ci",
|
||||
"celles-ci",
|
||||
"celui-ci",
|
||||
"cf.",
|
||||
"ci-dessous",
|
||||
"elle-même",
|
||||
"en-dessous",
|
||||
"etc.",
|
||||
"jusque-là",
|
||||
"lui-même",
|
||||
"MM.",
|
||||
"No.",
|
||||
"peut-être",
|
||||
"pp.",
|
||||
"quelques-uns",
|
||||
"rendez-vous",
|
||||
"Vol.",
|
||||
]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
|
||||
|
@ -72,7 +93,7 @@ for verb, verb_lemma in [
|
|||
for pronoun in ["elle", "il", "on"]:
|
||||
token = "{}-t-{}".format(orth, pronoun)
|
||||
_exc[token] = [
|
||||
{LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
|
||||
{LEMMA: verb_lemma, ORTH: orth}, #, TAG: "VERB"},
|
||||
{LEMMA: "t", ORTH: "-t"},
|
||||
{LEMMA: pronoun, ORTH: "-" + pronoun},
|
||||
]
|
||||
|
@ -81,7 +102,7 @@ for verb, verb_lemma in [("est", "être")]:
|
|||
for orth in [verb, verb.title()]:
|
||||
token = "{}-ce".format(orth)
|
||||
_exc[token] = [
|
||||
{LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
|
||||
{LEMMA: verb_lemma, ORTH: orth}, #, TAG: "VERB"},
|
||||
{LEMMA: "ce", ORTH: "-ce"},
|
||||
]
|
||||
|
||||
|
@ -89,12 +110,29 @@ for verb, verb_lemma in [("est", "être")]:
|
|||
for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
|
||||
for orth in [pre, pre.title()]:
|
||||
_exc["%sest-ce" % orth] = [
|
||||
{LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
|
||||
{LEMMA: "être", ORTH: "est", TAG: "VERB"},
|
||||
{LEMMA: pre_lemma, ORTH: orth},
|
||||
{LEMMA: "être", ORTH: "est"},
|
||||
{LEMMA: "ce", ORTH: "-ce"},
|
||||
]
|
||||
|
||||
|
||||
for verb, pronoun in [("est", "il"), ("EST", "IL")]:
|
||||
token = "{}-{}".format(verb, pronoun)
|
||||
_exc[token] = [
|
||||
{LEMMA: "être", ORTH: verb},
|
||||
{LEMMA: pronoun, ORTH: "-" + pronoun},
|
||||
]
|
||||
|
||||
|
||||
for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]:
|
||||
token = "{}'{}-{}".format(s, verb, pronoun)
|
||||
_exc[token] = [
|
||||
{LEMMA: "se", ORTH: s + "'"},
|
||||
{LEMMA: "être", ORTH: verb},
|
||||
{LEMMA: pronoun, ORTH: "-" + pronoun},
|
||||
]
|
||||
|
||||
|
||||
_infixes_exc = []
|
||||
orig_elision = "'"
|
||||
orig_hyphen = "-"
|
||||
|
|
Loading…
Reference in New Issue
Block a user