mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-05 05:33:15 +03:00
works
modifié : __init__.py modifié : punctuation.py modifié : tokenizer_exceptions.py
This commit is contained in:
parent
5a3928fe1e
commit
ef4c65598a
|
@ -5,10 +5,14 @@ from thinc.api import Model
|
|||
from ...language import BaseDefaults, Language
|
||||
from .lemmatizer import FrenchLemmatizer
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .punctuation import (
|
||||
TOKENIZER_INFIXES,
|
||||
TOKENIZER_PREFIXES,
|
||||
TOKENIZER_SUFFIXES,
|
||||
)
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class FrenchDefaults(BaseDefaults):
|
||||
|
@ -16,7 +20,6 @@ class FrenchDefaults(BaseDefaults):
|
|||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
token_match = TOKEN_MATCH
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
stop_words = STOP_WORDS
|
||||
|
@ -47,7 +50,12 @@ def make_lemmatizer(
|
|||
scorer: Optional[Callable],
|
||||
):
|
||||
return FrenchLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
nlp.vocab,
|
||||
model,
|
||||
name,
|
||||
mode=mode,
|
||||
overwrite=overwrite,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -7,29 +7,80 @@ from ..char_classes import (
|
|||
LIST_ELLIPSES,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
LIST_ICONS,
|
||||
UNITS,
|
||||
merge_chars,
|
||||
)
|
||||
from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
||||
|
||||
ELISION = "' ’".replace(" ", "")
|
||||
ELISION = " ' ` ´ ’ ".replace(" ", "")
|
||||
HYPHENS = r"- – — ‐ ‑".replace(" ", "")
|
||||
_prefixes_elision = "d l n"
|
||||
_prefixes_elision += " " + _prefixes_elision.upper()
|
||||
_hyphen_suffixes = "ce clés elle en il ils je là moi nous on t vous"
|
||||
_hyphen_suffixes += " " + _hyphen_suffixes.upper()
|
||||
|
||||
|
||||
_prefixes = TOKENIZER_PREFIXES + [
|
||||
r"(?:({pe})[{el}])(?=[{a}])".format(
|
||||
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
||||
)
|
||||
# fmt: off
|
||||
_suffix_inversion = [
|
||||
"je", "tu", "on", "il", "elle", "iel",
|
||||
"nous", "vous", "elles", "ils", "iels",
|
||||
"moi", "toi", "lui", "leur", "eux",
|
||||
# to avoid matching: Villar-le-bois
|
||||
fr"la(?![{HYPHENS}])",
|
||||
fr"le(?![{HYPHENS}])",
|
||||
fr"les(?![{HYPHENS}])",
|
||||
fr"en(?![{HYPHENS}])", "y",
|
||||
# a-t-on, a-t'on
|
||||
fr"t[{HYPHENS}]??[{ELISION}]?",
|
||||
fr"m[{ELISION}]?",
|
||||
"là", "ici",
|
||||
]
|
||||
_prefix_elision = [
|
||||
"n", "s", "c", "d", "j", "m", "t", "l", "qu",
|
||||
# i exclude "quelqu'un"/"quelqu'un" because it's one token (and not quelque + un, which would lacks the idea of 'one person').
|
||||
fr"quelqu(?![{ELISION}]un[ex]*\b)",
|
||||
"jusqu", "presqu", "lorsqu", "puisqu", "quoiqu",
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
_suffixes = (
|
||||
|
||||
def upperandtitle(a):
|
||||
"""[alors, on] -> [alors, Alors, ALORS, on, On, ON]"""
|
||||
|
||||
def fn(i):
|
||||
t = i[0].upper() + i[1:]
|
||||
u = i.upper()
|
||||
return [i, t] if t == u else [i, t, u]
|
||||
|
||||
return [x for y in [fn(i) for i in a] for x in y]
|
||||
|
||||
|
||||
_suffix_inversion = r"|".join(upperandtitle(_suffix_inversion))
|
||||
_prefix_elision = r"|".join(upperandtitle(_prefix_elision))
|
||||
|
||||
_elision = rf"(?:\b(?:{_prefix_elision})[{ELISION}])"
|
||||
_inversion = (
|
||||
rf"(?:(?<=[^\W\d])[{HYPHENS}]\b(?:{_suffix_inversion})\b)"
|
||||
)
|
||||
|
||||
TOKENIZER_PREFIXES = [_elision]
|
||||
|
||||
TOKENIZER_INFIXES = (
|
||||
# base list without hyphen regex
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
]
|
||||
# plus conditionnal hyphen
|
||||
+ [_inversion]
|
||||
)
|
||||
|
||||
TOKENIZER_SUFFIXES = (
|
||||
# base list, les hyphens and english things such as "'s"
|
||||
LIST_PUNCT
|
||||
+ LIST_ELLIPSES
|
||||
+ LIST_QUOTES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9])\+",
|
||||
r"(?<=°[FfCcKk])\.", # °C. -> ["°C", "."]
|
||||
|
@ -40,17 +91,5 @@ _suffixes = (
|
|||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||
r"(?<=[{a}])[{h}]({hs})".format(
|
||||
a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
_infixes = TOKENIZER_INFIXES + [
|
||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
|
||||
]
|
||||
|
||||
|
||||
TOKENIZER_PREFIXES = _prefixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
|
|
|
@ -1,36 +1,19 @@
|
|||
import re
|
||||
from ...util import update_exc
|
||||
from ...symbols import NORM, ORTH
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
||||
_hyphen = "-–—"
|
||||
_apostrophe = "'`´’"
|
||||
|
||||
# fmt: off
|
||||
_suffix_inversion = r"|".join([
|
||||
"je", "tu", "on", "il", "elle", "iel",
|
||||
"nous", "vous", "elles", "ils", "iels",
|
||||
# écoutons-les
|
||||
"moi", "toi", "lui", "leur",
|
||||
"eux",
|
||||
fr"en(?![{_hyphen}])", "y",
|
||||
# écoutons-les
|
||||
fr"la(?![{_hyphen}])", fr"le(?![{_hyphen}])", fr"les(?![{_hyphen}])",
|
||||
# a-t-il, pourra-t'on, dis-m'en plus
|
||||
fr"t[{_hyphen}]??[{_apostrophe}]?", fr"m[{_apostrophe}]?",
|
||||
"là", "ici",
|
||||
])
|
||||
_prefix_elision = r"|".join([
|
||||
"n", "s", "c", "d", "j", "m", "t", "l", "qu",
|
||||
# i exclude "quelqu'un"/"quelqu'un" because it's one token (and not quelque + un, which lack the idea of 'one person').
|
||||
fr"quelqu(?![{_apostrophe}]un[ex]*\b)", # quelque
|
||||
"jusqu", "presqu", "lorsqu", "puisqu", "quoiqu",
|
||||
])
|
||||
# fmt: on
|
||||
_exc = {
|
||||
"St": [{ORTH: "St", NORM: "Saint"}],
|
||||
"Ste": [{ORTH: "Ste", NORM: "Sainte"}],
|
||||
"Mme": [{ORTH: "Mme", NORM: "Madame"}],
|
||||
"Mr.": [{ORTH: "Mr", NORM: "Monsieur"}],
|
||||
"M.": [{ORTH: "M.", NORM: "Monsieur"}],
|
||||
"Mlle": [{ORTH: "Mlle", NORM: "Mademoiselle"}],
|
||||
"Dr": [{ORTH: "Dr", NORM: "Docteur"}],
|
||||
"Dresse": [{ORTH: "Dresse", NORM: "Doctoresse"}],
|
||||
"Drsse": [{ORTH: "Drsse", NORM: "Doctoresse"}],
|
||||
"etc": [{ORTH: "etc", NORM: "etcaetera"}],
|
||||
}
|
||||
|
||||
_elision = rf"(?:\b(?:{_prefix_elision})[{_apostrophe}])"
|
||||
_inversion = (
|
||||
rf"(?:(?<=[^\W\d])[{_hyphen}]\b(?:{_suffix_inversion})\b)"
|
||||
)
|
||||
|
||||
TOKEN_MATCH = re.compile(
|
||||
r"(?iu)" + r"|".join([_inversion, _elision])
|
||||
)
|
||||
# _abbrevs = ["ste?", "mme", "mr?", "mlle", "dr", "etc", "cf"]
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
Loading…
Reference in New Issue
Block a user