spaCy/spacy/lang/fr/tokenizer_exceptions.py
thjbbvlt ef4c65598a works
modifié :         __init__.py
modifié :         punctuation.py
modifié :         tokenizer_exceptions.py
2024-03-15 11:55:27 +01:00

20 lines
653 B
Python

from ...util import update_exc
from ...symbols import NORM, ORTH
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {
"St": [{ORTH: "St", NORM: "Saint"}],
"Ste": [{ORTH: "Ste", NORM: "Sainte"}],
"Mme": [{ORTH: "Mme", NORM: "Madame"}],
"Mr.": [{ORTH: "Mr", NORM: "Monsieur"}],
"M.": [{ORTH: "M.", NORM: "Monsieur"}],
"Mlle": [{ORTH: "Mlle", NORM: "Mademoiselle"}],
"Dr": [{ORTH: "Dr", NORM: "Docteur"}],
"Dresse": [{ORTH: "Dresse", NORM: "Doctoresse"}],
"Drsse": [{ORTH: "Drsse", NORM: "Doctoresse"}],
"etc": [{ORTH: "etc", NORM: "etcaetera"}],
}
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)