Add tokenizer exceptions for a.m. and p.m. in Spanish

2025-10-29 06:57:49 +03:00 · 2016-12-21 18:19:10 +01:00 · 2016-12-21 18:19:10 +01:00 · 3c87c71d43
commit 3c87c71d43
parent d1a2846750
1 changed files with 33 additions and 0 deletions
--- a/spacy/es/language_data.py
+++ b/spacy/es/language_data.py
@ -3,12 +3,45 @@ from __future__ import unicode_literals
 from .. import language_data as base
 from ..language_data import update_exc, strings_to_exc
 from ..symbols import ORTH, LEMMA
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
 def get_time_exc(hours):
    exc = {
        "12m.": [
            {ORTH: "12"},
            {ORTH: "m.", LEMMA: "p.m."}
        ]
    }
    for hour in hours:
        exc["%da.m." % hour] = [
            {ORTH: hour},
            {ORTH: "a.m."}
        ]
        exc["%dp.m." % hour] = [
            {ORTH: hour},
            {ORTH: "p.m."}
        ]
        exc["%dam" % hour] = [
            {ORTH: hour},
            {ORTH: "am", LEMMA: "a.m."}
        ]
        exc["%dpm" % hour] = [
            {ORTH: hour},
            {ORTH: "pm", LEMMA: "p.m."}
        ]
    return exc
 TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
 STOP_WORDS = set(STOP_WORDS)