Move English time exceptions ("1a.m." etc.) and refactor

2025-11-17 16:26:09 +03:00 · 2017-03-12 13:44:10 +01:00 · 2017-03-12 13:44:10 +01:00 · ce9568af84
commit ce9568af84
parent 6b30541774
2 changed files with 17 additions and 27 deletions
--- a/spacy/en/language_data.py
+++ b/spacy/en/language_data.py
@ -12,39 +12,12 @@ from .lemma_rules import LEMMA_RULES
 from .morph_rules import MORPH_RULES
 def get_time_exc(hours):
    exc = {}
    for hour in hours:
        exc["%sa.m." % hour] = [
            {ORTH: hour},
            {ORTH: "a.m."}
        ]
        exc["%sp.m." % hour] = [
            {ORTH: hour},
            {ORTH: "p.m."}
        ]
        exc["%sam" % hour] = [
            {ORTH: hour},
            {ORTH: "am", LEMMA: "a.m."}
        ]
        exc["%spm" % hour] = [
            {ORTH: hour},
            {ORTH: "pm", LEMMA: "p.m."}
        ]
    return exc
 TAG_MAP = dict(TAG_MAP)
 STOP_WORDS = set(STOP_WORDS)
 TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
 update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(
    ['%d' % hour for hour in range(1, 12 + 1)]))
 update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’"))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
--- a/spacy/en/tokenizer_exceptions.py
+++ b/spacy/en/tokenizer_exceptions.py
@ -337,6 +337,23 @@ for exc_data in [
        ]
 # Times
 for h in range(1, 12 + 1):
    hour = str(h)
    for period in ["a.m.", "am"]:
        EXC[hour + period] = [
            {ORTH: hour},
            {ORTH: period, LEMMA: "a.m."}
        ]
    for period in ["p.m.", "pm"]:
        EXC[hour + period] = [
            {ORTH: hour},
            {ORTH: period, LEMMA: "p.m."}
        ]
 # Rest
 OTHER = {