diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py index 54e9c8c32..e8082f308 100644 --- a/spacy/en/language_data.py +++ b/spacy/en/language_data.py @@ -12,39 +12,12 @@ from .lemma_rules import LEMMA_RULES from .morph_rules import MORPH_RULES -def get_time_exc(hours): - exc = {} - for hour in hours: - exc["%sa.m." % hour] = [ - {ORTH: hour}, - {ORTH: "a.m."} - ] - - exc["%sp.m." % hour] = [ - {ORTH: hour}, - {ORTH: "p.m."} - ] - - exc["%sam" % hour] = [ - {ORTH: hour}, - {ORTH: "am", LEMMA: "a.m."} - ] - - exc["%spm" % hour] = [ - {ORTH: hour}, - {ORTH: "pm", LEMMA: "p.m."} - ] - return exc - - TAG_MAP = dict(TAG_MAP) STOP_WORDS = set(STOP_WORDS) TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) -update_exc(TOKENIZER_EXCEPTIONS, get_time_exc( - ['%d' % hour for hour in range(1, 12 + 1)])) update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’")) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index cb0cbe5db..419d29f54 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -337,6 +337,23 @@ for exc_data in [ ] +# Times + +for h in range(1, 12 + 1): + hour = str(h) + + for period in ["a.m.", "am"]: + EXC[hour + period] = [ + {ORTH: hour}, + {ORTH: period, LEMMA: "a.m."} + ] + for period in ["p.m.", "pm"]: + EXC[hour + period] = [ + {ORTH: hour}, + {ORTH: period, LEMMA: "p.m."} + ] + + # Rest OTHER = {