Fix Issue #736: Times were being tokenized with incorrect string values.

2025-11-03 01:17:52 +03:00 · 2017-01-12 11:21:01 +01:00 · 2017-01-12 11:21:01 +01:00 · fba67fa342
commit fba67fa342
parent a6790b6694
4 changed files with 34 additions and 15 deletions
--- a/spacy/en/language_data.py
+++ b/spacy/en/language_data.py
@ -15,25 +15,26 @@ from .morph_rules import MORPH_RULES
 def get_time_exc(hours):
    exc = {}
    for hour in hours:
-        exc["%da.m." % hour] = [
+        exc["%sa.m." % hour] = [
            {ORTH: hour},
            {ORTH: "a.m."}
        ]

-        exc["%dp.m." % hour] = [
+        exc["%sp.m." % hour] = [
            {ORTH: hour},
            {ORTH: "p.m."}
        ]

-        exc["%dam" % hour] = [
+        exc["%sam" % hour] = [
            {ORTH: hour},
            {ORTH: "am", LEMMA: "a.m."}
        ]

-        exc["%dpm" % hour] = [
+        exc["%spm" % hour] = [
            {ORTH: hour},
            {ORTH: "pm", LEMMA: "p.m."}
        ]
+    print(exc)
    return exc


@ -43,7 +44,8 @@ STOP_WORDS = set(STOP_WORDS)

 TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
-update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
+update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(
+    ['%d' % hour for hour in range(1, 12 + 1)]))
 update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’"))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
--- a/spacy/en/tokenizer_exceptions.py
+++ b/spacy/en/tokenizer_exceptions.py
@ -20,18 +20,18 @@ for pron in ["i"]:
        ]

        EXC[orth + "m"] = [
-            {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
            {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }
        ]

        EXC[orth + "'ma"] = [
-            {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
            {ORTH: "'m", LEMMA: "be", NORM: "am"},
            {ORTH: "a", LEMMA: "going to", NORM: "gonna"}
        ]

        EXC[orth + "ma"] = [
-            {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
            {ORTH: "m", LEMMA: "be", NORM: "am"},
            {ORTH: "a", LEMMA: "going to", NORM: "gonna"}
        ]
@ -150,8 +150,8 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:

        EXC[orth + "'ll've"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "ll", LEMMA: "will", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+            {ORTH: "'ll", LEMMA: "will", TAG: "MD"},
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
        ]

        EXC[orth + "llve"] = [
@ -237,11 +237,13 @@ for verb_data in [
        ]

        EXC[data[ORTH] + "n't've"] = [
+            dict(data),
            {ORTH: "n't", LEMMA: "not", TAG: "RB"},
            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
        ]

        EXC[data[ORTH] + "ntve"] = [
+            dict(data),
            {ORTH: "nt", LEMMA: "not", TAG: "RB"},
            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
        ]
--- a/spacy/es/language_data.py
+++ b/spacy/es/language_data.py
@ -18,22 +18,22 @@ def get_time_exc(hours):
    }

    for hour in hours:
-        exc["%da.m." % hour] = [
+        exc["%sa.m." % hour] = [
            {ORTH: hour},
            {ORTH: "a.m."}
        ]

-        exc["%dp.m." % hour] = [
+        exc["%sp.m." % hour] = [
            {ORTH: hour},
            {ORTH: "p.m."}
        ]

-        exc["%dam" % hour] = [
+        exc["%sam" % hour] = [
            {ORTH: hour},
            {ORTH: "am", LEMMA: "a.m."}
        ]

-        exc["%dpm" % hour] = [
+        exc["%spm" % hour] = [
            {ORTH: hour},
            {ORTH: "pm", LEMMA: "p.m."}
        ]
@ -45,7 +45,8 @@ STOP_WORDS = set(STOP_WORDS)

 TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
-update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
+update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(
+    ['%d' % hour for hour in range(1, 12 + 1)]))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))

--- a/spacy/language_data/util.py
+++ b/spacy/language_data/util.py
@ -3,6 +3,11 @@ from __future__ import unicode_literals

 from ..symbols import *

+try:
+    unicode
+except:
+    unicode = str
+

 PRON_LEMMA = "-PRON-"
 DET_LEMMA = "-DET-"
@ -10,6 +15,15 @@ ENT_ID = "ent_id"


 def update_exc(exc, additions):
+    for orth, token_attrs in additions.items():
+        if not all(isinstance(attr[ORTH], unicode) for attr in token_attrs):
+            msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
+            raise ValueError(msg % (orth, token_attrs))
+        described_orth = ''.join(attr[ORTH] for attr in token_attrs)
+        if orth != described_orth:
+            # TODO: Better error
+            msg = "Invalid tokenizer exception: key='%s', orths='%s'"
+            raise ValueError(msg % (orth, described_orth))
    overlap = set(exc.keys()).intersection(set(additions))
    assert not overlap, overlap
    exc.update(additions)