Fix Issue #736: Times were being tokenized with incorrect string values.

This commit is contained in:
Matthew Honnibal 2017-01-12 11:21:01 +01:00
parent a6790b6694
commit fba67fa342
4 changed files with 34 additions and 15 deletions

View File

@ -15,25 +15,26 @@ from .morph_rules import MORPH_RULES
def get_time_exc(hours): def get_time_exc(hours):
exc = {} exc = {}
for hour in hours: for hour in hours:
exc["%da.m." % hour] = [ exc["%sa.m." % hour] = [
{ORTH: hour}, {ORTH: hour},
{ORTH: "a.m."} {ORTH: "a.m."}
] ]
exc["%dp.m." % hour] = [ exc["%sp.m." % hour] = [
{ORTH: hour}, {ORTH: hour},
{ORTH: "p.m."} {ORTH: "p.m."}
] ]
exc["%dam" % hour] = [ exc["%sam" % hour] = [
{ORTH: hour}, {ORTH: hour},
{ORTH: "am", LEMMA: "a.m."} {ORTH: "am", LEMMA: "a.m."}
] ]
exc["%dpm" % hour] = [ exc["%spm" % hour] = [
{ORTH: hour}, {ORTH: hour},
{ORTH: "pm", LEMMA: "p.m."} {ORTH: "pm", LEMMA: "p.m."}
] ]
print(exc)
return exc return exc
@ -43,7 +44,8 @@ STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1))) update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(
['%d' % hour for hour in range(1, 12 + 1)]))
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "")) update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", ""))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))

View File

@ -20,18 +20,18 @@ for pron in ["i"]:
] ]
EXC[orth + "m"] = [ EXC[orth + "m"] = [
{ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 } {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }
] ]
EXC[orth + "'ma"] = [ EXC[orth + "'ma"] = [
{ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'m", LEMMA: "be", NORM: "am"}, {ORTH: "'m", LEMMA: "be", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"} {ORTH: "a", LEMMA: "going to", NORM: "gonna"}
] ]
EXC[orth + "ma"] = [ EXC[orth + "ma"] = [
{ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "m", LEMMA: "be", NORM: "am"}, {ORTH: "m", LEMMA: "be", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"} {ORTH: "a", LEMMA: "going to", NORM: "gonna"}
] ]
@ -150,8 +150,8 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
EXC[orth + "'ll've"] = [ EXC[orth + "'ll've"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}, {ORTH: "'ll", LEMMA: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
] ]
EXC[orth + "llve"] = [ EXC[orth + "llve"] = [
@ -237,11 +237,13 @@ for verb_data in [
] ]
EXC[data[ORTH] + "n't've"] = [ EXC[data[ORTH] + "n't've"] = [
dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"}, {ORTH: "n't", LEMMA: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"} {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
] ]
EXC[data[ORTH] + "ntve"] = [ EXC[data[ORTH] + "ntve"] = [
dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"}, {ORTH: "nt", LEMMA: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"} {ORTH: "ve", LEMMA: "have", TAG: "VB"}
] ]

View File

@ -18,22 +18,22 @@ def get_time_exc(hours):
} }
for hour in hours: for hour in hours:
exc["%da.m." % hour] = [ exc["%sa.m." % hour] = [
{ORTH: hour}, {ORTH: hour},
{ORTH: "a.m."} {ORTH: "a.m."}
] ]
exc["%dp.m." % hour] = [ exc["%sp.m." % hour] = [
{ORTH: hour}, {ORTH: hour},
{ORTH: "p.m."} {ORTH: "p.m."}
] ]
exc["%dam" % hour] = [ exc["%sam" % hour] = [
{ORTH: hour}, {ORTH: hour},
{ORTH: "am", LEMMA: "a.m."} {ORTH: "am", LEMMA: "a.m."}
] ]
exc["%dpm" % hour] = [ exc["%spm" % hour] = [
{ORTH: hour}, {ORTH: hour},
{ORTH: "pm", LEMMA: "p.m."} {ORTH: "pm", LEMMA: "p.m."}
] ]
@ -45,7 +45,8 @@ STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1))) update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(
['%d' % hour for hour in range(1, 12 + 1)]))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))

View File

@ -3,6 +3,11 @@ from __future__ import unicode_literals
from ..symbols import * from ..symbols import *
try:
unicode
except:
unicode = str
PRON_LEMMA = "-PRON-" PRON_LEMMA = "-PRON-"
DET_LEMMA = "-DET-" DET_LEMMA = "-DET-"
@ -10,6 +15,15 @@ ENT_ID = "ent_id"
def update_exc(exc, additions): def update_exc(exc, additions):
for orth, token_attrs in additions.items():
if not all(isinstance(attr[ORTH], unicode) for attr in token_attrs):
msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
raise ValueError(msg % (orth, token_attrs))
described_orth = ''.join(attr[ORTH] for attr in token_attrs)
if orth != described_orth:
# TODO: Better error
msg = "Invalid tokenizer exception: key='%s', orths='%s'"
raise ValueError(msg % (orth, described_orth))
overlap = set(exc.keys()).intersection(set(additions)) overlap = set(exc.keys()).intersection(set(additions))
assert not overlap, overlap assert not overlap, overlap
exc.update(additions) exc.update(additions)