mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Fix Issue #736: Times were being tokenized with incorrect string values.
This commit is contained in:
parent
a6790b6694
commit
fba67fa342
|
@ -15,25 +15,26 @@ from .morph_rules import MORPH_RULES
|
|||
def get_time_exc(hours):
|
||||
exc = {}
|
||||
for hour in hours:
|
||||
exc["%da.m." % hour] = [
|
||||
exc["%sa.m." % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "a.m."}
|
||||
]
|
||||
|
||||
exc["%dp.m." % hour] = [
|
||||
exc["%sp.m." % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "p.m."}
|
||||
]
|
||||
|
||||
exc["%dam" % hour] = [
|
||||
exc["%sam" % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "am", LEMMA: "a.m."}
|
||||
]
|
||||
|
||||
exc["%dpm" % hour] = [
|
||||
exc["%spm" % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "pm", LEMMA: "p.m."}
|
||||
]
|
||||
print(exc)
|
||||
return exc
|
||||
|
||||
|
||||
|
@ -43,7 +44,8 @@ STOP_WORDS = set(STOP_WORDS)
|
|||
|
||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(
|
||||
['%d' % hour for hour in range(1, 12 + 1)]))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’"))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
|
|
|
@ -20,18 +20,18 @@ for pron in ["i"]:
|
|||
]
|
||||
|
||||
EXC[orth + "m"] = [
|
||||
{ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }
|
||||
]
|
||||
|
||||
EXC[orth + "'ma"] = [
|
||||
{ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'m", LEMMA: "be", NORM: "am"},
|
||||
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}
|
||||
]
|
||||
|
||||
EXC[orth + "ma"] = [
|
||||
{ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "m", LEMMA: "be", NORM: "am"},
|
||||
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}
|
||||
]
|
||||
|
@ -150,8 +150,8 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
|
|||
|
||||
EXC[orth + "'ll've"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "ll", LEMMA: "will", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
|
||||
{ORTH: "'ll", LEMMA: "will", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
|
||||
]
|
||||
|
||||
EXC[orth + "llve"] = [
|
||||
|
@ -237,11 +237,13 @@ for verb_data in [
|
|||
]
|
||||
|
||||
EXC[data[ORTH] + "n't've"] = [
|
||||
dict(data),
|
||||
{ORTH: "n't", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
|
||||
]
|
||||
|
||||
EXC[data[ORTH] + "ntve"] = [
|
||||
dict(data),
|
||||
{ORTH: "nt", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
|
||||
]
|
||||
|
|
|
@ -18,22 +18,22 @@ def get_time_exc(hours):
|
|||
}
|
||||
|
||||
for hour in hours:
|
||||
exc["%da.m." % hour] = [
|
||||
exc["%sa.m." % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "a.m."}
|
||||
]
|
||||
|
||||
exc["%dp.m." % hour] = [
|
||||
exc["%sp.m." % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "p.m."}
|
||||
]
|
||||
|
||||
exc["%dam" % hour] = [
|
||||
exc["%sam" % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "am", LEMMA: "a.m."}
|
||||
]
|
||||
|
||||
exc["%dpm" % hour] = [
|
||||
exc["%spm" % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "pm", LEMMA: "p.m."}
|
||||
]
|
||||
|
@ -45,7 +45,8 @@ STOP_WORDS = set(STOP_WORDS)
|
|||
|
||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(
|
||||
['%d' % hour for hour in range(1, 12 + 1)]))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
|
||||
|
|
|
@ -3,6 +3,11 @@ from __future__ import unicode_literals
|
|||
|
||||
from ..symbols import *
|
||||
|
||||
try:
|
||||
unicode
|
||||
except:
|
||||
unicode = str
|
||||
|
||||
|
||||
PRON_LEMMA = "-PRON-"
|
||||
DET_LEMMA = "-DET-"
|
||||
|
@ -10,6 +15,15 @@ ENT_ID = "ent_id"
|
|||
|
||||
|
||||
def update_exc(exc, additions):
|
||||
for orth, token_attrs in additions.items():
|
||||
if not all(isinstance(attr[ORTH], unicode) for attr in token_attrs):
|
||||
msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
|
||||
raise ValueError(msg % (orth, token_attrs))
|
||||
described_orth = ''.join(attr[ORTH] for attr in token_attrs)
|
||||
if orth != described_orth:
|
||||
# TODO: Better error
|
||||
msg = "Invalid tokenizer exception: key='%s', orths='%s'"
|
||||
raise ValueError(msg % (orth, described_orth))
|
||||
overlap = set(exc.keys()).intersection(set(additions))
|
||||
assert not overlap, overlap
|
||||
exc.update(additions)
|
||||
|
|
Loading…
Reference in New Issue
Block a user