mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Fix Issue #736: Times were being tokenized with incorrect string values.
This commit is contained in:
parent
a6790b6694
commit
fba67fa342
|
@ -15,25 +15,26 @@ from .morph_rules import MORPH_RULES
|
||||||
def get_time_exc(hours):
|
def get_time_exc(hours):
|
||||||
exc = {}
|
exc = {}
|
||||||
for hour in hours:
|
for hour in hours:
|
||||||
exc["%da.m." % hour] = [
|
exc["%sa.m." % hour] = [
|
||||||
{ORTH: hour},
|
{ORTH: hour},
|
||||||
{ORTH: "a.m."}
|
{ORTH: "a.m."}
|
||||||
]
|
]
|
||||||
|
|
||||||
exc["%dp.m." % hour] = [
|
exc["%sp.m." % hour] = [
|
||||||
{ORTH: hour},
|
{ORTH: hour},
|
||||||
{ORTH: "p.m."}
|
{ORTH: "p.m."}
|
||||||
]
|
]
|
||||||
|
|
||||||
exc["%dam" % hour] = [
|
exc["%sam" % hour] = [
|
||||||
{ORTH: hour},
|
{ORTH: hour},
|
||||||
{ORTH: "am", LEMMA: "a.m."}
|
{ORTH: "am", LEMMA: "a.m."}
|
||||||
]
|
]
|
||||||
|
|
||||||
exc["%dpm" % hour] = [
|
exc["%spm" % hour] = [
|
||||||
{ORTH: hour},
|
{ORTH: hour},
|
||||||
{ORTH: "pm", LEMMA: "p.m."}
|
{ORTH: "pm", LEMMA: "p.m."}
|
||||||
]
|
]
|
||||||
|
print(exc)
|
||||||
return exc
|
return exc
|
||||||
|
|
||||||
|
|
||||||
|
@ -43,7 +44,8 @@ STOP_WORDS = set(STOP_WORDS)
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
|
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(
|
||||||
|
['%d' % hour for hour in range(1, 12 + 1)]))
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’"))
|
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’"))
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||||
|
|
|
@ -20,18 +20,18 @@ for pron in ["i"]:
|
||||||
]
|
]
|
||||||
|
|
||||||
EXC[orth + "m"] = [
|
EXC[orth + "m"] = [
|
||||||
{ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }
|
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }
|
||||||
]
|
]
|
||||||
|
|
||||||
EXC[orth + "'ma"] = [
|
EXC[orth + "'ma"] = [
|
||||||
{ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "'m", LEMMA: "be", NORM: "am"},
|
{ORTH: "'m", LEMMA: "be", NORM: "am"},
|
||||||
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}
|
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}
|
||||||
]
|
]
|
||||||
|
|
||||||
EXC[orth + "ma"] = [
|
EXC[orth + "ma"] = [
|
||||||
{ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||||
{ORTH: "m", LEMMA: "be", NORM: "am"},
|
{ORTH: "m", LEMMA: "be", NORM: "am"},
|
||||||
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}
|
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}
|
||||||
]
|
]
|
||||||
|
@ -150,8 +150,8 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
|
||||||
|
|
||||||
EXC[orth + "'ll've"] = [
|
EXC[orth + "'ll've"] = [
|
||||||
{ORTH: orth, LEMMA: word},
|
{ORTH: orth, LEMMA: word},
|
||||||
{ORTH: "ll", LEMMA: "will", TAG: "MD"},
|
{ORTH: "'ll", LEMMA: "will", TAG: "MD"},
|
||||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
|
||||||
]
|
]
|
||||||
|
|
||||||
EXC[orth + "llve"] = [
|
EXC[orth + "llve"] = [
|
||||||
|
@ -237,11 +237,13 @@ for verb_data in [
|
||||||
]
|
]
|
||||||
|
|
||||||
EXC[data[ORTH] + "n't've"] = [
|
EXC[data[ORTH] + "n't've"] = [
|
||||||
|
dict(data),
|
||||||
{ORTH: "n't", LEMMA: "not", TAG: "RB"},
|
{ORTH: "n't", LEMMA: "not", TAG: "RB"},
|
||||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
|
||||||
]
|
]
|
||||||
|
|
||||||
EXC[data[ORTH] + "ntve"] = [
|
EXC[data[ORTH] + "ntve"] = [
|
||||||
|
dict(data),
|
||||||
{ORTH: "nt", LEMMA: "not", TAG: "RB"},
|
{ORTH: "nt", LEMMA: "not", TAG: "RB"},
|
||||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
|
||||||
]
|
]
|
||||||
|
|
|
@ -18,22 +18,22 @@ def get_time_exc(hours):
|
||||||
}
|
}
|
||||||
|
|
||||||
for hour in hours:
|
for hour in hours:
|
||||||
exc["%da.m." % hour] = [
|
exc["%sa.m." % hour] = [
|
||||||
{ORTH: hour},
|
{ORTH: hour},
|
||||||
{ORTH: "a.m."}
|
{ORTH: "a.m."}
|
||||||
]
|
]
|
||||||
|
|
||||||
exc["%dp.m." % hour] = [
|
exc["%sp.m." % hour] = [
|
||||||
{ORTH: hour},
|
{ORTH: hour},
|
||||||
{ORTH: "p.m."}
|
{ORTH: "p.m."}
|
||||||
]
|
]
|
||||||
|
|
||||||
exc["%dam" % hour] = [
|
exc["%sam" % hour] = [
|
||||||
{ORTH: hour},
|
{ORTH: hour},
|
||||||
{ORTH: "am", LEMMA: "a.m."}
|
{ORTH: "am", LEMMA: "a.m."}
|
||||||
]
|
]
|
||||||
|
|
||||||
exc["%dpm" % hour] = [
|
exc["%spm" % hour] = [
|
||||||
{ORTH: hour},
|
{ORTH: hour},
|
||||||
{ORTH: "pm", LEMMA: "p.m."}
|
{ORTH: "pm", LEMMA: "p.m."}
|
||||||
]
|
]
|
||||||
|
@ -45,7 +45,8 @@ STOP_WORDS = set(STOP_WORDS)
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
|
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(
|
||||||
|
['%d' % hour for hour in range(1, 12 + 1)]))
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,11 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..symbols import *
|
from ..symbols import *
|
||||||
|
|
||||||
|
try:
|
||||||
|
unicode
|
||||||
|
except:
|
||||||
|
unicode = str
|
||||||
|
|
||||||
|
|
||||||
PRON_LEMMA = "-PRON-"
|
PRON_LEMMA = "-PRON-"
|
||||||
DET_LEMMA = "-DET-"
|
DET_LEMMA = "-DET-"
|
||||||
|
@ -10,6 +15,15 @@ ENT_ID = "ent_id"
|
||||||
|
|
||||||
|
|
||||||
def update_exc(exc, additions):
|
def update_exc(exc, additions):
|
||||||
|
for orth, token_attrs in additions.items():
|
||||||
|
if not all(isinstance(attr[ORTH], unicode) for attr in token_attrs):
|
||||||
|
msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
|
||||||
|
raise ValueError(msg % (orth, token_attrs))
|
||||||
|
described_orth = ''.join(attr[ORTH] for attr in token_attrs)
|
||||||
|
if orth != described_orth:
|
||||||
|
# TODO: Better error
|
||||||
|
msg = "Invalid tokenizer exception: key='%s', orths='%s'"
|
||||||
|
raise ValueError(msg % (orth, described_orth))
|
||||||
overlap = set(exc.keys()).intersection(set(additions))
|
overlap = set(exc.keys()).intersection(set(additions))
|
||||||
assert not overlap, overlap
|
assert not overlap, overlap
|
||||||
exc.update(additions)
|
exc.update(additions)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user