Add day of month tokenizer exceptions for Danish.

This commit is contained in:
Søren Lind Kristiansen 2017-11-24 15:03:24 +01:00
parent 0c276ed020
commit 6aa241bcec
2 changed files with 11 additions and 0 deletions

View File

@ -117,6 +117,12 @@ for orth in [
"øv.", "øvr.", "årg.", "årh.", ""]:
_exc[orth] = [{ORTH: orth}]
# Dates
for h in range(1, 31 + 1):
for period in ["."]:
_exc["%d%s" % (h, period)] = [
{ORTH: "%d." % h}]
_custom_base_exc = {
"i.": [
{ORTH: "i", LEMMA: "i", NORM: "i"},

View File

@ -14,6 +14,11 @@ def test_da_tokenizer_handles_ambiguous_abbr(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["1.", "10.", "31."])
def test_da_tokenizer_handles_dates(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 1
def test_da_tokenizer_handles_exc_in_text(da_tokenizer):
text = "Det er bl.a. ikke meningen"
tokens = da_tokenizer(text)