Add day of month tokenizer exceptions for Danish.

2025-11-07 03:17:37 +03:00 · 2017-11-24 15:03:24 +01:00 · 2017-11-24 15:03:24 +01:00 · 6aa241bcec
commit 6aa241bcec
parent 0c276ed020
2 changed files with 11 additions and 0 deletions
--- a/spacy/lang/da/tokenizer_exceptions.py
+++ b/spacy/lang/da/tokenizer_exceptions.py
@ -117,6 +117,12 @@ for orth in [
        "øv.", "øvr.", "årg.", "årh.", ""]:
    _exc[orth] = [{ORTH: orth}]

+# Dates
+for h in range(1, 31 + 1):
+    for period in ["."]:
+        _exc["%d%s" % (h, period)] = [
+            {ORTH: "%d." % h}]
+
 _custom_base_exc = {
    "i.": [
        {ORTH: "i", LEMMA: "i", NORM: "i"},
--- a/spacy/tests/lang/da/test_exceptions.py
+++ b/spacy/tests/lang/da/test_exceptions.py
@ -14,6 +14,11 @@ def test_da_tokenizer_handles_ambiguous_abbr(da_tokenizer, text):
    tokens = da_tokenizer(text)
    assert len(tokens) == 2

+@pytest.mark.parametrize('text', ["1.", "10.", "31."])
+def test_da_tokenizer_handles_dates(da_tokenizer, text):
+    tokens = da_tokenizer(text)
+    assert len(tokens) == 1
+
 def test_da_tokenizer_handles_exc_in_text(da_tokenizer):
    text = "Det er bl.a. ikke meningen"
    tokens = da_tokenizer(text)