Add day of month tokenizer exceptions for Danish.

2025-07-15 10:42:34 +03:00 · 2017-11-24 15:03:24 +01:00 · 2017-11-24 15:03:24 +01:00 · 6aa241bcec
commit 6aa241bcec
parent 0c276ed020
2 changed files with 11 additions and 0 deletions
--- a/spacy/lang/da/tokenizer_exceptions.py
+++ b/spacy/lang/da/tokenizer_exceptions.py
@ -117,6 +117,12 @@ for orth in [
        "øv.", "øvr.", "årg.", "årh.", ""]:
    _exc[orth] = [{ORTH: orth}]
 # Dates
 for h in range(1, 31 + 1):
    for period in ["."]:
        _exc["%d%s" % (h, period)] = [
            {ORTH: "%d." % h}]
 _custom_base_exc = {
    "i.": [
        {ORTH: "i", LEMMA: "i", NORM: "i"},
--- a/spacy/tests/lang/da/test_exceptions.py
+++ b/spacy/tests/lang/da/test_exceptions.py
@ -14,6 +14,11 @@ def test_da_tokenizer_handles_ambiguous_abbr(da_tokenizer, text):
    tokens = da_tokenizer(text)
    assert len(tokens) == 2
@pytest.mark.parametrize('text', ["1.", "10.", "31."])
 def test_da_tokenizer_handles_dates(da_tokenizer, text):
    tokens = da_tokenizer(text)
    assert len(tokens) == 1
 def test_da_tokenizer_handles_exc_in_text(da_tokenizer):
    text = "Det er bl.a. ikke meningen"
    tokens = da_tokenizer(text)