mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Add day of month tokenizer exceptions for Danish.
This commit is contained in:
parent
0c276ed020
commit
6aa241bcec
|
@ -117,6 +117,12 @@ for orth in [
|
|||
"øv.", "øvr.", "årg.", "årh.", ""]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
# Dates
|
||||
for h in range(1, 31 + 1):
|
||||
for period in ["."]:
|
||||
_exc["%d%s" % (h, period)] = [
|
||||
{ORTH: "%d." % h}]
|
||||
|
||||
_custom_base_exc = {
|
||||
"i.": [
|
||||
{ORTH: "i", LEMMA: "i", NORM: "i"},
|
||||
|
|
|
@ -14,6 +14,11 @@ def test_da_tokenizer_handles_ambiguous_abbr(da_tokenizer, text):
|
|||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
@pytest.mark.parametrize('text', ["1.", "10.", "31."])
|
||||
def test_da_tokenizer_handles_dates(da_tokenizer, text):
|
||||
tokens = da_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
def test_da_tokenizer_handles_exc_in_text(da_tokenizer):
|
||||
text = "Det er bl.a. ikke meningen"
|
||||
tokens = da_tokenizer(text)
|
||||
|
|
Loading…
Reference in New Issue
Block a user