mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Add day of month tokenizer exceptions for Danish.
This commit is contained in:
parent
0c276ed020
commit
6aa241bcec
|
@ -117,6 +117,12 @@ for orth in [
|
||||||
"øv.", "øvr.", "årg.", "årh.", ""]:
|
"øv.", "øvr.", "årg.", "årh.", ""]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
# Dates
|
||||||
|
for h in range(1, 31 + 1):
|
||||||
|
for period in ["."]:
|
||||||
|
_exc["%d%s" % (h, period)] = [
|
||||||
|
{ORTH: "%d." % h}]
|
||||||
|
|
||||||
_custom_base_exc = {
|
_custom_base_exc = {
|
||||||
"i.": [
|
"i.": [
|
||||||
{ORTH: "i", LEMMA: "i", NORM: "i"},
|
{ORTH: "i", LEMMA: "i", NORM: "i"},
|
||||||
|
|
|
@ -14,6 +14,11 @@ def test_da_tokenizer_handles_ambiguous_abbr(da_tokenizer, text):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["1.", "10.", "31."])
|
||||||
|
def test_da_tokenizer_handles_dates(da_tokenizer, text):
|
||||||
|
tokens = da_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
|
||||||
def test_da_tokenizer_handles_exc_in_text(da_tokenizer):
|
def test_da_tokenizer_handles_exc_in_text(da_tokenizer):
|
||||||
text = "Det er bl.a. ikke meningen"
|
text = "Det er bl.a. ikke meningen"
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user