mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Add weekday abbreviations and remove abiguous month abbreviations for Danish.
This commit is contained in:
parent
056547e989
commit
0c276ed020
|
@ -11,20 +11,47 @@ from ...symbols import ORTH, LEMMA, NORM, TAG, ADP, PUNCT
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
# Abbreviations for weekdays "søn." (for "søndag") as well as "Tor." and "Tors."
|
||||||
|
# (for "torsdag") are left out because they are ambiguous. The same is the case
|
||||||
|
# for abbreviations "jul." and "Jul." ("juli").
|
||||||
for exc_data in [
|
for exc_data in [
|
||||||
{ORTH: "Kbh.", LEMMA: "København", NORM: "København"},
|
{ORTH: "Kbh.", LEMMA: "København", NORM: "København"},
|
||||||
{ORTH: "Jan.", LEMMA: "januar", NORM: "januar"},
|
{ORTH: "jan.", LEMMA: "januar"},
|
||||||
{ORTH: "Feb.", LEMMA: "februar", NORM: "februar"},
|
{ORTH: "febr.", LEMMA: "februar"},
|
||||||
{ORTH: "Mar.", LEMMA: "marts", NORM: "marts"},
|
{ORTH: "feb.", LEMMA: "februar"},
|
||||||
{ORTH: "Apr.", LEMMA: "april", NORM: "april"},
|
{ORTH: "mar.", LEMMA: "marts"},
|
||||||
{ORTH: "Maj.", LEMMA: "maj", NORM: "maj"},
|
{ORTH: "apr.", LEMMA: "april"},
|
||||||
{ORTH: "Jun.", LEMMA: "juni", NORM: "juni"},
|
{ORTH: "jun.", LEMMA: "juni"},
|
||||||
{ORTH: "Jul.", LEMMA: "juli", NORM: "juli"},
|
{ORTH: "aug.", LEMMA: "august"},
|
||||||
{ORTH: "Aug.", LEMMA: "august", NORM: "august"},
|
{ORTH: "sept.", LEMMA: "september"},
|
||||||
{ORTH: "Sep.", LEMMA: "september", NORM: "september"},
|
{ORTH: "sep.", LEMMA: "september"},
|
||||||
{ORTH: "Okt.", LEMMA: "oktober", NORM: "oktober"},
|
{ORTH: "okt.", LEMMA: "oktober"},
|
||||||
{ORTH: "Nov.", LEMMA: "november", NORM: "november"},
|
{ORTH: "nov.", LEMMA: "november"},
|
||||||
{ORTH: "Dec.", LEMMA: "december", NORM: "december"}]:
|
{ORTH: "dec.", LEMMA: "december"},
|
||||||
|
{ORTH: "man.", LEMMA: "mandag"},
|
||||||
|
{ORTH: "tirs.", LEMMA: "tirsdag"},
|
||||||
|
{ORTH: "ons.", LEMMA: "onsdag"},
|
||||||
|
{ORTH: "tor.", LEMMA: "torsdag"},
|
||||||
|
{ORTH: "tors.", LEMMA: "torsdag"},
|
||||||
|
{ORTH: "fre.", LEMMA: "fredag"},
|
||||||
|
{ORTH: "lør.", LEMMA: "lørdag"},
|
||||||
|
{ORTH: "Jan.", LEMMA: "januar"},
|
||||||
|
{ORTH: "Febr.", LEMMA: "februar"},
|
||||||
|
{ORTH: "Feb.", LEMMA: "februar"},
|
||||||
|
{ORTH: "Mar.", LEMMA: "marts"},
|
||||||
|
{ORTH: "Apr.", LEMMA: "april"},
|
||||||
|
{ORTH: "Jun.", LEMMA: "juni"},
|
||||||
|
{ORTH: "Aug.", LEMMA: "august"},
|
||||||
|
{ORTH: "Sept.", LEMMA: "september"},
|
||||||
|
{ORTH: "Sep.", LEMMA: "september"},
|
||||||
|
{ORTH: "Okt.", LEMMA: "oktober"},
|
||||||
|
{ORTH: "Nov.", LEMMA: "november"},
|
||||||
|
{ORTH: "Dec.", LEMMA: "december"},
|
||||||
|
{ORTH: "Man.", LEMMA: "mandag"},
|
||||||
|
{ORTH: "Tirs.", LEMMA: "tirsdag"},
|
||||||
|
{ORTH: "Ons.", LEMMA: "onsdag"},
|
||||||
|
{ORTH: "Fre.", LEMMA: "fredag"},
|
||||||
|
{ORTH: "Lør.", LEMMA: "lørdag"}]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
for orth in [
|
for orth in [
|
||||||
|
|
|
@ -9,6 +9,11 @@ def test_da_tokenizer_handles_abbr(da_tokenizer, text):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["Jul.", "jul.", "Tor.", "Tors."])
|
||||||
|
def test_da_tokenizer_handles_ambiguous_abbr(da_tokenizer, text):
|
||||||
|
tokens = da_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
|
||||||
def test_da_tokenizer_handles_exc_in_text(da_tokenizer):
|
def test_da_tokenizer_handles_exc_in_text(da_tokenizer):
|
||||||
text = "Det er bl.a. ikke meningen"
|
text = "Det er bl.a. ikke meningen"
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user