mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Remove TAG from da/sv tokenizer exceptions (#5428)
Remove `TAG` value from Danish and Swedish tokenizer exceptions because it may not be included in a tag map (and these settings are problematic as tokenizer exceptions anyway).
This commit is contained in:
parent
24e7108f80
commit
07639dd6ac
|
@ -6,7 +6,7 @@ Source: https://forkortelse.dk/ and various others.
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA, NORM, TAG, PUNCT
|
from ...symbols import ORTH, LEMMA, NORM
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -52,7 +52,7 @@ for exc_data in [
|
||||||
{ORTH: "Ons.", LEMMA: "onsdag"},
|
{ORTH: "Ons.", LEMMA: "onsdag"},
|
||||||
{ORTH: "Fre.", LEMMA: "fredag"},
|
{ORTH: "Fre.", LEMMA: "fredag"},
|
||||||
{ORTH: "Lør.", LEMMA: "lørdag"},
|
{ORTH: "Lør.", LEMMA: "lørdag"},
|
||||||
{ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller", TAG: "CC"},
|
{ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller"},
|
||||||
]:
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
@ -577,7 +577,7 @@ for h in range(1, 31 + 1):
|
||||||
for period in ["."]:
|
for period in ["."]:
|
||||||
_exc["%d%s" % (h, period)] = [{ORTH: "%d." % h}]
|
_exc["%d%s" % (h, period)] = [{ORTH: "%d." % h}]
|
||||||
|
|
||||||
_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: ".", TAG: PUNCT}]}
|
_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]}
|
||||||
_exc.update(_custom_base_exc)
|
_exc.update(_custom_base_exc)
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA, PUNCT, TAG
|
from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
@ -155,6 +155,6 @@ for orth in ABBREVIATIONS:
|
||||||
# Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
|
# Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
|
||||||
# should be tokenized as two separate tokens.
|
# should be tokenized as two separate tokens.
|
||||||
for orth in ["i", "m"]:
|
for orth in ["i", "m"]:
|
||||||
_exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: ".", TAG: PUNCT}]
|
_exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: "."}]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
|
Loading…
Reference in New Issue
Block a user