mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Fix tokenization of 'i.' for Danish.
This commit is contained in:
parent
726fb2d0b5
commit
ac8116510d
|
@ -1,7 +1,7 @@
|
||||||
# encoding: utf8
|
# encoding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA, NORM
|
from ...symbols import ORTH, LEMMA, NORM, TAG, ADP, PUNCT
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -28,5 +28,12 @@ for orth in [
|
||||||
"t.o.m.", "vha.", ""]:
|
"t.o.m.", "vha.", ""]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
_custom_base_exc = {
|
||||||
|
"i.": [
|
||||||
|
{ORTH: "i", LEMMA: "i", NORM: "i"},
|
||||||
|
{ORTH: ".", TAG: PUNCT}]
|
||||||
|
}
|
||||||
|
_exc.update(_custom_base_exc)
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
|
Loading…
Reference in New Issue
Block a user