mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Fix tokenization of 'i.' for Danish.
This commit is contained in:
parent
726fb2d0b5
commit
ac8116510d
|
@ -1,7 +1,7 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import ORTH, LEMMA, NORM
|
||||
from ...symbols import ORTH, LEMMA, NORM, TAG, ADP, PUNCT
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
@ -28,5 +28,12 @@ for orth in [
|
|||
"t.o.m.", "vha.", ""]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
_custom_base_exc = {
|
||||
"i.": [
|
||||
{ORTH: "i", LEMMA: "i", NORM: "i"},
|
||||
{ORTH: ".", TAG: PUNCT}]
|
||||
}
|
||||
_exc.update(_custom_base_exc)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
|
|
Loading…
Reference in New Issue
Block a user