mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Swedish: Exceptions for single letter words ending sentence (#2615)
* Exceptions for single letter words ending sentence Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."), should be tokenized as two separate tokens. * Add test
This commit is contained in:
parent
860f5bd91f
commit
1914c488d3
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA, PUNCT
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -78,5 +78,11 @@ for orth in [
|
||||||
"s.k.", "st.", "s:t", "t.ex.", "t.o.m.", "ung.", "äv.", "övers."]:
|
"s.k.", "st.", "s:t", "t.ex.", "t.o.m.", "ung.", "äv.", "övers."]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
# Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
|
||||||
|
# should be tokenized as two separate tokens.
|
||||||
|
for orth in ["i", "m"]:
|
||||||
|
_exc[orth + "."] = [
|
||||||
|
{ORTH: orth, LEMMA: orth, NORM: orth},
|
||||||
|
{ORTH: ".", TAG: PUNCT}]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
|
|
@ -6,7 +6,8 @@ import pytest
|
||||||
|
|
||||||
SV_TOKEN_EXCEPTION_TESTS = [
|
SV_TOKEN_EXCEPTION_TESTS = [
|
||||||
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
|
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
|
||||||
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar'])
|
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']),
|
||||||
|
('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."])
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user