mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Swedish: Exceptions for single letter words ending sentence (#2615)
* Exceptions for single letter words ending sentence Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."), should be tokenized as two separate tokens. * Add test
This commit is contained in:
parent
860f5bd91f
commit
1914c488d3
|
@ -1,7 +1,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA, PUNCT
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
@ -78,5 +78,11 @@ for orth in [
|
|||
"s.k.", "st.", "s:t", "t.ex.", "t.o.m.", "ung.", "äv.", "övers."]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
# Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
|
||||
# should be tokenized as two separate tokens.
|
||||
for orth in ["i", "m"]:
|
||||
_exc[orth + "."] = [
|
||||
{ORTH: orth, LEMMA: orth, NORM: orth},
|
||||
{ORTH: ".", TAG: PUNCT}]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
|
|
|
@ -6,7 +6,8 @@ import pytest
|
|||
|
||||
SV_TOKEN_EXCEPTION_TESTS = [
|
||||
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
|
||||
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar'])
|
||||
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']),
|
||||
('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."])
|
||||
]
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user