Swedish: Exceptions for single letter words ending sentence (#2615)

* Exceptions for single letter words ending sentence

Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."), should be tokenized as two separate tokens.

* Add test
This commit is contained in:
Emil Stenström 2018-08-05 14:14:30 +02:00 committed by Matthew Honnibal
parent 860f5bd91f
commit 1914c488d3
2 changed files with 9 additions and 2 deletions

View File

@ -1,7 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA, PUNCT
_exc = {} _exc = {}
@ -78,5 +78,11 @@ for orth in [
"s.k.", "st.", "s:t", "t.ex.", "t.o.m.", "ung.", "äv.", "övers."]: "s.k.", "st.", "s:t", "t.ex.", "t.o.m.", "ung.", "äv.", "övers."]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
# Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
# should be tokenized as two separate tokens.
for orth in ["i", "m"]:
_exc[orth + "."] = [
{ORTH: orth, LEMMA: orth, NORM: orth},
{ORTH: ".", TAG: PUNCT}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = _exc

View File

@ -6,7 +6,8 @@ import pytest
SV_TOKEN_EXCEPTION_TESTS = [ SV_TOKEN_EXCEPTION_TESTS = [
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']), ('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']) ('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']),
('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."])
] ]