Hyphen infix (#5770)

* infix split on hyphen when preceded by number

* clean up

* skip ukranian test instead of xfail
This commit is contained in:
Sofie Van Landeghem 2020-07-20 14:48:51 +02:00 committed by GitHub
parent ec819fc311
commit 1b2ec94382
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 22 additions and 5 deletions

View File

@ -3,6 +3,7 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
@ -20,6 +21,7 @@ class EnglishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
infixes = TOKENIZER_INFIXES
single_orth_variants = [
{"tags": ["NFP"], "variants": ["", "..."]},
{"tags": [":"], "variants": ["-", "", "", "--", "---", "——"]},

View File

@ -0,0 +1,19 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)
TOKENIZER_INFIXES = _infixes

View File

@ -26,9 +26,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
("""'Me too!', Mr. P. Delaware cried. """, 11),
("They ran about 10km.", 6),
pytest.param(
"But then the 6,000-year ice age came...", 10, marks=pytest.mark.xfail()
),
("But then the 6,000-year ice age came...", 10),
],
)
def test_en_tokenizer_handles_cnts(en_tokenizer, text, length):

View File

@ -16,8 +16,6 @@ import pytest
"grand'hamien",
"Châteauneuf-la-Forêt",
"Château-Guibert",
"11-septembre",
"11-Septembre",
"refox-trottâmes",
# u"K-POP",
# u"K-Pop",