Hyphen infix (#5770)

* infix split on hyphen when preceded by number

* clean up

* skip ukranian test instead of xfail
This commit is contained in:
Sofie Van Landeghem 2020-07-20 14:48:51 +02:00 committed by GitHub
parent ec819fc311
commit 1b2ec94382
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 22 additions and 5 deletions

View File

@ -3,6 +3,7 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG
@ -20,6 +21,7 @@ class EnglishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
infixes = TOKENIZER_INFIXES
single_orth_variants = [ single_orth_variants = [
{"tags": ["NFP"], "variants": ["", "..."]}, {"tags": ["NFP"], "variants": ["", "..."]},
{"tags": [":"], "variants": ["-", "", "", "--", "---", "——"]}, {"tags": [":"], "variants": ["-", "", "", "--", "---", "——"]},

View File

@ -0,0 +1,19 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)
TOKENIZER_INFIXES = _infixes

View File

@ -26,9 +26,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15), ("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
("""'Me too!', Mr. P. Delaware cried. """, 11), ("""'Me too!', Mr. P. Delaware cried. """, 11),
("They ran about 10km.", 6), ("They ran about 10km.", 6),
pytest.param( ("But then the 6,000-year ice age came...", 10),
"But then the 6,000-year ice age came...", 10, marks=pytest.mark.xfail()
),
], ],
) )
def test_en_tokenizer_handles_cnts(en_tokenizer, text, length): def test_en_tokenizer_handles_cnts(en_tokenizer, text, length):

View File

@ -16,8 +16,6 @@ import pytest
"grand'hamien", "grand'hamien",
"Châteauneuf-la-Forêt", "Châteauneuf-la-Forêt",
"Château-Guibert", "Château-Guibert",
"11-septembre",
"11-Septembre",
"refox-trottâmes", "refox-trottâmes",
# u"K-POP", # u"K-POP",
# u"K-Pop", # u"K-Pop",