Fixed hyphen handling in the Hungarian tokenizer.

This commit is contained in:
Gyorgy Orosz 2017-01-14 16:30:11 +01:00
parent f77c0284d6
commit 63037e79af
5 changed files with 17 additions and 8 deletions

View File

@ -15,7 +15,7 @@ update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS))
TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES
TOKENIZER_PREFIXES = TOKENIZER_PREFIXES
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
TOKENIZER_INFIXES = TOKENIZER_INFIXES

View File

@ -6,6 +6,13 @@ from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES, ALPH
CURRENCY_SYMBOLS = r"\$ ¢ £ € ¥ ฿"
TOKENIZER_PREFIXES = (
[r'\+'] +
LIST_PUNCT +
LIST_ELLIPSES +
LIST_QUOTES
)
TOKENIZER_SUFFIXES = (
LIST_PUNCT +
LIST_ELLIPSES +
@ -32,4 +39,4 @@ TOKENIZER_INFIXES = (
]
)
__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]

View File

@ -539,13 +539,13 @@ OTHER_EXC = """
ORD_NUM_OR_DATE = "([A-Z0-9]+[./-])*(\d+\.?)"
_NUM = "[+\-]?\d+([,.]\d+)*"
_OPS = "[=<>+\-\*/^()÷%²]"
_SUFFIES = "-[{a}]+".format(a=ALPHA_LOWER)
_SUFFIXES = "-[{a}]+".format(a=ALPHA_LOWER)
NUMERIC_EXP = "({n})(({o})({n}))*[%]?".format(n=_NUM, o=_OPS)
TIME_EXP = "\d+(:\d+)*(\.\d+)?"
NUMS = "(({ne})|({t})|({on})|({c}))({s})?".format(
ne=NUMERIC_EXP, t=TIME_EXP, on=ORD_NUM_OR_DATE,
c=CURRENCY, s=_SUFFIES
c=CURRENCY, s=_SUFFIXES
)
TOKEN_MATCH = re.compile("^({u})|({n})$".format(u=_URL_PATTERN, n=NUMS)).match

View File

@ -58,7 +58,7 @@ LIST_HYPHENS = list(_HYPHENS.strip().split())
ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '').replace('\n', '')
ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '').replace(' ', '')
ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '').replace('\n', '')
ALPHA = ALPHA_LOWER + ALPHA_UPPER

View File

@ -29,7 +29,7 @@ HYPHEN_TESTS = [
('Dinnye-domb-.', ['Dinnye-domb-', '.']),
('Ezen -e elcsatangolt.', ['Ezen', '-e', 'elcsatangolt', '.']),
('Lakik-e', ['Lakik', '-e']),
('A--B', ['A', '--' 'B']),
('A--B', ['A', '--', 'B']),
('Lakik-e?', ['Lakik', '-e', '?']),
('Lakik-e.', ['Lakik', '-e', '.']),
('Lakik-e...', ['Lakik', '-e', '...']),
@ -42,6 +42,7 @@ HYPHEN_TESTS = [
('A 7-es.', ['A', '7-es', '.']),
('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']),
('A %-sal.', ['A', '%-sal', '.']),
('A $-sal.', ['A', '$-sal', '.']),
('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])
]
@ -206,7 +207,7 @@ NUMBER_TESTS = [
('A 5$-ban', ['A', '5$-ban']),
('A 5$.', ['A', '5', '$', '.']),
('A 5$', ['A', '5', '$']),
('A $5', ['A', '$', '5']),
('A $5', ['A', '$5']),
('A 5km/h', ['A', '5', 'km/h']),
('A 75%+1-100%-ig', ['A', '75%+1-100%-ig']),
('A 5km/h.', ['A', '5', 'km/h', '.']),
@ -247,7 +248,8 @@ WIKI_TESTS = [
('"(...)"sokkal ', ['"', '(', '...', ')', '"', 'sokkal']),
]
TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS # + HYPHEN_TESTS # + WIKI_TESTS
TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS # + WIKI_TESTS
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
def test_tokenizer_handles_testcases(hu_tokenizer, text, expected_tokens):