mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Fixed hyphen handling in the Hungarian tokenizer.
This commit is contained in:
parent
f77c0284d6
commit
63037e79af
|
@ -15,7 +15,7 @@ update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC))
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC))
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS))
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS))
|
||||||
|
|
||||||
TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES
|
TOKENIZER_PREFIXES = TOKENIZER_PREFIXES
|
||||||
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
|
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
|
||||||
TOKENIZER_INFIXES = TOKENIZER_INFIXES
|
TOKENIZER_INFIXES = TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,13 @@ from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES, ALPH
|
||||||
|
|
||||||
CURRENCY_SYMBOLS = r"\$ ¢ £ € ¥ ฿"
|
CURRENCY_SYMBOLS = r"\$ ¢ £ € ¥ ฿"
|
||||||
|
|
||||||
|
TOKENIZER_PREFIXES = (
|
||||||
|
[r'\+'] +
|
||||||
|
LIST_PUNCT +
|
||||||
|
LIST_ELLIPSES +
|
||||||
|
LIST_QUOTES
|
||||||
|
)
|
||||||
|
|
||||||
TOKENIZER_SUFFIXES = (
|
TOKENIZER_SUFFIXES = (
|
||||||
LIST_PUNCT +
|
LIST_PUNCT +
|
||||||
LIST_ELLIPSES +
|
LIST_ELLIPSES +
|
||||||
|
@ -32,4 +39,4 @@ TOKENIZER_INFIXES = (
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
||||||
|
|
|
@ -539,13 +539,13 @@ OTHER_EXC = """
|
||||||
ORD_NUM_OR_DATE = "([A-Z0-9]+[./-])*(\d+\.?)"
|
ORD_NUM_OR_DATE = "([A-Z0-9]+[./-])*(\d+\.?)"
|
||||||
_NUM = "[+\-]?\d+([,.]\d+)*"
|
_NUM = "[+\-]?\d+([,.]\d+)*"
|
||||||
_OPS = "[=<>+\-\*/^()÷%²]"
|
_OPS = "[=<>+\-\*/^()÷%²]"
|
||||||
_SUFFIES = "-[{a}]+".format(a=ALPHA_LOWER)
|
_SUFFIXES = "-[{a}]+".format(a=ALPHA_LOWER)
|
||||||
NUMERIC_EXP = "({n})(({o})({n}))*[%]?".format(n=_NUM, o=_OPS)
|
NUMERIC_EXP = "({n})(({o})({n}))*[%]?".format(n=_NUM, o=_OPS)
|
||||||
TIME_EXP = "\d+(:\d+)*(\.\d+)?"
|
TIME_EXP = "\d+(:\d+)*(\.\d+)?"
|
||||||
|
|
||||||
NUMS = "(({ne})|({t})|({on})|({c}))({s})?".format(
|
NUMS = "(({ne})|({t})|({on})|({c}))({s})?".format(
|
||||||
ne=NUMERIC_EXP, t=TIME_EXP, on=ORD_NUM_OR_DATE,
|
ne=NUMERIC_EXP, t=TIME_EXP, on=ORD_NUM_OR_DATE,
|
||||||
c=CURRENCY, s=_SUFFIES
|
c=CURRENCY, s=_SUFFIXES
|
||||||
)
|
)
|
||||||
|
|
||||||
TOKEN_MATCH = re.compile("^({u})|({n})$".format(u=_URL_PATTERN, n=NUMS)).match
|
TOKEN_MATCH = re.compile("^({u})|({n})$".format(u=_URL_PATTERN, n=NUMS)).match
|
||||||
|
|
|
@ -58,7 +58,7 @@ LIST_HYPHENS = list(_HYPHENS.strip().split())
|
||||||
|
|
||||||
|
|
||||||
ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '').replace('\n', '')
|
ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '').replace('\n', '')
|
||||||
ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '').replace(' ', '')
|
ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '').replace('\n', '')
|
||||||
ALPHA = ALPHA_LOWER + ALPHA_UPPER
|
ALPHA = ALPHA_LOWER + ALPHA_UPPER
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -29,7 +29,7 @@ HYPHEN_TESTS = [
|
||||||
('Dinnye-domb-.', ['Dinnye-domb-', '.']),
|
('Dinnye-domb-.', ['Dinnye-domb-', '.']),
|
||||||
('Ezen -e elcsatangolt.', ['Ezen', '-e', 'elcsatangolt', '.']),
|
('Ezen -e elcsatangolt.', ['Ezen', '-e', 'elcsatangolt', '.']),
|
||||||
('Lakik-e', ['Lakik', '-e']),
|
('Lakik-e', ['Lakik', '-e']),
|
||||||
('A--B', ['A', '--' 'B']),
|
('A--B', ['A', '--', 'B']),
|
||||||
('Lakik-e?', ['Lakik', '-e', '?']),
|
('Lakik-e?', ['Lakik', '-e', '?']),
|
||||||
('Lakik-e.', ['Lakik', '-e', '.']),
|
('Lakik-e.', ['Lakik', '-e', '.']),
|
||||||
('Lakik-e...', ['Lakik', '-e', '...']),
|
('Lakik-e...', ['Lakik', '-e', '...']),
|
||||||
|
@ -42,6 +42,7 @@ HYPHEN_TESTS = [
|
||||||
('A 7-es.', ['A', '7-es', '.']),
|
('A 7-es.', ['A', '7-es', '.']),
|
||||||
('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']),
|
('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']),
|
||||||
('A %-sal.', ['A', '%-sal', '.']),
|
('A %-sal.', ['A', '%-sal', '.']),
|
||||||
|
('A $-sal.', ['A', '$-sal', '.']),
|
||||||
('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])
|
('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -206,7 +207,7 @@ NUMBER_TESTS = [
|
||||||
('A 5$-ban', ['A', '5$-ban']),
|
('A 5$-ban', ['A', '5$-ban']),
|
||||||
('A 5$.', ['A', '5', '$', '.']),
|
('A 5$.', ['A', '5', '$', '.']),
|
||||||
('A 5$', ['A', '5', '$']),
|
('A 5$', ['A', '5', '$']),
|
||||||
('A $5', ['A', '$', '5']),
|
('A $5', ['A', '$5']),
|
||||||
('A 5km/h', ['A', '5', 'km/h']),
|
('A 5km/h', ['A', '5', 'km/h']),
|
||||||
('A 75%+1-100%-ig', ['A', '75%+1-100%-ig']),
|
('A 75%+1-100%-ig', ['A', '75%+1-100%-ig']),
|
||||||
('A 5km/h.', ['A', '5', 'km/h', '.']),
|
('A 5km/h.', ['A', '5', 'km/h', '.']),
|
||||||
|
@ -247,7 +248,8 @@ WIKI_TESTS = [
|
||||||
('"(...)"–sokkal ', ['"', '(', '...', ')', '"', '–sokkal']),
|
('"(...)"–sokkal ', ['"', '(', '...', ')', '"', '–sokkal']),
|
||||||
]
|
]
|
||||||
|
|
||||||
TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS # + HYPHEN_TESTS # + WIKI_TESTS
|
TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS # + WIKI_TESTS
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
||||||
def test_tokenizer_handles_testcases(hu_tokenizer, text, expected_tokens):
|
def test_tokenizer_handles_testcases(hu_tokenizer, text, expected_tokens):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user