Adding partial hyphen and quote handling support.

This commit is contained in:
Gyorgy Orosz 2016-12-11 00:04:19 +01:00
parent 2051726fd3
commit 0cf2144d24
4 changed files with 116 additions and 6 deletions

View File

@ -53,7 +53,8 @@ _
\.\.
\.\.\.
\.\.\.\.
(?<=[a-züóőúéáűíAÜÓŐÚÉÁŰÍ)\]"'´«‘’%\)²“”])\.
(?<=[a-züóőúéáűí)\]"'´«‘’%\)²“”-])\.
(?<=[a-züóőúéáűí)])-e
\-\-
´
(?<=[0-9])km²
@ -98,14 +99,17 @@ _
(?<=[0-9])kb
'''.strip().split('\n')
TOKENIZER_INFIXES = (r'''\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) '''
r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split()
TOKENIZER_INFIXES = (r'''\.\.+ (?<=[a-züóőúéáűí])\.(?=[A-ZÜÓŐÚÉÁŰÍ]) (?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ0-9])"(?=[\-a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) '''
r'''(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])--(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) (?<=[0-9])-(?=[0-9]) '''
r'''(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]),(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])''').split()
ABBREVIATIONS = {abbrev: [{"ORTH": abbrev}] for abbrev in
_load_txt_data(_ABBREVIATIONS_ORIG_PATH, _ABBREVIATIONS_NYTUD_PATH)}
TOKENIZER_EXCEPTIONS = {
"\w*\(\w+\)\w*": [{"F": "???"}],
"-e": [{"F": "-e"}],
"vs.": [{"F": "vs."}],
"''": [{"F": "''"}],

View File

@ -0,0 +1,81 @@
# TOKEN hyphen
-nak, -nek es ehhez hasonlok
IN : Egy -nak, -jaiért, -magyar, bel- van.
OUT: <s><w>Egy</w><ws> </ws><w>-nak</w><c>,</c><ws> </ws><w>-jaiért</w><c>,</c><ws> </ws><w>-magyar</w><c>,</c><ws> </ws><w>bel-</w><ws> </ws><w>van</w><c>.</c></s>
IN : Egy -nak.
OUT: <s><w>Egy</w><ws> </ws><w>-nak</w><c>.</c></s>
IN : Egy bel-.
OUT: <s><w>Egy</w><ws> </ws><w>bel-</w><c>.</c></s>
IN : Dinnye-domb-.
OUT: <s><w>Dinnye-domb-</w><c>.</c></s>
kulonvalt '-e'
IN : Ezen -e elcsatangolt.
OUT: <s><w>Ezen</w><ws> </ws><w>-e</w><ws> </ws><w>elcsatangolt</w><c>.</c></s>
-e levagasa, zarojel nelkul
IN : Lakik-e
OUT: <s><w>Lakik</w><w>-e</w></s>
IN : Lakik-e?
OUT: <s><w>Lakik</w><w>-e</w><c>?</c></s>
IN : Lakik-e.
OUT: <s><w>Lakik</w><w>-e</w><c>.</c></s>
IN : Lakik-e...
OUT: <s><w>Lakik</w><w>-e</w><c>...</c></s>
IN : Lakik-e... van.
OUT: <s><w>Lakik</w><w>-e</w><c>...</c><ws> </ws><w>van</w><c>.</c></s>
IN : Lakik-e van?
OUT: <s><w>Lakik</w><w>-e</w><ws> </ws><w>van</w><c>?</c></s>
# TODO: adapt spacy to handle such brackets
zarojeles mondatkozi valtozatok
#IN : (La)kik-e van?
#OUT: <s><w>(La)kik</w><w>-e</w><ws> </ws><w>van</w><c>?</c></s>
#IN : L(a)kik-e van?
#OUT: <s><w>L(a)kik</w><w>-e</w><ws> </ws><w>van</w><c>?</c></s>
#IN : Lak(ik)-e van?
#OUT: <s><w>Lak(ik)</w><w>-e</w><ws> </ws><w>van</w><c>?</c></s>
# TODO: adapt spacy to handle such brackets
zarojeles mondatvegi valtozatok
#IN : (La)kik-e.
#OUT: <s><w>(La)kik</w><w>-e</w><c>.</c></s>
#IN : L(a)kik-e.
#OUT: <s><w>L(a)kik</w><w>-e</w><c>.</c></s>
#IN : Lak(ik)-e.
#OUT: <s><w>Lak(ik)</w><w>-e</w><c>.</c></s>
kontroll
IN : Lakik-elem van?
OUT: <s><w>Lakik-elem</w><ws> </ws><w>van</w><c>?</c></s>
IN : Van lakik-elem.
OUT: <s><w>Van</w><ws> </ws><w>lakik-elem</w><c>.</c></s>
IN : A 7-es busz?
OUT: <s><w>A</w><ws> </ws><w>7-es</w><ws> </ws><w>busz</w><c>?</c></s>
IN : A 7-es?
OUT: <s><w>A</w><ws> </ws><w>7-es</w><c>?</c></s>
IN : A 7-es.
OUT: <s><w>A</w><ws> </ws><w>7-es</w><c>.</c></s>
problemas eset, megengedjuk # TODO: works erroundously in HunToken, but OK in spacy
IN : Ez (lakik)-e?
OUT: <s><w>Ez</w><ws> </ws><c>(</c><w>lakik</w><c>)</c><w>-e</w><c>?</c></s>
TODO: macska-/kutyavilag
IN : A macska-/kutyavilag van.
OUT: <s><w>A</w><ws> </ws><w>macska-</w><c>/</c><w>kutyavilag</w><ws> </ws><w>van</w><c>.</c></s>
%-, §-
IN : A §-sal.
OUT: <s><w>A</w><ws> </ws><w>§-sal</w><c>.</c></s>
IN : A %-sal.
OUT: <s><w>A</w><ws> </ws><w>%-sal</w><c>.</c></s>
tobb kotojel
IN : A CD-ROM-okrol.
OUT: <s><w>A</w><ws> </ws><w>CD-ROM-okrol</w><c>.</c></s>

View File

@ -0,0 +1,22 @@
# TOKEN quote
mondatban
IN : Az "Ime, hat"-ban irja.
OUT: <s><w>Az</w><ws> </ws><c>"</c><w>Ime</w><c>,</c><ws> </ws><w>hat</w><c>"</c><w>-ban</w><ws> </ws><w>irja</w><c>.</c></s>
mondat elejen
IN : "Ime, hat"-ban irja.
OUT: <s><c>"</c><w>Ime</w><c>,</c><ws> </ws><w>hat</w><c>"</c><w>-ban</w><ws> </ws><w>irja</w><c>.</c></s>
mondat vegen
IN : Az "Ime, hat".
OUT: <s><w>Az</w><ws> </ws><c>"</c><w>Ime</w><c>,</c><ws> </ws><w>hat</w><c>"</c><c>.</c></s>
magaban
IN : Egy 24"-os monitor.
OUT: <s><w>Egy</w><ws> </ws><w>24</w><c>"</c><w>-os</w><ws> </ws><w>monitor</w><c>.</c></s>
aposztrof
IN : A don't van.
OUT: <s><w>A</w><ws> </ws><w>don't</w><ws> </ws><w>van</w><c>.</c></s>

View File

@ -55,6 +55,9 @@ class TokenizerTestCase(object):
_DOTS_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_dots.txt"))
_HYPHEN_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_hyphen.txt"))
_QUOTE_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_quote.txt"))
ALL_TESTCASES = _DOTS_CASES + _HYPHEN_CASES
@pytest.fixture(scope="session")
@ -67,8 +70,8 @@ def hu_tokenizer(HU):
return HU.tokenizer
@pytest.mark.parametrize(("test_case"), _DOTS_CASES)
@pytest.mark.parametrize(("test_case"), ALL_TESTCASES)
def test_abbreviations(hu_tokenizer, test_case):
tokens = hu_tokenizer(test_case.input)
token_list = [token.orth_ for token in tokens if not token.is_space]
assert test_case.expected_tokens == token_list, "{} was erronously tokenized as {}".format(test_case, token_list)
assert test_case.expected_tokens == token_list#, "{} was erronously tokenized as {}".format(test_case, token_list)