diff --git a/spacy/hu/language_data.py b/spacy/hu/language_data.py index 138b3afc8..e446fdcd6 100644 --- a/spacy/hu/language_data.py +++ b/spacy/hu/language_data.py @@ -53,7 +53,8 @@ _ \.\. \.\.\. \.\.\.\. -(?<=[a-züóőúéáűíAÜÓŐÚÉÁŰÍ)\]"'´«‘’%\)²“”])\. +(?<=[a-züóőúéáűí)\]"'´«‘’%\)²“”-])\. +(?<=[a-züóőúéáűí)])-e \-\- ´ (?<=[0-9])km² @@ -98,14 +99,17 @@ _ (?<=[0-9])kb '''.strip().split('\n') -TOKENIZER_INFIXES = (r'''\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) ''' - r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) ''' - r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split() +TOKENIZER_INFIXES = (r'''\.\.+ (?<=[a-züóőúéáűí])\.(?=[A-ZÜÓŐÚÉÁŰÍ]) (?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ0-9])"(?=[\-a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) ''' + r'''(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])--(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) (?<=[0-9])-(?=[0-9]) ''' + r'''(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]),(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])''').split() ABBREVIATIONS = {abbrev: [{"ORTH": abbrev}] for abbrev in _load_txt_data(_ABBREVIATIONS_ORIG_PATH, _ABBREVIATIONS_NYTUD_PATH)} TOKENIZER_EXCEPTIONS = { + "\w*\(\w+\)\w*": [{"F": "???"}], + "-e": [{"F": "-e"}], + "vs.": [{"F": "vs."}], "''": [{"F": "''"}], diff --git a/spacy/tests/hu/tokenizer/test_default_token_hyphen.txt b/spacy/tests/hu/tokenizer/test_default_token_hyphen.txt new file mode 100644 index 000000000..8b09749d2 --- /dev/null +++ b/spacy/tests/hu/tokenizer/test_default_token_hyphen.txt @@ -0,0 +1,81 @@ +# TOKEN hyphen + +-nak, -nek es ehhez hasonlok +IN : Egy -nak, -jaiért, -magyar, bel- van. +OUT: Egy -nak, -jaiért, -magyar, bel- van. +IN : Egy -nak. +OUT: Egy -nak. +IN : Egy bel-. +OUT: Egy bel-. +IN : Dinnye-domb-. +OUT: Dinnye-domb-. + +kulonvalt '-e' +IN : Ezen -e elcsatangolt. +OUT: Ezen -e elcsatangolt. + +-e levagasa, zarojel nelkul +IN : Lakik-e +OUT: Lakik-e +IN : Lakik-e? +OUT: Lakik-e? +IN : Lakik-e. +OUT: Lakik-e. +IN : Lakik-e... +OUT: Lakik-e... +IN : Lakik-e... van. +OUT: Lakik-e... van. +IN : Lakik-e van? +OUT: Lakik-e van? + +# TODO: adapt spacy to handle such brackets +zarojeles mondatkozi valtozatok +#IN : (La)kik-e van? +#OUT: (La)kik-e van? +#IN : L(a)kik-e van? +#OUT: L(a)kik-e van? +#IN : Lak(ik)-e van? +#OUT: Lak(ik)-e van? + +# TODO: adapt spacy to handle such brackets +zarojeles mondatvegi valtozatok +#IN : (La)kik-e. +#OUT: (La)kik-e. +#IN : L(a)kik-e. +#OUT: L(a)kik-e. +#IN : Lak(ik)-e. +#OUT: Lak(ik)-e. + +kontroll +IN : Lakik-elem van? +OUT: Lakik-elem van? +IN : Van lakik-elem. +OUT: Van lakik-elem. +IN : A 7-es busz? +OUT: A 7-es busz? +IN : A 7-es? +OUT: A 7-es? +IN : A 7-es. +OUT: A 7-es. + +problemas eset, megengedjuk # TODO: works erroundously in HunToken, but OK in spacy +IN : Ez (lakik)-e? +OUT: Ez (lakik)-e? + + TODO: macska-/kutyavilag + IN : A macska-/kutyavilag van. + OUT: A macska-/kutyavilag van. + +%-, §- +IN : A §-sal. +OUT: A §-sal. +IN : A %-sal. +OUT: A %-sal. + +tobb kotojel +IN : A CD-ROM-okrol. +OUT: A CD-ROM-okrol. + + + + diff --git a/spacy/tests/hu/tokenizer/test_default_token_quote.txt b/spacy/tests/hu/tokenizer/test_default_token_quote.txt new file mode 100644 index 000000000..c334e5246 --- /dev/null +++ b/spacy/tests/hu/tokenizer/test_default_token_quote.txt @@ -0,0 +1,22 @@ +# TOKEN quote + +mondatban +IN : Az "Ime, hat"-ban irja. +OUT: Az "Ime, hat"-ban irja. + +mondat elejen +IN : "Ime, hat"-ban irja. +OUT: "Ime, hat"-ban irja. + +mondat vegen +IN : Az "Ime, hat". +OUT: Az "Ime, hat". + +magaban +IN : Egy 24"-os monitor. +OUT: Egy 24"-os monitor. + +aposztrof +IN : A don't van. +OUT: A don't van. + diff --git a/spacy/tests/hu/tokenizer/test_tokenizer.py b/spacy/tests/hu/tokenizer/test_tokenizer.py index 12dbe5b78..67859ce3f 100644 --- a/spacy/tests/hu/tokenizer/test_tokenizer.py +++ b/spacy/tests/hu/tokenizer/test_tokenizer.py @@ -55,6 +55,9 @@ class TokenizerTestCase(object): _DOTS_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_dots.txt")) +_HYPHEN_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_hyphen.txt")) +_QUOTE_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_quote.txt")) +ALL_TESTCASES = _DOTS_CASES + _HYPHEN_CASES @pytest.fixture(scope="session") @@ -67,8 +70,8 @@ def hu_tokenizer(HU): return HU.tokenizer -@pytest.mark.parametrize(("test_case"), _DOTS_CASES) +@pytest.mark.parametrize(("test_case"), ALL_TESTCASES) def test_abbreviations(hu_tokenizer, test_case): tokens = hu_tokenizer(test_case.input) token_list = [token.orth_ for token in tokens if not token.is_space] - assert test_case.expected_tokens == token_list, "{} was erronously tokenized as {}".format(test_case, token_list) + assert test_case.expected_tokens == token_list#, "{} was erronously tokenized as {}".format(test_case, token_list)