diff --git a/spacy/hu/language_data.py b/spacy/hu/language_data.py
index 138b3afc8..e446fdcd6 100644
--- a/spacy/hu/language_data.py
+++ b/spacy/hu/language_data.py
@@ -53,7 +53,8 @@ _
\.\.
\.\.\.
\.\.\.\.
-(?<=[a-züóőúéáűíAÜÓŐÚÉÁŰÍ)\]"'´«‘’%\)²“”])\.
+(?<=[a-züóőúéáűí)\]"'´«‘’%\)²“”-])\.
+(?<=[a-züóőúéáűí)])-e
\-\-
´
(?<=[0-9])km²
@@ -98,14 +99,17 @@ _
(?<=[0-9])kb
'''.strip().split('\n')
-TOKENIZER_INFIXES = (r'''\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
- r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) '''
- r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split()
+TOKENIZER_INFIXES = (r'''\.\.+ (?<=[a-züóőúéáűí])\.(?=[A-ZÜÓŐÚÉÁŰÍ]) (?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ0-9])"(?=[\-a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) '''
+ r'''(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])--(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) (?<=[0-9])-(?=[0-9]) '''
+ r'''(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]),(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])''').split()
ABBREVIATIONS = {abbrev: [{"ORTH": abbrev}] for abbrev in
_load_txt_data(_ABBREVIATIONS_ORIG_PATH, _ABBREVIATIONS_NYTUD_PATH)}
TOKENIZER_EXCEPTIONS = {
+ "\w*\(\w+\)\w*": [{"F": "???"}],
+ "-e": [{"F": "-e"}],
+
"vs.": [{"F": "vs."}],
"''": [{"F": "''"}],
diff --git a/spacy/tests/hu/tokenizer/test_default_token_hyphen.txt b/spacy/tests/hu/tokenizer/test_default_token_hyphen.txt
new file mode 100644
index 000000000..8b09749d2
--- /dev/null
+++ b/spacy/tests/hu/tokenizer/test_default_token_hyphen.txt
@@ -0,0 +1,81 @@
+# TOKEN hyphen
+
+-nak, -nek es ehhez hasonlok
+IN : Egy -nak, -jaiért, -magyar, bel- van.
+OUT: Egy -nak, -jaiért, -magyar, bel- van.
+IN : Egy -nak.
+OUT: Egy -nak.
+IN : Egy bel-.
+OUT: Egy bel-.
+IN : Dinnye-domb-.
+OUT: Dinnye-domb-.
+
+kulonvalt '-e'
+IN : Ezen -e elcsatangolt.
+OUT: Ezen -e elcsatangolt.
+
+-e levagasa, zarojel nelkul
+IN : Lakik-e
+OUT: Lakik-e
+IN : Lakik-e?
+OUT: Lakik-e?
+IN : Lakik-e.
+OUT: Lakik-e.
+IN : Lakik-e...
+OUT: Lakik-e...
+IN : Lakik-e... van.
+OUT: Lakik-e... van.
+IN : Lakik-e van?
+OUT: Lakik-e van?
+
+# TODO: adapt spacy to handle such brackets
+zarojeles mondatkozi valtozatok
+#IN : (La)kik-e van?
+#OUT: (La)kik-e van?
+#IN : L(a)kik-e van?
+#OUT: L(a)kik-e van?
+#IN : Lak(ik)-e van?
+#OUT: Lak(ik)-e van?
+
+# TODO: adapt spacy to handle such brackets
+zarojeles mondatvegi valtozatok
+#IN : (La)kik-e.
+#OUT: (La)kik-e.
+#IN : L(a)kik-e.
+#OUT: L(a)kik-e.
+#IN : Lak(ik)-e.
+#OUT: Lak(ik)-e.
+
+kontroll
+IN : Lakik-elem van?
+OUT: Lakik-elem van?
+IN : Van lakik-elem.
+OUT: Van lakik-elem.
+IN : A 7-es busz?
+OUT: A 7-es busz?
+IN : A 7-es?
+OUT: A 7-es?
+IN : A 7-es.
+OUT: A 7-es.
+
+problemas eset, megengedjuk # TODO: works erroundously in HunToken, but OK in spacy
+IN : Ez (lakik)-e?
+OUT: Ez (lakik)-e?
+
+ TODO: macska-/kutyavilag
+ IN : A macska-/kutyavilag van.
+ OUT: A macska-/kutyavilag van.
+
+%-, §-
+IN : A §-sal.
+OUT: A §-sal.
+IN : A %-sal.
+OUT: A %-sal.
+
+tobb kotojel
+IN : A CD-ROM-okrol.
+OUT: A CD-ROM-okrol.
+
+
+
+
diff --git a/spacy/tests/hu/tokenizer/test_default_token_quote.txt b/spacy/tests/hu/tokenizer/test_default_token_quote.txt
new file mode 100644
index 000000000..c334e5246
--- /dev/null
+++ b/spacy/tests/hu/tokenizer/test_default_token_quote.txt
@@ -0,0 +1,22 @@
+# TOKEN quote
+
+mondatban
+IN : Az "Ime, hat"-ban irja.
+OUT: Az "Ime, hat"-ban irja.
+
+mondat elejen
+IN : "Ime, hat"-ban irja.
+OUT: "Ime, hat"-ban irja.
+
+mondat vegen
+IN : Az "Ime, hat".
+OUT: Az "Ime, hat".
+
+magaban
+IN : Egy 24"-os monitor.
+OUT: Egy 24"-os monitor.
+
+aposztrof
+IN : A don't van.
+OUT: A don't van.
+
diff --git a/spacy/tests/hu/tokenizer/test_tokenizer.py b/spacy/tests/hu/tokenizer/test_tokenizer.py
index 12dbe5b78..67859ce3f 100644
--- a/spacy/tests/hu/tokenizer/test_tokenizer.py
+++ b/spacy/tests/hu/tokenizer/test_tokenizer.py
@@ -55,6 +55,9 @@ class TokenizerTestCase(object):
_DOTS_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_dots.txt"))
+_HYPHEN_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_hyphen.txt"))
+_QUOTE_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_quote.txt"))
+ALL_TESTCASES = _DOTS_CASES + _HYPHEN_CASES
@pytest.fixture(scope="session")
@@ -67,8 +70,8 @@ def hu_tokenizer(HU):
return HU.tokenizer
-@pytest.mark.parametrize(("test_case"), _DOTS_CASES)
+@pytest.mark.parametrize(("test_case"), ALL_TESTCASES)
def test_abbreviations(hu_tokenizer, test_case):
tokens = hu_tokenizer(test_case.input)
token_list = [token.orth_ for token in tokens if not token.is_space]
- assert test_case.expected_tokens == token_list, "{} was erronously tokenized as {}".format(test_case, token_list)
+ assert test_case.expected_tokens == token_list#, "{} was erronously tokenized as {}".format(test_case, token_list)