Passing Hungatian abbrev tests.

2025-11-01 00:17:44 +03:00 · 2016-12-10 23:29:41 +01:00 · 2016-12-10 23:29:41 +01:00 · 2051726fd3
commit 2051726fd3
parent 0289b8ceaa
5 changed files with 139 additions and 57 deletions
--- a/spacy/hu/language_data.py
+++ b/spacy/hu/language_data.py
@ -24,34 +24,7 @@ STOP_WORDS = set(_load_txt_data(_STOPWORDS_PATH))

 HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]]

-TOKENIZER_PREFIXES = map(re.escape, r'''
-,
-"
-(
-[
-{
-*
-<
->
-$
-£
-„
-“
-'
-``
-`
-#
-US$
-C$
-A$
-‘
-....
-...
-‚
-»
-_
-§
-'''.strip().split('\n'))
+TOKENIZER_PREFIXES = r''', " ( [ { * < $ £ “ ' `` ` # US$ C$ A$ a- ‘ .... ...'''.split()

 TOKENIZER_SUFFIXES = r'''
 ,
@ -125,11 +98,11 @@ _
 (?<=[0-9])kb
 '''.strip().split('\n')

-TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
+TOKENIZER_INFIXES = (r'''\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
                     r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) '''
                     r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split()

-ABBREVIATIONS = {abbrev: [{"F": abbrev}] for abbrev in
+ABBREVIATIONS = {abbrev: [{"ORTH": abbrev}] for abbrev in
                 _load_txt_data(_ABBREVIATIONS_ORIG_PATH, _ABBREVIATIONS_NYTUD_PATH)}

 TOKENIZER_EXCEPTIONS = {
--- a/spacy/tests/hu/test_tokenizer.py
+++ b/spacy/tests/hu/test_tokenizer.py
@ -1,27 +0,0 @@
-import pytest
-
-from spacy.hu import Hungarian
-
-
-@pytest.fixture(scope="session")
-def HU():
-    return Hungarian()
-
-
-@pytest.fixture(scope="module")
-def hu_tokenizer(HU):
-    return HU.tokenizer
-
-
-@pytest.mark.parametrize(("input_str", "expected_length"), [
-    ("A vs. egy", 3),
-    ("A dr. egy", 3),
-    ("A .hu egy tld.", 5),
-    ("A .hu.", 3),
-    ("Az egy.ketto pelda.", 4),
-    ("A pl. rovidites.", 4),
-    ("A S.M.A.R.T. szo.", 4)
-])
-def test_abbreviations(hu_tokenizer, input_str, expected_length):
-    tokens = hu_tokenizer(input_str)
-    assert len(tokens) == expected_length
--- a/spacy/tests/hu/tokenizer/init.py
+++ b/spacy/tests/hu/tokenizer/init.py
@ -0,0 +1,4 @@
+__author__ = 'gyorgyorosz'
+
+if __name__ == "__main__":
+    pass
--- a/spacy/tests/hu/tokenizer/test_default_token_dots.txt
+++ b/spacy/tests/hu/tokenizer/test_default_token_dots.txt
@ -0,0 +1,58 @@
+# TOKEN dots
+
+0. egyszeru szavak
+IN : N. kormányzósági
+IN : székhely.
+OUT: <s><w>N.</w><ws> </ws><w>kormányzósági</w><ws>
+OUT: </ws><w>székhely</w><c>.</c></s>
+
+
+1. szavak pontokkal
+
+1.1 mondatkozi verziok
+1.1.1 pottal kezdodo szavak
+IN : A .hu egy tld.
+OUT: <s><w>A</w><ws> </ws><w>.hu</w><ws> </ws><w>egy</w><ws> </ws><w>tld</w><c>.</c></s>
+1.1.2 pont a szo belsejeben
+IN : Az egy.ketto pelda.
+OUT: <s><w>Az</w><ws> </ws><w>egy.ketto</w><ws> </ws><w>pelda</w><c>.</c></s>
+1.1.3 pont a szo vegen
+IN : A pl. rovidites.
+OUT: <s><w>A</w><ws> </ws><w>pl.</w><ws> </ws><w>rovidites</w><c>.</c></s>
+1.1.4 pontozott szo
+IN : A S.M.A.R.T. szo.
+OUT: <s><w>A</w><ws> </ws><w>S.M.A.R.T.</w><ws> </ws><w>szo</w><c>.</c></s>
+
+1.2 mondatvegi verziok
+1.2.1 pottal kezdodo szavak
+IN : A .hu.
+OUT: <s><w>A</w><ws> </ws><w>.hu</w><c>.</c></s>
+1.2.2 pont a szo belsejeben
+IN : Az egy.ketto.
+OUT: <s><w>Az</w><ws> </ws><w>egy.ketto</w><c>.</c></s>
+1.2.3 pont a szo vegen
+#TODO: cf. Huntoken
+IN : A pl.
+OUT: <s><w>A</w><ws> </ws><w>pl.</w></s>
+1.2.4 pontozott szo
+#TODO: cf. Huntoken
+IN : A S.M.A.R.T.
+OUT: <s><w>A</w><ws> </ws><w>S.M.A.R.T.</w></s>
+
+
+2. tobb pont
+
+2.1 ketto vagy tobb pont utan uj szo
+IN : Egy..ket.
+OUT: <s><w>Egy</w><c>..</c><w>ket</w><c>.</c></s>
+IN : Valami... van.
+OUT: <s><w>Valami</w><c>...</c><ws> </ws><w>van</w><c>.</c></s>
+IN : Valami ...van...
+OUT: <s><w>Valami</w><ws> </ws><c>...</c><w>van</w><c>...</c></s>
+IN : Valami...
+OUT: <s><w>Valami</w><c>...</c></s>
+IN : Valami ...
+OUT: <s><w>Valami</w><ws> </ws><c>...</c></s>
+IN : Valami ... más.
+OUT: <s><w>Valami</w><ws> </ws><c>...</c><ws> </ws><w>más</w><c>.</c></s>
+
--- a/spacy/tests/hu/tokenizer/test_tokenizer.py
+++ b/spacy/tests/hu/tokenizer/test_tokenizer.py
@ -0,0 +1,74 @@
+import os
+import re
+
+import pytest
+
+from spacy.hu import Hungarian
+
+_MODULE_PATH = os.path.dirname(__file__)
+
+
+class TokenizerTestCase(object):
+    INPUT_PREFIX = "IN :"
+    OUTPUT_PREFIX = "OUT:"
+    WORD_PATTERN = re.compile(r"<([wc])>([^<>]+)</\1>")
+
+    def __init__(self, input_str, expected_words):
+        self.input = input_str
+        self.expected_tokens = expected_words
+
+    def __repr__(self):
+        return "TokenizerTestCase<input={}, words={}>".format(repr(self.input), self.expected_tokens)
+
+    def to_tuple(self):
+        return (self.input, self.expected_tokens)
+
+    @classmethod
+    def _parse_output_line(cls, line):
+        for match in cls.WORD_PATTERN.finditer(line):
+            yield match.group(2)
+
+    @classmethod
+    def read_from_file(cls, path):
+        with open(path) as f:
+            input_lines = []
+            output_words = []
+            last_type = None
+            for line in f:
+                if line.startswith(cls.INPUT_PREFIX):
+                    if last_type == TokenizerTestCase.OUTPUT_PREFIX and input_lines:
+                        yield TokenizerTestCase("\n".join(input_lines), output_words)
+                        input_lines = []
+                        output_words = []
+                    input_lines.append(line[len(cls.INPUT_PREFIX):].strip())
+                    last_type = TokenizerTestCase.INPUT_PREFIX
+                elif line.startswith(cls.OUTPUT_PREFIX):
+                    output_words.extend(list(cls._parse_output_line(line.strip())))
+                    last_type = TokenizerTestCase.OUTPUT_PREFIX
+                else:
+                    # Comments separate test cases
+                    if input_lines:
+                        yield TokenizerTestCase("\n".join(input_lines), output_words)
+                        input_lines = []
+                        output_words = []
+                    last_type = None
+
+
+_DOTS_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_dots.txt"))
+
+
+@pytest.fixture(scope="session")
+def HU():
+    return Hungarian()
+
+
+@pytest.fixture(scope="module")
+def hu_tokenizer(HU):
+    return HU.tokenizer
+
+
+@pytest.mark.parametrize(("test_case"), _DOTS_CASES)
+def test_abbreviations(hu_tokenizer, test_case):
+    tokens = hu_tokenizer(test_case.input)
+    token_list = [token.orth_ for token in tokens if not token.is_space]
+    assert test_case.expected_tokens == token_list, "{} was erronously tokenized as {}".format(test_case, token_list)