Passing Hungatian abbrev tests.

This commit is contained in:
Gyorgy Orosz 2016-12-10 23:29:41 +01:00
parent 0289b8ceaa
commit 2051726fd3
5 changed files with 139 additions and 57 deletions

View File

@ -24,34 +24,7 @@ STOP_WORDS = set(_load_txt_data(_STOPWORDS_PATH))
HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]]
TOKENIZER_PREFIXES = map(re.escape, r'''
,
"
(
[
{
*
<
>
$
£
'
``
`
#
US$
C$
A$
....
...
»
_
§
'''.strip().split('\n'))
TOKENIZER_PREFIXES = r''', " ( [ { * < $ £ “ ' `` ` # US$ C$ A$ a- .... ...'''.split()
TOKENIZER_SUFFIXES = r'''
,
@ -125,11 +98,11 @@ _
(?<=[0-9])kb
'''.strip().split('\n')
TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
TOKENIZER_INFIXES = (r'''\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) '''
r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split()
ABBREVIATIONS = {abbrev: [{"F": abbrev}] for abbrev in
ABBREVIATIONS = {abbrev: [{"ORTH": abbrev}] for abbrev in
_load_txt_data(_ABBREVIATIONS_ORIG_PATH, _ABBREVIATIONS_NYTUD_PATH)}
TOKENIZER_EXCEPTIONS = {

View File

@ -1,27 +0,0 @@
import pytest
from spacy.hu import Hungarian
@pytest.fixture(scope="session")
def HU():
return Hungarian()
@pytest.fixture(scope="module")
def hu_tokenizer(HU):
return HU.tokenizer
@pytest.mark.parametrize(("input_str", "expected_length"), [
("A vs. egy", 3),
("A dr. egy", 3),
("A .hu egy tld.", 5),
("A .hu.", 3),
("Az egy.ketto pelda.", 4),
("A pl. rovidites.", 4),
("A S.M.A.R.T. szo.", 4)
])
def test_abbreviations(hu_tokenizer, input_str, expected_length):
tokens = hu_tokenizer(input_str)
assert len(tokens) == expected_length

View File

@ -0,0 +1,4 @@
__author__ = 'gyorgyorosz'
if __name__ == "__main__":
pass

View File

@ -0,0 +1,58 @@
# TOKEN dots
0. egyszeru szavak
IN : N. kormányzósági
IN : székhely.
OUT: <s><w>N.</w><ws> </ws><w>kormányzósági</w><ws>
OUT: </ws><w>székhely</w><c>.</c></s>
1. szavak pontokkal
1.1 mondatkozi verziok
1.1.1 pottal kezdodo szavak
IN : A .hu egy tld.
OUT: <s><w>A</w><ws> </ws><w>.hu</w><ws> </ws><w>egy</w><ws> </ws><w>tld</w><c>.</c></s>
1.1.2 pont a szo belsejeben
IN : Az egy.ketto pelda.
OUT: <s><w>Az</w><ws> </ws><w>egy.ketto</w><ws> </ws><w>pelda</w><c>.</c></s>
1.1.3 pont a szo vegen
IN : A pl. rovidites.
OUT: <s><w>A</w><ws> </ws><w>pl.</w><ws> </ws><w>rovidites</w><c>.</c></s>
1.1.4 pontozott szo
IN : A S.M.A.R.T. szo.
OUT: <s><w>A</w><ws> </ws><w>S.M.A.R.T.</w><ws> </ws><w>szo</w><c>.</c></s>
1.2 mondatvegi verziok
1.2.1 pottal kezdodo szavak
IN : A .hu.
OUT: <s><w>A</w><ws> </ws><w>.hu</w><c>.</c></s>
1.2.2 pont a szo belsejeben
IN : Az egy.ketto.
OUT: <s><w>Az</w><ws> </ws><w>egy.ketto</w><c>.</c></s>
1.2.3 pont a szo vegen
#TODO: cf. Huntoken
IN : A pl.
OUT: <s><w>A</w><ws> </ws><w>pl.</w></s>
1.2.4 pontozott szo
#TODO: cf. Huntoken
IN : A S.M.A.R.T.
OUT: <s><w>A</w><ws> </ws><w>S.M.A.R.T.</w></s>
2. tobb pont
2.1 ketto vagy tobb pont utan uj szo
IN : Egy..ket.
OUT: <s><w>Egy</w><c>..</c><w>ket</w><c>.</c></s>
IN : Valami... van.
OUT: <s><w>Valami</w><c>...</c><ws> </ws><w>van</w><c>.</c></s>
IN : Valami ...van...
OUT: <s><w>Valami</w><ws> </ws><c>...</c><w>van</w><c>...</c></s>
IN : Valami...
OUT: <s><w>Valami</w><c>...</c></s>
IN : Valami ...
OUT: <s><w>Valami</w><ws> </ws><c>...</c></s>
IN : Valami ... más.
OUT: <s><w>Valami</w><ws> </ws><c>...</c><ws> </ws><w>más</w><c>.</c></s>

View File

@ -0,0 +1,74 @@
import os
import re
import pytest
from spacy.hu import Hungarian
_MODULE_PATH = os.path.dirname(__file__)
class TokenizerTestCase(object):
INPUT_PREFIX = "IN :"
OUTPUT_PREFIX = "OUT:"
WORD_PATTERN = re.compile(r"<([wc])>([^<>]+)</\1>")
def __init__(self, input_str, expected_words):
self.input = input_str
self.expected_tokens = expected_words
def __repr__(self):
return "TokenizerTestCase<input={}, words={}>".format(repr(self.input), self.expected_tokens)
def to_tuple(self):
return (self.input, self.expected_tokens)
@classmethod
def _parse_output_line(cls, line):
for match in cls.WORD_PATTERN.finditer(line):
yield match.group(2)
@classmethod
def read_from_file(cls, path):
with open(path) as f:
input_lines = []
output_words = []
last_type = None
for line in f:
if line.startswith(cls.INPUT_PREFIX):
if last_type == TokenizerTestCase.OUTPUT_PREFIX and input_lines:
yield TokenizerTestCase("\n".join(input_lines), output_words)
input_lines = []
output_words = []
input_lines.append(line[len(cls.INPUT_PREFIX):].strip())
last_type = TokenizerTestCase.INPUT_PREFIX
elif line.startswith(cls.OUTPUT_PREFIX):
output_words.extend(list(cls._parse_output_line(line.strip())))
last_type = TokenizerTestCase.OUTPUT_PREFIX
else:
# Comments separate test cases
if input_lines:
yield TokenizerTestCase("\n".join(input_lines), output_words)
input_lines = []
output_words = []
last_type = None
_DOTS_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_dots.txt"))
@pytest.fixture(scope="session")
def HU():
return Hungarian()
@pytest.fixture(scope="module")
def hu_tokenizer(HU):
return HU.tokenizer
@pytest.mark.parametrize(("test_case"), _DOTS_CASES)
def test_abbreviations(hu_tokenizer, test_case):
tokens = hu_tokenizer(test_case.input)
token_list = [token.orth_ for token in tokens if not token.is_space]
assert test_case.expected_tokens == token_list, "{} was erronously tokenized as {}".format(test_case, token_list)