spaCy/spacy/tests/hu/tokenizer/test_tokenizer.py

import os
import re

import pytest

from spacy.hu import Hungarian

_MODULE_PATH = os.path.dirname(__file__)


class TokenizerTestCase(object):
    INPUT_PREFIX = "IN :"
    OUTPUT_PREFIX = "OUT:"
    WORD_PATTERN = re.compile(r"<([wc])>([^<>]+)</\1>")

    def __init__(self, input_str, expected_words):
        self.input = input_str
        self.expected_tokens = expected_words

    def __repr__(self):
        return "TokenizerTestCase<input={}, words={}>".format(repr(self.input), self.expected_tokens)

    def to_tuple(self):
        return (self.input, self.expected_tokens)

    @classmethod
    def _parse_output_line(cls, line):
        for match in cls.WORD_PATTERN.finditer(line):
            yield match.group(2)

    @classmethod
    def read_from_file(cls, path):
        with open(path) as f:
            input_lines = []
            output_words = []
            last_type = None
            for line in f:
                if line.startswith(cls.INPUT_PREFIX):
                    if last_type == TokenizerTestCase.OUTPUT_PREFIX and input_lines:
                        yield TokenizerTestCase("\n".join(input_lines), output_words)
                        input_lines = []
                        output_words = []
                    input_lines.append(line[len(cls.INPUT_PREFIX):].strip())
                    last_type = TokenizerTestCase.INPUT_PREFIX
                elif line.startswith(cls.OUTPUT_PREFIX):
                    output_words.extend(list(cls._parse_output_line(line.strip())))
                    last_type = TokenizerTestCase.OUTPUT_PREFIX
                else:
                    # Comments separate test cases
                    if input_lines:
                        yield TokenizerTestCase("\n".join(input_lines), output_words)
                        input_lines = []
                        output_words = []
                    last_type = None


_DOTS_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_dots.txt"))
_HYPHEN_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_hyphen.txt"))
_QUOTE_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_quote.txt"))
_NUMBER_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_numbers.txt"))
_MISC_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_misc.txt"))
_IT_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_it.txt"))

# TODO: Until this get fixed we cannot really test the urls: https://github.com/explosion/spaCy/issues/344
ALL_TESTCASES = _DOTS_CASES + _HYPHEN_CASES + _QUOTE_CASES + _NUMBER_CASES + _MISC_CASES  # + _IT_CASES


@pytest.fixture(scope="session")
def HU():
    return Hungarian()


@pytest.fixture(scope="module")
def hu_tokenizer(HU):
    return HU.tokenizer


@pytest.mark.parametrize(("test_case"), ALL_TESTCASES)
def test_testcases(hu_tokenizer, test_case):
    tokens = hu_tokenizer(test_case.input)
    token_list = [token.orth_ for token in tokens if not token.is_space]
    assert test_case.expected_tokens == token_list  # , "{} was erronously tokenized as {}".format(test_case, token_list)
Passing Hungatian abbrev tests. 2016-12-11 01:29:41 +03:00			`import os`
			`import re`

			`import pytest`

			`from spacy.hu import Hungarian`

			`_MODULE_PATH = os.path.dirname(__file__)`


			`class TokenizerTestCase(object):`
			`INPUT_PREFIX = "IN :"`
			`OUTPUT_PREFIX = "OUT:"`
			`WORD_PATTERN = re.compile(r"<([wc])>([^<>]+)</\1>")`

			`def __init__(self, input_str, expected_words):`
			`self.input = input_str`
			`self.expected_tokens = expected_words`

			`def __repr__(self):`
			`return "TokenizerTestCase<input={}, words={}>".format(repr(self.input), self.expected_tokens)`

			`def to_tuple(self):`
			`return (self.input, self.expected_tokens)`

			`@classmethod`
			`def _parse_output_line(cls, line):`
			`for match in cls.WORD_PATTERN.finditer(line):`
			`yield match.group(2)`

			`@classmethod`
			`def read_from_file(cls, path):`
			`with open(path) as f:`
			`input_lines = []`
			`output_words = []`
			`last_type = None`
			`for line in f:`
			`if line.startswith(cls.INPUT_PREFIX):`
			`if last_type == TokenizerTestCase.OUTPUT_PREFIX and input_lines:`
			`yield TokenizerTestCase("\n".join(input_lines), output_words)`
			`input_lines = []`
			`output_words = []`
			`input_lines.append(line[len(cls.INPUT_PREFIX):].strip())`
			`last_type = TokenizerTestCase.INPUT_PREFIX`
			`elif line.startswith(cls.OUTPUT_PREFIX):`
			`output_words.extend(list(cls._parse_output_line(line.strip())))`
			`last_type = TokenizerTestCase.OUTPUT_PREFIX`
			`else:`
			`# Comments separate test cases`
			`if input_lines:`
			`yield TokenizerTestCase("\n".join(input_lines), output_words)`
			`input_lines = []`
			`output_words = []`
			`last_type = None`


			`_DOTS_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_dots.txt"))`
Adding partial hyphen and quote handling support. 2016-12-11 02:04:19 +03:00			`_HYPHEN_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_hyphen.txt"))`
			`_QUOTE_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_quote.txt"))`
Partial Hungarian number tokenization is added. 2016-12-20 22:46:20 +03:00			`_NUMBER_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_numbers.txt"))`
Added further testcases. 2016-12-21 01:49:35 +03:00			`_MISC_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_misc.txt"))`
			`_IT_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_it.txt"))`

			`# TODO: Until this get fixed we cannot really test the urls: https://github.com/explosion/spaCy/issues/344`
			`ALL_TESTCASES = _DOTS_CASES + _HYPHEN_CASES + _QUOTE_CASES + _NUMBER_CASES + _MISC_CASES # + _IT_CASES`
Passing Hungatian abbrev tests. 2016-12-11 01:29:41 +03:00

			`@pytest.fixture(scope="session")`
			`def HU():`
			`return Hungarian()`


			`@pytest.fixture(scope="module")`
			`def hu_tokenizer(HU):`
			`return HU.tokenizer`


Adding partial hyphen and quote handling support. 2016-12-11 02:04:19 +03:00			`@pytest.mark.parametrize(("test_case"), ALL_TESTCASES)`
Partial Hungarian number tokenization is added. 2016-12-20 22:46:20 +03:00			`def test_testcases(hu_tokenizer, test_case):`
Passing Hungatian abbrev tests. 2016-12-11 01:29:41 +03:00			`tokens = hu_tokenizer(test_case.input)`
			`token_list = [token.orth_ for token in tokens if not token.is_space]`
Partial Hungarian number tokenization is added. 2016-12-20 22:46:20 +03:00			`assert test_case.expected_tokens == token_list # , "{} was erronously tokenized as {}".format(test_case, token_list)`