From e56fea14eb7e807d5ea4ee5fdd12f7ca0610690a Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 28 Jun 2017 01:24:25 +0900 Subject: [PATCH] Add basic Japanese tokenizer test --- spacy/tests/conftest.py | 8 +++++++- spacy/tests/ja/__init__.py | 0 spacy/tests/ja/test_tokenizer.py | 8 ++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/ja/__init__.py create mode 100644 spacy/tests/ja/test_tokenizer.py diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index b8ada1d9a..b0f11b5a4 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -5,6 +5,7 @@ from ..en import English from ..de import German from ..es import Spanish from ..it import Italian +from ..ja import Japanese from ..fr import French from ..pt import Portuguese from ..nl import Dutch @@ -27,7 +28,7 @@ import os import pytest -LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch, +LANGUAGES = [English, German, Spanish, Italian, Japanese, French, Portuguese, Dutch, Swedish, Hungarian, Finnish, Bengali, Norwegian] @@ -76,6 +77,11 @@ def fi_tokenizer(): return Finnish.Defaults.create_tokenizer() +@pytest.fixture +def ja_tokenizer(): + return Japanese.Defaults.create_tokenizer() + + @pytest.fixture def sv_tokenizer(): return Swedish.Defaults.create_tokenizer() diff --git a/spacy/tests/ja/__init__.py b/spacy/tests/ja/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/ja/test_tokenizer.py b/spacy/tests/ja/test_tokenizer.py new file mode 100644 index 000000000..8d45c822d --- /dev/null +++ b/spacy/tests/ja/test_tokenizer.py @@ -0,0 +1,8 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +def test_japanese_tokenizer(ja_tokenizer): + tokens = ja_tokenizer("日本語だよ") + assert len(tokens) == 3