mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 12:18:04 +03:00
b892b446cc
* Added the same punctuation rules as danish language. * Added abbreviations and also the possibility to have capitalized abbreviations on some. Added a few specific cases too * Added test for long texts in swedish * Added morph rules, infixes and suffixes to __init__.py for swedish * Added some tests for prefixes, infixes and suffixes * Added tests for lemma * Renamed files to follow convention * [sv] Removed ambigious abbreviations * Added more tests for tokenizer exceptions * Added test for problem with punctuation in issue #2578 * Contributor agreement * Removed faulty lemmatization of 'jag' ('I') as it was lemmatized to 'jaga' ('hunt')
22 lines
860 B
Python
22 lines
860 B
Python
# coding: utf-8
|
|
"""Test that longer and mixed texts are tokenized correctly."""
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
|
|
def test_sv_tokenizer_handles_long_text(sv_tokenizer):
|
|
text = """Det var så härligt ute på landet. Det var sommar, majsen var gul, havren grön,
|
|
höet var uppställt i stackar nere vid den gröna ängen, och där gick storken på sina långa,
|
|
röda ben och snackade engelska, för det språket hade han lärt sig av sin mor.
|
|
|
|
Runt om åkrar och äng låg den stora skogen, och mitt i skogen fanns djupa sjöar; jo, det var verkligen trevligt ute på landet!"""
|
|
tokens = sv_tokenizer(text)
|
|
assert len(tokens) == 86
|
|
|
|
|
|
def test_sv_tokenizer_handles_trailing_dot_for_i_in_sentence(sv_tokenizer):
|
|
text = "Provar att tokenisera en mening med ord i."
|
|
tokens = sv_tokenizer(text)
|
|
assert len(tokens) == 9
|