",
+ "''",
+ "C++",
+ "a.",
+ "b.",
+ "c.",
+ "d.",
+ "e.",
+ "f.",
+ "g.",
+ "h.",
+ "i.",
+ "j.",
+ "k.",
+ "l.",
+ "m.",
+ "n.",
+ "o.",
+ "p.",
+ "q.",
+ "r.",
+ "s.",
+ "t.",
+ "u.",
+ "v.",
+ "w.",
+ "x.",
+ "y.",
+ "z.",
+ "ä.",
+ "ö.",
+ "ü."
+]
+
+
+__all__ = [ "ABBREVIATIONS" ]
diff --git a/spacy/language_data/emoticons.py b/spacy/language_data/emoticons.py
index 3fa44368d..bc951a007 100644
--- a/spacy/language_data/emoticons.py
+++ b/spacy/language_data/emoticons.py
@@ -13,6 +13,7 @@ EMOTICONS = set("""
(-:
=)
(=
+")
:]
:-]
[:
diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py
index fb784271e..d8ed19ca1 100644
--- a/spacy/language_data/punctuation.py
+++ b/spacy/language_data/punctuation.py
@@ -1,133 +1,115 @@
# encoding: utf8
from __future__ import unicode_literals
-
-TOKENIZER_PREFIXES = r'''
-,
-"
-(
-[
-{
-*
-<
->
-$
-£
-¡
-¿
-„
-“
-'
-``
-`
-#
-‘
-....
-...
-…
-‚
-»
-§
-US$
-C$
-A$
-a-
-'''.strip().split('\n')
+import re
-TOKENIZER_SUFFIXES = r'''
-,
-\"
-\)
-\]
-\}
-\*
-\!
-\?
-%
-\$
->
-:
-;
-'
-”
-“
-«
-_
-''
-'s
-'S
-’s
-’S
-’
-‘
-°
-€
-…
-\.\.
-\.\.\.
-\.\.\.\.
-(?<=[a-z0-9)\]”"'%\)])\.
-(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
-\-\-
-´
-(?<=[0-9])km²
-(?<=[0-9])m²
-(?<=[0-9])cm²
-(?<=[0-9])mm²
-(?<=[0-9])km³
-(?<=[0-9])m³
-(?<=[0-9])cm³
-(?<=[0-9])mm³
-(?<=[0-9])ha
-(?<=[0-9])km
-(?<=[0-9])m
-(?<=[0-9])cm
-(?<=[0-9])mm
-(?<=[0-9])µm
-(?<=[0-9])nm
-(?<=[0-9])yd
-(?<=[0-9])in
-(?<=[0-9])ft
-(?<=[0-9])kg
-(?<=[0-9])g
-(?<=[0-9])mg
-(?<=[0-9])µg
-(?<=[0-9])t
-(?<=[0-9])lb
-(?<=[0-9])oz
-(?<=[0-9])m/s
-(?<=[0-9])km/h
-(?<=[0-9])mph
-(?<=[0-9])°C
-(?<=[0-9])°K
-(?<=[0-9])°F
-(?<=[0-9])hPa
-(?<=[0-9])Pa
-(?<=[0-9])mbar
-(?<=[0-9])mb
-(?<=[0-9])T
-(?<=[0-9])G
-(?<=[0-9])M
-(?<=[0-9])K
-(?<=[0-9])kb
-'''.strip().split('\n')
+_ALPHA_LOWER = """
+a ä à á â ǎ æ ã å ā ă ą b c ç ć č ĉ ċ c̄ d ð ď e é è ê ë ė ȅ ȩ ẽ ę f g ĝ ğ h i ı
+î ï í ī ì ȉ ǐ į ĩ j k ķ l ł ļ m n ñ ń ň ņ o ö ó ò ő ô õ œ ø ō ő ǒ ơ p q r ř ŗ s
+ß ś š ş ŝ t ť u ú û ù ú ū ű ǔ ů ų ư v w ŵ x y ÿ ý ỳ ŷ ỹ z ź ž ż þ
+"""
-TOKENIZER_INFIXES = r'''
-…
-\.\.\.+
-(?<=[a-z])\.(?=[A-Z])
-(?<=[a-z])\.(?=[A-Z])
-(?<=[a-zA-Z])-(?=[a-zA-z])
-(?<=[a-zA-Z])--(?=[a-zA-z])
-(?<=[0-9])-(?=[0-9])
-(?<=[A-Za-z]),(?=[A-Za-z])
-(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ])
-(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ])
-(?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ])
-(?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ])
-'''.strip().split('\n')
+_ALPHA_UPPER = """
+A Ä À Á Â Ǎ Æ Ã Å Ā Ă Ą B C Ç Ć Č Ĉ Ċ C̄ D Ð Ď E É È Ê Ë Ė Ȅ Ȩ Ẽ Ę F G Ĝ Ğ H I İ
+Î Ï Í Ī Ì Ȉ Ǐ Į Ĩ J K Ķ L Ł Ļ M N Ñ Ń Ň Ņ O Ö Ó Ò Ő Ô Õ Œ Ø Ō Ő Ǒ Ơ P Q R Ř Ŗ S
+Ś Š Ş Ŝ T Ť U Ú Û Ù Ú Ū Ű Ǔ Ů Ų Ư V W Ŵ X Y Ÿ Ý Ỳ Ŷ Ỹ Z Ź Ž Ż Þ
+"""
+
+
+_UNITS = """
+km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft kg g mg
+µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb
+TB T G M K
+"""
+
+
+_CURRENCY = r"""
+\$ £ € ¥ ฿ US\$ C\$ A\$
+"""
+
+
+_QUOTES = r"""
+' '' " ” “ `` ` ‘ ´ ‚ , „ » «
+"""
+
+
+_PUNCT = r"""
+… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &
+"""
+
+
+_HYPHENS = r"""
+- – — -- ---
+"""
+
+
+LIST_ELLIPSES = [
+ r'\.\.+',
+ "…"
+]
+
+
+LIST_CURRENCY = list(_CURRENCY.strip().split())
+LIST_QUOTES = list(_QUOTES.strip().split())
+LIST_PUNCT = list(_PUNCT.strip().split())
+LIST_HYPHENS = list(_HYPHENS.strip().split())
+
+
+ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '')
+ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '')
+ALPHA = ALPHA_LOWER + ALPHA_UPPER
+
+
+QUOTES = _QUOTES.strip().replace(' ', '|')
+CURRENCY = _CURRENCY.strip().replace(' ', '|')
+UNITS = _UNITS.strip().replace(' ', '|')
+HYPHENS = _HYPHENS.strip().replace(' ', '|')
+
+
+
+# Prefixes
+
+TOKENIZER_PREFIXES = (
+ ['§', '%', r'\+'] +
+ LIST_PUNCT +
+ LIST_ELLIPSES +
+ LIST_QUOTES +
+ LIST_CURRENCY
+)
+
+
+# Suffixes
+
+TOKENIZER_SUFFIXES = (
+ LIST_PUNCT +
+ LIST_ELLIPSES +
+ LIST_QUOTES +
+ [
+ r'(?<=[0-9])\+',
+ r'(?<=°[FfCcKk])\.',
+ r'(?<=[0-9])(?:{c})'.format(c=CURRENCY),
+ r'(?<=[0-9])(?:{u})'.format(u=UNITS),
+ r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES),
+ r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER),
+ "'s", "'S", "’s", "’S"
+ ]
+)
+
+
+# Infixes
+
+TOKENIZER_INFIXES = (
+ LIST_ELLIPSES +
+ [
+ r'(?<=[0-9])[+\-\*/^](?=[0-9])',
+ r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
+ r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
+ r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
+ r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA)
+ ]
+)
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
diff --git a/spacy/language_data/tag_map.py b/spacy/language_data/tag_map.py
index f5b6b5040..966960721 100644
--- a/spacy/language_data/tag_map.py
+++ b/spacy/language_data/tag_map.py
@@ -20,5 +20,6 @@ TAG_MAP = {
"X": {POS: X},
"CONJ": {POS: CONJ},
"ADJ": {POS: ADJ},
- "VERB": {POS: VERB}
+ "VERB": {POS: VERB},
+ "PART": {POS: PART}
}
diff --git a/spacy/nl/__init__.py b/spacy/nl/__init__.py
index d958783ea..d4aa39506 100644
--- a/spacy/nl/__init__.py
+++ b/spacy/nl/__init__.py
@@ -1,8 +1,6 @@
# encoding: utf8
from __future__ import unicode_literals, print_function
-from os import path
-
from ..language import Language
from ..attrs import LANG
from .language_data import *
diff --git a/spacy/nl/language_data.py b/spacy/nl/language_data.py
index 8683f83ac..a4a657c33 100644
--- a/spacy/nl/language_data.py
+++ b/spacy/nl/language_data.py
@@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
-TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS)
+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+
+
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
diff --git a/spacy/pt/__init__.py b/spacy/pt/__init__.py
index 06c6417dc..ed26fb0b3 100644
--- a/spacy/pt/__init__.py
+++ b/spacy/pt/__init__.py
@@ -1,8 +1,6 @@
# encoding: utf8
from __future__ import unicode_literals, print_function
-from os import path
-
from ..language import Language
from ..attrs import LANG
diff --git a/spacy/pt/language_data.py b/spacy/pt/language_data.py
index 8683f83ac..a4a657c33 100644
--- a/spacy/pt/language_data.py
+++ b/spacy/pt/language_data.py
@@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
-TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS)
+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+
+
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
diff --git a/spacy/sv/__init__.py b/spacy/sv/__init__.py
index 25930386a..e03c9a56f 100644
--- a/spacy/sv/__init__.py
+++ b/spacy/sv/__init__.py
@@ -1,8 +1,6 @@
# encoding: utf8
from __future__ import unicode_literals, print_function
-from os import path
-
from ..language import Language
from ..attrs import LANG
from .language_data import *
diff --git a/spacy/sv/language_data.py b/spacy/sv/language_data.py
index 8683f83ac..a4a657c33 100644
--- a/spacy/sv/language_data.py
+++ b/spacy/sv/language_data.py
@@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
-TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS)
+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+
+
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
diff --git a/spacy/tests/website/__init__.py b/spacy/tests/de/__init__.py
similarity index 100%
rename from spacy/tests/website/__init__.py
rename to spacy/tests/de/__init__.py
diff --git a/spacy/tests/de/conftest.py b/spacy/tests/de/conftest.py
new file mode 100644
index 000000000..c6b8be26e
--- /dev/null
+++ b/spacy/tests/de/conftest.py
@@ -0,0 +1,11 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+from ...de import German
+
+
+@pytest.fixture
+def de_tokenizer():
+ return German.Defaults.create_tokenizer()
diff --git a/spacy/tests/de/tokenizer/__init__.py b/spacy/tests/de/tokenizer/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/de/tokenizer/test_exceptions.py b/spacy/tests/de/tokenizer/test_exceptions.py
new file mode 100644
index 000000000..13da3dc33
--- /dev/null
+++ b/spacy/tests/de/tokenizer/test_exceptions.py
@@ -0,0 +1,27 @@
+# coding: utf-8
+"""Test that tokenizer exceptions and emoticons are handles correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
+def test_tokenizer_splits_contractions(de_tokenizer, text):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == 2
+
+
+@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
+def test_tokenizer_handles_abbr(de_tokenizer, text):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == 1
+
+
+def test_tokenizer_handles_exc_in_text(de_tokenizer):
+ text = "Ich bin z.Zt. im Urlaub."
+ tokens = de_tokenizer(text)
+ assert len(tokens) == 6
+ assert tokens[2].text == "z.Zt."
+ assert tokens[2].lemma_ == "zur Zeit"
diff --git a/spacy/tests/de/tokenizer/test_prefix_suffix_infix.py b/spacy/tests/de/tokenizer/test_prefix_suffix_infix.py
new file mode 100644
index 000000000..dcf4f4ef0
--- /dev/null
+++ b/spacy/tests/de/tokenizer/test_prefix_suffix_infix.py
@@ -0,0 +1,116 @@
+# coding: utf-8
+"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize('text', ["(unter)"])
+def test_tokenizer_splits_no_special(de_tokenizer, text):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["unter'm"])
+def test_tokenizer_splits_no_punct(de_tokenizer, text):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == 2
+
+
+@pytest.mark.parametrize('text', ["(unter'm"])
+def test_tokenizer_splits_prefix_punct(de_tokenizer, text):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["unter'm)"])
+def test_tokenizer_splits_suffix_punct(de_tokenizer, text):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["(unter'm)"])
+def test_tokenizer_splits_even_wrap(de_tokenizer, text):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == 4
+
+
+@pytest.mark.parametrize('text', ["(unter'm?)"])
+def test_tokenizer_splits_uneven_wrap(de_tokenizer, text):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == 5
+
+
+@pytest.mark.parametrize('text,length', [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)])
+def test_tokenizer_splits_prefix_interact(de_tokenizer, text, length):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == length
+
+
+@pytest.mark.parametrize('text', ["z.B.)"])
+def test_tokenizer_splits_suffix_interact(de_tokenizer, text):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == 2
+
+
+@pytest.mark.parametrize('text', ["(z.B.)"])
+def test_tokenizer_splits_even_wrap_interact(de_tokenizer, text):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["(z.B.?)"])
+def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == 4
+
+
+@pytest.mark.parametrize('text', ["blau-rot"])
+def test_tokenizer_splits_hyphens(de_tokenizer, text):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
+def test_tokenizer_splits_numeric_range(de_tokenizer, text):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["blau.Rot", "Hallo.Welt"])
+def test_tokenizer_splits_period_infix(de_tokenizer, text):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["Hallo,Welt", "eins,zwei"])
+def test_tokenizer_splits_comma_infix(de_tokenizer, text):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == 3
+ assert tokens[0].text == text.split(",")[0]
+ assert tokens[1].text == ","
+ assert tokens[2].text == text.split(",")[1]
+
+
+@pytest.mark.parametrize('text', ["blau...Rot", "blau...rot"])
+def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == 3
+
+
+def test_tokenizer_splits_double_hyphen_infix(de_tokenizer):
+ tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.")
+ assert len(tokens) == 12
+ assert tokens[0].text == "Viele"
+ assert tokens[1].text == "Regeln"
+ assert tokens[2].text == "--"
+ assert tokens[3].text == "wie"
+ assert tokens[4].text == "die"
+ assert tokens[5].text == "Bindestrich"
+ assert tokens[6].text == "-"
+ assert tokens[7].text == "Regeln"
+ assert tokens[8].text == "--"
+ assert tokens[9].text == "sind"
+ assert tokens[10].text == "kompliziert"
diff --git a/spacy/tests/de/tokenizer/test_text.py b/spacy/tests/de/tokenizer/test_text.py
new file mode 100644
index 000000000..84fa6f2a5
--- /dev/null
+++ b/spacy/tests/de/tokenizer/test_text.py
@@ -0,0 +1,45 @@
+# coding: utf-8
+"""Test that longer and mixed texts are tokenized correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_tokenizer_handles_long_text(de_tokenizer):
+ text = """Die Verwandlung
+
+Als Gregor Samsa eines Morgens aus unruhigen Träumen erwachte, fand er sich in
+seinem Bett zu einem ungeheueren Ungeziefer verwandelt.
+
+Er lag auf seinem panzerartig harten Rücken und sah, wenn er den Kopf ein wenig
+hob, seinen gewölbten, braunen, von bogenförmigen Versteifungen geteilten
+Bauch, auf dessen Höhe sich die Bettdecke, zum gänzlichen Niedergleiten bereit,
+kaum noch erhalten konnte. Seine vielen, im Vergleich zu seinem sonstigen
+Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
+
+»Was ist mit mir geschehen?«, dachte er."""
+
+ tokens = de_tokenizer(text)
+ assert len(tokens) == 109
+
+
+@pytest.mark.parametrize('text,length', [
+ ("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1),
+ ("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1),
+ ("Kraftfahrzeug-Haftpflichtversicherung", 3),
+ ("Vakuum-Mittelfrequenz-Induktionsofen", 5)
+ ])
+def test_tokenizer_handles_long_words(de_tokenizer, text, length):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == length
+
+
+@pytest.mark.parametrize('text,length', [
+ ("»Was ist mit mir geschehen?«, dachte er.", 12),
+ ("“Dies frühzeitige Aufstehen”, dachte er, “macht einen ganz blödsinnig. ", 15)
+ ])
+def test_tokenizer_handles_examples(de_tokenizer, text, length):
+ tokens = de_tokenizer(text)
+ assert len(tokens) == length
diff --git a/spacy/tests/en/__init__.py b/spacy/tests/en/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/en/conftest.py b/spacy/tests/en/conftest.py
new file mode 100644
index 000000000..3a3516c41
--- /dev/null
+++ b/spacy/tests/en/conftest.py
@@ -0,0 +1,11 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+from ...en import English
+
+
+@pytest.fixture
+def en_tokenizer():
+ return English.Defaults.create_tokenizer()
diff --git a/spacy/tests/en/tokenizer/__init__.py b/spacy/tests/en/tokenizer/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/en/tokenizer/test_contractions.py b/spacy/tests/en/tokenizer/test_contractions.py
new file mode 100644
index 000000000..a97b8f5ba
--- /dev/null
+++ b/spacy/tests/en/tokenizer/test_contractions.py
@@ -0,0 +1,87 @@
+# coding: utf-8
+"""Test that tokens are created correctly for contractions."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_tokenizer_handles_basic_contraction(en_tokenizer):
+ text = "don't giggle"
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 3
+ assert tokens[1].text == "n't"
+ text = "i said don't!"
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 5
+ assert tokens[4].text == "!"
+
+
+@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
+def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
+def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
+ tokens = en_tokenizer(text_poss)
+ assert len(tokens) == 2
+ assert tokens[0].text == text
+ assert tokens[1].text == "'s"
+
+
+@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
+def test_tokenizer_splits_trailing_apos(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 2
+ assert tokens[0].text == text.split("'")[0]
+ assert tokens[1].text == "'"
+
+
+@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
+def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 1
+ assert tokens[0].text == text
+
+
+@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
+def test_tokenizer_handles_ll_contraction(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 2
+ assert tokens[0].text == text.split("'")[0]
+ assert tokens[1].text == "'ll"
+ assert tokens[1].lemma_ == "will"
+
+
+@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
+def test_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
+ tokens_lower = en_tokenizer(text_lower)
+ tokens_title = en_tokenizer(text_title)
+ assert tokens_title[0].text == tokens_lower[0].text.title()
+ assert tokens_lower[0].text == tokens_title[0].text.lower()
+ assert tokens_lower[1].text == tokens_title[1].text
+
+
+@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
+@pytest.mark.parametrize('contraction', ["'ll", "'d"])
+def test_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
+ tokens = en_tokenizer(pron + contraction)
+ assert tokens[0].text == pron
+ assert tokens[1].text == contraction
+
+
+@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
+def test_tokenizer_excludes_ambiguous(en_tokenizer, exc):
+ tokens = en_tokenizer(exc)
+ assert len(tokens) == 1
+
+
+@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
+def test_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
+ tokens = en_tokenizer(wo_punct)
+ assert len(tokens) == 2
+ tokens = en_tokenizer(w_punct)
+ assert len(tokens) == 3
diff --git a/spacy/tests/en/tokenizer/test_exceptions.py b/spacy/tests/en/tokenizer/test_exceptions.py
new file mode 100644
index 000000000..ac7ed452f
--- /dev/null
+++ b/spacy/tests/en/tokenizer/test_exceptions.py
@@ -0,0 +1,20 @@
+# coding: utf-8
+"""Test that tokenizer exceptions are handled correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
+def test_tokenizer_handles_abbr(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 1
+
+
+def test_tokenizer_handles_exc_in_text(en_tokenizer):
+ text = "It's mediocre i.e. bad."
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 6
+ assert tokens[3].text == "i.e."
diff --git a/spacy/tests/tokenizer/test_indices.py b/spacy/tests/en/tokenizer/test_indices.py
similarity index 91%
rename from spacy/tests/tokenizer/test_indices.py
rename to spacy/tests/en/tokenizer/test_indices.py
index 5df7bcc59..0ed6ca4dc 100644
--- a/spacy/tests/tokenizer/test_indices.py
+++ b/spacy/tests/en/tokenizer/test_indices.py
@@ -1,12 +1,14 @@
+# coding: utf-8
"""Test that token.idx correctly computes index into the original string."""
+
from __future__ import unicode_literals
import pytest
def test_simple_punct(en_tokenizer):
- text = 'to walk, do foo'
+ text = "to walk, do foo"
tokens = en_tokenizer(text)
assert tokens[0].idx == 0
assert tokens[1].idx == 3
@@ -16,7 +18,7 @@ def test_simple_punct(en_tokenizer):
def test_complex_punct(en_tokenizer):
- text = 'Tom (D., Ill.)!'
+ text = "Tom (D., Ill.)!"
tokens = en_tokenizer(text)
assert tokens[0].idx == 0
assert len(tokens[0]) == 3
diff --git a/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py b/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py
new file mode 100644
index 000000000..042934d4e
--- /dev/null
+++ b/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py
@@ -0,0 +1,136 @@
+# coding: utf-8
+"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize('text', ["(can)"])
+def test_tokenizer_splits_no_special(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["can't"])
+def test_tokenizer_splits_no_punct(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 2
+
+
+@pytest.mark.parametrize('text', ["(can't"])
+def test_tokenizer_splits_prefix_punct(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["can't)"])
+def test_tokenizer_splits_suffix_punct(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["(can't)"])
+def test_tokenizer_splits_even_wrap(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 4
+
+
+@pytest.mark.parametrize('text', ["(can't?)"])
+def test_tokenizer_splits_uneven_wrap(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 5
+
+
+@pytest.mark.parametrize('text,length', [("U.S.", 1), ("us.", 2), ("(U.S.", 2)])
+def test_tokenizer_splits_prefix_interact(en_tokenizer, text, length):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == length
+
+
+@pytest.mark.parametrize('text', ["U.S.)"])
+def test_tokenizer_splits_suffix_interact(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 2
+
+
+@pytest.mark.parametrize('text', ["(U.S.)"])
+def test_tokenizer_splits_even_wrap_interact(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["(U.S.?)"])
+def test_tokenizer_splits_uneven_wrap_interact(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 4
+
+
+@pytest.mark.parametrize('text', ["best-known"])
+def test_tokenizer_splits_hyphens(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
+def test_tokenizer_splits_numeric_range(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["best.Known", "Hello.World"])
+def test_tokenizer_splits_period_infix(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["Hello,world", "one,two"])
+def test_tokenizer_splits_comma_infix(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 3
+ assert tokens[0].text == text.split(",")[0]
+ assert tokens[1].text == ","
+ assert tokens[2].text == text.split(",")[1]
+
+
+@pytest.mark.parametrize('text', ["best...Known", "best...known"])
+def test_tokenizer_splits_ellipsis_infix(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 3
+
+
+def test_tokenizer_splits_double_hyphen_infix(en_tokenizer):
+ tokens = en_tokenizer("No decent--let alone well-bred--people.")
+ assert tokens[0].text == "No"
+ assert tokens[1].text == "decent"
+ assert tokens[2].text == "--"
+ assert tokens[3].text == "let"
+ assert tokens[4].text == "alone"
+ assert tokens[5].text == "well"
+ assert tokens[6].text == "-"
+ assert tokens[7].text == "bred"
+ assert tokens[8].text == "--"
+ assert tokens[9].text == "people"
+
+
+@pytest.mark.xfail
+def test_tokenizer_splits_period_abbr(en_tokenizer):
+ text = "Today is Tuesday.Mr."
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 5
+ assert tokens[0].text == "Today"
+ assert tokens[1].text == "is"
+ assert tokens[2].text == "Tuesday"
+ assert tokens[3].text == "."
+ assert tokens[4].text == "Mr."
+
+
+@pytest.mark.xfail
+def test_tokenizer_splits_em_dash_infix(en_tokenizer):
+ # Re Issue #225
+ tokens = en_tokenizer("""Will this road take me to Puddleton?\u2014No, """
+ """you'll have to walk there.\u2014Ariel.""")
+ assert tokens[6].text == "Puddleton"
+ assert tokens[7].text == "?"
+ assert tokens[8].text == "\u2014"
diff --git a/spacy/tests/en/tokenizer/test_punct.py b/spacy/tests/en/tokenizer/test_punct.py
new file mode 100644
index 000000000..b6ae9224d
--- /dev/null
+++ b/spacy/tests/en/tokenizer/test_punct.py
@@ -0,0 +1,132 @@
+# coding: utf-8
+"""Test that open, closed and paired punctuation is split off correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+from ....util import compile_prefix_regex
+from ....language_data import TOKENIZER_PREFIXES
+
+
+
+en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
+
+PUNCT_OPEN = ['(', '[', '{', '*']
+PUNCT_CLOSE = [')', ']', '}', '*']
+PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
+
+
+@pytest.mark.parametrize('text', ["(", "((", "<"])
+def test_tokenizer_handles_only_punct(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == len(text)
+
+
+@pytest.mark.parametrize('punct', PUNCT_OPEN)
+@pytest.mark.parametrize('text', ["Hello"])
+def test_tokenizer_splits_open_punct(en_tokenizer, punct, text):
+ tokens = en_tokenizer(punct + text)
+ assert len(tokens) == 2
+ assert tokens[0].text == punct
+ assert tokens[1].text == text
+
+
+@pytest.mark.parametrize('punct', PUNCT_CLOSE)
+@pytest.mark.parametrize('text', ["Hello"])
+def test_tokenizer_splits_close_punct(en_tokenizer, punct, text):
+ tokens = en_tokenizer(text + punct)
+ assert len(tokens) == 2
+ assert tokens[0].text == text
+ assert tokens[1].text == punct
+
+
+@pytest.mark.parametrize('punct', PUNCT_OPEN)
+@pytest.mark.parametrize('punct_add', ["`"])
+@pytest.mark.parametrize('text', ["Hello"])
+def test_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text):
+ tokens = en_tokenizer(punct + punct_add + text)
+ assert len(tokens) == 3
+ assert tokens[0].text == punct
+ assert tokens[1].text == punct_add
+ assert tokens[2].text == text
+
+
+@pytest.mark.parametrize('punct', PUNCT_CLOSE)
+@pytest.mark.parametrize('punct_add', ["'"])
+@pytest.mark.parametrize('text', ["Hello"])
+def test_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text):
+ tokens = en_tokenizer(text + punct + punct_add)
+ assert len(tokens) == 3
+ assert tokens[0].text == text
+ assert tokens[1].text == punct
+ assert tokens[2].text == punct_add
+
+
+@pytest.mark.parametrize('punct', PUNCT_OPEN)
+@pytest.mark.parametrize('text', ["Hello"])
+def test_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
+ tokens = en_tokenizer(punct + punct + punct + text)
+ assert len(tokens) == 4
+ assert tokens[0].text == punct
+ assert tokens[3].text == text
+
+
+@pytest.mark.parametrize('punct', PUNCT_CLOSE)
+@pytest.mark.parametrize('text', ["Hello"])
+def test_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
+ tokens = en_tokenizer(text + punct + punct + punct)
+ assert len(tokens) == 4
+ assert tokens[0].text == text
+ assert tokens[1].text == punct
+
+
+@pytest.mark.parametrize('text', ["'The"])
+def test_tokenizer_splits_open_appostrophe(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 2
+ assert tokens[0].text == "'"
+
+
+@pytest.mark.parametrize('text', ["Hello''"])
+def test_tokenizer_splits_double_end_quote(en_tokenizer, text):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 2
+ tokens_punct = en_tokenizer("''")
+ assert len(tokens_punct) == 1
+
+
+@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
+@pytest.mark.parametrize('text', ["Hello"])
+def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close, text):
+ tokens = en_tokenizer(punct_open + text + punct_close)
+ assert len(tokens) == 3
+ assert tokens[0].text == punct_open
+ assert tokens[1].text == text
+ assert tokens[2].text == punct_close
+
+
+@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
+@pytest.mark.parametrize('punct_open_add,punct_close_add', [("`", "'")])
+@pytest.mark.parametrize('text', ["Hello"])
+def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, punct_close_add, text):
+ tokens = en_tokenizer(punct_open_add + punct_open + text + punct_close + punct_close_add)
+ assert len(tokens) == 5
+ assert tokens[0].text == punct_open_add
+ assert tokens[1].text == punct_open
+ assert tokens[2].text == text
+ assert tokens[3].text == punct_close
+ assert tokens[4].text == punct_close_add
+
+
+@pytest.mark.parametrize('text,punct', [("(can't", "(")])
+def test_tokenizer_splits_pre_punct_regex(text, punct):
+ match = en_search_prefixes(text)
+ assert match.group() == punct
+
+
+def test_tokenizer_splits_bracket_period(en_tokenizer):
+ text = "(And a 6a.m. run through Washington Park)."
+ tokens = en_tokenizer(text)
+ assert tokens[len(tokens) - 1].text == "."
diff --git a/spacy/tests/en/tokenizer/test_text.py b/spacy/tests/en/tokenizer/test_text.py
new file mode 100644
index 000000000..c7178fbf9
--- /dev/null
+++ b/spacy/tests/en/tokenizer/test_text.py
@@ -0,0 +1,36 @@
+# coding: utf-8
+"""Test that longer and mixed texts are tokenized correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_tokenizer_handles_long_text(en_tokenizer):
+ text = """Tributes pour in for late British Labour Party leader
+
+Tributes poured in from around the world Thursday
+to the late Labour Party leader John Smith, who died earlier from a massive
+heart attack aged 55.
+
+In Washington, the US State Department issued a statement regretting "the
+untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
+
+"Mr. Smith, throughout his distinguished"""
+ tokens = en_tokenizer(text)
+ assert len(tokens) == 76
+
+
+@pytest.mark.parametrize('text,length', [
+ ("The U.S. Army likes Shock and Awe.", 8),
+ ("U.N. regulations are not a part of their concern.", 10),
+ ("“Isn't it?”", 6),
+ ("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
+ ("""'Me too!', Mr. P. Delaware cried. """, 11),
+ ("They ran about 10km.", 6),
+ # ("But then the 6,000-year ice age came...", 10)
+ ])
+def test_tokenizer_handles_cnts(en_tokenizer, text, length):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == length
diff --git a/spacy/tests/hu/conftest.py b/spacy/tests/hu/conftest.py
new file mode 100644
index 000000000..222bd1b00
--- /dev/null
+++ b/spacy/tests/hu/conftest.py
@@ -0,0 +1,11 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+from ...hu import Hungarian
+
+
+@pytest.fixture
+def hu_tokenizer():
+ return Hungarian.Defaults.create_tokenizer()
diff --git a/spacy/tests/hu/tokenizer/test_tokenizer.py b/spacy/tests/hu/tokenizer/test_tokenizer.py
index 2bfbfdf36..0b76da0c6 100644
--- a/spacy/tests/hu/tokenizer/test_tokenizer.py
+++ b/spacy/tests/hu/tokenizer/test_tokenizer.py
@@ -2,25 +2,27 @@
from __future__ import unicode_literals
import pytest
-from spacy.hu import Hungarian
-_DEFAULT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
- ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
- ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
- ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
- ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
- ('A .hu.', ['A', '.hu', '.']),
- ('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
- ('A pl.', ['A', 'pl.']),
- ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
- ('Egy..ket.', ['Egy', '..', 'ket', '.']),
- ('Valami... van.', ['Valami', '...', 'van', '.']),
- ('Valami ...van...', ['Valami', '...', 'van', '...']),
- ('Valami...', ['Valami', '...']),
- ('Valami ...', ['Valami', '...']),
- ('Valami ... más.', ['Valami', '...', 'más', '.'])]
-_HYPHEN_TESTS = [
+DEFAULT_TESTS = [
+ ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
+ ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
+ ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
+ ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
+ ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
+ ('A .hu.', ['A', '.hu', '.']),
+ ('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
+ ('A pl.', ['A', 'pl.']),
+ ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
+ ('Egy..ket.', ['Egy', '..', 'ket', '.']),
+ ('Valami... van.', ['Valami', '...', 'van', '.']),
+ ('Valami ...van...', ['Valami', '...', 'van', '...']),
+ ('Valami...', ['Valami', '...']),
+ ('Valami ...', ['Valami', '...']),
+ ('Valami ... más.', ['Valami', '...', 'más', '.'])
+]
+
+HYPHEN_TESTS = [
('Egy -nak, -jaiért, -magyar, bel- van.', ['Egy', '-nak', ',', '-jaiért', ',', '-magyar', ',', 'bel-', 'van', '.']),
('Egy -nak.', ['Egy', '-nak', '.']),
('Egy bel-.', ['Egy', 'bel-', '.']),
@@ -39,195 +41,194 @@ _HYPHEN_TESTS = [
('A 7-es.', ['A', '7-es', '.']),
('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']),
('A %-sal.', ['A', '%-sal', '.']),
- ('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])]
+ ('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])
+]
-_NUMBER_TESTS = [('A 2b van.', ['A', '2b', 'van', '.']),
- ('A 2b-ben van.', ['A', '2b-ben', 'van', '.']),
- ('A 2b.', ['A', '2b', '.']),
- ('A 2b-ben.', ['A', '2b-ben', '.']),
- ('A 3.b van.', ['A', '3.b', 'van', '.']),
- ('A 3.b-ben van.', ['A', '3.b-ben', 'van', '.']),
- ('A 3.b.', ['A', '3.b', '.']),
- ('A 3.b-ben.', ['A', '3.b-ben', '.']),
- ('A 1:20:36.7 van.', ['A', '1:20:36.7', 'van', '.']),
- ('A 1:20:36.7-ben van.', ['A', '1:20:36.7-ben', 'van', '.']),
- ('A 1:20:36.7-ben.', ['A', '1:20:36.7-ben', '.']),
- ('A 1:35 van.', ['A', '1:35', 'van', '.']),
- ('A 1:35-ben van.', ['A', '1:35-ben', 'van', '.']),
- ('A 1:35-ben.', ['A', '1:35-ben', '.']),
- ('A 1.35 van.', ['A', '1.35', 'van', '.']),
- ('A 1.35-ben van.', ['A', '1.35-ben', 'van', '.']),
- ('A 1.35-ben.', ['A', '1.35-ben', '.']),
- ('A 4:01,95 van.', ['A', '4:01,95', 'van', '.']),
- ('A 4:01,95-ben van.', ['A', '4:01,95-ben', 'van', '.']),
- ('A 4:01,95-ben.', ['A', '4:01,95-ben', '.']),
- ('A 10--12 van.', ['A', '10--12', 'van', '.']),
- ('A 10--12-ben van.', ['A', '10--12-ben', 'van', '.']),
- ('A 10--12-ben.', ['A', '10--12-ben', '.']),
- ('A 10‐12 van.', ['A', '10‐12', 'van', '.']),
- ('A 10‐12-ben van.', ['A', '10‐12-ben', 'van', '.']),
- ('A 10‐12-ben.', ['A', '10‐12-ben', '.']),
- ('A 10‑12 van.', ['A', '10‑12', 'van', '.']),
- ('A 10‑12-ben van.', ['A', '10‑12-ben', 'van', '.']),
- ('A 10‑12-ben.', ['A', '10‑12-ben', '.']),
- ('A 10‒12 van.', ['A', '10‒12', 'van', '.']),
- ('A 10‒12-ben van.', ['A', '10‒12-ben', 'van', '.']),
- ('A 10‒12-ben.', ['A', '10‒12-ben', '.']),
- ('A 10–12 van.', ['A', '10–12', 'van', '.']),
- ('A 10–12-ben van.', ['A', '10–12-ben', 'van', '.']),
- ('A 10–12-ben.', ['A', '10–12-ben', '.']),
- ('A 10—12 van.', ['A', '10—12', 'van', '.']),
- ('A 10—12-ben van.', ['A', '10—12-ben', 'van', '.']),
- ('A 10—12-ben.', ['A', '10—12-ben', '.']),
- ('A 10―12 van.', ['A', '10―12', 'van', '.']),
- ('A 10―12-ben van.', ['A', '10―12-ben', 'van', '.']),
- ('A 10―12-ben.', ['A', '10―12-ben', '.']),
- ('A -23,12 van.', ['A', '-23,12', 'van', '.']),
- ('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']),
- ('A -23,12-ben.', ['A', '-23,12-ben', '.']),
- ('A 2+3 van.', ['A', '2', '+', '3', 'van', '.']),
- ('A 2 +3 van.', ['A', '2', '+', '3', 'van', '.']),
- ('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']),
- ('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']),
- ('A 2*3 van.', ['A', '2', '*', '3', 'van', '.']),
- ('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']),
- ('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']),
- ('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']),
- ('A C++ van.', ['A', 'C++', 'van', '.']),
- ('A C++-ben van.', ['A', 'C++-ben', 'van', '.']),
- ('A C++.', ['A', 'C++', '.']),
- ('A C++-ben.', ['A', 'C++-ben', '.']),
- ('A 2003. I. 06. van.', ['A', '2003.', 'I.', '06.', 'van', '.']),
- ('A 2003. I. 06-ben van.', ['A', '2003.', 'I.', '06-ben', 'van', '.']),
- ('A 2003. I. 06.', ['A', '2003.', 'I.', '06.']),
- ('A 2003. I. 06-ben.', ['A', '2003.', 'I.', '06-ben', '.']),
- ('A 2003. 01. 06. van.', ['A', '2003.', '01.', '06.', 'van', '.']),
- ('A 2003. 01. 06-ben van.', ['A', '2003.', '01.', '06-ben', 'van', '.']),
- ('A 2003. 01. 06.', ['A', '2003.', '01.', '06.']),
- ('A 2003. 01. 06-ben.', ['A', '2003.', '01.', '06-ben', '.']),
- ('A IV. 12. van.', ['A', 'IV.', '12.', 'van', '.']),
- ('A IV. 12-ben van.', ['A', 'IV.', '12-ben', 'van', '.']),
- ('A IV. 12.', ['A', 'IV.', '12.']),
- ('A IV. 12-ben.', ['A', 'IV.', '12-ben', '.']),
- ('A 2003.01.06. van.', ['A', '2003.01.06.', 'van', '.']),
- ('A 2003.01.06-ben van.', ['A', '2003.01.06-ben', 'van', '.']),
- ('A 2003.01.06.', ['A', '2003.01.06.']),
- ('A 2003.01.06-ben.', ['A', '2003.01.06-ben', '.']),
- ('A IV.12. van.', ['A', 'IV.12.', 'van', '.']),
- ('A IV.12-ben van.', ['A', 'IV.12-ben', 'van', '.']),
- ('A IV.12.', ['A', 'IV.12.']),
- ('A IV.12-ben.', ['A', 'IV.12-ben', '.']),
- ('A 1.1.2. van.', ['A', '1.1.2.', 'van', '.']),
- ('A 1.1.2-ben van.', ['A', '1.1.2-ben', 'van', '.']),
- ('A 1.1.2.', ['A', '1.1.2.']),
- ('A 1.1.2-ben.', ['A', '1.1.2-ben', '.']),
- ('A 1,5--2,5 van.', ['A', '1,5--2,5', 'van', '.']),
- ('A 1,5--2,5-ben van.', ['A', '1,5--2,5-ben', 'van', '.']),
- ('A 1,5--2,5-ben.', ['A', '1,5--2,5-ben', '.']),
- ('A 3,14 van.', ['A', '3,14', 'van', '.']),
- ('A 3,14-ben van.', ['A', '3,14-ben', 'van', '.']),
- ('A 3,14-ben.', ['A', '3,14-ben', '.']),
- ('A 3.14 van.', ['A', '3.14', 'van', '.']),
- ('A 3.14-ben van.', ['A', '3.14-ben', 'van', '.']),
- ('A 3.14-ben.', ['A', '3.14-ben', '.']),
- ('A 15. van.', ['A', '15.', 'van', '.']),
- ('A 15-ben van.', ['A', '15-ben', 'van', '.']),
- ('A 15-ben.', ['A', '15-ben', '.']),
- ('A 15.-ben van.', ['A', '15.-ben', 'van', '.']),
- ('A 15.-ben.', ['A', '15.-ben', '.']),
- ('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']),
- ('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']),
- ('A 2002--2003-ben.', ['A', '2002--2003-ben', '.']),
- ('A -0,99% van.', ['A', '-0,99%', 'van', '.']),
- ('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']),
- ('A -0,99%.', ['A', '-0,99%', '.']),
- ('A -0,99%-ben.', ['A', '-0,99%-ben', '.']),
- ('A 10--20% van.', ['A', '10--20%', 'van', '.']),
- ('A 10--20%-ben van.', ['A', '10--20%-ben', 'van', '.']),
- ('A 10--20%.', ['A', '10--20%', '.']),
- ('A 10--20%-ben.', ['A', '10--20%-ben', '.']),
- ('A 99§ van.', ['A', '99§', 'van', '.']),
- ('A 99§-ben van.', ['A', '99§-ben', 'van', '.']),
- ('A 99§-ben.', ['A', '99§-ben', '.']),
- ('A 10--20§ van.', ['A', '10--20§', 'van', '.']),
- ('A 10--20§-ben van.', ['A', '10--20§-ben', 'van', '.']),
- ('A 10--20§-ben.', ['A', '10--20§-ben', '.']),
- ('A 99° van.', ['A', '99°', 'van', '.']),
- ('A 99°-ben van.', ['A', '99°-ben', 'van', '.']),
- ('A 99°-ben.', ['A', '99°-ben', '.']),
- ('A 10--20° van.', ['A', '10--20°', 'van', '.']),
- ('A 10--20°-ben van.', ['A', '10--20°-ben', 'van', '.']),
- ('A 10--20°-ben.', ['A', '10--20°-ben', '.']),
- ('A °C van.', ['A', '°C', 'van', '.']),
- ('A °C-ben van.', ['A', '°C-ben', 'van', '.']),
- ('A °C.', ['A', '°C', '.']),
- ('A °C-ben.', ['A', '°C-ben', '.']),
- ('A 100°C van.', ['A', '100°C', 'van', '.']),
- ('A 100°C-ben van.', ['A', '100°C-ben', 'van', '.']),
- ('A 100°C.', ['A', '100°C', '.']),
- ('A 100°C-ben.', ['A', '100°C-ben', '.']),
- ('A 800x600 van.', ['A', '800x600', 'van', '.']),
- ('A 800x600-ben van.', ['A', '800x600-ben', 'van', '.']),
- ('A 800x600-ben.', ['A', '800x600-ben', '.']),
- ('A 1x2x3x4 van.', ['A', '1x2x3x4', 'van', '.']),
- ('A 1x2x3x4-ben van.', ['A', '1x2x3x4-ben', 'van', '.']),
- ('A 1x2x3x4-ben.', ['A', '1x2x3x4-ben', '.']),
- ('A 5/J van.', ['A', '5/J', 'van', '.']),
- ('A 5/J-ben van.', ['A', '5/J-ben', 'van', '.']),
- ('A 5/J-ben.', ['A', '5/J-ben', '.']),
- ('A 5/J. van.', ['A', '5/J.', 'van', '.']),
- ('A 5/J.-ben van.', ['A', '5/J.-ben', 'van', '.']),
- ('A 5/J.-ben.', ['A', '5/J.-ben', '.']),
- ('A III/1 van.', ['A', 'III/1', 'van', '.']),
- ('A III/1-ben van.', ['A', 'III/1-ben', 'van', '.']),
- ('A III/1-ben.', ['A', 'III/1-ben', '.']),
- ('A III/1. van.', ['A', 'III/1.', 'van', '.']),
- ('A III/1.-ben van.', ['A', 'III/1.-ben', 'van', '.']),
- ('A III/1.-ben.', ['A', 'III/1.-ben', '.']),
- ('A III/c van.', ['A', 'III/c', 'van', '.']),
- ('A III/c-ben van.', ['A', 'III/c-ben', 'van', '.']),
- ('A III/c.', ['A', 'III/c', '.']),
- ('A III/c-ben.', ['A', 'III/c-ben', '.']),
- ('A TU–154 van.', ['A', 'TU–154', 'van', '.']),
- ('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']),
- ('A TU–154-ben.', ['A', 'TU–154-ben', '.'])]
+NUMBER_TESTS = [
+ ('A 2b van.', ['A', '2b', 'van', '.']),
+ ('A 2b-ben van.', ['A', '2b-ben', 'van', '.']),
+ ('A 2b.', ['A', '2b', '.']),
+ ('A 2b-ben.', ['A', '2b-ben', '.']),
+ ('A 3.b van.', ['A', '3.b', 'van', '.']),
+ ('A 3.b-ben van.', ['A', '3.b-ben', 'van', '.']),
+ ('A 3.b.', ['A', '3.b', '.']),
+ ('A 3.b-ben.', ['A', '3.b-ben', '.']),
+ ('A 1:20:36.7 van.', ['A', '1:20:36.7', 'van', '.']),
+ ('A 1:20:36.7-ben van.', ['A', '1:20:36.7-ben', 'van', '.']),
+ ('A 1:20:36.7-ben.', ['A', '1:20:36.7-ben', '.']),
+ ('A 1:35 van.', ['A', '1:35', 'van', '.']),
+ ('A 1:35-ben van.', ['A', '1:35-ben', 'van', '.']),
+ ('A 1:35-ben.', ['A', '1:35-ben', '.']),
+ ('A 1.35 van.', ['A', '1.35', 'van', '.']),
+ ('A 1.35-ben van.', ['A', '1.35-ben', 'van', '.']),
+ ('A 1.35-ben.', ['A', '1.35-ben', '.']),
+ ('A 4:01,95 van.', ['A', '4:01,95', 'van', '.']),
+ ('A 4:01,95-ben van.', ['A', '4:01,95-ben', 'van', '.']),
+ ('A 4:01,95-ben.', ['A', '4:01,95-ben', '.']),
+ ('A 10--12 van.', ['A', '10--12', 'van', '.']),
+ ('A 10--12-ben van.', ['A', '10--12-ben', 'van', '.']),
+ ('A 10--12-ben.', ['A', '10--12-ben', '.']),
+ ('A 10‐12 van.', ['A', '10‐12', 'van', '.']),
+ ('A 10‐12-ben van.', ['A', '10‐12-ben', 'van', '.']),
+ ('A 10‐12-ben.', ['A', '10‐12-ben', '.']),
+ ('A 10‑12 van.', ['A', '10‑12', 'van', '.']),
+ ('A 10‑12-ben van.', ['A', '10‑12-ben', 'van', '.']),
+ ('A 10‑12-ben.', ['A', '10‑12-ben', '.']),
+ ('A 10‒12 van.', ['A', '10‒12', 'van', '.']),
+ ('A 10‒12-ben van.', ['A', '10‒12-ben', 'van', '.']),
+ ('A 10‒12-ben.', ['A', '10‒12-ben', '.']),
+ ('A 10–12 van.', ['A', '10–12', 'van', '.']),
+ ('A 10–12-ben van.', ['A', '10–12-ben', 'van', '.']),
+ ('A 10–12-ben.', ['A', '10–12-ben', '.']),
+ ('A 10—12 van.', ['A', '10—12', 'van', '.']),
+ ('A 10—12-ben van.', ['A', '10—12-ben', 'van', '.']),
+ ('A 10—12-ben.', ['A', '10—12-ben', '.']),
+ ('A 10―12 van.', ['A', '10―12', 'van', '.']),
+ ('A 10―12-ben van.', ['A', '10―12-ben', 'van', '.']),
+ ('A 10―12-ben.', ['A', '10―12-ben', '.']),
+ ('A -23,12 van.', ['A', '-23,12', 'van', '.']),
+ ('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']),
+ ('A -23,12-ben.', ['A', '-23,12-ben', '.']),
+ ('A 2+3 van.', ['A', '2', '+', '3', 'van', '.']),
+ ('A 2 +3 van.', ['A', '2', '+', '3', 'van', '.']),
+ ('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']),
+ ('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']),
+ ('A 2*3 van.', ['A', '2', '*', '3', 'van', '.']),
+ ('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']),
+ ('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']),
+ ('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']),
+ ('A C++ van.', ['A', 'C++', 'van', '.']),
+ ('A C++-ben van.', ['A', 'C++-ben', 'van', '.']),
+ ('A C++.', ['A', 'C++', '.']),
+ ('A C++-ben.', ['A', 'C++-ben', '.']),
+ ('A 2003. I. 06. van.', ['A', '2003.', 'I.', '06.', 'van', '.']),
+ ('A 2003. I. 06-ben van.', ['A', '2003.', 'I.', '06-ben', 'van', '.']),
+ ('A 2003. I. 06.', ['A', '2003.', 'I.', '06.']),
+ ('A 2003. I. 06-ben.', ['A', '2003.', 'I.', '06-ben', '.']),
+ ('A 2003. 01. 06. van.', ['A', '2003.', '01.', '06.', 'van', '.']),
+ ('A 2003. 01. 06-ben van.', ['A', '2003.', '01.', '06-ben', 'van', '.']),
+ ('A 2003. 01. 06.', ['A', '2003.', '01.', '06.']),
+ ('A 2003. 01. 06-ben.', ['A', '2003.', '01.', '06-ben', '.']),
+ ('A IV. 12. van.', ['A', 'IV.', '12.', 'van', '.']),
+ ('A IV. 12-ben van.', ['A', 'IV.', '12-ben', 'van', '.']),
+ ('A IV. 12.', ['A', 'IV.', '12.']),
+ ('A IV. 12-ben.', ['A', 'IV.', '12-ben', '.']),
+ ('A 2003.01.06. van.', ['A', '2003.01.06.', 'van', '.']),
+ ('A 2003.01.06-ben van.', ['A', '2003.01.06-ben', 'van', '.']),
+ ('A 2003.01.06.', ['A', '2003.01.06.']),
+ ('A 2003.01.06-ben.', ['A', '2003.01.06-ben', '.']),
+ ('A IV.12. van.', ['A', 'IV.12.', 'van', '.']),
+ ('A IV.12-ben van.', ['A', 'IV.12-ben', 'van', '.']),
+ ('A IV.12.', ['A', 'IV.12.']),
+ ('A IV.12-ben.', ['A', 'IV.12-ben', '.']),
+ ('A 1.1.2. van.', ['A', '1.1.2.', 'van', '.']),
+ ('A 1.1.2-ben van.', ['A', '1.1.2-ben', 'van', '.']),
+ ('A 1.1.2.', ['A', '1.1.2.']),
+ ('A 1.1.2-ben.', ['A', '1.1.2-ben', '.']),
+ ('A 1,5--2,5 van.', ['A', '1,5--2,5', 'van', '.']),
+ ('A 1,5--2,5-ben van.', ['A', '1,5--2,5-ben', 'van', '.']),
+ ('A 1,5--2,5-ben.', ['A', '1,5--2,5-ben', '.']),
+ ('A 3,14 van.', ['A', '3,14', 'van', '.']),
+ ('A 3,14-ben van.', ['A', '3,14-ben', 'van', '.']),
+ ('A 3,14-ben.', ['A', '3,14-ben', '.']),
+ ('A 3.14 van.', ['A', '3.14', 'van', '.']),
+ ('A 3.14-ben van.', ['A', '3.14-ben', 'van', '.']),
+ ('A 3.14-ben.', ['A', '3.14-ben', '.']),
+ ('A 15. van.', ['A', '15.', 'van', '.']),
+ ('A 15-ben van.', ['A', '15-ben', 'van', '.']),
+ ('A 15-ben.', ['A', '15-ben', '.']),
+ ('A 15.-ben van.', ['A', '15.-ben', 'van', '.']),
+ ('A 15.-ben.', ['A', '15.-ben', '.']),
+ ('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']),
+ ('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']),
+ ('A 2002--2003-ben.', ['A', '2002--2003-ben', '.']),
+ ('A -0,99% van.', ['A', '-0,99%', 'van', '.']),
+ ('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']),
+ ('A -0,99%.', ['A', '-0,99%', '.']),
+ ('A -0,99%-ben.', ['A', '-0,99%-ben', '.']),
+ ('A 10--20% van.', ['A', '10--20%', 'van', '.']),
+ ('A 10--20%-ben van.', ['A', '10--20%-ben', 'van', '.']),
+ ('A 10--20%.', ['A', '10--20%', '.']),
+ ('A 10--20%-ben.', ['A', '10--20%-ben', '.']),
+ ('A 99§ van.', ['A', '99§', 'van', '.']),
+ ('A 99§-ben van.', ['A', '99§-ben', 'van', '.']),
+ ('A 99§-ben.', ['A', '99§-ben', '.']),
+ ('A 10--20§ van.', ['A', '10--20§', 'van', '.']),
+ ('A 10--20§-ben van.', ['A', '10--20§-ben', 'van', '.']),
+ ('A 10--20§-ben.', ['A', '10--20§-ben', '.']),
+ ('A 99° van.', ['A', '99°', 'van', '.']),
+ ('A 99°-ben van.', ['A', '99°-ben', 'van', '.']),
+ ('A 99°-ben.', ['A', '99°-ben', '.']),
+ ('A 10--20° van.', ['A', '10--20°', 'van', '.']),
+ ('A 10--20°-ben van.', ['A', '10--20°-ben', 'van', '.']),
+ ('A 10--20°-ben.', ['A', '10--20°-ben', '.']),
+ ('A °C van.', ['A', '°C', 'van', '.']),
+ ('A °C-ben van.', ['A', '°C-ben', 'van', '.']),
+ ('A °C.', ['A', '°C', '.']),
+ ('A °C-ben.', ['A', '°C-ben', '.']),
+ ('A 100°C van.', ['A', '100°C', 'van', '.']),
+ ('A 100°C-ben van.', ['A', '100°C-ben', 'van', '.']),
+ ('A 100°C.', ['A', '100°C', '.']),
+ ('A 100°C-ben.', ['A', '100°C-ben', '.']),
+ ('A 800x600 van.', ['A', '800x600', 'van', '.']),
+ ('A 800x600-ben van.', ['A', '800x600-ben', 'van', '.']),
+ ('A 800x600-ben.', ['A', '800x600-ben', '.']),
+ ('A 1x2x3x4 van.', ['A', '1x2x3x4', 'van', '.']),
+ ('A 1x2x3x4-ben van.', ['A', '1x2x3x4-ben', 'van', '.']),
+ ('A 1x2x3x4-ben.', ['A', '1x2x3x4-ben', '.']),
+ ('A 5/J van.', ['A', '5/J', 'van', '.']),
+ ('A 5/J-ben van.', ['A', '5/J-ben', 'van', '.']),
+ ('A 5/J-ben.', ['A', '5/J-ben', '.']),
+ ('A 5/J. van.', ['A', '5/J.', 'van', '.']),
+ ('A 5/J.-ben van.', ['A', '5/J.-ben', 'van', '.']),
+ ('A 5/J.-ben.', ['A', '5/J.-ben', '.']),
+ ('A III/1 van.', ['A', 'III/1', 'van', '.']),
+ ('A III/1-ben van.', ['A', 'III/1-ben', 'van', '.']),
+ ('A III/1-ben.', ['A', 'III/1-ben', '.']),
+ ('A III/1. van.', ['A', 'III/1.', 'van', '.']),
+ ('A III/1.-ben van.', ['A', 'III/1.-ben', 'van', '.']),
+ ('A III/1.-ben.', ['A', 'III/1.-ben', '.']),
+ ('A III/c van.', ['A', 'III/c', 'van', '.']),
+ ('A III/c-ben van.', ['A', 'III/c-ben', 'van', '.']),
+ ('A III/c.', ['A', 'III/c', '.']),
+ ('A III/c-ben.', ['A', 'III/c-ben', '.']),
+ ('A TU–154 van.', ['A', 'TU–154', 'van', '.']),
+ ('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']),
+ ('A TU–154-ben.', ['A', 'TU–154-ben', '.'])
+]
-_QUOTE_TESTS = [('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
- ('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
- ('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']),
- ('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']),
- ("A don't van.", ['A', "don't", 'van', '.'])]
+QUOTE_TESTS = [
+ ('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
+ ('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
+ ('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']),
+ ('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']),
+ ("A don't van.", ['A', "don't", 'van', '.'])
+]
-_DOT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
- ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
- ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
- ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
- ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
- ('A .hu.', ['A', '.hu', '.']),
- ('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
- ('A pl.', ['A', 'pl.']),
- ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
- ('Egy..ket.', ['Egy', '..', 'ket', '.']),
- ('Valami... van.', ['Valami', '...', 'van', '.']),
- ('Valami ...van...', ['Valami', '...', 'van', '...']),
- ('Valami...', ['Valami', '...']),
- ('Valami ...', ['Valami', '...']),
- ('Valami ... más.', ['Valami', '...', 'más', '.'])]
+DOT_TESTS = [
+ ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
+ ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
+ ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
+ ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
+ ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
+ ('A .hu.', ['A', '.hu', '.']),
+ ('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
+ ('A pl.', ['A', 'pl.']),
+ ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
+ ('Egy..ket.', ['Egy', '..', 'ket', '.']),
+ ('Valami... van.', ['Valami', '...', 'van', '.']),
+ ('Valami ...van...', ['Valami', '...', 'van', '...']),
+ ('Valami...', ['Valami', '...']),
+ ('Valami ...', ['Valami', '...']),
+ ('Valami ... más.', ['Valami', '...', 'más', '.'])
+]
-@pytest.fixture(scope="session")
-def HU():
- return Hungarian()
+TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS # + NUMBER_TESTS + HYPHEN_TESTS
-@pytest.fixture(scope="module")
-def hu_tokenizer(HU):
- return HU.tokenizer
-
-
-@pytest.mark.parametrize(("input", "expected_tokens"),
- _DEFAULT_TESTS + _HYPHEN_TESTS + _NUMBER_TESTS + _DOT_TESTS + _QUOTE_TESTS)
-def test_testcases(hu_tokenizer, input, expected_tokens):
- tokens = hu_tokenizer(input)
- token_list = [token.orth_ for token in tokens if not token.is_space]
+@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
+def test_tokenizer_handles_testcases(hu_tokenizer, text, expected_tokens):
+ tokens = hu_tokenizer(text)
+ token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list
diff --git a/spacy/tests/regression/test_issue351.py b/spacy/tests/regression/test_issue351.py
new file mode 100644
index 000000000..84d4398c5
--- /dev/null
+++ b/spacy/tests/regression/test_issue351.py
@@ -0,0 +1,16 @@
+from __future__ import unicode_literals
+from ...en import English
+
+import pytest
+
+
+@pytest.fixture
+def en_tokenizer():
+ return English.Defaults.create_tokenizer()
+
+
+def test_issue351(en_tokenizer):
+ doc = en_tokenizer(" This is a cat.")
+ assert doc[0].idx == 0
+ assert len(doc[0]) == 3
+ assert doc[1].idx == 3
diff --git a/spacy/tests/regression/test_issue360.py b/spacy/tests/regression/test_issue360.py
new file mode 100644
index 000000000..018289030
--- /dev/null
+++ b/spacy/tests/regression/test_issue360.py
@@ -0,0 +1,14 @@
+from __future__ import unicode_literals
+from ...en import English
+
+import pytest
+
+
+@pytest.fixture
+def en_tokenizer():
+ return English.Defaults.create_tokenizer()
+
+
+def test_big_ellipsis(en_tokenizer):
+ tokens = en_tokenizer(u'$45...............Asking')
+ assert len(tokens) > 2
diff --git a/spacy/tests/sun.tokens b/spacy/tests/sun.tokens
deleted file mode 100644
index 4b912e18e..000000000
--- a/spacy/tests/sun.tokens
+++ /dev/null
@@ -1,4 +0,0 @@
-The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ]
-
-The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ]
-Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of −1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ]
diff --git a/spacy/tests/tokenizer/conftest.py b/spacy/tests/tokenizer/conftest.py
index 06ccde7b3..c8e340208 100644
--- a/spacy/tests/tokenizer/conftest.py
+++ b/spacy/tests/tokenizer/conftest.py
@@ -1,7 +1,23 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
import pytest
-from spacy.en import English
+
+from ...en import English
+from ...de import German
+from ...es import Spanish
+from ...it import Italian
+from ...fr import French
+from ...pt import Portuguese
+from ...nl import Dutch
+from ...sv import Swedish
+from ...hu import Hungarian
-@pytest.fixture(scope="module")
-def en_tokenizer(EN):
- return EN.tokenizer
+LANGUAGES = [English, German, Spanish, Italian, French, Dutch, Swedish, Hungarian]
+
+
+@pytest.fixture(params=LANGUAGES)
+def tokenizer(request):
+ lang = request.param
+ return lang.Defaults.create_tokenizer()
diff --git a/spacy/tests/tokenizer/test_contractions.py b/spacy/tests/tokenizer/test_contractions.py
deleted file mode 100644
index 568e34704..000000000
--- a/spacy/tests/tokenizer/test_contractions.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from __future__ import unicode_literals
-import pytest
-
-
-def test_possess(en_tokenizer):
- tokens = en_tokenizer("Mike's")
- assert en_tokenizer.vocab.strings[tokens[0].orth] == "Mike"
- assert en_tokenizer.vocab.strings[tokens[1].orth] == "'s"
- assert len(tokens) == 2
-
-
-def test_apostrophe(en_tokenizer):
- tokens = en_tokenizer("schools'")
- assert len(tokens) == 2
- assert tokens[1].orth_ == "'"
- assert tokens[0].orth_ == "schools"
-
-
-def test_LL(en_tokenizer):
- tokens = en_tokenizer("we'll")
- assert len(tokens) == 2
- assert tokens[1].orth_ == "'ll"
- assert tokens[1].lemma_ == "will"
- assert tokens[0].orth_ == "we"
-
-
-def test_aint(en_tokenizer):
- tokens = en_tokenizer("ain't")
- assert len(tokens) == 2
- assert tokens[0].orth_ == "ai"
- assert tokens[0].lemma_ == "be"
- assert tokens[1].orth_ == "n't"
- assert tokens[1].lemma_ == "not"
-
-def test_capitalized(en_tokenizer):
- tokens = en_tokenizer("can't")
- assert len(tokens) == 2
- tokens = en_tokenizer("Can't")
- assert len(tokens) == 2
- tokens = en_tokenizer("Ain't")
- assert len(tokens) == 2
- assert tokens[0].orth_ == "Ai"
- assert tokens[0].lemma_ == "be"
-
-
-def test_punct(en_tokenizer):
- tokens = en_tokenizer("We've")
- assert len(tokens) == 2
- tokens = en_tokenizer("``We've")
- assert len(tokens) == 3
-
-
-@pytest.mark.xfail
-def test_therell(en_tokenizer):
- tokens = en_tokenizer("there'll")
- assert len(tokens) == 2
- assert tokens[0].text == "there"
- assert tokens[1].text == "there"
diff --git a/spacy/tests/tokenizer/test_emoticons.py b/spacy/tests/tokenizer/test_emoticons.py
deleted file mode 100644
index e0022dbbd..000000000
--- a/spacy/tests/tokenizer/test_emoticons.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from __future__ import unicode_literals
-import pytest
-
-
-def test_tweebo_challenge(en_tokenizer):
- text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
- tokens = en_tokenizer(text)
- assert tokens[0].orth_ == ":o"
- assert tokens[1].orth_ == ":/"
- assert tokens[2].orth_ == ":'("
- assert tokens[3].orth_ == ">:o"
- assert tokens[4].orth_ == "(:"
- assert tokens[5].orth_ == ":)"
- assert tokens[6].orth_ == ">.<"
- assert tokens[7].orth_ == "XD"
- assert tokens[8].orth_ == "-__-"
- assert tokens[9].orth_ == "o.O"
- assert tokens[10].orth_ == ";D"
- assert tokens[11].orth_ == ":-)"
- assert tokens[12].orth_ == "@_@"
- assert tokens[13].orth_ == ":P"
- assert tokens[14].orth_ == "8D"
- assert tokens[15].orth_ == ":1"
- assert tokens[16].orth_ == ">:("
- assert tokens[17].orth_ == ":D"
- assert tokens[18].orth_ == "=|"
- assert tokens[19].orth_ == '")'
- assert tokens[20].orth_ == ':>'
- assert tokens[21].orth_ == '....'
-
-
-def test_false_positive(en_tokenizer):
- text = "example:)"
- tokens = en_tokenizer(text)
- assert len(tokens) == 3
diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py
new file mode 100644
index 000000000..aab27714e
--- /dev/null
+++ b/spacy/tests/tokenizer/test_exceptions.py
@@ -0,0 +1,41 @@
+# coding: utf-8
+"""Test that tokenizer exceptions and emoticons are handled correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_tokenizer_handles_emoticons(tokenizer):
+ # Tweebo challenge (CMU)
+ text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
+ tokens = tokenizer(text)
+ assert tokens[0].text == ":o"
+ assert tokens[1].text == ":/"
+ assert tokens[2].text == ":'("
+ assert tokens[3].text == ">:o"
+ assert tokens[4].text == "(:"
+ assert tokens[5].text == ":)"
+ assert tokens[6].text == ">.<"
+ assert tokens[7].text == "XD"
+ assert tokens[8].text == "-__-"
+ assert tokens[9].text == "o.O"
+ assert tokens[10].text == ";D"
+ assert tokens[11].text == ":-)"
+ assert tokens[12].text == "@_@"
+ assert tokens[13].text == ":P"
+ assert tokens[14].text == "8D"
+ assert tokens[15].text == ":1"
+ assert tokens[16].text == ">:("
+ assert tokens[17].text == ":D"
+ assert tokens[18].text == "=|"
+ assert tokens[19].text == '")'
+ assert tokens[20].text == ':>'
+ assert tokens[21].text == '....'
+
+
+@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)])
+def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
+ tokens = tokenizer(text)
+ assert len(tokens) == length
diff --git a/spacy/tests/tokenizer/test_infix.py b/spacy/tests/tokenizer/test_infix.py
deleted file mode 100644
index 1b7cbaa7b..000000000
--- a/spacy/tests/tokenizer/test_infix.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from __future__ import unicode_literals
-
-import pytest
-
-def test_hyphen(en_tokenizer):
- tokens = en_tokenizer('best-known')
- assert len(tokens) == 3
-
-
-def test_numeric_range(en_tokenizer):
- tokens = en_tokenizer('0.1-13.5')
- assert len(tokens) == 3
-
-def test_period(en_tokenizer):
- tokens = en_tokenizer('best.Known')
- assert len(tokens) == 3
- tokens = en_tokenizer('zombo.com')
- assert len(tokens) == 1
-
-
-def test_ellipsis(en_tokenizer):
- tokens = en_tokenizer('best...Known')
- assert len(tokens) == 3
- tokens = en_tokenizer('best...known')
- assert len(tokens) == 3
-
-def test_big_ellipsis(en_tokenizer):
- '''Test regression identified in Issue #360'''
- tokens = en_tokenizer(u'$45...............Asking')
- assert len(tokens) > 2
-
-
-
-def test_email(en_tokenizer):
- tokens = en_tokenizer('hello@example.com')
- assert len(tokens) == 1
- tokens = en_tokenizer('hi+there@gmail.it')
- assert len(tokens) == 1
-
-
-def test_double_hyphen(en_tokenizer):
- tokens = en_tokenizer(u'No decent--let alone well-bred--people.')
- assert tokens[0].text == u'No'
- assert tokens[1].text == u'decent'
- assert tokens[2].text == u'--'
- assert tokens[3].text == u'let'
- assert tokens[4].text == u'alone'
- assert tokens[5].text == u'well'
- assert tokens[6].text == u'-'
- # TODO: This points to a deeper issue with the tokenizer: it doesn't re-enter
- # on infixes.
- assert tokens[7].text == u'bred'
- assert tokens[8].text == u'--'
- assert tokens[9].text == u'people'
-
-
-def test_infix_comma(en_tokenizer):
- # Re issue #326
- tokens = en_tokenizer(u'Hello,world')
- assert tokens[0].text == u'Hello'
- assert tokens[1].text == u','
- assert tokens[2].text == u'world'
diff --git a/spacy/tests/tokenizer/test_only_punct.py b/spacy/tests/tokenizer/test_only_punct.py
deleted file mode 100644
index 12c958088..000000000
--- a/spacy/tests/tokenizer/test_only_punct.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from __future__ import unicode_literals
-
-
-def test_only_pre1(en_tokenizer):
- assert len(en_tokenizer("(")) == 1
-
-
-def test_only_pre2(en_tokenizer):
- assert len(en_tokenizer("((")) == 2
diff --git a/spacy/tests/tokenizer/test_post_punct.py b/spacy/tests/tokenizer/test_post_punct.py
deleted file mode 100644
index ff1120c63..000000000
--- a/spacy/tests/tokenizer/test_post_punct.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from __future__ import unicode_literals
-import pytest
-
-
-@pytest.fixture
-def close_puncts():
- return [')', ']', '}', '*']
-
-
-def test_close(close_puncts, en_tokenizer):
- word_str = 'Hello'
- for p in close_puncts:
- string = word_str + p
- tokens = en_tokenizer(string)
- assert len(tokens) == 2
- assert tokens[1].string == p
- assert tokens[0].string == word_str
-
-
-def test_two_different_close(close_puncts, en_tokenizer):
- word_str = 'Hello'
- for p in close_puncts:
- string = word_str + p + "'"
- tokens = en_tokenizer(string)
- assert len(tokens) == 3
- assert tokens[0].string == word_str
- assert tokens[1].string == p
- assert tokens[2].string == "'"
-
-
-def test_three_same_close(close_puncts, en_tokenizer):
- word_str = 'Hello'
- for p in close_puncts:
- string = word_str + p + p + p
- tokens = en_tokenizer(string)
- assert len(tokens) == 4
- assert tokens[0].string == word_str
- assert tokens[1].string == p
-
-
-def test_double_end_quote(en_tokenizer):
- assert len(en_tokenizer("Hello''")) == 2
- assert len(en_tokenizer("''")) == 1
diff --git a/spacy/tests/tokenizer/test_pre_punct.py b/spacy/tests/tokenizer/test_pre_punct.py
deleted file mode 100644
index 9aec1dc7b..000000000
--- a/spacy/tests/tokenizer/test_pre_punct.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from __future__ import unicode_literals
-
-import pytest
-
-
-@pytest.fixture
-def open_puncts():
- return ['(', '[', '{', '*']
-
-
-def test_open(open_puncts, en_tokenizer):
- word_str = 'Hello'
- for p in open_puncts:
- string = p + word_str
- tokens = en_tokenizer(string)
- assert len(tokens) == 2
- assert tokens[0].orth_ == p
- assert tokens[1].orth_ == word_str
-
-
-def test_two_different_open(open_puncts, en_tokenizer):
- word_str = 'Hello'
- for p in open_puncts:
- string = p + "`" + word_str
- tokens = en_tokenizer(string)
- assert len(tokens) == 3
- assert tokens[0].orth_ == p
- assert tokens[1].orth_ == "`"
- assert tokens[2].orth_ == word_str
-
-
-def test_three_same_open(open_puncts, en_tokenizer):
- word_str = 'Hello'
- for p in open_puncts:
- string = p + p + p + word_str
- tokens = en_tokenizer(string)
- assert len(tokens) == 4
- assert tokens[0].orth_ == p
- assert tokens[3].orth_ == word_str
-
-
-def test_open_appostrophe(en_tokenizer):
- string = "'The"
- tokens = en_tokenizer(string)
- assert len(tokens) == 2
- assert tokens[0].orth_ == "'"
diff --git a/spacy/tests/tokenizer/test_special_affix.py b/spacy/tests/tokenizer/test_special_affix.py
deleted file mode 100644
index 62cf114f1..000000000
--- a/spacy/tests/tokenizer/test_special_affix.py
+++ /dev/null
@@ -1,46 +0,0 @@
-"""Test entries in the tokenization special-case interacting with prefix
-and suffix punctuation."""
-from __future__ import unicode_literals
-import pytest
-
-
-def test_no_special(en_tokenizer):
- assert len(en_tokenizer("(can)")) == 3
-
-
-def test_no_punct(en_tokenizer):
- assert len(en_tokenizer("can't")) == 2
-
-
-def test_prefix(en_tokenizer):
- assert len(en_tokenizer("(can't")) == 3
-
-
-def test_suffix(en_tokenizer):
- assert len(en_tokenizer("can't)")) == 3
-
-
-def test_wrap(en_tokenizer):
- assert len(en_tokenizer("(can't)")) == 4
-
-
-def test_uneven_wrap(en_tokenizer):
- assert len(en_tokenizer("(can't?)")) == 5
-
-
-def test_prefix_interact(en_tokenizer):
- assert len(en_tokenizer("U.S.")) == 1
- assert len(en_tokenizer("us.")) == 2
- assert len(en_tokenizer("(U.S.")) == 2
-
-
-def test_suffix_interact(en_tokenizer):
- assert len(en_tokenizer("U.S.)")) == 2
-
-
-def test_even_wrap_interact(en_tokenizer):
- assert len(en_tokenizer("(U.S.)")) == 3
-
-
-def test_uneven_wrap_interact(en_tokenizer):
- assert len(en_tokenizer("(U.S.?)")) == 4
diff --git a/spacy/tests/tokenizer/test_string_loading.py b/spacy/tests/tokenizer/test_string_loading.py
deleted file mode 100644
index 1bc5539bc..000000000
--- a/spacy/tests/tokenizer/test_string_loading.py
+++ /dev/null
@@ -1,9 +0,0 @@
-"""Test suspected freeing of strings"""
-from __future__ import unicode_literals
-
-
-def test_one(en_tokenizer):
- tokens = en_tokenizer('Betty Botter bought a pound of butter.')
- assert tokens[0].orth_ == 'Betty'
- tokens2 = en_tokenizer('Betty also bought a pound of butter.')
- assert tokens2[0].orth_ == 'Betty'
diff --git a/spacy/tests/tokenizer/test_surround_punct.py b/spacy/tests/tokenizer/test_surround_punct.py
deleted file mode 100644
index 7c7a50904..000000000
--- a/spacy/tests/tokenizer/test_surround_punct.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from __future__ import unicode_literals
-import pytest
-
-
-@pytest.fixture
-def paired_puncts():
- return [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
-
-
-def test_token(paired_puncts, en_tokenizer):
- word_str = 'Hello'
- for open_, close_ in paired_puncts:
- string = open_ + word_str + close_
- tokens = en_tokenizer(string)
- assert len(tokens) == 3
- assert tokens[0].orth_ == open_
- assert tokens[1].orth_ == word_str
- assert tokens[2].orth_ == close_
-
-
-def test_two_different(paired_puncts, en_tokenizer):
- word_str = 'Hello'
- for open_, close_ in paired_puncts:
- string = "`" + open_ + word_str + close_ + "'"
- tokens = en_tokenizer(string)
- assert len(tokens) == 5
- assert tokens[0].orth_ == "`"
- assert tokens[1].orth_ == open_
- assert tokens[2].orth_ == word_str
- assert tokens[2].orth_ == word_str
- assert tokens[3].orth_ == close_
- assert tokens[4].orth_ == "'"
diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index 091561ae3..9db007d7e 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -1,172 +1,83 @@
# coding: utf-8
from __future__ import unicode_literals
+from os import path
import pytest
-import io
-import pickle
-import cloudpickle
-import tempfile
-from ... import util
-from ...language_data import TOKENIZER_PREFIXES
+from ...util import utf8open
-en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
-# @pytest.mark.xfail
-# def test_pickle(en_tokenizer):
-# file_ = io.BytesIO()
-# cloudpickle.dump(en_tokenizer, file_)
-# file_.seek(0)
-# loaded = pickle.load(file_)
-# assert loaded is not None
-
-def test_pre_punct_regex():
- string = "(can't"
- match = en_search_prefixes(string)
- assert match.group() == "("
-
-def test_no_word(en_tokenizer):
- tokens = en_tokenizer(u'')
+def test_tokenizer_handles_no_word(tokenizer):
+ tokens = tokenizer("")
assert len(tokens) == 0
-def test_single_word(en_tokenizer):
- tokens = en_tokenizer(u'hello')
- assert tokens[0].orth_ == 'hello'
+@pytest.mark.parametrize('text', ["lorem"])
+def test_tokenizer_handles_single_word(tokenizer, text):
+ tokens = tokenizer(text)
+ assert tokens[0].text == text
-def test_two_words(en_tokenizer):
- tokens = en_tokenizer('hello possums')
- assert len(tokens) == 2
- assert tokens[0].orth_ != tokens[1].orth_
-
-
-def test_punct(en_tokenizer):
- tokens = en_tokenizer('hello, possums.')
+def test_tokenizer_handles_punct(tokenizer):
+ text = "Lorem, ipsum."
+ tokens = tokenizer(text)
assert len(tokens) == 4
- assert tokens[0].orth_ == 'hello'
- assert tokens[1].orth_ == ','
- assert tokens[2].orth_ == 'possums'
- assert tokens[1].orth_ != 'hello'
+ assert tokens[0].text == "Lorem"
+ assert tokens[1].text == ","
+ assert tokens[2].text == "ipsum"
+ assert tokens[1].text != "Lorem"
-def test_digits(en_tokenizer):
- tokens = en_tokenizer('The year: 1984.')
- assert len(tokens) == 5
- assert tokens[0].orth == en_tokenizer.vocab['The'].orth
- assert tokens[3].orth == en_tokenizer.vocab['1984'].orth
+def test_tokenizer_handles_digits(tokenizer):
+ exceptions = ["hu"]
+ text = "Lorem ipsum: 1984."
+ tokens = tokenizer(text)
+
+ if tokens[0].lang_ not in exceptions:
+ assert len(tokens) == 5
+ assert tokens[0].text == "Lorem"
+ assert tokens[3].text == "1984"
-def test_contraction(en_tokenizer):
- tokens = en_tokenizer("don't giggle")
- assert len(tokens) == 3
- assert tokens[1].orth == en_tokenizer.vocab["n't"].orth
- tokens = en_tokenizer("i said don't!")
- assert len(tokens) == 5
- assert tokens[4].orth == en_tokenizer.vocab['!'].orth
-
-def test_contraction_punct(en_tokenizer):
- tokens = [w.text for w in en_tokenizer("(can't")]
- assert tokens == ['(', 'ca', "n't"]
- tokens = en_tokenizer("`ain't")
- assert len(tokens) == 3
- tokens = en_tokenizer('''"isn't''')
- assert len(tokens) == 3
- tokens = en_tokenizer("can't!")
- assert len(tokens) == 3
+@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"])
+def test_tokenizer_keep_urls(tokenizer, text):
+ tokens = tokenizer(text)
+ assert len(tokens) == 1
-def test_sample(en_tokenizer):
- text = """Tributes pour in for late British Labour Party leader
+@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"])
+def test_tokenizer_keeps_email(tokenizer, text):
+ tokens = tokenizer(text)
+ assert len(tokens) == 1
-Tributes poured in from around the world Thursday
-to the late Labour Party leader John Smith, who died earlier from a massive
-heart attack aged 55.
-In Washington, the US State Department issued a statement regretting "the
-untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
+def test_tokenizer_handles_long_text(tokenizer):
+ text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit
-"Mr. Smith, throughout his distinguished"""
+Cras egestas orci non porttitor maximus.
+Maecenas quis odio id dolor rhoncus dignissim. Curabitur sed velit at orci ultrices sagittis. Nulla commodo euismod arcu eget vulputate.
- tokens = en_tokenizer(text)
+Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, non lacinia enim nibh eget ipsum. Vestibulum in bibendum mauris.
+
+"Nullam porta fringilla enim, a dictum orci consequat in." Mauris nec malesuada justo."""
+
+ tokens = tokenizer(text)
assert len(tokens) > 5
-def test_cnts1(en_tokenizer):
- text = u"""The U.S. Army likes Shock and Awe."""
- tokens = en_tokenizer(text)
- assert len(tokens) == 8
+@pytest.mark.parametrize('file_name', ["sun.txt"])
+def test_tokenizer_handle_text_from_file(tokenizer, file_name):
+ loc = path.join(path.dirname(__file__), '..', file_name)
+ text = utf8open(loc).read()
+ assert len(text) != 0
+ tokens = tokenizer(text)
+ assert len(tokens) > 100
-def test_cnts2(en_tokenizer):
- text = u"""U.N. regulations are not a part of their concern."""
- tokens = en_tokenizer(text)
- assert len(tokens) == 10
-
-
-def test_cnts3(en_tokenizer):
- text = u"“Isn't it?”"
- tokens = en_tokenizer(text)
- words = [t.orth_ for t in tokens]
- assert len(words) == 6
-
-
-def test_cnts4(en_tokenizer):
- text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
- tokens = en_tokenizer(text)
- words = [t.orth_ for t in tokens]
- assert len(words) == 15
-
-
-def test_cnts5(en_tokenizer):
- text = """'Me too!', Mr. P. Delaware cried. """
- tokens = en_tokenizer(text)
- assert len(tokens) == 11
-
-
-@pytest.mark.xfail
-def test_mr(en_tokenizer):
- text = """Today is Tuesday.Mr."""
- tokens = en_tokenizer(text)
- assert len(tokens) == 5
- assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
-
-
-def test_cnts6(en_tokenizer):
- text = u'They ran about 10km.'
- tokens = en_tokenizer(text)
- words = [t.orth_ for t in tokens]
- assert len(words) == 6
-
-def test_bracket_period(en_tokenizer):
- text = u'(And a 6a.m. run through Washington Park).'
- tokens = en_tokenizer(text)
- assert tokens[len(tokens) - 1].orth_ == u'.'
-
-
-def test_ie(en_tokenizer):
- text = u"It's mediocre i.e. bad."
- tokens = en_tokenizer(text)
- assert len(tokens) == 6
- assert tokens[3].orth_ == "i.e."
-
-
-def test_two_whitespace(en_tokenizer):
- orig_str = u'there are 2 spaces after this '
- tokens = en_tokenizer(orig_str)
- assert repr(tokens.text_with_ws) == repr(orig_str)
-
-
-@pytest.mark.xfail
-def test_em_dash_infix(en_tokenizer):
- # Re Issue #225
- tokens = en_tokenizer('''Will this road take me to Puddleton?\u2014No, '''
- '''you'll have to walk there.\u2014Ariel.''')
- assert tokens[6].text == 'Puddleton'
- assert tokens[7].text == '?'
- assert tokens[8].text == '\u2014'
-
-#def test_cnts7():
-# text = 'But then the 6,000-year ice age came...'
-# tokens = EN.tokenize(text)
-# assert len(tokens) == 10
+def test_tokenizer_suspected_freeing_strings(tokenizer):
+ text1 = "Lorem dolor sit amet, consectetur adipiscing elit."
+ text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
+ tokens1 = tokenizer(text1)
+ tokens2 = tokenizer(text2)
+ assert tokens1[0].text == "Lorem"
+ assert tokens2[0].text == "Lorem"
diff --git a/spacy/tests/tokenizer/test_whitespace.py b/spacy/tests/tokenizer/test_whitespace.py
index ad34c8791..7ff3106a8 100644
--- a/spacy/tests/tokenizer/test_whitespace.py
+++ b/spacy/tests/tokenizer/test_whitespace.py
@@ -1,67 +1,51 @@
+# coding: utf-8
"""Test that tokens are created correctly for whitespace."""
+
+
from __future__ import unicode_literals
import pytest
-def test_single_space(en_tokenizer):
- tokens = en_tokenizer('hello possums')
+@pytest.mark.parametrize('text', ["lorem ipsum"])
+def test_tokenizer_splits_single_space(tokenizer, text):
+ tokens = tokenizer(text)
assert len(tokens) == 2
-def test_double_space(en_tokenizer):
- tokens = en_tokenizer('hello possums')
+@pytest.mark.parametrize('text', ["lorem ipsum"])
+def test_tokenizer_splits_double_space(tokenizer, text):
+ tokens = tokenizer(text)
assert len(tokens) == 3
- assert tokens[1].orth_ == ' '
+ assert tokens[1].text == " "
-def test_newline(en_tokenizer):
- tokens = en_tokenizer('hello\npossums')
+@pytest.mark.parametrize('text', ["lorem ipsum "])
+def test_tokenizer_handles_double_trainling_ws(tokenizer, text):
+ tokens = tokenizer(text)
+ assert repr(tokens.text_with_ws) == repr(text)
+
+
+@pytest.mark.parametrize('text', ["lorem\nipsum"])
+def test_tokenizer_splits_newline(tokenizer, text):
+ tokens = tokenizer(text)
+ assert len(tokens) == 3
+ assert tokens[1].text == "\n"
+
+
+@pytest.mark.parametrize('text', ["lorem \nipsum"])
+def test_tokenizer_splits_newline_space(tokenizer, text):
+ tokens = tokenizer(text)
assert len(tokens) == 3
-def test_newline_space(en_tokenizer):
- tokens = en_tokenizer('hello \npossums')
+@pytest.mark.parametrize('text', ["lorem \nipsum"])
+def test_tokenizer_splits_newline_double_space(tokenizer, text):
+ tokens = tokenizer(text)
assert len(tokens) == 3
-def test_newline_double_space(en_tokenizer):
- tokens = en_tokenizer('hello \npossums')
+@pytest.mark.parametrize('text', ["lorem \n ipsum"])
+def test_tokenizer_splits_newline_space_wrap(tokenizer, text):
+ tokens = tokenizer(text)
assert len(tokens) == 3
-
-
-def test_newline_space_wrap(en_tokenizer):
- tokens = en_tokenizer('hello \n possums')
- assert len(tokens) == 3
-
-
-def test_leading_space_offsets(en_tokenizer):
- '''Issue #351
- # this works
-
- text1 = u"This is a cat."
- a = english_spacy(text1)
-
- tok0 = list(a.sents)[0][0]
- print tok0, tok0.idx, text1[tok0.idx]
-
- tok1 = list(a.sents)[0][1]
- print tok1, tok1.idx, text1[tok1.idx]
-
- print "=="
-
- # this does not work
-
- text2 = u" This is a cat."
- b = english_spacy(text2)
-
- tok0 = list(b.sents)[0][0]
-print tok0, tok0.idx, text2[tok0.idx]
-
- tok1 = list(b.sents)[0][1]
- print tok1, tok1.idx, text2[tok1.idx]
- '''
- doc = en_tokenizer(u" This is a cat.")
- assert doc[0].idx == 0
- assert len(doc[0]) == 3
- assert doc[1].idx == 3
diff --git a/spacy/tests/tokenizer/test_wiki_sun.py b/spacy/tests/tokenizer/test_wiki_sun.py
deleted file mode 100644
index 8d2a6682e..000000000
--- a/spacy/tests/tokenizer/test_wiki_sun.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from __future__ import unicode_literals
-
-from spacy.util import utf8open
-
-import pytest
-from os import path
-
-
-HERE = path.dirname(__file__)
-
-
-@pytest.fixture
-def sun_txt():
- loc = path.join(HERE, '..', 'sun.txt')
- return utf8open(loc).read()
-
-
-def test_tokenize(sun_txt, en_tokenizer):
- assert len(sun_txt) != 0
- tokens = en_tokenizer(sun_txt)
- assert len(tokens) > 100
diff --git a/spacy/tests/website/conftest.py b/spacy/tests/website/conftest.py
deleted file mode 100644
index 4f533ae76..000000000
--- a/spacy/tests/website/conftest.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from __future__ import unicode_literals
-import pytest
-import os
-
-
-@pytest.fixture(scope='session')
-def nlp():
- from spacy.en import English
- if os.environ.get('SPACY_DATA'):
- data_dir = os.environ.get('SPACY_DATA')
- else:
- data_dir = True
- return English(path=data_dir)
-
-
-@pytest.fixture()
-def doc(nlp):
- for word in ['Hello', ',', 'world', '.', 'Here', 'are', 'two', 'sentences', '.']:
- _ = nlp.vocab[word]
- return nlp('Hello, world. Here are two sentences.')
diff --git a/spacy/tests/website/test_api.py b/spacy/tests/website/test_api.py
deleted file mode 100644
index 6a7379d87..000000000
--- a/spacy/tests/website/test_api.py
+++ /dev/null
@@ -1,172 +0,0 @@
-from __future__ import unicode_literals
-import pytest
-from spacy.attrs import HEAD
-import numpy
-
-
-@pytest.mark.xfail
-def test_example_war_and_peace(nlp):
- # from spacy.en import English
- from spacy._doc_examples import download_war_and_peace
-
- unprocessed_unicode = download_war_and_peace()
-
- # nlp = English()
- # TODO: ImportError: No module named _doc_examples
- doc = nlp(unprocessed_unicode)
-
-
-def test_main_entry_point(nlp):
- # from spacy.en import English
- # nlp = English()
- doc = nlp('Some text.') # Applies tagger, parser, entity
- doc = nlp('Some text.', parse=False) # Applies tagger and entity, not parser
- doc = nlp('Some text.', entity=False) # Applies tagger and parser, not entity
- doc = nlp('Some text.', tag=False) # Does not apply tagger, entity or parser
- doc = nlp('') # Zero-length tokens, not an error
- # doc = nlp(b'Some text') <-- Error: need unicode
- doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
-
-
-@pytest.mark.models
-def test_sentence_spans(nlp):
- # from spacy.en import English
- # nlp = English()
- doc = nlp("This is a sentence. Here's another...")
- assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
-
-
-@pytest.mark.models
-def test_entity_spans(nlp):
- # from spacy.en import English
- # nlp = English()
- tokens = nlp('Mr. Best flew to New York on Saturday morning.')
- ents = list(tokens.ents)
- assert ents[0].label == 346
- assert ents[0].label_ == 'PERSON'
- assert ents[0].orth_ == 'Best'
- assert ents[0].string == ents[0].string
-
-
-@pytest.mark.models
-def test_noun_chunk_spans(nlp):
- # from spacy.en import English
- # nlp = English()
- doc = nlp('The sentence in this example has three noun chunks.')
- for chunk in doc.noun_chunks:
- print(chunk.label, chunk.orth_, '<--', chunk.root.head.orth_)
-
- # NP The sentence <-- has
- # NP this example <-- in
- # NP three noun chunks <-- has
-
-
-@pytest.mark.models
-def test_count_by(nlp):
- # from spacy.en import English, attrs
- # nlp = English()
- import numpy
- from spacy import attrs
- tokens = nlp('apple apple orange banana')
- assert tokens.count_by(attrs.ORTH) == {3699: 2, 3750: 1, 5965: 1}
- assert repr(tokens.to_array([attrs.ORTH])) == repr(numpy.array([[3699],
- [3699],
- [3750],
- [5965]], dtype=numpy.int32))
-
-@pytest.mark.models
-def test_read_bytes(nlp):
- from spacy.tokens.doc import Doc
- loc = 'test_serialize.bin'
- with open(loc, 'wb') as file_:
- file_.write(nlp(u'This is a document.').to_bytes())
- file_.write(nlp(u'This is another.').to_bytes())
- docs = []
- with open(loc, 'rb') as file_:
- for byte_string in Doc.read_bytes(file_):
- docs.append(Doc(nlp.vocab).from_bytes(byte_string))
- assert len(docs) == 2
-
-
-def test_token_span(doc):
- span = doc[4:6]
- token = span[0]
- assert token.i == 4
-
-
-@pytest.mark.models
-def test_example_i_like_new_york1(nlp):
- toks = nlp('I like New York in Autumn.')
-
-
-@pytest.fixture
-def toks(nlp):
- doc = nlp('I like New York in Autumn.')
- doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T)
- return doc
-
-
-def test_example_i_like_new_york2(toks):
- i, like, new, york, in_, autumn, dot = range(len(toks))
-
-
-@pytest.fixture
-def tok(toks, tok):
- i, like, new, york, in_, autumn, dot = range(len(toks))
- return locals()[tok]
-
-
-@pytest.fixture
-def new(toks):
- return tok(toks, "new")
-
-
-@pytest.fixture
-def york(toks):
- return tok(toks, "york")
-
-
-@pytest.fixture
-def autumn(toks):
- return tok(toks, "autumn")
-
-
-@pytest.fixture
-def dot(toks):
- return tok(toks, "dot")
-
-
-def test_example_i_like_new_york3(toks, new, york):
- assert toks[new].head.orth_ == 'York'
- assert toks[york].head.orth_ == 'like'
-
-
-def test_example_i_like_new_york4(toks, new, york):
- new_york = toks[new:york+1]
- assert new_york.root.orth_ == 'York'
-
-
-def test_example_i_like_new_york5(toks, autumn, dot):
- assert toks[autumn].head.orth_ == 'in'
- assert toks[dot].head.orth_ == 'like'
- autumn_dot = toks[autumn:]
- assert autumn_dot.root.orth_ == 'Autumn'
-
-
-def test_navigating_the_parse_tree_lefts(doc):
- # TODO: where does the span object come from?
- span = doc[:2]
- lefts = [span.doc[i] for i in range(0, span.start)
- if span.doc[i].head in span]
-
-
-def test_navigating_the_parse_tree_rights(doc):
- span = doc[:2]
- rights = [span.doc[i] for i in range(span.end, len(span.doc))
- if span.doc[i].head in span]
-
-
-def test_string_store(doc):
- string_store = doc.vocab.strings
- for i, string in enumerate(string_store):
- assert i == string_store[string]
diff --git a/spacy/tests/website/test_home.py b/spacy/tests/website/test_home.py
deleted file mode 100644
index 95c0ec3bb..000000000
--- a/spacy/tests/website/test_home.py
+++ /dev/null
@@ -1,180 +0,0 @@
-from __future__ import unicode_literals
-import pytest
-import spacy
-import os
-
-
-try:
- xrange
-except NameError:
- xrange = range
-
-
-@pytest.fixture()
-def token(doc):
- return doc[0]
-
-
-@pytest.mark.models
-def test_load_resources_and_process_text():
- from spacy.en import English
- nlp = English()
- doc = nlp(u'Hello, world. Here are two sentences.')
-
-
-@pytest.mark.models
-def test_get_tokens_and_sentences(doc):
- token = doc[0]
- sentence = next(doc.sents)
- assert token is sentence[0]
- assert sentence.text == 'Hello, world.'
-
-
-@pytest.mark.models
-def test_use_integer_ids_for_any_strings(nlp, token):
- hello_id = nlp.vocab.strings['Hello']
- hello_str = nlp.vocab.strings[hello_id]
-
- assert token.orth == hello_id == 3125
- assert token.orth_ == hello_str == 'Hello'
-
-
-def test_get_and_set_string_views_and_flags(nlp, token):
- assert token.shape_ == 'Xxxxx'
- for lexeme in nlp.vocab:
- if lexeme.is_alpha:
- lexeme.shape_ = 'W'
- elif lexeme.is_digit:
- lexeme.shape_ = 'D'
- elif lexeme.is_punct:
- lexeme.shape_ = 'P'
- else:
- lexeme.shape_ = 'M'
- assert token.shape_ == 'W'
-
-
-def test_export_to_numpy_arrays(nlp, doc):
- from spacy.attrs import ORTH, LIKE_URL, IS_OOV
-
- attr_ids = [ORTH, LIKE_URL, IS_OOV]
- doc_array = doc.to_array(attr_ids)
- assert doc_array.shape == (len(doc), len(attr_ids))
- assert doc[0].orth == doc_array[0, 0]
- assert doc[1].orth == doc_array[1, 0]
- assert doc[0].like_url == doc_array[0, 1]
- assert list(doc_array[:, 1]) == [t.like_url for t in doc]
-
-
-@pytest.mark.models
-def test_word_vectors(nlp):
- doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
-
- apples = doc[0]
- oranges = doc[2]
- boots = doc[6]
- hippos = doc[8]
-
- assert apples.similarity(oranges) > boots.similarity(hippos)
-
-
-@pytest.mark.models
-def test_part_of_speech_tags(nlp):
- from spacy.parts_of_speech import ADV
-
- def is_adverb(token):
- return token.pos == spacy.parts_of_speech.ADV
-
- # These are data-specific, so no constants are provided. You have to look
- # up the IDs from the StringStore.
- NNS = nlp.vocab.strings['NNS']
- NNPS = nlp.vocab.strings['NNPS']
- def is_plural_noun(token):
- return token.tag == NNS or token.tag == NNPS
-
- def print_coarse_pos(token):
- print(token.pos_)
-
- def print_fine_pos(token):
- print(token.tag_)
-
-
-@pytest.mark.models
-def test_syntactic_dependencies():
- def dependency_labels_to_root(token):
- '''Walk up the syntactic tree, collecting the arc labels.'''
- dep_labels = []
- while token.head is not token:
- dep_labels.append(token.dep)
- token = token.head
- return dep_labels
-
-
-@pytest.mark.models
-def test_named_entities():
- def iter_products(docs):
- for doc in docs:
- for ent in doc.ents:
- if ent.label_ == 'PRODUCT':
- yield ent
-
- def word_is_in_entity(word):
- return word.ent_type != 0
-
- def count_parent_verb_by_person(docs):
- counts = defaultdict(defaultdict(int))
- for doc in docs:
- for ent in doc.ents:
- if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
- counts[ent.orth_][ent.root.head.lemma_] += 1
- return counts
-
-
-def test_calculate_inline_mark_up_on_original_string():
- def put_spans_around_tokens(doc, get_classes):
- '''Given some function to compute class names, put each token in a
- span element, with the appropriate classes computed.
-
- All whitespace is preserved, outside of the spans. (Yes, I know HTML
- won't display it. But the point is no information is lost, so you can
- calculate what you need, e.g.
tags, tags, etc.)
- '''
- output = []
- template = '{word}{space}'
- for token in doc:
- if token.is_space:
- output.append(token.orth_)
- else:
- output.append(
- template.format(
- classes=' '.join(get_classes(token)),
- word=token.orth_,
- space=token.whitespace_))
- string = ''.join(output)
- string = string.replace('\n', '')
- string = string.replace('\t', ' ')
- return string
-
-
-@pytest.mark.models
-def test_efficient_binary_serialization(doc):
- from spacy.tokens.doc import Doc
-
- byte_string = doc.to_bytes()
- open('moby_dick.bin', 'wb').write(byte_string)
-
- nlp = spacy.en.English()
- for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')):
- doc = Doc(nlp.vocab)
- doc.from_bytes(byte_string)
-
-
-@pytest.mark.models
-def test_multithreading(nlp):
- texts = [u'One document.', u'...', u'Lots of documents']
- # .pipe streams input, and produces streaming output
- iter_texts = (texts[i % 3] for i in xrange(100000000))
- for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):
- assert doc.is_parsed
- if i == 100:
- break
-
diff --git a/spacy/util.py b/spacy/util.py
index afed4142e..457534302 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -94,8 +94,13 @@ def read_regex(path):
def compile_prefix_regex(entries):
- expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
- return re.compile(expression)
+ if '(' in entries:
+ # Handle deprecated data
+ expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
+ return re.compile(expression)
+ else:
+ expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
+ return re.compile(expression)
def compile_suffix_regex(entries):
diff --git a/website/_harp.json b/website/_harp.json
index bc8cf4d84..04a66f772 100644
--- a/website/_harp.json
+++ b/website/_harp.json
@@ -22,7 +22,8 @@
"twitter": "spacy_io",
"github": "explosion",
"reddit": "spacynlp",
- "codepen": "explosion"
+ "codepen": "explosion",
+ "gitter": "explosion/spaCy"
},
"NAVIGATION": {
@@ -53,7 +54,7 @@
}
},
- "V_CSS": "1.10",
+ "V_CSS": "1.14",
"V_JS": "1.0",
"DEFAULT_SYNTAX" : "python",
"ANALYTICS": "UA-58931649-1",
diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade
index 27f195690..bc8b85557 100644
--- a/website/_includes/_mixins-base.jade
+++ b/website/_includes/_mixins-base.jade
@@ -1,6 +1,7 @@
//- 💫 MIXINS > BASE
//- Aside wrapper
+ label - [string] aside label
mixin aside-wrapper(label)
aside.c-aside
@@ -21,6 +22,10 @@ mixin date(input)
//- SVG from map
+ file - [string] SVG file name in /assets/img/
+ name - [string] SVG symbol id
+ width - [integer] width in px
+ height - [integer] height in px (default: same as width)
mixin svg(file, name, width, height)
svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
@@ -28,19 +33,23 @@ mixin svg(file, name, width, height)
//- Icon
+ name - [string] icon name, should be SVG symbol ID
+ size - [integer] icon width and height (default: 20)
mixin icon(name, size)
- +svg("icons", "icon-" + name, size || 20).o-icon&attributes(attributes)
+ +svg("icons", name, size || 20).o-icon&attributes(attributes)
//- Pro/Con/Neutral icon
+ icon - [string] "pro", "con" or "neutral" (default: "neutral")
mixin procon(icon)
- - colors = { pro: "green", con: "red" }
+ - colors = { pro: "green", con: "red", neutral: "yellow" }
+icon(icon)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
//- Headlines Helper Mixin
+ level - [integer] 1, 2, 3, 4, or 5
mixin headline(level)
if level == 1
@@ -65,6 +74,7 @@ mixin headline(level)
//- Permalink rendering
+ id - [string] permalink ID used for link anchor
mixin permalink(id)
if id
@@ -77,6 +87,7 @@ mixin permalink(id)
//- Terminal-style code window
+ label - [string] title displayed in top bar of terminal window
mixin terminal(label)
.x-terminal
@@ -87,6 +98,18 @@ mixin terminal(label)
block
+//- Gitter chat button and widget
+ button - [string] text shown on button
+ label - [string] title of chat window (default: same as button)
+
+mixin gitter(button, label)
+ aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
+
+ button.js-gitter-button.c-chat__button.u-text-small
+ +icon("chat").o-icon--inline
+ !=button
+
+
//- Logo
mixin logo()
diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade
index 8fe24b11b..8a42024c1 100644
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@@ -44,7 +44,7 @@ mixin api(path)
+a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block
block
- | #[+icon("book", 18).o-icon--inline.u-help.u-color-subtle]
+ | #[+icon("book", 18).o-icon--inline.u-color-subtle]
//- Aside for text
diff --git a/website/_includes/_page-docs.jade b/website/_includes/_page-docs.jade
index 09cbfa6a5..72db134cd 100644
--- a/website/_includes/_page-docs.jade
+++ b/website/_includes/_page-docs.jade
@@ -24,4 +24,6 @@ main.o-main.o-main--sidebar.o-main--aside
.o-inline-list
+button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary").u-text-tag Suggest edits #[+icon("code", 14)]
+ +gitter("spaCy chat")
+
include _footer
diff --git a/website/_includes/_scripts.jade b/website/_includes/_scripts.jade
new file mode 100644
index 000000000..544cf0977
--- /dev/null
+++ b/website/_includes/_scripts.jade
@@ -0,0 +1,23 @@
+//- 💫 INCLUDES > SCRIPTS
+
+script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript")
+script(src="/assets/js/prism.js", type="text/javascript")
+
+if SECTION == "docs"
+ script.
+ ((window.gitter = {}).chat = {}).options = {
+ useStyles: false,
+ activationElement: '.js-gitter-button',
+ targetElement: '.js-gitter',
+ room: '!{SOCIAL.gitter}'
+ };
+
+ script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
+
+if environment == "deploy"
+ script
+ | window.ga=window.ga||function(){
+ | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
+ | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
+
+ script(async src="https://www.google-analytics.com/analytics.js")
diff --git a/website/_layout.jade b/website/_layout.jade
index b04c4b5f3..d5c52df3f 100644
--- a/website/_layout.jade
+++ b/website/_layout.jade
@@ -52,13 +52,4 @@ html(lang="en")
main!=yield
include _includes/_footer
- script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript")
- script(src="/assets/js/prism.js", type="text/javascript")
-
- if environment == "deploy"
- script
- | window.ga=window.ga||function(){
- | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
- | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
-
- script(async src="https://www.google-analytics.com/analytics.js")
+ include _includes/_scripts
diff --git a/website/assets/css/_base/_fonts.sass b/website/assets/css/_base/_fonts.sass
index 72aaf97f8..be113798c 100644
--- a/website/assets/css/_base/_fonts.sass
+++ b/website/assets/css/_base/_fonts.sass
@@ -6,36 +6,36 @@
font-family: "Source Sans Pro"
font-style: normal
font-weight: 400
- src: url("../fonts/sourcesanspro-regular.eot")
- src: url("../fonts/sourcesanspro-regular.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-regular.woff2") format("woff2"), url("../fonts/sourcesanspro-regular.woff") format("woff"), url("../fonts/sourcesanspro-regular.ttf") format("truetype"), url("../fonts/sourcesanspro-regular.svg#source_sans_proregular") format("svg")
+ src: url("/assets/fonts/sourcesanspro-regular.eot")
+ src: url("/assets/fonts/sourcesanspro-regular.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-regular.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-regular.woff") format("woff"), url("/assets/fonts/sourcesanspro-regular.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-regular.svg#source_sans_proregular") format("svg")
@font-face
font-family: "Source Sans Pro"
font-style: italic
font-weight: 400
- src: url("../fonts/sourcesanspro-italic.eot")
- src: url("../fonts/sourcesanspro-italic.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-italic.woff2") format("woff2"), url("../fonts/sourcesanspro-italic.woff") format("woff"), url("../fonts/sourcesanspro-italic.ttf") format("truetype"), url("../fonts/sourcesanspro-italic.svg#source_sans_proitalic") format("svg")
+ src: url("/assets/fonts/sourcesanspro-italic.eot")
+ src: url("/assets/fonts/sourcesanspro-italic.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-italic.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-italic.woff") format("woff"), url("/assets/fonts/sourcesanspro-italic.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-italic.svg#source_sans_proitalic") format("svg")
@font-face
font-family: "Source Sans Pro"
font-style: normal
font-weight: 700
- src: url("../fonts/sourcesanspro-bold.eot")
- src: url("../fonts/sourcesanspro-bold.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-bold.woff2") format("woff2"), url("../fonts/sourcesanspro-bold.woff") format("woff"), url("../fonts/sourcesanspro-bold.ttf") format("truetype"), url("../fonts/sourcesanspro-bold.svg#source_sans_probold") format("svg")
+ src: url("/assets/fonts/sourcesanspro-bold.eot")
+ src: url("/assets/fonts/sourcesanspro-bold.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-bold.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-bold.woff") format("woff"), url("/assets/fonts/sourcesanspro-bold.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-bold.svg#source_sans_probold") format("svg")
@font-face
font-family: "Source Sans Pro"
font-style: italic
font-weight: 700
- src: url("../fonts/sourcesanspro-bolditalic.eot")
- src: url("../fonts/sourcesanspro-bolditalic.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-bolditalic.woff2") format("woff2"), url("../fonts/sourcesanspro-bolditalic.woff") format("woff"), url("../fonts/sourcesanspro-bolditalic.ttf") format("truetype"), url("../fonts/sourcesanspro-bolditalic.svg#source_sans_probold_italic") format("svg")
+ src: url("/assets/fonts/sourcesanspro-bolditalic.eot")
+ src: url("/assets/fonts/sourcesanspro-bolditalic.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-bolditalic.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-bolditalic.woff") format("woff"), url("/assets/fonts/sourcesanspro-bolditalic.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-bolditalic.svg#source_sans_probold_italic") format("svg")
// Source Code Pro
@font-face
- font-family: "Source Code Pro"
- font-style: normal
- font-weight: 600
- src: url("../fonts/sourcecodepro-semibold.eot")
- src: url("../fonts/sourcecodepro-semibold.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcecodepro-semibold.woff") format("woff"), url("../fonts/sourcecodepro-semibold.ttf") format("truetype"), url("../fonts/sourcecodepro-semibold.svg#sourcecodepro_semibold") format("svg")
+ font-family: "Source Code Pro"
+ font-style: normal
+ font-weight: 600
+ src: url("/assets/fonts/sourcecodepro-semibold.eot")
+ src: url("/assets/fonts/sourcecodepro-semibold.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcecodepro-semibold.woff") format("woff"), url("/assets/fonts/sourcecodepro-semibold.ttf") format("truetype"), url("/assets/fonts/sourcecodepro-semibold.svg#sourcecodepro_semibold") format("svg")
diff --git a/website/assets/css/_base/_objects.sass b/website/assets/css/_base/_objects.sass
index 2b037dca7..7aaaef787 100644
--- a/website/assets/css/_base/_objects.sass
+++ b/website/assets/css/_base/_objects.sass
@@ -60,7 +60,7 @@
background: $color-back
border-radius: 2px
border: 1px solid $color-subtle
- padding: 3.5% 2.5%
+ padding: 3rem 2.5%
//- Icons
diff --git a/website/assets/css/_base/_utilities.sass b/website/assets/css/_base/_utilities.sass
index 95be81bcd..2c40858a8 100644
--- a/website/assets/css/_base/_utilities.sass
+++ b/website/assets/css/_base/_utilities.sass
@@ -141,12 +141,6 @@
background: $pattern
-//- Cursors
-
-.u-help
- cursor: help
-
-
//- Hidden elements
.u-hidden
diff --git a/website/assets/css/_components/_chat.sass b/website/assets/css/_components/_chat.sass
new file mode 100644
index 000000000..2a1e5cc3d
--- /dev/null
+++ b/website/assets/css/_components/_chat.sass
@@ -0,0 +1,100 @@
+//- 💫 CSS > COMPONENTS > CHAT
+
+.c-chat
+ @include position(fixed, top, left, 0, 60%)
+ bottom: 0
+ right: 0
+ display: flex
+ flex-flow: column nowrap
+ background: $color-back
+ transition: transform 0.3s cubic-bezier(0.16, 0.22, 0.22, 1.7)
+ box-shadow: -0.25rem 0 1rem 0 rgba($color-front, 0.25)
+ z-index: 100
+
+ @include breakpoint(min, md)
+ left: calc(100% - #{$aside-width} - #{$aside-padding})
+
+ @include breakpoint(max, sm)
+ left: 50%
+
+ @include breakpoint(max, xs)
+ left: 0
+
+ &.is-collapsed:not(.is-loading)
+ transform: translateX(110%)
+
+ &:before
+ @include position(absolute, top, left, 1rem, 2rem)
+ content: attr(data-title)
+ font: bold 1.4rem $font-code
+ text-transform: uppercase
+ color: $color-back
+
+ &:after
+ @include position(absolute, top, left, 0, 100%)
+ content: ""
+ z-index: -1
+ bottom: 0
+ right: -100%
+ background: $color-back
+
+ & > iframe
+ width: 100%
+ flex: 1 1 calc(100% - #{$nav-height})
+ border: 0
+
+ .gitter-chat-embed-loading-wrapper
+ @include position(absolute, top, left, 0, 0)
+ right: 0
+ bottom: 0
+ display: none
+ justify-content: center
+ align-items: center
+
+ .is-loading &
+ display: flex
+
+ .gitter-chat-embed-action-bar,
+ .gitter-chat-embed-action-bar-item
+ display: flex
+
+ .gitter-chat-embed-action-bar
+ align-items: center
+ justify-content: flex-end
+ background: $color-theme
+ padding: 0 1rem 0 2rem
+ flex: 0 0 $nav-height
+
+ .gitter-chat-embed-action-bar-item
+ @include size(40px)
+ padding: 0
+ opacity: 0.75
+ background-position: 50%
+ background-repeat: no-repeat
+ background-size: 22px 22px
+ border: 0
+ cursor: pointer
+ transition: all 0.2s ease
+
+ &:focus,
+ &:hover
+ opacity: 1
+
+ &.gitter-chat-embed-action-bar-item-pop-out
+ background-image: url()
+ margin-right: -4px
+
+ &.gitter-chat-embed-action-bar-item-collapse-chat
+ background-image: url()
+
+.c-chat__button
+ @include position(fixed, bottom, right, 0, 2rem)
+ padding: 1rem 1.5rem
+ background: $color-front
+ color: $color-back
+ border-top-left-radius: 4px
+ border-top-right-radius: 4px
+ z-index: 20
+ border-color: $color-theme
+ border-style: solid
+ border-width: 1px 1px 0 1px
diff --git a/website/assets/css/style.sass b/website/assets/css/style.sass
index 5ab135ab9..a8d2edad4 100644
--- a/website/assets/css/style.sass
+++ b/website/assets/css/style.sass
@@ -24,6 +24,7 @@ $theme: blue !default
@import _components/asides
@import _components/buttons
+@import _components/chat
@import _components/code
@import _components/landing
@import _components/lists
diff --git a/website/assets/img/graphics.svg b/website/assets/img/graphics.svg
index 23036f4ca..dc69deda4 100644
--- a/website/assets/img/graphics.svg
+++ b/website/assets/img/graphics.svg
@@ -64,5 +64,6 @@
matt-signature
+
diff --git a/website/assets/img/icons.svg b/website/assets/img/icons.svg
index 9237c9994..224224084 100644
--- a/website/assets/img/icons.svg
+++ b/website/assets/img/icons.svg
@@ -1,32 +1,28 @@
diff --git a/website/docs/api/index.jade b/website/docs/api/index.jade
index 20995df2e..24f3d4458 100644
--- a/website/docs/api/index.jade
+++ b/website/docs/api/index.jade
@@ -23,7 +23,7 @@ p
+row
+cell Multi-language support
- each icon in [ "con", "pro", "pro", "pro" ]
+ each icon in [ "neutral", "pro", "pro", "pro" ]
+cell.u-text-center #[+procon(icon)]
+row
diff --git a/website/docs/index.jade b/website/docs/index.jade
index d2949b8c4..c19602002 100644
--- a/website/docs/index.jade
+++ b/website/docs/index.jade
@@ -2,8 +2,6 @@
include ../_includes/_mixins
-p=lorem_short
-
+aside("Help us improve the docs")
| Did you spot a mistake or come across explanations that
| are unclear? You can find a "Suggest edits" button at the
diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade
index 4b62a290b..a96df5694 100644
--- a/website/docs/usage/entity-recognition.jade
+++ b/website/docs/usage/entity-recognition.jade
@@ -57,7 +57,7 @@ p
doc.ents = [Span(0, 1, label='GPE')]
assert doc[0].ent_type_ == 'GPE'
doc.ents = []
- doc.ents = [(u'LondonCity', 0, 1, u'GPE')]
+ doc.ents = [(u'LondonCity', u'GPE', 0, 1)]
p
| The value you assign should be a sequence, the values of which
diff --git a/website/docs/usage/resources.jade b/website/docs/usage/resources.jade
index a09c7358d..2b80ebe48 100644
--- a/website/docs/usage/resources.jade
+++ b/website/docs/usage/resources.jade
@@ -30,6 +30,13 @@ p Many of the associated tools and resources that we're developing alongside spa
+cell
| REST microservices for spaCy demos and visualisers.
+ +row
+ +cell
+ +src(gh("spacy-notebooks")) spaCy Notebooks
+
+ +cell
+ | Jupyter notebooks for spaCy examples and tutorials.
+
+h(2, "libraries") Libraries and projects
+table(["Name", "Description"])
+row
diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade
index bedadb0d3..fde9ee4d7 100644
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@@ -141,7 +141,7 @@ p
span.merge(label=label, tag='NNP' if label else span.root.tag_)
matcher.add_entity('GoogleNow', on_match=merge_phrases)
- matcher.add_pattern('GoogleNow', {ORTH: 'Google'}, {ORTH: 'Now'}])
+ matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
doc = Doc(matcher.vocab, words=[u'Google', u'Now', u'is', u'being', u'rebranded'])
matcher(doc)
print([w.text for w in doc])