Merge branch 'master' of ssh://github.com/explosion/spaCy

2025-11-11 21:35:47 +03:00 · 2017-01-09 13:21:56 +01:00 · 2017-01-09 13:21:56 +01:00 · 3eb6a929f3
commit 3eb6a929f3
parent 9bac332688 c1ef07788c
84 changed files with 1867 additions and 3255 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -33,6 +33,7 @@ We use the following system to tag our issues:
 | [`install`](https://github.com/explosion/spaCy/labels/install) | Installation problems |
 | [`performance`](https://github.com/explosion/spaCy/labels/performance) | Accuracy, speed and memory use problems |
 | [`tests`](https://github.com/explosion/spaCy/labels/tests) | Missing or incorrect [tests](spacy/tests) |
 | [`english`](https://github.com/explosion/spaCy/labels/english), [`german`](https://github.com/explosion/spaCy/labels/german) | Issues related to the specific languages, models and data |
 | [`linux`](https://github.com/explosion/spaCy/labels/linux), [`osx`](https://github.com/explosion/spaCy/labels/osx), [`windows`](https://github.com/explosion/spaCy/labels/windows) | Issues related to the specific operating systems |
 | [`pip`](https://github.com/explosion/spaCy/labels/pip), [`conda`](https://github.com/explosion/spaCy/labels/conda) | Issues related to the specific package managers |
 | [`duplicate`](https://github.com/explosion/spaCy/labels/duplicate) | Duplicates, i.e. issues that have been reported before |
--- a/README.rst
+++ b/README.rst
@ -3,8 +3,10 @@ spaCy: Industrial-strength NLP
 spaCy is a library for advanced natural language processing in Python and 
 Cython. spaCy is built on  the very latest research, but it isn't researchware.  
-It was designed from day 1 to be used in real products. It's commercial 
+It was designed from day one to be used in real products. spaCy currently supports 
-open-source software, released under the MIT license.
+English and German,  as well as tokenization for Chinese, Spanish, Italian, French, 
 Portuguese, Dutch, Swedish and Hungarian. It's commercial  open-source software, 
 released under the MIT license.
 💫 **Version 1.5 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
@ -24,7 +26,7 @@ open-source software, released under the MIT license.
    :target: https://pypi.python.org/pypi/spacy
    :alt: pypi Version
-.. image:: https://badges.gitter.im/spaCy-users.png
+.. image:: https://badges.gitter.im/explosion.png
    :target: https://gitter.im/explosion/spaCy
    :alt: spaCy on Gitter
--- a/bin/parser/train_ud.py
+++ b/bin/parser/train_ud.py
@ -71,6 +71,8 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc):
    features = get_templates('basic')
    model_dir = pathlib.Path(model_dir)
    if not (model_dir / 'deps').exists():
        (model_dir / 'deps').mkdir()
    with (model_dir / 'deps' / 'config.json').open('w') as file_:
        json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_)
--- a/setup.py
+++ b/setup.py
@ -47,8 +47,7 @@ PACKAGES = [
    'spacy.tests.tokenizer',
    'spacy.tests.tokens',
    'spacy.tests.vectors',
-    'spacy.tests.vocab',
+    'spacy.tests.vocab']
    'spacy.tests.website']
 MOD_NAMES = [
--- a/spacy/de/language_data.py
+++ b/spacy/de/language_data.py
@ -9,12 +9,13 @@ from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
 TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 TAG_MAP = dict(TAG_MAP)
 STOP_WORDS = set(STOP_WORDS)
 TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
--- a/spacy/de/tokenizer_exceptions.py
+++ b/spacy/de/tokenizer_exceptions.py
@ -516,11 +516,6 @@ TOKENIZER_EXCEPTIONS = {
 ORTH_ONLY = [
    "'",
    "\\\")",
    "<space>",
    "a.",
    "ä.",
    "A.C.",
    "a.D.",
    "A.D.",
@ -530,24 +525,20 @@ ORTH_ONLY = [
    "Abs.",
    "adv.",
    "al.",
    "b.",
    "B.A.",
    "B.Sc.",
    "betr.",
    "biol.",
    "Biol.",
    "c.",
    "ca.",
    "Chr.",
    "Cie.",
    "co.",
    "Co.",
    "d.",
    "D.C.",
    "Dipl.-Ing.",
    "Dipl.",
    "Dr.",
    "e.",
    "e.g.",
    "e.V.",
    "ehem.",
@ -555,79 +546,57 @@ ORTH_ONLY = [
    "erm.",
    "etc.",
    "ev.",
    "f.",
    "g.",
    "G.m.b.H.",
    "geb.",
    "Gebr.",
    "gem.",
    "h.",
    "h.c.",
    "Hg.",
    "hrsg.",
    "Hrsg.",
    "i.",
    "i.A.",
    "i.e.",
    "i.G.",
    "i.Tr.",
    "i.V.",
    "Ing.",
    "j.",
    "jr.",
    "Jr.",
    "jun.",
    "jur.",
    "k.",
    "K.O.",
    "l.",
    "L.A.",
    "lat.",
    "m.",
    "M.A.",
    "m.E.",
    "m.M.",
    "M.Sc.",
    "Mr.",
    "n.",
    "N.Y.",
    "N.Y.C.",
    "nat.",
    "ö."
    "o.",
    "o.a.",
    "o.ä.",
    "o.g.",
    "o.k.",
    "O.K.",
    "p.",
    "p.a.",
    "p.s.",
    "P.S.",
    "pers.",
    "phil.",
    "q.",
    "q.e.d.",
    "r.",
    "R.I.P.",
    "rer.",
    "s.",
    "sen.",
    "St.",
    "std.",
    "t.",
    "u.",
    "ü.",
    "u.a.",
    "U.S.",
    "U.S.A.",
    "U.S.S.",
    "v.",
    "Vol.",
    "vs.",
-    "w.",
+    "wiss."
    "wiss.",
    "x.",
    "y.",
    "z."
 ]
--- a/spacy/en/language_data.py
+++ b/spacy/en/language_data.py
@ -37,14 +37,16 @@ def get_time_exc(hours):
    return exc
 TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 TAG_MAP = dict(TAG_MAP)
 STOP_WORDS = set(STOP_WORDS)
 TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
 update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
 update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’"))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
 __all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "LEMMA_RULES", "MORPH_RULES"]
--- a/spacy/en/tokenizer_exceptions.py
+++ b/spacy/en/tokenizer_exceptions.py
--- a/spacy/es/language_data.py
+++ b/spacy/es/language_data.py
@ -40,11 +40,14 @@ def get_time_exc(hours):
    return exc
 TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 STOP_WORDS = set(STOP_WORDS)
 TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
 update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
--- a/spacy/es/tokenizer_exceptions.py
+++ b/spacy/es/tokenizer_exceptions.py
@ -85,55 +85,29 @@ TOKENIZER_EXCEPTIONS = {
 ORTH_ONLY = [
    "a.",
    "a.C.",
    "a.J.C.",
    "apdo.",
    "Av.",
    "Avda.",
    "b.",
    "c.",
    "Cía.",
    "d.",
    "e.",
    "etc.",
    "f.",
    "g.",
    "Gob.",
    "Gral.",
    "h.",
    "i.",
    "Ing.",
    "j.",
    "J.C.",
    "k.",
    "l.",
    "Lic.",
    "m.",
    "m.n.",
    "n.",
    "no.",
    "núm.",
    "o.",
    "p.",
    "P.D.",
    "Prof.",
    "Profa.",
    "q.",
    "q.e.p.d."
    "r.",
    "s.",
    "S.A.",
    "S.L.",
    "s.s.s.",
    "Sr.",
    "Sra.",
-    "Srta.",
+    "Srta."
    "t.",
    "u.",
    "v.",
    "w.",
    "x.",
    "y.",
    "z."
 ]
--- a/spacy/fr/language_data.py
+++ b/spacy/fr/language_data.py
@ -2,13 +2,16 @@
 from __future__ import unicode_literals
 from .. import language_data as base
-from ..language_data import strings_to_exc
+from ..language_data import strings_to_exc, update_exc
 from .stop_words import STOP_WORDS
 TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 STOP_WORDS = set(STOP_WORDS)
 TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
--- a/spacy/hu/language_data.py
+++ b/spacy/hu/language_data.py
@ -4,21 +4,25 @@ from __future__ import unicode_literals
 import six
 from spacy.language_data import strings_to_exc, update_exc
-from .punctuations import *
+from .punctuation import *
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import ABBREVIATIONS
 from .tokenizer_exceptions import OTHER_EXC
 from .. import language_data as base
 STOP_WORDS = set(STOP_WORDS)
 TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
-TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES + TOKENIZER_PREFIXES
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
 TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
 TOKENIZER_INFIXES = TOKENIZER_INFIXES
 # HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]]
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS))
 TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES
 TOKENIZER_SUFFIXES = base.TOKENIZER_SUFFIXES + TOKENIZER_SUFFIXES
 TOKENIZER_INFIXES = TOKENIZER_INFIXES
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
--- a/spacy/hu/punctuation.py
+++ b/spacy/hu/punctuation.py
@ -0,0 +1,25 @@
 # encoding: utf8
 from __future__ import unicode_literals
 from ..language_data.punctuation import ALPHA, ALPHA_LOWER, ALPHA_UPPER, LIST_ELLIPSES
 TOKENIZER_SUFFIXES = [
    r'(?<=[{al})])-e'.format(al=ALPHA_LOWER)
 ]
 TOKENIZER_INFIXES = [
    r'(?<=[0-9])-(?=[0-9])',
    r'(?<=[0-9])[+\-\*/^](?=[0-9])',
    r'(?<=[{a}])--(?=[{a}])',
    r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
    r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
    r'(?<=[0-9{a}])"(?=[\-{a}])'.format(a=ALPHA),
    r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA)
 ]
 TOKENIZER_INFIXES += LIST_ELLIPSES
 __all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
--- a/spacy/hu/punctuations.py
+++ b/spacy/hu/punctuations.py
@ -1,89 +0,0 @@
 # encoding: utf8
 from __future__ import unicode_literals
 TOKENIZER_PREFIXES = r'''
 +
 '''.strip().split('\n')
 TOKENIZER_SUFFIXES = r'''
 ,
 \"
 \)
 \]
 \}
 \*
 \!
 \?
 \$
 >
 :
 ;
 '
 ”
 “
 «
 _
 ''
 ’
 ‘
 €
 \.\.
 \.\.\.
 \.\.\.\.
 (?<=[a-züóőúéáűí)\]"'´«‘’%\)²“”+-])\.
 (?<=[a-züóőúéáűí)])-e
 \-\-
 ´
 (?<=[0-9])\+
 (?<=[a-z0-9üóőúéáűí][\)\]”"'%\)§/])\.
 (?<=[0-9])km²
 (?<=[0-9])m²
 (?<=[0-9])cm²
 (?<=[0-9])mm²
 (?<=[0-9])km³
 (?<=[0-9])m³
 (?<=[0-9])cm³
 (?<=[0-9])mm³
 (?<=[0-9])ha
 (?<=[0-9])km
 (?<=[0-9])m
 (?<=[0-9])cm
 (?<=[0-9])mm
 (?<=[0-9])µm
 (?<=[0-9])nm
 (?<=[0-9])yd
 (?<=[0-9])in
 (?<=[0-9])ft
 (?<=[0-9])kg
 (?<=[0-9])g
 (?<=[0-9])mg
 (?<=[0-9])µg
 (?<=[0-9])t
 (?<=[0-9])lb
 (?<=[0-9])oz
 (?<=[0-9])m/s
 (?<=[0-9])km/h
 (?<=[0-9])mph
 (?<=°[FCK])\.
 (?<=[0-9])hPa
 (?<=[0-9])Pa
 (?<=[0-9])mbar
 (?<=[0-9])mb
 (?<=[0-9])T
 (?<=[0-9])G
 (?<=[0-9])M
 (?<=[0-9])K
 (?<=[0-9])kb
 '''.strip().split('\n')
 TOKENIZER_INFIXES = r'''
 …
 \.\.+
 (?<=[a-züóőúéáűí])\.(?=[A-ZÜÓŐÚÉÁŰÍ])
 (?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ0-9])"(?=[\-a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
 (?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])--(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
 (?<=[0-9])[+\-\*/^](?=[0-9])
 (?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]),(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
 '''.strip().split('\n')
 __all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
--- a/spacy/hu/tokenizer_exceptions.py
+++ b/spacy/hu/tokenizer_exceptions.py
@ -111,7 +111,6 @@ Vcs.
 Vhr.
 X.Y.
 Zs.
 a.
 a.C.
 ac.
 adj.
@ -126,7 +125,6 @@ ang.
 arch.
 at.
 aug.
 b.
 b.a.
 b.s.
 b.sc.
@ -141,7 +139,6 @@ br.
 bsc.
 bt.
 btk.
 c.
 ca.
 cc.
 cca.
@ -155,7 +152,6 @@ csc.
 csüt.
 cső.
 ctv.
 d.
 dbj.
 dd.
 ddr.
@ -170,7 +166,6 @@ dolg.
 dr.
 du.
 dzs.
 e.
 ea.
 ed.
 eff.
@ -186,7 +181,6 @@ etc.
 ev.
 ezr.
 eü.
 f.
 f.h.
 f.é.
 fam.
@ -213,7 +207,6 @@ főig.
 főisk.
 főtörm.
 főv.
 g.
 gazd.
 gimn.
 gk.
@ -225,7 +218,6 @@ gy.
 gyak.
 gyártm.
 gör.
 h.
 hads.
 hallg.
 hdm.
@ -266,7 +258,6 @@ isk.
 ism.
 izr.
 iá.
 j.
 jan.
 jav.
 jegyz.
@ -278,7 +269,6 @@ jr.
 jvb.
 júl.
 jún.
 k.
 karb.
 kat.
 kb.
@ -313,7 +303,6 @@ közl.
 közp.
 közt.
 kü.
 l.
 lat.
 ld.
 legs.
@ -324,7 +313,6 @@ lt.
 ltd.
 ltp.
 luth.
 m.
 m.a.
 m.s.
 m.sc.
@ -359,7 +347,6 @@ műh.
 műsz.
 műv.
 művez.
 n.
 nagyker.
 nagys.
 nat.
@ -372,7 +359,6 @@ ny.
 nyilv.
 nyrt.
 nyug.
 o.
 obj.
 okl.
 okt.
@ -381,7 +367,6 @@ orsz.
 ort.
 ov.
 ovh.
 p.
 pf.
 pg.
 ph.d
@ -404,8 +389,6 @@ pság.
 ptk.
 pu.
 pü.
 q.
 r.
 r.k.
 rac.
 rad.
@ -420,7 +403,6 @@ rkt.
 rt.
 rtg.
 röv.
 s.
 s.b.
 s.k.
 sa.
@ -450,7 +432,6 @@ szt.
 szubj.
 szöv.
 szül.
 t.
 tanm.
 tb.
 tbk.
@ -476,13 +457,11 @@ tvr.
 ty.
 törv.
 tü.
 u.
 ua.
 ui.
 unit.
 uo.
 uv.
 v.
 vas.
 vb.
 vegy.
@ -501,9 +480,6 @@ vv.
 vál.
 vízv.
 vö.
 w.
 y.
 z.
 zrt.
 zs.
 Ész.
@ -520,7 +496,6 @@ zs.
 évf.
 í.
 ó.
 ö.
 össz.
 ötk.
 özv.
@ -528,7 +503,6 @@ zs.
 úm.
 ún.
 út.
 ü.
 üag.
 üd.
 üdv.
@ -544,6 +518,5 @@ zs.
 """.strip().split()
 OTHER_EXC = """
 ''
 -e
 """.strip().split()
--- a/spacy/it/init.py
+++ b/spacy/it/init.py
@ -1,8 +1,6 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 from os import path
 from ..language import Language
 from ..attrs import LANG
--- a/spacy/it/language_data.py
+++ b/spacy/it/language_data.py
@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
 from .stop_words import STOP_WORDS
 TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 STOP_WORDS = set(STOP_WORDS)
 TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
--- a/spacy/language_data/init.py
+++ b/spacy/language_data/init.py
@ -1,3 +1,4 @@
 from .abbreviations import *
 from .emoticons import *
 from .punctuation import *
 from .tag_map import *
--- a/spacy/language_data/abbreviations.py
+++ b/spacy/language_data/abbreviations.py
@ -0,0 +1,43 @@
 # encoding: utf8
 from __future__ import unicode_literals
 ABBREVIATIONS = [
    "'",
    "\\\")",
    "<space>",
    "''",
    "C++",
    "a.",
    "b.",
    "c.",
    "d.",
    "e.",
    "f.",
    "g.",
    "h.",
    "i.",
    "j.",
    "k.",
    "l.",
    "m.",
    "n.",
    "o.",
    "p.",
    "q.",
    "r.",
    "s.",
    "t.",
    "u.",
    "v.",
    "w.",
    "x.",
    "y.",
    "z.",
    "ä.",
    "ö.",
    "ü."
 ]
 __all__ = [ "ABBREVIATIONS" ]
--- a/spacy/language_data/emoticons.py
+++ b/spacy/language_data/emoticons.py
@ -13,6 +13,7 @@ EMOTICONS = set("""
 (-:
 =)
 (=
 ")
 :]
 :-]
 [:
--- a/spacy/language_data/punctuation.py
+++ b/spacy/language_data/punctuation.py
@ -1,133 +1,115 @@
 # encoding: utf8
 from __future__ import unicode_literals
-
+import re
 TOKENIZER_PREFIXES = r'''
 ,
 "
 (
 [
 {
 *
 <
 >
 $
 £
 ¡
 ¿
 „
 “
 '
 ``
 `
 #
 ‘
 ....
 ...
 …
 ‚
 »
 §
 US$
 C$
 A$
 a-
 '''.strip().split('\n')
-TOKENIZER_SUFFIXES = r'''
+_ALPHA_LOWER = """
-,
+a ä à á â ǎ æ ã å ā ă ą b c ç ć č ĉ ċ c̄ d ð ď e é è ê ë ė ȅ ȩ ẽ ę f g ĝ ğ h i ı
-\"
+î ï í ī ì ȉ ǐ į ĩ j k ķ l ł ļ m n ñ ń ň ņ o ö ó ò ő ô õ œ ø ō ő ǒ ơ p q r ř ŗ s
-\)
+ß ś š ş ŝ t ť u ú û ù ú ū ű ǔ ů ų ư v w ŵ x y ÿ ý ỳ ŷ ỹ z ź ž ż þ
-\]
+"""
 \}
 \*
 \!
 \?
 %
 \$
 >
 :
 ;
 '
 ”
 “
 «
 _
 ''
 's
 'S
 ’s
 ’S
 ’
 ‘
 °
 €
 …
 \.\.
 \.\.\.
 \.\.\.\.
 (?<=[a-z0-9)\]”"'%\)])\.
 (?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
 \-\-
 ´
 (?<=[0-9])km²
 (?<=[0-9])m²
 (?<=[0-9])cm²
 (?<=[0-9])mm²
 (?<=[0-9])km³
 (?<=[0-9])m³
 (?<=[0-9])cm³
 (?<=[0-9])mm³
 (?<=[0-9])ha
 (?<=[0-9])km
 (?<=[0-9])m
 (?<=[0-9])cm
 (?<=[0-9])mm
 (?<=[0-9])µm
 (?<=[0-9])nm
 (?<=[0-9])yd
 (?<=[0-9])in
 (?<=[0-9])ft
 (?<=[0-9])kg
 (?<=[0-9])g
 (?<=[0-9])mg
 (?<=[0-9])µg
 (?<=[0-9])t
 (?<=[0-9])lb
 (?<=[0-9])oz
 (?<=[0-9])m/s
 (?<=[0-9])km/h
 (?<=[0-9])mph
 (?<=[0-9])°C
 (?<=[0-9])°K
 (?<=[0-9])°F
 (?<=[0-9])hPa
 (?<=[0-9])Pa
 (?<=[0-9])mbar
 (?<=[0-9])mb
 (?<=[0-9])T
 (?<=[0-9])G
 (?<=[0-9])M
 (?<=[0-9])K
 (?<=[0-9])kb
 '''.strip().split('\n')
-TOKENIZER_INFIXES = r'''
+_ALPHA_UPPER = """
-…
+A Ä À Á Â Ǎ Æ Ã Å Ā Ă Ą B C Ç Ć Č Ĉ Ċ C̄ D Ð Ď E É È Ê Ë Ė Ȅ Ȩ Ẽ Ę F G Ĝ Ğ H I İ
-\.\.\.+
+Î Ï Í Ī Ì Ȉ Ǐ Į Ĩ J K Ķ L Ł Ļ M N Ñ Ń Ň Ņ O Ö Ó Ò Ő Ô Õ Œ Ø Ō Ő Ǒ Ơ P Q R Ř Ŗ S
-(?<=[a-z])\.(?=[A-Z])
+Ś Š Ş Ŝ T Ť U Ú Û Ù Ú Ū Ű Ǔ Ů Ų Ư V W Ŵ X Y Ÿ Ý Ỳ Ŷ Ỹ Z Ź Ž Ż Þ
-(?<=[a-z])\.(?=[A-Z])
+"""
-(?<=[a-zA-Z])-(?=[a-zA-z])
+
-(?<=[a-zA-Z])--(?=[a-zA-z])
+
-(?<=[0-9])-(?=[0-9])
+_UNITS = """
-(?<=[A-Za-z]),(?=[A-Za-z])
+km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft kg g mg
-(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ])
+µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb
-(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ])
+TB T G M K
-(?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ])
+"""
-(?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ])
+
-'''.strip().split('\n')
+
 _CURRENCY = r"""
 \$ £ € ¥ ฿ US\$ C\$ A\$
 """
 _QUOTES = r"""
 ' '' " ” “ `` ` ‘ ´ ‚ , „ » «
 """
 _PUNCT = r"""
 … , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &
 """
 _HYPHENS = r"""
 - – — -- ---
 """
 LIST_ELLIPSES = [
    r'\.\.+',
    "…"
 ]
 LIST_CURRENCY = list(_CURRENCY.strip().split())
 LIST_QUOTES = list(_QUOTES.strip().split())
 LIST_PUNCT = list(_PUNCT.strip().split())
 LIST_HYPHENS = list(_HYPHENS.strip().split())
 ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '')
 ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '')
 ALPHA = ALPHA_LOWER + ALPHA_UPPER
 QUOTES = _QUOTES.strip().replace(' ', '|')
 CURRENCY = _CURRENCY.strip().replace(' ', '|')
 UNITS = _UNITS.strip().replace(' ', '|')
 HYPHENS = _HYPHENS.strip().replace(' ', '|')
 # Prefixes
 TOKENIZER_PREFIXES = (
    ['§', '%', r'\+'] +
    LIST_PUNCT +
    LIST_ELLIPSES +
    LIST_QUOTES +
    LIST_CURRENCY
 )
 # Suffixes
 TOKENIZER_SUFFIXES = (
    LIST_PUNCT +
    LIST_ELLIPSES +
    LIST_QUOTES +
    [
        r'(?<=[0-9])\+',
        r'(?<=°[FfCcKk])\.',
        r'(?<=[0-9])(?:{c})'.format(c=CURRENCY),
        r'(?<=[0-9])(?:{u})'.format(u=UNITS),
        r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES),
        r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER),
        "'s", "'S", "’s", "’S"
    ]
 )
 # Infixes
 TOKENIZER_INFIXES = (
    LIST_ELLIPSES +
    [
        r'(?<=[0-9])[+\-\*/^](?=[0-9])',
        r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
        r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
        r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
        r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA)
    ]
 )
 __all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
--- a/spacy/language_data/tag_map.py
+++ b/spacy/language_data/tag_map.py
@ -20,5 +20,6 @@ TAG_MAP = {
    "X":        {POS: X},
    "CONJ":     {POS: CONJ},
    "ADJ":      {POS: ADJ},
-    "VERB":     {POS: VERB}
+    "VERB":     {POS: VERB},
    "PART":     {POS: PART}
 }
--- a/spacy/nl/init.py
+++ b/spacy/nl/init.py
@ -1,8 +1,6 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 from os import path
 from ..language import Language
 from ..attrs import LANG
 from .language_data import *
--- a/spacy/nl/language_data.py
+++ b/spacy/nl/language_data.py
@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
 from .stop_words import STOP_WORDS
 TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 STOP_WORDS = set(STOP_WORDS)
 TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
--- a/spacy/pt/init.py
+++ b/spacy/pt/init.py
@ -1,8 +1,6 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 from os import path
 from ..language import Language
 from ..attrs import LANG
--- a/spacy/pt/language_data.py
+++ b/spacy/pt/language_data.py
@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
 from .stop_words import STOP_WORDS
 TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 STOP_WORDS = set(STOP_WORDS)
 TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
--- a/spacy/sv/init.py
+++ b/spacy/sv/init.py
@ -1,8 +1,6 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 from os import path
 from ..language import Language
 from ..attrs import LANG
 from .language_data import *
--- a/spacy/sv/language_data.py
+++ b/spacy/sv/language_data.py
@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
 from .stop_words import STOP_WORDS
 TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 STOP_WORDS = set(STOP_WORDS)
 TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
--- a/spacy/tests/website/init.py
+++ b/spacy/tests/website/init.py
--- a/spacy/tests/de/conftest.py
+++ b/spacy/tests/de/conftest.py
@ -0,0 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
 from ...de import German
@pytest.fixture
 def de_tokenizer():
    return German.Defaults.create_tokenizer()
--- a/spacy/tests/de/tokenizer/init.py
+++ b/spacy/tests/de/tokenizer/init.py
--- a/spacy/tests/de/tokenizer/test_exceptions.py
+++ b/spacy/tests/de/tokenizer/test_exceptions.py
@ -0,0 +1,27 @@
 # coding: utf-8
 """Test that tokenizer exceptions and emoticons are handles correctly."""
 from __future__ import unicode_literals
 import pytest
@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
 def test_tokenizer_splits_contractions(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 2
@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
 def test_tokenizer_handles_abbr(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 1
 def test_tokenizer_handles_exc_in_text(de_tokenizer):
    text = "Ich bin z.Zt. im Urlaub."
    tokens = de_tokenizer(text)
    assert len(tokens) == 6
    assert tokens[2].text == "z.Zt."
    assert tokens[2].lemma_ == "zur Zeit"
--- a/spacy/tests/de/tokenizer/test_prefix_suffix_infix.py
+++ b/spacy/tests/de/tokenizer/test_prefix_suffix_infix.py
@ -0,0 +1,116 @@
 # coding: utf-8
 """Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
 from __future__ import unicode_literals
 import pytest
@pytest.mark.parametrize('text', ["(unter)"])
 def test_tokenizer_splits_no_special(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["unter'm"])
 def test_tokenizer_splits_no_punct(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(unter'm"])
 def test_tokenizer_splits_prefix_punct(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["unter'm)"])
 def test_tokenizer_splits_suffix_punct(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(unter'm)"])
 def test_tokenizer_splits_even_wrap(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 4
@pytest.mark.parametrize('text', ["(unter'm?)"])
 def test_tokenizer_splits_uneven_wrap(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 5
@pytest.mark.parametrize('text,length', [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)])
 def test_tokenizer_splits_prefix_interact(de_tokenizer, text, length):
    tokens = de_tokenizer(text)
    assert len(tokens) == length
@pytest.mark.parametrize('text', ["z.B.)"])
 def test_tokenizer_splits_suffix_interact(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(z.B.)"])
 def test_tokenizer_splits_even_wrap_interact(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(z.B.?)"])
 def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 4
@pytest.mark.parametrize('text', ["blau-rot"])
 def test_tokenizer_splits_hyphens(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
 def test_tokenizer_splits_numeric_range(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["blau.Rot", "Hallo.Welt"])
 def test_tokenizer_splits_period_infix(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Hallo,Welt", "eins,zwei"])
 def test_tokenizer_splits_comma_infix(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 3
    assert tokens[0].text == text.split(",")[0]
    assert tokens[1].text == ","
    assert tokens[2].text == text.split(",")[1]
@pytest.mark.parametrize('text', ["blau...Rot", "blau...rot"])
 def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 3
 def test_tokenizer_splits_double_hyphen_infix(de_tokenizer):
    tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.")
    assert len(tokens) == 12
    assert tokens[0].text == "Viele"
    assert tokens[1].text == "Regeln"
    assert tokens[2].text == "--"
    assert tokens[3].text == "wie"
    assert tokens[4].text == "die"
    assert tokens[5].text == "Bindestrich"
    assert tokens[6].text == "-"
    assert tokens[7].text == "Regeln"
    assert tokens[8].text == "--"
    assert tokens[9].text == "sind"
    assert tokens[10].text == "kompliziert"
--- a/spacy/tests/de/tokenizer/test_text.py
+++ b/spacy/tests/de/tokenizer/test_text.py
@ -0,0 +1,45 @@
 # coding: utf-8
 """Test that longer and mixed texts are tokenized correctly."""
 from __future__ import unicode_literals
 import pytest
 def test_tokenizer_handles_long_text(de_tokenizer):
    text = """Die Verwandlung
 Als Gregor Samsa eines Morgens aus unruhigen Träumen erwachte, fand er sich in
 seinem Bett zu einem ungeheueren Ungeziefer verwandelt.
 Er lag auf seinem panzerartig harten Rücken und sah, wenn er den Kopf ein wenig
 hob, seinen gewölbten, braunen, von bogenförmigen Versteifungen geteilten
 Bauch, auf dessen Höhe sich die Bettdecke, zum gänzlichen Niedergleiten bereit,
 kaum noch erhalten konnte. Seine vielen, im Vergleich zu seinem sonstigen
 Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
 »Was ist mit mir geschehen?«, dachte er."""
    tokens = de_tokenizer(text)
    assert len(tokens) == 109
@pytest.mark.parametrize('text,length', [
    ("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1),
    ("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1),
    ("Kraftfahrzeug-Haftpflichtversicherung", 3),
    ("Vakuum-Mittelfrequenz-Induktionsofen", 5)
    ])
 def test_tokenizer_handles_long_words(de_tokenizer, text, length):
    tokens = de_tokenizer(text)
    assert len(tokens) == length
@pytest.mark.parametrize('text,length', [
    ("»Was ist mit mir geschehen?«, dachte er.", 12),
    ("“Dies frühzeitige Aufstehen”, dachte er, “macht einen ganz blödsinnig. ", 15)
    ])
 def test_tokenizer_handles_examples(de_tokenizer, text, length):
    tokens = de_tokenizer(text)
    assert len(tokens) == length
--- a/spacy/tests/en/init.py
+++ b/spacy/tests/en/init.py
--- a/spacy/tests/en/conftest.py
+++ b/spacy/tests/en/conftest.py
@ -0,0 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
 from ...en import English
@pytest.fixture
 def en_tokenizer():
    return English.Defaults.create_tokenizer()
--- a/spacy/tests/en/tokenizer/init.py
+++ b/spacy/tests/en/tokenizer/init.py
--- a/spacy/tests/en/tokenizer/test_contractions.py
+++ b/spacy/tests/en/tokenizer/test_contractions.py
@ -0,0 +1,87 @@
 # coding: utf-8
 """Test that tokens are created correctly for contractions."""
 from __future__ import unicode_literals
 import pytest
 def test_tokenizer_handles_basic_contraction(en_tokenizer):
    text = "don't giggle"
    tokens = en_tokenizer(text)
    assert len(tokens) == 3
    assert tokens[1].text == "n't"
    text = "i said don't!"
    tokens = en_tokenizer(text)
    assert len(tokens) == 5
    assert tokens[4].text == "!"
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
 def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
 def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
    tokens = en_tokenizer(text_poss)
    assert len(tokens) == 2
    assert tokens[0].text == text
    assert tokens[1].text == "'s"
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
 def test_tokenizer_splits_trailing_apos(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 2
    assert tokens[0].text == text.split("'")[0]
    assert tokens[1].text == "'"
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
 def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 1
    assert tokens[0].text == text
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
 def test_tokenizer_handles_ll_contraction(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 2
    assert tokens[0].text == text.split("'")[0]
    assert tokens[1].text == "'ll"
    assert tokens[1].lemma_ == "will"
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
 def test_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
    tokens_lower = en_tokenizer(text_lower)
    tokens_title = en_tokenizer(text_title)
    assert tokens_title[0].text == tokens_lower[0].text.title()
    assert tokens_lower[0].text == tokens_title[0].text.lower()
    assert tokens_lower[1].text == tokens_title[1].text
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
 def test_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
    tokens = en_tokenizer(pron + contraction)
    assert tokens[0].text == pron
    assert tokens[1].text == contraction
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
 def test_tokenizer_excludes_ambiguous(en_tokenizer, exc):
    tokens = en_tokenizer(exc)
    assert len(tokens) == 1
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
 def test_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
    tokens = en_tokenizer(wo_punct)
    assert len(tokens) == 2
    tokens = en_tokenizer(w_punct)
    assert len(tokens) == 3
--- a/spacy/tests/en/tokenizer/test_exceptions.py
+++ b/spacy/tests/en/tokenizer/test_exceptions.py
@ -0,0 +1,20 @@
 # coding: utf-8
 """Test that tokenizer exceptions are handled correctly."""
 from __future__ import unicode_literals
 import pytest
@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
 def test_tokenizer_handles_abbr(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 1
 def test_tokenizer_handles_exc_in_text(en_tokenizer):
    text = "It's mediocre i.e. bad."
    tokens = en_tokenizer(text)
    assert len(tokens) == 6
    assert tokens[3].text == "i.e."
--- a/spacy/tests/en/tokenizer/test_indices.py
+++ b/spacy/tests/en/tokenizer/test_indices.py
@ -1,12 +1,14 @@
 # coding: utf-8
 """Test that token.idx correctly computes index into the original string."""
 from __future__ import unicode_literals
 import pytest
 def test_simple_punct(en_tokenizer):
-    text = 'to walk, do foo'
+    text = "to walk, do foo"
    tokens = en_tokenizer(text)
    assert tokens[0].idx == 0
    assert tokens[1].idx == 3
@ -16,7 +18,7 @@ def test_simple_punct(en_tokenizer):
 def test_complex_punct(en_tokenizer):
-    text = 'Tom (D., Ill.)!'
+    text = "Tom (D., Ill.)!"
    tokens = en_tokenizer(text)
    assert tokens[0].idx == 0
    assert len(tokens[0]) == 3
--- a/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py
+++ b/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py
@ -0,0 +1,136 @@
 # coding: utf-8
 """Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
 from __future__ import unicode_literals
 import pytest
@pytest.mark.parametrize('text', ["(can)"])
 def test_tokenizer_splits_no_special(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["can't"])
 def test_tokenizer_splits_no_punct(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(can't"])
 def test_tokenizer_splits_prefix_punct(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["can't)"])
 def test_tokenizer_splits_suffix_punct(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(can't)"])
 def test_tokenizer_splits_even_wrap(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 4
@pytest.mark.parametrize('text', ["(can't?)"])
 def test_tokenizer_splits_uneven_wrap(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 5
@pytest.mark.parametrize('text,length', [("U.S.", 1), ("us.", 2), ("(U.S.", 2)])
 def test_tokenizer_splits_prefix_interact(en_tokenizer, text, length):
    tokens = en_tokenizer(text)
    assert len(tokens) == length
@pytest.mark.parametrize('text', ["U.S.)"])
 def test_tokenizer_splits_suffix_interact(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(U.S.)"])
 def test_tokenizer_splits_even_wrap_interact(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(U.S.?)"])
 def test_tokenizer_splits_uneven_wrap_interact(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 4
@pytest.mark.parametrize('text', ["best-known"])
 def test_tokenizer_splits_hyphens(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
 def test_tokenizer_splits_numeric_range(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["best.Known", "Hello.World"])
 def test_tokenizer_splits_period_infix(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Hello,world", "one,two"])
 def test_tokenizer_splits_comma_infix(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 3
    assert tokens[0].text == text.split(",")[0]
    assert tokens[1].text == ","
    assert tokens[2].text == text.split(",")[1]
@pytest.mark.parametrize('text', ["best...Known", "best...known"])
 def test_tokenizer_splits_ellipsis_infix(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 3
 def test_tokenizer_splits_double_hyphen_infix(en_tokenizer):
    tokens = en_tokenizer("No decent--let alone well-bred--people.")
    assert tokens[0].text == "No"
    assert tokens[1].text == "decent"
    assert tokens[2].text == "--"
    assert tokens[3].text == "let"
    assert tokens[4].text == "alone"
    assert tokens[5].text == "well"
    assert tokens[6].text == "-"
    assert tokens[7].text == "bred"
    assert tokens[8].text == "--"
    assert tokens[9].text == "people"
@pytest.mark.xfail
 def test_tokenizer_splits_period_abbr(en_tokenizer):
    text = "Today is Tuesday.Mr."
    tokens = en_tokenizer(text)
    assert len(tokens) == 5
    assert tokens[0].text == "Today"
    assert tokens[1].text == "is"
    assert tokens[2].text == "Tuesday"
    assert tokens[3].text == "."
    assert tokens[4].text == "Mr."
@pytest.mark.xfail
 def test_tokenizer_splits_em_dash_infix(en_tokenizer):
    # Re Issue #225
    tokens = en_tokenizer("""Will this road take me to Puddleton?\u2014No, """
                          """you'll have to walk there.\u2014Ariel.""")
    assert tokens[6].text == "Puddleton"
    assert tokens[7].text == "?"
    assert tokens[8].text == "\u2014"
--- a/spacy/tests/en/tokenizer/test_punct.py
+++ b/spacy/tests/en/tokenizer/test_punct.py
@ -0,0 +1,132 @@
 # coding: utf-8
 """Test that open, closed and paired punctuation is split off correctly."""
 from __future__ import unicode_literals
 import pytest
 from ....util import compile_prefix_regex
 from ....language_data import TOKENIZER_PREFIXES
 en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
 PUNCT_OPEN = ['(', '[', '{', '*']
 PUNCT_CLOSE = [')', ']', '}', '*']
 PUNCT_PAIRED = [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')]
@pytest.mark.parametrize('text', ["(", "((", "<"])
 def test_tokenizer_handles_only_punct(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == len(text)
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('text', ["Hello"])
 def test_tokenizer_splits_open_punct(en_tokenizer, punct, text):
    tokens = en_tokenizer(punct + text)
    assert len(tokens) == 2
    assert tokens[0].text == punct
    assert tokens[1].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('text', ["Hello"])
 def test_tokenizer_splits_close_punct(en_tokenizer, punct, text):
    tokens = en_tokenizer(text + punct)
    assert len(tokens) == 2
    assert tokens[0].text == text
    assert tokens[1].text == punct
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('punct_add', ["`"])
@pytest.mark.parametrize('text', ["Hello"])
 def test_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text):
    tokens = en_tokenizer(punct + punct_add + text)
    assert len(tokens) == 3
    assert tokens[0].text == punct
    assert tokens[1].text == punct_add
    assert tokens[2].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('punct_add', ["'"])
@pytest.mark.parametrize('text', ["Hello"])
 def test_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text):
    tokens = en_tokenizer(text + punct + punct_add)
    assert len(tokens) == 3
    assert tokens[0].text == text
    assert tokens[1].text == punct
    assert tokens[2].text == punct_add
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('text', ["Hello"])
 def test_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
    tokens = en_tokenizer(punct + punct + punct + text)
    assert len(tokens) == 4
    assert tokens[0].text == punct
    assert tokens[3].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('text', ["Hello"])
 def test_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
    tokens = en_tokenizer(text + punct + punct + punct)
    assert len(tokens) == 4
    assert tokens[0].text == text
    assert tokens[1].text == punct
@pytest.mark.parametrize('text', ["'The"])
 def test_tokenizer_splits_open_appostrophe(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 2
    assert tokens[0].text == "'"
@pytest.mark.parametrize('text', ["Hello''"])
 def test_tokenizer_splits_double_end_quote(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 2
    tokens_punct = en_tokenizer("''")
    assert len(tokens_punct) == 1
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
@pytest.mark.parametrize('text', ["Hello"])
 def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close, text):
    tokens = en_tokenizer(punct_open + text + punct_close)
    assert len(tokens) == 3
    assert tokens[0].text == punct_open
    assert tokens[1].text == text
    assert tokens[2].text == punct_close
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
@pytest.mark.parametrize('punct_open_add,punct_close_add', [("`", "'")])
@pytest.mark.parametrize('text', ["Hello"])
 def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, punct_close_add, text):
    tokens = en_tokenizer(punct_open_add + punct_open + text + punct_close + punct_close_add)
    assert len(tokens) == 5
    assert tokens[0].text == punct_open_add
    assert tokens[1].text == punct_open
    assert tokens[2].text == text
    assert tokens[3].text == punct_close
    assert tokens[4].text == punct_close_add
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
 def test_tokenizer_splits_pre_punct_regex(text, punct):
    match = en_search_prefixes(text)
    assert match.group() == punct
 def test_tokenizer_splits_bracket_period(en_tokenizer):
    text = "(And a 6a.m. run through Washington Park)."
    tokens = en_tokenizer(text)
    assert tokens[len(tokens) - 1].text == "."
--- a/spacy/tests/en/tokenizer/test_text.py
+++ b/spacy/tests/en/tokenizer/test_text.py
@ -0,0 +1,36 @@
 # coding: utf-8
 """Test that longer and mixed texts are tokenized correctly."""
 from __future__ import unicode_literals
 import pytest
 def test_tokenizer_handles_long_text(en_tokenizer):
    text = """Tributes pour in for late British Labour Party leader
 Tributes poured in from around the world Thursday
 to the late Labour Party leader John Smith, who died earlier from a massive
 heart attack aged 55.
 In Washington, the US State Department issued a statement regretting "the
 untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
 "Mr. Smith, throughout his distinguished"""
    tokens = en_tokenizer(text)
    assert len(tokens) == 76
@pytest.mark.parametrize('text,length', [
    ("The U.S. Army likes Shock and Awe.", 8),
    ("U.N. regulations are not a part of their concern.", 10),
    ("“Isn't it?”", 6),
    ("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
    ("""'Me too!', Mr. P. Delaware cried. """, 11),
    ("They ran about 10km.", 6),
    # ("But then the 6,000-year ice age came...", 10)
    ])
 def test_tokenizer_handles_cnts(en_tokenizer, text, length):
    tokens = en_tokenizer(text)
    assert len(tokens) == length
--- a/spacy/tests/hu/conftest.py
+++ b/spacy/tests/hu/conftest.py
@ -0,0 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
 from ...hu import Hungarian
@pytest.fixture
 def hu_tokenizer():
    return Hungarian.Defaults.create_tokenizer()
--- a/spacy/tests/hu/tokenizer/test_tokenizer.py
+++ b/spacy/tests/hu/tokenizer/test_tokenizer.py
@ -2,25 +2,27 @@
 from __future__ import unicode_literals
 import pytest
 from spacy.hu import Hungarian
 _DEFAULT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
                  ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
                  ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
                  ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
                  ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
                  ('A .hu.', ['A', '.hu', '.']),
                  ('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
                  ('A pl.', ['A', 'pl.']),
                  ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
                  ('Egy..ket.', ['Egy', '..', 'ket', '.']),
                  ('Valami... van.', ['Valami', '...', 'van', '.']),
                  ('Valami ...van...', ['Valami', '...', 'van', '...']),
                  ('Valami...', ['Valami', '...']),
                  ('Valami ...', ['Valami', '...']),
                  ('Valami ... más.', ['Valami', '...', 'más', '.'])]
-_HYPHEN_TESTS = [
+DEFAULT_TESTS = [
    ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
    ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
    ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
    ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
    ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
    ('A .hu.', ['A', '.hu', '.']),
    ('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
    ('A pl.', ['A', 'pl.']),
    ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
    ('Egy..ket.', ['Egy', '..', 'ket', '.']),
    ('Valami... van.', ['Valami', '...', 'van', '.']),
    ('Valami ...van...', ['Valami', '...', 'van', '...']),
    ('Valami...', ['Valami', '...']),
    ('Valami ...', ['Valami', '...']),
    ('Valami ... más.', ['Valami', '...', 'más', '.'])
 ]
 HYPHEN_TESTS = [
    ('Egy -nak, -jaiért, -magyar, bel- van.', ['Egy', '-nak', ',', '-jaiért', ',', '-magyar', ',', 'bel-', 'van', '.']),
    ('Egy -nak.', ['Egy', '-nak', '.']),
    ('Egy bel-.', ['Egy', 'bel-', '.']),
@ -39,195 +41,194 @@ _HYPHEN_TESTS = [
    ('A 7-es.', ['A', '7-es', '.']),
    ('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']),
    ('A %-sal.', ['A', '%-sal', '.']),
-    ('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])]
+    ('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])
 ]
-_NUMBER_TESTS = [('A 2b van.', ['A', '2b', 'van', '.']),
+NUMBER_TESTS = [
-                 ('A 2b-ben van.', ['A', '2b-ben', 'van', '.']),
+    ('A 2b van.', ['A', '2b', 'van', '.']),
-                 ('A 2b.', ['A', '2b', '.']),
+    ('A 2b-ben van.', ['A', '2b-ben', 'van', '.']),
-                 ('A 2b-ben.', ['A', '2b-ben', '.']),
+    ('A 2b.', ['A', '2b', '.']),
-                 ('A 3.b van.', ['A', '3.b', 'van', '.']),
+    ('A 2b-ben.', ['A', '2b-ben', '.']),
-                 ('A 3.b-ben van.', ['A', '3.b-ben', 'van', '.']),
+    ('A 3.b van.', ['A', '3.b', 'van', '.']),
-                 ('A 3.b.', ['A', '3.b', '.']),
+    ('A 3.b-ben van.', ['A', '3.b-ben', 'van', '.']),
-                 ('A 3.b-ben.', ['A', '3.b-ben', '.']),
+    ('A 3.b.', ['A', '3.b', '.']),
-                 ('A 1:20:36.7 van.', ['A', '1:20:36.7', 'van', '.']),
+    ('A 3.b-ben.', ['A', '3.b-ben', '.']),
-                 ('A 1:20:36.7-ben van.', ['A', '1:20:36.7-ben', 'van', '.']),
+    ('A 1:20:36.7 van.', ['A', '1:20:36.7', 'van', '.']),
-                 ('A 1:20:36.7-ben.', ['A', '1:20:36.7-ben', '.']),
+    ('A 1:20:36.7-ben van.', ['A', '1:20:36.7-ben', 'van', '.']),
-                 ('A 1:35 van.', ['A', '1:35', 'van', '.']),
+    ('A 1:20:36.7-ben.', ['A', '1:20:36.7-ben', '.']),
-                 ('A 1:35-ben van.', ['A', '1:35-ben', 'van', '.']),
+    ('A 1:35 van.', ['A', '1:35', 'van', '.']),
-                 ('A 1:35-ben.', ['A', '1:35-ben', '.']),
+    ('A 1:35-ben van.', ['A', '1:35-ben', 'van', '.']),
-                 ('A 1.35 van.', ['A', '1.35', 'van', '.']),
+    ('A 1:35-ben.', ['A', '1:35-ben', '.']),
-                 ('A 1.35-ben van.', ['A', '1.35-ben', 'van', '.']),
+    ('A 1.35 van.', ['A', '1.35', 'van', '.']),
-                 ('A 1.35-ben.', ['A', '1.35-ben', '.']),
+    ('A 1.35-ben van.', ['A', '1.35-ben', 'van', '.']),
-                 ('A 4:01,95 van.', ['A', '4:01,95', 'van', '.']),
+    ('A 1.35-ben.', ['A', '1.35-ben', '.']),
-                 ('A 4:01,95-ben van.', ['A', '4:01,95-ben', 'van', '.']),
+    ('A 4:01,95 van.', ['A', '4:01,95', 'van', '.']),
-                 ('A 4:01,95-ben.', ['A', '4:01,95-ben', '.']),
+    ('A 4:01,95-ben van.', ['A', '4:01,95-ben', 'van', '.']),
-                 ('A 10--12 van.', ['A', '10--12', 'van', '.']),
+    ('A 4:01,95-ben.', ['A', '4:01,95-ben', '.']),
-                 ('A 10--12-ben van.', ['A', '10--12-ben', 'van', '.']),
+    ('A 10--12 van.', ['A', '10--12', 'van', '.']),
-                 ('A 10--12-ben.', ['A', '10--12-ben', '.']),
+    ('A 10--12-ben van.', ['A', '10--12-ben', 'van', '.']),
-                 ('A 10‐12 van.', ['A', '10‐12', 'van', '.']),
+    ('A 10--12-ben.', ['A', '10--12-ben', '.']),
-                 ('A 10‐12-ben van.', ['A', '10‐12-ben', 'van', '.']),
+    ('A 10‐12 van.', ['A', '10‐12', 'van', '.']),
-                 ('A 10‐12-ben.', ['A', '10‐12-ben', '.']),
+    ('A 10‐12-ben van.', ['A', '10‐12-ben', 'van', '.']),
-                 ('A 10‑12 van.', ['A', '10‑12', 'van', '.']),
+    ('A 10‐12-ben.', ['A', '10‐12-ben', '.']),
-                 ('A 10‑12-ben van.', ['A', '10‑12-ben', 'van', '.']),
+    ('A 10‑12 van.', ['A', '10‑12', 'van', '.']),
-                 ('A 10‑12-ben.', ['A', '10‑12-ben', '.']),
+    ('A 10‑12-ben van.', ['A', '10‑12-ben', 'van', '.']),
-                 ('A 10‒12 van.', ['A', '10‒12', 'van', '.']),
+    ('A 10‑12-ben.', ['A', '10‑12-ben', '.']),
-                 ('A 10‒12-ben van.', ['A', '10‒12-ben', 'van', '.']),
+    ('A 10‒12 van.', ['A', '10‒12', 'van', '.']),
-                 ('A 10‒12-ben.', ['A', '10‒12-ben', '.']),
+    ('A 10‒12-ben van.', ['A', '10‒12-ben', 'van', '.']),
-                 ('A 10–12 van.', ['A', '10–12', 'van', '.']),
+    ('A 10‒12-ben.', ['A', '10‒12-ben', '.']),
-                 ('A 10–12-ben van.', ['A', '10–12-ben', 'van', '.']),
+    ('A 10–12 van.', ['A', '10–12', 'van', '.']),
-                 ('A 10–12-ben.', ['A', '10–12-ben', '.']),
+    ('A 10–12-ben van.', ['A', '10–12-ben', 'van', '.']),
-                 ('A 10—12 van.', ['A', '10—12', 'van', '.']),
+    ('A 10–12-ben.', ['A', '10–12-ben', '.']),
-                 ('A 10—12-ben van.', ['A', '10—12-ben', 'van', '.']),
+    ('A 10—12 van.', ['A', '10—12', 'van', '.']),
-                 ('A 10—12-ben.', ['A', '10—12-ben', '.']),
+    ('A 10—12-ben van.', ['A', '10—12-ben', 'van', '.']),
-                 ('A 10―12 van.', ['A', '10―12', 'van', '.']),
+    ('A 10—12-ben.', ['A', '10—12-ben', '.']),
-                 ('A 10―12-ben van.', ['A', '10―12-ben', 'van', '.']),
+    ('A 10―12 van.', ['A', '10―12', 'van', '.']),
-                 ('A 10―12-ben.', ['A', '10―12-ben', '.']),
+    ('A 10―12-ben van.', ['A', '10―12-ben', 'van', '.']),
-                 ('A -23,12 van.', ['A', '-23,12', 'van', '.']),
+    ('A 10―12-ben.', ['A', '10―12-ben', '.']),
-                 ('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']),
+    ('A -23,12 van.', ['A', '-23,12', 'van', '.']),
-                 ('A -23,12-ben.', ['A', '-23,12-ben', '.']),
+    ('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']),
-                 ('A 2+3 van.', ['A', '2', '+', '3', 'van', '.']),
+    ('A -23,12-ben.', ['A', '-23,12-ben', '.']),
-                 ('A 2 +3 van.', ['A', '2', '+', '3', 'van', '.']),
+    ('A 2+3 van.', ['A', '2', '+', '3', 'van', '.']),
-                 ('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']),
+    ('A 2 +3 van.', ['A', '2', '+', '3', 'van', '.']),
-                 ('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']),
+    ('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']),
-                 ('A 2*3 van.', ['A', '2', '*', '3', 'van', '.']),
+    ('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']),
-                 ('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']),
+    ('A 2*3 van.', ['A', '2', '*', '3', 'van', '.']),
-                 ('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']),
+    ('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']),
-                 ('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']),
+    ('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']),
-                 ('A C++ van.', ['A', 'C++', 'van', '.']),
+    ('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']),
-                 ('A C++-ben van.', ['A', 'C++-ben', 'van', '.']),
+    ('A C++ van.', ['A', 'C++', 'van', '.']),
-                 ('A C++.', ['A', 'C++', '.']),
+    ('A C++-ben van.', ['A', 'C++-ben', 'van', '.']),
-                 ('A C++-ben.', ['A', 'C++-ben', '.']),
+    ('A C++.', ['A', 'C++', '.']),
-                 ('A 2003. I. 06. van.', ['A', '2003.', 'I.', '06.', 'van', '.']),
+    ('A C++-ben.', ['A', 'C++-ben', '.']),
-                 ('A 2003. I. 06-ben van.', ['A', '2003.', 'I.', '06-ben', 'van', '.']),
+    ('A 2003. I. 06. van.', ['A', '2003.', 'I.', '06.', 'van', '.']),
-                 ('A 2003. I. 06.', ['A', '2003.', 'I.', '06.']),
+    ('A 2003. I. 06-ben van.', ['A', '2003.', 'I.', '06-ben', 'van', '.']),
-                 ('A 2003. I. 06-ben.', ['A', '2003.', 'I.', '06-ben', '.']),
+    ('A 2003. I. 06.', ['A', '2003.', 'I.', '06.']),
-                 ('A 2003. 01. 06. van.', ['A', '2003.', '01.', '06.', 'van', '.']),
+    ('A 2003. I. 06-ben.', ['A', '2003.', 'I.', '06-ben', '.']),
-                 ('A 2003. 01. 06-ben van.', ['A', '2003.', '01.', '06-ben', 'van', '.']),
+    ('A 2003. 01. 06. van.', ['A', '2003.', '01.', '06.', 'van', '.']),
-                 ('A 2003. 01. 06.', ['A', '2003.', '01.', '06.']),
+    ('A 2003. 01. 06-ben van.', ['A', '2003.', '01.', '06-ben', 'van', '.']),
-                 ('A 2003. 01. 06-ben.', ['A', '2003.', '01.', '06-ben', '.']),
+    ('A 2003. 01. 06.', ['A', '2003.', '01.', '06.']),
-                 ('A IV. 12. van.', ['A', 'IV.', '12.', 'van', '.']),
+    ('A 2003. 01. 06-ben.', ['A', '2003.', '01.', '06-ben', '.']),
-                 ('A IV. 12-ben van.', ['A', 'IV.', '12-ben', 'van', '.']),
+    ('A IV. 12. van.', ['A', 'IV.', '12.', 'van', '.']),
-                 ('A IV. 12.', ['A', 'IV.', '12.']),
+    ('A IV. 12-ben van.', ['A', 'IV.', '12-ben', 'van', '.']),
-                 ('A IV. 12-ben.', ['A', 'IV.', '12-ben', '.']),
+    ('A IV. 12.', ['A', 'IV.', '12.']),
-                 ('A 2003.01.06. van.', ['A', '2003.01.06.', 'van', '.']),
+    ('A IV. 12-ben.', ['A', 'IV.', '12-ben', '.']),
-                 ('A 2003.01.06-ben van.', ['A', '2003.01.06-ben', 'van', '.']),
+    ('A 2003.01.06. van.', ['A', '2003.01.06.', 'van', '.']),
-                 ('A 2003.01.06.', ['A', '2003.01.06.']),
+    ('A 2003.01.06-ben van.', ['A', '2003.01.06-ben', 'van', '.']),
-                 ('A 2003.01.06-ben.', ['A', '2003.01.06-ben', '.']),
+    ('A 2003.01.06.', ['A', '2003.01.06.']),
-                 ('A IV.12. van.', ['A', 'IV.12.', 'van', '.']),
+    ('A 2003.01.06-ben.', ['A', '2003.01.06-ben', '.']),
-                 ('A IV.12-ben van.', ['A', 'IV.12-ben', 'van', '.']),
+    ('A IV.12. van.', ['A', 'IV.12.', 'van', '.']),
-                 ('A IV.12.', ['A', 'IV.12.']),
+    ('A IV.12-ben van.', ['A', 'IV.12-ben', 'van', '.']),
-                 ('A IV.12-ben.', ['A', 'IV.12-ben', '.']),
+    ('A IV.12.', ['A', 'IV.12.']),
-                 ('A 1.1.2. van.', ['A', '1.1.2.', 'van', '.']),
+    ('A IV.12-ben.', ['A', 'IV.12-ben', '.']),
-                 ('A 1.1.2-ben van.', ['A', '1.1.2-ben', 'van', '.']),
+    ('A 1.1.2. van.', ['A', '1.1.2.', 'van', '.']),
-                 ('A 1.1.2.', ['A', '1.1.2.']),
+    ('A 1.1.2-ben van.', ['A', '1.1.2-ben', 'van', '.']),
-                 ('A 1.1.2-ben.', ['A', '1.1.2-ben', '.']),
+    ('A 1.1.2.', ['A', '1.1.2.']),
-                 ('A 1,5--2,5 van.', ['A', '1,5--2,5', 'van', '.']),
+    ('A 1.1.2-ben.', ['A', '1.1.2-ben', '.']),
-                 ('A 1,5--2,5-ben van.', ['A', '1,5--2,5-ben', 'van', '.']),
+    ('A 1,5--2,5 van.', ['A', '1,5--2,5', 'van', '.']),
-                 ('A 1,5--2,5-ben.', ['A', '1,5--2,5-ben', '.']),
+    ('A 1,5--2,5-ben van.', ['A', '1,5--2,5-ben', 'van', '.']),
-                 ('A 3,14 van.', ['A', '3,14', 'van', '.']),
+    ('A 1,5--2,5-ben.', ['A', '1,5--2,5-ben', '.']),
-                 ('A 3,14-ben van.', ['A', '3,14-ben', 'van', '.']),
+    ('A 3,14 van.', ['A', '3,14', 'van', '.']),
-                 ('A 3,14-ben.', ['A', '3,14-ben', '.']),
+    ('A 3,14-ben van.', ['A', '3,14-ben', 'van', '.']),
-                 ('A 3.14 van.', ['A', '3.14', 'van', '.']),
+    ('A 3,14-ben.', ['A', '3,14-ben', '.']),
-                 ('A 3.14-ben van.', ['A', '3.14-ben', 'van', '.']),
+    ('A 3.14 van.', ['A', '3.14', 'van', '.']),
-                 ('A 3.14-ben.', ['A', '3.14-ben', '.']),
+    ('A 3.14-ben van.', ['A', '3.14-ben', 'van', '.']),
-                 ('A 15. van.', ['A', '15.', 'van', '.']),
+    ('A 3.14-ben.', ['A', '3.14-ben', '.']),
-                 ('A 15-ben van.', ['A', '15-ben', 'van', '.']),
+    ('A 15. van.', ['A', '15.', 'van', '.']),
-                 ('A 15-ben.', ['A', '15-ben', '.']),
+    ('A 15-ben van.', ['A', '15-ben', 'van', '.']),
-                 ('A 15.-ben van.', ['A', '15.-ben', 'van', '.']),
+    ('A 15-ben.', ['A', '15-ben', '.']),
-                 ('A 15.-ben.', ['A', '15.-ben', '.']),
+    ('A 15.-ben van.', ['A', '15.-ben', 'van', '.']),
-                 ('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']),
+    ('A 15.-ben.', ['A', '15.-ben', '.']),
-                 ('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']),
+    ('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']),
-                 ('A 2002--2003-ben.', ['A', '2002--2003-ben', '.']),
+    ('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']),
-                 ('A -0,99% van.', ['A', '-0,99%', 'van', '.']),
+    ('A 2002--2003-ben.', ['A', '2002--2003-ben', '.']),
-                 ('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']),
+    ('A -0,99% van.', ['A', '-0,99%', 'van', '.']),
-                 ('A -0,99%.', ['A', '-0,99%', '.']),
+    ('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']),
-                 ('A -0,99%-ben.', ['A', '-0,99%-ben', '.']),
+    ('A -0,99%.', ['A', '-0,99%', '.']),
-                 ('A 10--20% van.', ['A', '10--20%', 'van', '.']),
+    ('A -0,99%-ben.', ['A', '-0,99%-ben', '.']),
-                 ('A 10--20%-ben van.', ['A', '10--20%-ben', 'van', '.']),
+    ('A 10--20% van.', ['A', '10--20%', 'van', '.']),
-                 ('A 10--20%.', ['A', '10--20%', '.']),
+    ('A 10--20%-ben van.', ['A', '10--20%-ben', 'van', '.']),
-                 ('A 10--20%-ben.', ['A', '10--20%-ben', '.']),
+    ('A 10--20%.', ['A', '10--20%', '.']),
-                 ('A 99§ van.', ['A', '99§', 'van', '.']),
+    ('A 10--20%-ben.', ['A', '10--20%-ben', '.']),
-                 ('A 99§-ben van.', ['A', '99§-ben', 'van', '.']),
+    ('A 99§ van.', ['A', '99§', 'van', '.']),
-                 ('A 99§-ben.', ['A', '99§-ben', '.']),
+    ('A 99§-ben van.', ['A', '99§-ben', 'van', '.']),
-                 ('A 10--20§ van.', ['A', '10--20§', 'van', '.']),
+    ('A 99§-ben.', ['A', '99§-ben', '.']),
-                 ('A 10--20§-ben van.', ['A', '10--20§-ben', 'van', '.']),
+    ('A 10--20§ van.', ['A', '10--20§', 'van', '.']),
-                 ('A 10--20§-ben.', ['A', '10--20§-ben', '.']),
+    ('A 10--20§-ben van.', ['A', '10--20§-ben', 'van', '.']),
-                 ('A 99° van.', ['A', '99°', 'van', '.']),
+    ('A 10--20§-ben.', ['A', '10--20§-ben', '.']),
-                 ('A 99°-ben van.', ['A', '99°-ben', 'van', '.']),
+    ('A 99° van.', ['A', '99°', 'van', '.']),
-                 ('A 99°-ben.', ['A', '99°-ben', '.']),
+    ('A 99°-ben van.', ['A', '99°-ben', 'van', '.']),
-                 ('A 10--20° van.', ['A', '10--20°', 'van', '.']),
+    ('A 99°-ben.', ['A', '99°-ben', '.']),
-                 ('A 10--20°-ben van.', ['A', '10--20°-ben', 'van', '.']),
+    ('A 10--20° van.', ['A', '10--20°', 'van', '.']),
-                 ('A 10--20°-ben.', ['A', '10--20°-ben', '.']),
+    ('A 10--20°-ben van.', ['A', '10--20°-ben', 'van', '.']),
-                 ('A °C van.', ['A', '°C', 'van', '.']),
+    ('A 10--20°-ben.', ['A', '10--20°-ben', '.']),
-                 ('A °C-ben van.', ['A', '°C-ben', 'van', '.']),
+    ('A °C van.', ['A', '°C', 'van', '.']),
-                 ('A °C.', ['A', '°C', '.']),
+    ('A °C-ben van.', ['A', '°C-ben', 'van', '.']),
-                 ('A °C-ben.', ['A', '°C-ben', '.']),
+    ('A °C.', ['A', '°C', '.']),
-                 ('A 100°C van.', ['A', '100°C', 'van', '.']),
+    ('A °C-ben.', ['A', '°C-ben', '.']),
-                 ('A 100°C-ben van.', ['A', '100°C-ben', 'van', '.']),
+    ('A 100°C van.', ['A', '100°C', 'van', '.']),
-                 ('A 100°C.', ['A', '100°C', '.']),
+    ('A 100°C-ben van.', ['A', '100°C-ben', 'van', '.']),
-                 ('A 100°C-ben.', ['A', '100°C-ben', '.']),
+    ('A 100°C.', ['A', '100°C', '.']),
-                 ('A 800x600 van.', ['A', '800x600', 'van', '.']),
+    ('A 100°C-ben.', ['A', '100°C-ben', '.']),
-                 ('A 800x600-ben van.', ['A', '800x600-ben', 'van', '.']),
+    ('A 800x600 van.', ['A', '800x600', 'van', '.']),
-                 ('A 800x600-ben.', ['A', '800x600-ben', '.']),
+    ('A 800x600-ben van.', ['A', '800x600-ben', 'van', '.']),
-                 ('A 1x2x3x4 van.', ['A', '1x2x3x4', 'van', '.']),
+    ('A 800x600-ben.', ['A', '800x600-ben', '.']),
-                 ('A 1x2x3x4-ben van.', ['A', '1x2x3x4-ben', 'van', '.']),
+    ('A 1x2x3x4 van.', ['A', '1x2x3x4', 'van', '.']),
-                 ('A 1x2x3x4-ben.', ['A', '1x2x3x4-ben', '.']),
+    ('A 1x2x3x4-ben van.', ['A', '1x2x3x4-ben', 'van', '.']),
-                 ('A 5/J van.', ['A', '5/J', 'van', '.']),
+    ('A 1x2x3x4-ben.', ['A', '1x2x3x4-ben', '.']),
-                 ('A 5/J-ben van.', ['A', '5/J-ben', 'van', '.']),
+    ('A 5/J van.', ['A', '5/J', 'van', '.']),
-                 ('A 5/J-ben.', ['A', '5/J-ben', '.']),
+    ('A 5/J-ben van.', ['A', '5/J-ben', 'van', '.']),
-                 ('A 5/J. van.', ['A', '5/J.', 'van', '.']),
+    ('A 5/J-ben.', ['A', '5/J-ben', '.']),
-                 ('A 5/J.-ben van.', ['A', '5/J.-ben', 'van', '.']),
+    ('A 5/J. van.', ['A', '5/J.', 'van', '.']),
-                 ('A 5/J.-ben.', ['A', '5/J.-ben', '.']),
+    ('A 5/J.-ben van.', ['A', '5/J.-ben', 'van', '.']),
-                 ('A III/1 van.', ['A', 'III/1', 'van', '.']),
+    ('A 5/J.-ben.', ['A', '5/J.-ben', '.']),
-                 ('A III/1-ben van.', ['A', 'III/1-ben', 'van', '.']),
+    ('A III/1 van.', ['A', 'III/1', 'van', '.']),
-                 ('A III/1-ben.', ['A', 'III/1-ben', '.']),
+    ('A III/1-ben van.', ['A', 'III/1-ben', 'van', '.']),
-                 ('A III/1. van.', ['A', 'III/1.', 'van', '.']),
+    ('A III/1-ben.', ['A', 'III/1-ben', '.']),
-                 ('A III/1.-ben van.', ['A', 'III/1.-ben', 'van', '.']),
+    ('A III/1. van.', ['A', 'III/1.', 'van', '.']),
-                 ('A III/1.-ben.', ['A', 'III/1.-ben', '.']),
+    ('A III/1.-ben van.', ['A', 'III/1.-ben', 'van', '.']),
-                 ('A III/c van.', ['A', 'III/c', 'van', '.']),
+    ('A III/1.-ben.', ['A', 'III/1.-ben', '.']),
-                 ('A III/c-ben van.', ['A', 'III/c-ben', 'van', '.']),
+    ('A III/c van.', ['A', 'III/c', 'van', '.']),
-                 ('A III/c.', ['A', 'III/c', '.']),
+    ('A III/c-ben van.', ['A', 'III/c-ben', 'van', '.']),
-                 ('A III/c-ben.', ['A', 'III/c-ben', '.']),
+    ('A III/c.', ['A', 'III/c', '.']),
-                 ('A TU–154 van.', ['A', 'TU–154', 'van', '.']),
+    ('A III/c-ben.', ['A', 'III/c-ben', '.']),
-                 ('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']),
+    ('A TU–154 van.', ['A', 'TU–154', 'van', '.']),
-                 ('A TU–154-ben.', ['A', 'TU–154-ben', '.'])]
+    ('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']),
    ('A TU–154-ben.', ['A', 'TU–154-ben', '.'])
 ]
-_QUOTE_TESTS = [('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
+QUOTE_TESTS = [
-                ('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
+    ('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
-                ('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']),
+    ('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
-                ('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']),
+    ('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']),
-                ("A don't van.", ['A', "don't", 'van', '.'])]
+    ('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']),
    ("A don't van.", ['A', "don't", 'van', '.'])
 ]
-_DOT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
+DOT_TESTS = [
-              ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
+    ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
-              ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
+    ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
-              ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
+    ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
-              ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
+    ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
-              ('A .hu.', ['A', '.hu', '.']),
+    ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
-              ('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
+    ('A .hu.', ['A', '.hu', '.']),
-              ('A pl.', ['A', 'pl.']),
+    ('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
-              ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
+    ('A pl.', ['A', 'pl.']),
-              ('Egy..ket.', ['Egy', '..', 'ket', '.']),
+    ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
-              ('Valami... van.', ['Valami', '...', 'van', '.']),
+    ('Egy..ket.', ['Egy', '..', 'ket', '.']),
-              ('Valami ...van...', ['Valami', '...', 'van', '...']),
+    ('Valami... van.', ['Valami', '...', 'van', '.']),
-              ('Valami...', ['Valami', '...']),
+    ('Valami ...van...', ['Valami', '...', 'van', '...']),
-              ('Valami ...', ['Valami', '...']),
+    ('Valami...', ['Valami', '...']),
-              ('Valami ... más.', ['Valami', '...', 'más', '.'])]
+    ('Valami ...', ['Valami', '...']),
    ('Valami ... más.', ['Valami', '...', 'más', '.'])
 ]
-@pytest.fixture(scope="session")
+TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS # + NUMBER_TESTS + HYPHEN_TESTS
 def HU():
    return Hungarian()
-@pytest.fixture(scope="module")
+@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
-def hu_tokenizer(HU):
+def test_tokenizer_handles_testcases(hu_tokenizer, text, expected_tokens):
-    return HU.tokenizer
+    tokens = hu_tokenizer(text)
-
+    token_list = [token.text for token in tokens if not token.is_space]
@pytest.mark.parametrize(("input", "expected_tokens"),
                         _DEFAULT_TESTS + _HYPHEN_TESTS + _NUMBER_TESTS + _DOT_TESTS + _QUOTE_TESTS)
 def test_testcases(hu_tokenizer, input, expected_tokens):
    tokens = hu_tokenizer(input)
    token_list = [token.orth_ for token in tokens if not token.is_space]
    assert expected_tokens == token_list
--- a/spacy/tests/regression/test_issue351.py
+++ b/spacy/tests/regression/test_issue351.py
@ -0,0 +1,16 @@
 from __future__ import unicode_literals
 from ...en import English
 import pytest
@pytest.fixture
 def en_tokenizer():
    return English.Defaults.create_tokenizer()
 def test_issue351(en_tokenizer):
    doc = en_tokenizer("   This is a cat.")
    assert doc[0].idx == 0
    assert len(doc[0]) == 3
    assert doc[1].idx == 3
--- a/spacy/tests/regression/test_issue360.py
+++ b/spacy/tests/regression/test_issue360.py
@ -0,0 +1,14 @@
 from __future__ import unicode_literals
 from ...en import English
 import pytest
@pytest.fixture
 def en_tokenizer():
    return English.Defaults.create_tokenizer()
 def test_big_ellipsis(en_tokenizer):
    tokens = en_tokenizer(u'$45...............Asking')
    assert len(tokens) > 2
--- a/spacy/tests/sun.tokens
+++ b/spacy/tests/sun.tokens
@ -1,4 +0,0 @@
 The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ]
 The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ]
 Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of −1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ]
--- a/spacy/tests/tokenizer/conftest.py
+++ b/spacy/tests/tokenizer/conftest.py
@ -1,7 +1,23 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
-from spacy.en import English
+
 from ...en import English
 from ...de import German
 from ...es import Spanish
 from ...it import Italian
 from ...fr import French
 from ...pt import Portuguese
 from ...nl import Dutch
 from ...sv import Swedish
 from ...hu import Hungarian
-@pytest.fixture(scope="module")
+LANGUAGES = [English, German, Spanish, Italian, French, Dutch, Swedish, Hungarian]
-def en_tokenizer(EN):
+
-    return EN.tokenizer
+
@pytest.fixture(params=LANGUAGES)
 def tokenizer(request):
    lang = request.param
    return lang.Defaults.create_tokenizer()
--- a/spacy/tests/tokenizer/test_contractions.py
+++ b/spacy/tests/tokenizer/test_contractions.py
@ -1,58 +0,0 @@
 from __future__ import unicode_literals
 import pytest
 def test_possess(en_tokenizer):
    tokens = en_tokenizer("Mike's")
    assert en_tokenizer.vocab.strings[tokens[0].orth] == "Mike"
    assert en_tokenizer.vocab.strings[tokens[1].orth] == "'s"
    assert len(tokens) == 2
 def test_apostrophe(en_tokenizer):
    tokens = en_tokenizer("schools'")
    assert len(tokens) == 2
    assert tokens[1].orth_ == "'"
    assert tokens[0].orth_ == "schools"
 def test_LL(en_tokenizer):
    tokens = en_tokenizer("we'll")
    assert len(tokens) == 2
    assert tokens[1].orth_ == "'ll"
    assert tokens[1].lemma_ == "will"
    assert tokens[0].orth_ == "we"
 def test_aint(en_tokenizer):
    tokens = en_tokenizer("ain't")
    assert len(tokens) == 2
    assert tokens[0].orth_ == "ai"
    assert tokens[0].lemma_ == "be"
    assert tokens[1].orth_ == "n't"
    assert tokens[1].lemma_ == "not"
 def test_capitalized(en_tokenizer):
    tokens = en_tokenizer("can't")
    assert len(tokens) == 2
    tokens = en_tokenizer("Can't")
    assert len(tokens) == 2
    tokens = en_tokenizer("Ain't")
    assert len(tokens) == 2
    assert tokens[0].orth_ == "Ai"
    assert tokens[0].lemma_ == "be"
 def test_punct(en_tokenizer):
    tokens = en_tokenizer("We've")
    assert len(tokens) == 2
    tokens = en_tokenizer("``We've")
    assert len(tokens) == 3
@pytest.mark.xfail
 def test_therell(en_tokenizer):
    tokens = en_tokenizer("there'll")
    assert len(tokens) == 2
    assert tokens[0].text == "there"
    assert tokens[1].text == "there"
--- a/spacy/tests/tokenizer/test_emoticons.py
+++ b/spacy/tests/tokenizer/test_emoticons.py
@ -1,35 +0,0 @@
 from __future__ import unicode_literals
 import pytest
 def test_tweebo_challenge(en_tokenizer):
    text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
    tokens = en_tokenizer(text)
    assert tokens[0].orth_ == ":o"
    assert tokens[1].orth_ == ":/"
    assert tokens[2].orth_ == ":'("
    assert tokens[3].orth_ == ">:o"
    assert tokens[4].orth_ == "(:"
    assert tokens[5].orth_ == ":)"
    assert tokens[6].orth_ == ">.<"
    assert tokens[7].orth_ == "XD"
    assert tokens[8].orth_ == "-__-"
    assert tokens[9].orth_ == "o.O"
    assert tokens[10].orth_ == ";D"
    assert tokens[11].orth_ == ":-)"
    assert tokens[12].orth_ == "@_@"
    assert tokens[13].orth_ == ":P"
    assert tokens[14].orth_ == "8D"
    assert tokens[15].orth_ == ":1"
    assert tokens[16].orth_ == ">:("
    assert tokens[17].orth_ == ":D"
    assert tokens[18].orth_ == "=|"
    assert tokens[19].orth_ == '")'
    assert tokens[20].orth_ == ':>'
    assert tokens[21].orth_ == '....'
 def test_false_positive(en_tokenizer):
    text = "example:)"
    tokens = en_tokenizer(text)
    assert len(tokens) == 3
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@ -0,0 +1,41 @@
 # coding: utf-8
 """Test that tokenizer exceptions and emoticons are handled correctly."""
 from __future__ import unicode_literals
 import pytest
 def test_tokenizer_handles_emoticons(tokenizer):
    # Tweebo challenge (CMU)
    text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
    tokens = tokenizer(text)
    assert tokens[0].text == ":o"
    assert tokens[1].text == ":/"
    assert tokens[2].text == ":'("
    assert tokens[3].text == ">:o"
    assert tokens[4].text == "(:"
    assert tokens[5].text == ":)"
    assert tokens[6].text == ">.<"
    assert tokens[7].text == "XD"
    assert tokens[8].text == "-__-"
    assert tokens[9].text == "o.O"
    assert tokens[10].text == ";D"
    assert tokens[11].text == ":-)"
    assert tokens[12].text == "@_@"
    assert tokens[13].text == ":P"
    assert tokens[14].text == "8D"
    assert tokens[15].text == ":1"
    assert tokens[16].text == ">:("
    assert tokens[17].text == ":D"
    assert tokens[18].text == "=|"
    assert tokens[19].text == '")'
    assert tokens[20].text == ':>'
    assert tokens[21].text == '....'
@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)])
 def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
    tokens = tokenizer(text)
    assert len(tokens) == length
--- a/spacy/tests/tokenizer/test_infix.py
+++ b/spacy/tests/tokenizer/test_infix.py
@ -1,62 +0,0 @@
 from __future__ import unicode_literals
 import pytest
 def test_hyphen(en_tokenizer):
    tokens = en_tokenizer('best-known')
    assert len(tokens) == 3
 def test_numeric_range(en_tokenizer):
    tokens = en_tokenizer('0.1-13.5')
    assert len(tokens) == 3
 def test_period(en_tokenizer):
    tokens = en_tokenizer('best.Known')
    assert len(tokens) == 3
    tokens = en_tokenizer('zombo.com')
    assert len(tokens) == 1
 def test_ellipsis(en_tokenizer):
    tokens = en_tokenizer('best...Known')
    assert len(tokens) == 3
    tokens = en_tokenizer('best...known')
    assert len(tokens) == 3
 def test_big_ellipsis(en_tokenizer):
    '''Test regression identified in Issue #360'''
    tokens = en_tokenizer(u'$45...............Asking')
    assert len(tokens) > 2
 def test_email(en_tokenizer):
    tokens = en_tokenizer('hello@example.com')
    assert len(tokens) == 1
    tokens = en_tokenizer('hi+there@gmail.it')
    assert len(tokens) == 1
 def test_double_hyphen(en_tokenizer):
    tokens = en_tokenizer(u'No decent--let alone well-bred--people.')
    assert tokens[0].text == u'No'
    assert tokens[1].text == u'decent'
    assert tokens[2].text == u'--'
    assert tokens[3].text == u'let'
    assert tokens[4].text == u'alone'
    assert tokens[5].text == u'well'
    assert tokens[6].text == u'-'
    # TODO: This points to a deeper issue with the tokenizer: it doesn't re-enter
    # on infixes.
    assert tokens[7].text == u'bred'
    assert tokens[8].text == u'--'
    assert tokens[9].text == u'people'
 def test_infix_comma(en_tokenizer):
    # Re issue #326
    tokens = en_tokenizer(u'Hello,world')
    assert tokens[0].text == u'Hello'
    assert tokens[1].text == u','
    assert tokens[2].text == u'world'
--- a/spacy/tests/tokenizer/test_only_punct.py
+++ b/spacy/tests/tokenizer/test_only_punct.py
@ -1,9 +0,0 @@
 from __future__ import unicode_literals
 def test_only_pre1(en_tokenizer):
    assert len(en_tokenizer("(")) == 1
 def test_only_pre2(en_tokenizer):
    assert len(en_tokenizer("((")) == 2
--- a/spacy/tests/tokenizer/test_post_punct.py
+++ b/spacy/tests/tokenizer/test_post_punct.py
@ -1,43 +0,0 @@
 from __future__ import unicode_literals
 import pytest
@pytest.fixture
 def close_puncts():
    return [')', ']', '}', '*']
 def test_close(close_puncts, en_tokenizer):
    word_str = 'Hello'
    for p in close_puncts:
        string = word_str + p
        tokens = en_tokenizer(string)
        assert len(tokens) == 2
        assert tokens[1].string == p
        assert tokens[0].string == word_str
 def test_two_different_close(close_puncts, en_tokenizer):
    word_str = 'Hello'
    for p in close_puncts:
        string = word_str + p + "'"
        tokens = en_tokenizer(string)
        assert len(tokens) == 3
        assert tokens[0].string == word_str
        assert tokens[1].string == p
        assert tokens[2].string == "'"
 def test_three_same_close(close_puncts, en_tokenizer):
    word_str = 'Hello'
    for p in close_puncts:
        string = word_str + p + p + p
        tokens = en_tokenizer(string)
        assert len(tokens) == 4
        assert tokens[0].string == word_str
        assert tokens[1].string == p
 def test_double_end_quote(en_tokenizer):
    assert len(en_tokenizer("Hello''")) == 2
    assert len(en_tokenizer("''")) == 1
--- a/spacy/tests/tokenizer/test_pre_punct.py
+++ b/spacy/tests/tokenizer/test_pre_punct.py
@ -1,46 +0,0 @@
 from __future__ import unicode_literals
 import pytest
@pytest.fixture
 def open_puncts():
    return ['(', '[', '{', '*']
 def test_open(open_puncts, en_tokenizer):
    word_str = 'Hello'
    for p in open_puncts:
        string = p + word_str
        tokens = en_tokenizer(string)
        assert len(tokens) == 2
        assert tokens[0].orth_ == p
        assert tokens[1].orth_ == word_str
 def test_two_different_open(open_puncts, en_tokenizer):
    word_str = 'Hello'
    for p in open_puncts:
        string = p + "`" + word_str
        tokens = en_tokenizer(string)
        assert len(tokens) == 3
        assert tokens[0].orth_ == p
        assert tokens[1].orth_ == "`"
        assert tokens[2].orth_ == word_str
 def test_three_same_open(open_puncts, en_tokenizer):
    word_str = 'Hello'
    for p in open_puncts:
        string = p + p + p + word_str
        tokens = en_tokenizer(string)
        assert len(tokens) == 4
        assert tokens[0].orth_ == p
        assert tokens[3].orth_ == word_str
 def test_open_appostrophe(en_tokenizer):
    string = "'The"
    tokens = en_tokenizer(string)
    assert len(tokens) == 2
    assert tokens[0].orth_ == "'"
--- a/spacy/tests/tokenizer/test_special_affix.py
+++ b/spacy/tests/tokenizer/test_special_affix.py
@ -1,46 +0,0 @@
 """Test entries in the tokenization special-case interacting with prefix
 and suffix punctuation."""
 from __future__ import unicode_literals
 import pytest
 def test_no_special(en_tokenizer):
    assert len(en_tokenizer("(can)")) == 3
 def test_no_punct(en_tokenizer):
    assert len(en_tokenizer("can't")) == 2
 def test_prefix(en_tokenizer):
    assert len(en_tokenizer("(can't")) == 3
 def test_suffix(en_tokenizer):
    assert len(en_tokenizer("can't)")) == 3
 def test_wrap(en_tokenizer):
    assert len(en_tokenizer("(can't)")) == 4
 def test_uneven_wrap(en_tokenizer):
    assert len(en_tokenizer("(can't?)")) == 5
 def test_prefix_interact(en_tokenizer):
    assert len(en_tokenizer("U.S.")) == 1
    assert len(en_tokenizer("us.")) == 2
    assert len(en_tokenizer("(U.S.")) == 2
 def test_suffix_interact(en_tokenizer):
    assert len(en_tokenizer("U.S.)")) == 2
 def test_even_wrap_interact(en_tokenizer):
    assert len(en_tokenizer("(U.S.)")) == 3
 def test_uneven_wrap_interact(en_tokenizer):
    assert len(en_tokenizer("(U.S.?)")) == 4
--- a/spacy/tests/tokenizer/test_string_loading.py
+++ b/spacy/tests/tokenizer/test_string_loading.py
@ -1,9 +0,0 @@
 """Test suspected freeing of strings"""
 from __future__ import unicode_literals
 def test_one(en_tokenizer):
    tokens = en_tokenizer('Betty Botter bought a pound of butter.')
    assert tokens[0].orth_ == 'Betty'
    tokens2 = en_tokenizer('Betty also bought a pound of butter.')
    assert tokens2[0].orth_ == 'Betty'
--- a/spacy/tests/tokenizer/test_surround_punct.py
+++ b/spacy/tests/tokenizer/test_surround_punct.py
@ -1,32 +0,0 @@
 from __future__ import unicode_literals
 import pytest
@pytest.fixture
 def paired_puncts():
    return [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')]
 def test_token(paired_puncts, en_tokenizer):
    word_str = 'Hello'
    for open_, close_ in paired_puncts:
        string = open_ + word_str + close_
        tokens = en_tokenizer(string)
        assert len(tokens) == 3
        assert tokens[0].orth_ == open_
        assert tokens[1].orth_ == word_str
        assert tokens[2].orth_ == close_
 def test_two_different(paired_puncts, en_tokenizer):
    word_str = 'Hello'
    for open_, close_ in paired_puncts:
        string = "`" + open_ + word_str + close_ + "'"
        tokens = en_tokenizer(string)
        assert len(tokens) == 5
        assert tokens[0].orth_ == "`"
        assert tokens[1].orth_ == open_
        assert tokens[2].orth_ == word_str
        assert tokens[2].orth_ == word_str
        assert tokens[3].orth_ == close_
        assert tokens[4].orth_ == "'"
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -1,172 +1,83 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from os import path
 import pytest
 import io
 import pickle
 import cloudpickle
 import tempfile
-from ... import util
+from ...util import utf8open
 from ...language_data import TOKENIZER_PREFIXES
 en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
-# @pytest.mark.xfail
+def test_tokenizer_handles_no_word(tokenizer):
-# def test_pickle(en_tokenizer):
+    tokens = tokenizer("")
 #     file_ = io.BytesIO()
 #     cloudpickle.dump(en_tokenizer, file_)
 #     file_.seek(0)
 #     loaded = pickle.load(file_)
 #     assert loaded is not None
 def test_pre_punct_regex():
    string = "(can't"
    match = en_search_prefixes(string)
    assert match.group() == "("
 def test_no_word(en_tokenizer):
    tokens = en_tokenizer(u'')
    assert len(tokens) == 0
-def test_single_word(en_tokenizer):
+@pytest.mark.parametrize('text', ["lorem"])
-    tokens = en_tokenizer(u'hello')
+def test_tokenizer_handles_single_word(tokenizer, text):
-    assert tokens[0].orth_ == 'hello'
+    tokens = tokenizer(text)
    assert tokens[0].text == text
-def test_two_words(en_tokenizer):
+def test_tokenizer_handles_punct(tokenizer):
-    tokens = en_tokenizer('hello possums')
+    text = "Lorem, ipsum."
-    assert len(tokens) == 2
+    tokens = tokenizer(text)
    assert tokens[0].orth_ != tokens[1].orth_
 def test_punct(en_tokenizer):
    tokens = en_tokenizer('hello, possums.')
    assert len(tokens) == 4
-    assert tokens[0].orth_ == 'hello'
+    assert tokens[0].text == "Lorem"
-    assert tokens[1].orth_ == ','
+    assert tokens[1].text == ","
-    assert tokens[2].orth_ == 'possums'
+    assert tokens[2].text == "ipsum"
-    assert tokens[1].orth_ != 'hello'
+    assert tokens[1].text != "Lorem"
-def test_digits(en_tokenizer):
+def test_tokenizer_handles_digits(tokenizer):
-    tokens = en_tokenizer('The year: 1984.')
+    exceptions = ["hu"]
-    assert len(tokens) == 5
+    text = "Lorem ipsum: 1984."
-    assert tokens[0].orth == en_tokenizer.vocab['The'].orth
+    tokens = tokenizer(text)
-    assert tokens[3].orth == en_tokenizer.vocab['1984'].orth
+
    if tokens[0].lang_ not in exceptions:
        assert len(tokens) == 5
        assert tokens[0].text == "Lorem"
        assert tokens[3].text == "1984"
-def test_contraction(en_tokenizer):
+@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"])
-    tokens = en_tokenizer("don't giggle")
+def test_tokenizer_keep_urls(tokenizer, text):
-    assert len(tokens) == 3
+    tokens = tokenizer(text)
-    assert tokens[1].orth == en_tokenizer.vocab["n't"].orth
+    assert len(tokens) == 1
    tokens = en_tokenizer("i said don't!")
    assert len(tokens) == 5
    assert tokens[4].orth == en_tokenizer.vocab['!'].orth
 def test_contraction_punct(en_tokenizer):
    tokens = [w.text for w in en_tokenizer("(can't")]
    assert tokens == ['(', 'ca', "n't"]
    tokens = en_tokenizer("`ain't")
    assert len(tokens) == 3
    tokens = en_tokenizer('''"isn't''')
    assert len(tokens) == 3
    tokens = en_tokenizer("can't!")
    assert len(tokens) == 3
-def test_sample(en_tokenizer):
+@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"])
-    text = """Tributes pour in for late British Labour Party leader
+def test_tokenizer_keeps_email(tokenizer, text):
    tokens = tokenizer(text)
    assert len(tokens) == 1
 Tributes poured in from around the world Thursday
 to the late Labour Party leader John Smith, who died earlier from a massive
 heart attack aged 55.
-In Washington, the US State Department issued a statement regretting "the
+def test_tokenizer_handles_long_text(tokenizer):
-untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
+    text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit
-"Mr. Smith, throughout his distinguished"""
+Cras egestas orci non porttitor maximus.
 Maecenas quis odio id dolor rhoncus dignissim. Curabitur sed velit at orci ultrices sagittis. Nulla commodo euismod arcu eget vulputate.
-    tokens = en_tokenizer(text)
+Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, non lacinia enim nibh eget ipsum. Vestibulum in bibendum mauris.
 "Nullam porta fringilla enim, a dictum orci consequat in." Mauris nec malesuada justo."""
    tokens = tokenizer(text)
    assert len(tokens) > 5
-def test_cnts1(en_tokenizer):
+@pytest.mark.parametrize('file_name', ["sun.txt"])
-    text = u"""The U.S. Army likes Shock and Awe."""
+def test_tokenizer_handle_text_from_file(tokenizer, file_name):
-    tokens = en_tokenizer(text)
+    loc = path.join(path.dirname(__file__), '..', file_name)
-    assert len(tokens) == 8
+    text = utf8open(loc).read()
    assert len(text) != 0
    tokens = tokenizer(text)
    assert len(tokens) > 100
-def test_cnts2(en_tokenizer):
+def test_tokenizer_suspected_freeing_strings(tokenizer):
-    text = u"""U.N. regulations are not a part of their concern."""
+    text1 = "Lorem dolor sit amet, consectetur adipiscing elit."
-    tokens = en_tokenizer(text)
+    text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
-    assert len(tokens) == 10
+    tokens1 = tokenizer(text1)
-
+    tokens2 = tokenizer(text2)
-
+    assert tokens1[0].text == "Lorem"
-def test_cnts3(en_tokenizer):
+    assert tokens2[0].text == "Lorem"
    text = u"“Isn't it?”"
    tokens = en_tokenizer(text)
    words = [t.orth_ for t in tokens]
    assert len(words) == 6
 def test_cnts4(en_tokenizer):
    text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
    tokens = en_tokenizer(text)
    words = [t.orth_ for t in tokens]
    assert len(words) == 15
 def test_cnts5(en_tokenizer):
    text = """'Me too!', Mr. P. Delaware cried. """
    tokens = en_tokenizer(text)
    assert len(tokens) == 11
@pytest.mark.xfail
 def test_mr(en_tokenizer):
    text = """Today is Tuesday.Mr."""
    tokens = en_tokenizer(text)
    assert len(tokens) == 5
    assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
 def test_cnts6(en_tokenizer):
    text = u'They ran about 10km.'
    tokens = en_tokenizer(text)
    words = [t.orth_ for t in tokens]
    assert len(words) == 6
 def test_bracket_period(en_tokenizer):
    text = u'(And a 6a.m. run through Washington Park).'
    tokens = en_tokenizer(text)
    assert tokens[len(tokens) - 1].orth_ == u'.'
 def test_ie(en_tokenizer):
    text = u"It's mediocre i.e. bad."
    tokens = en_tokenizer(text)
    assert len(tokens) == 6
    assert tokens[3].orth_ == "i.e."
 def test_two_whitespace(en_tokenizer):
    orig_str = u'there are 2 spaces after this  '
    tokens = en_tokenizer(orig_str)
    assert repr(tokens.text_with_ws) == repr(orig_str)
@pytest.mark.xfail
 def test_em_dash_infix(en_tokenizer):
    # Re Issue #225
    tokens = en_tokenizer('''Will this road take me to Puddleton?\u2014No, '''
                          '''you'll have to walk there.\u2014Ariel.''')
    assert tokens[6].text == 'Puddleton'
    assert tokens[7].text == '?'
    assert tokens[8].text == '\u2014'
 #def test_cnts7():
 #    text = 'But then the 6,000-year ice age came...'
 #    tokens = EN.tokenize(text)
 #    assert len(tokens) == 10
--- a/spacy/tests/tokenizer/test_whitespace.py
+++ b/spacy/tests/tokenizer/test_whitespace.py
@ -1,67 +1,51 @@
 # coding: utf-8
 """Test that tokens are created correctly for whitespace."""
 from __future__ import unicode_literals
 import pytest
-def test_single_space(en_tokenizer):
+@pytest.mark.parametrize('text', ["lorem ipsum"])
-    tokens = en_tokenizer('hello possums')
+def test_tokenizer_splits_single_space(tokenizer, text):
    tokens = tokenizer(text)
    assert len(tokens) == 2
-def test_double_space(en_tokenizer):
+@pytest.mark.parametrize('text', ["lorem  ipsum"])
-    tokens = en_tokenizer('hello  possums')
+def test_tokenizer_splits_double_space(tokenizer, text):
    tokens = tokenizer(text)
    assert len(tokens) == 3
-    assert tokens[1].orth_ == ' '
+    assert tokens[1].text == " "
-def test_newline(en_tokenizer):
+@pytest.mark.parametrize('text', ["lorem ipsum  "])
-    tokens = en_tokenizer('hello\npossums')
+def test_tokenizer_handles_double_trainling_ws(tokenizer, text):
    tokens = tokenizer(text)
    assert repr(tokens.text_with_ws) == repr(text)
@pytest.mark.parametrize('text', ["lorem\nipsum"])
 def test_tokenizer_splits_newline(tokenizer, text):
    tokens = tokenizer(text)
    assert len(tokens) == 3
    assert tokens[1].text == "\n"
@pytest.mark.parametrize('text', ["lorem \nipsum"])
 def test_tokenizer_splits_newline_space(tokenizer, text):
    tokens = tokenizer(text)
    assert len(tokens) == 3
-def test_newline_space(en_tokenizer):
+@pytest.mark.parametrize('text', ["lorem  \nipsum"])
-    tokens = en_tokenizer('hello \npossums')
+def test_tokenizer_splits_newline_double_space(tokenizer, text):
    tokens = tokenizer(text)
    assert len(tokens) == 3
-def test_newline_double_space(en_tokenizer):
+@pytest.mark.parametrize('text', ["lorem \n ipsum"])
-    tokens = en_tokenizer('hello  \npossums')
+def test_tokenizer_splits_newline_space_wrap(tokenizer, text):
    tokens = tokenizer(text)
    assert len(tokens) == 3
 def test_newline_space_wrap(en_tokenizer):
    tokens = en_tokenizer('hello \n possums')
    assert len(tokens) == 3
 def test_leading_space_offsets(en_tokenizer):
    '''Issue #351
    # this works
    text1 = u"This is a cat."
    a = english_spacy(text1)
    tok0 = list(a.sents)[0][0]
    print tok0, tok0.idx, text1[tok0.idx]
    tok1 = list(a.sents)[0][1]
    print tok1, tok1.idx, text1[tok1.idx]
    print "=="
    # this does not work
    text2 = u"   This is a cat."
    b = english_spacy(text2)
    tok0 = list(b.sents)[0][0]
 print tok0, tok0.idx, text2[tok0.idx]
    tok1 = list(b.sents)[0][1]
    print tok1, tok1.idx, text2[tok1.idx]
    '''
    doc = en_tokenizer(u"   This is a cat.")
    assert doc[0].idx == 0
    assert len(doc[0]) == 3
    assert doc[1].idx == 3
--- a/spacy/tests/tokenizer/test_wiki_sun.py
+++ b/spacy/tests/tokenizer/test_wiki_sun.py
@ -1,21 +0,0 @@
 from __future__ import unicode_literals
 from spacy.util import utf8open
 import pytest
 from os import path
 HERE = path.dirname(__file__)
@pytest.fixture
 def sun_txt():
    loc = path.join(HERE, '..', 'sun.txt')
    return utf8open(loc).read()
 def test_tokenize(sun_txt, en_tokenizer):
    assert len(sun_txt) != 0
    tokens = en_tokenizer(sun_txt)
    assert len(tokens) > 100
--- a/spacy/tests/website/conftest.py
+++ b/spacy/tests/website/conftest.py
@ -1,20 +0,0 @@
 from __future__ import unicode_literals
 import pytest
 import os
@pytest.fixture(scope='session')
 def nlp():
    from spacy.en import English
    if os.environ.get('SPACY_DATA'):
        data_dir = os.environ.get('SPACY_DATA')
    else:
        data_dir = True
    return English(path=data_dir)
@pytest.fixture()
 def doc(nlp):
    for word in ['Hello', ',', 'world', '.', 'Here', 'are', 'two', 'sentences', '.']:
        _ = nlp.vocab[word]
    return nlp('Hello, world. Here are two sentences.')
--- a/spacy/tests/website/test_api.py
+++ b/spacy/tests/website/test_api.py
@ -1,172 +0,0 @@
 from __future__ import unicode_literals
 import pytest
 from spacy.attrs import HEAD
 import numpy
@pytest.mark.xfail
 def test_example_war_and_peace(nlp):
    # from spacy.en import English
    from spacy._doc_examples import download_war_and_peace
    unprocessed_unicode = download_war_and_peace()
    # nlp = English()
    # TODO: ImportError: No module named _doc_examples
    doc = nlp(unprocessed_unicode)
 def test_main_entry_point(nlp):
    # from spacy.en import English
    # nlp = English()
    doc = nlp('Some text.') # Applies tagger, parser, entity
    doc = nlp('Some text.', parse=False) # Applies tagger and entity, not parser
    doc = nlp('Some text.', entity=False) # Applies tagger and parser, not entity
    doc = nlp('Some text.', tag=False) # Does not apply tagger, entity or parser
    doc = nlp('') # Zero-length tokens, not an error
    # doc = nlp(b'Some text') <-- Error: need unicode
    doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
@pytest.mark.models
 def test_sentence_spans(nlp):
    # from spacy.en import English
    # nlp = English()
    doc = nlp("This is a sentence. Here's another...")
    assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
@pytest.mark.models
 def test_entity_spans(nlp):
    # from spacy.en import English
    # nlp = English()
    tokens = nlp('Mr. Best flew to New York on Saturday morning.')
    ents = list(tokens.ents)
    assert ents[0].label == 346
    assert ents[0].label_ == 'PERSON'
    assert ents[0].orth_ == 'Best'
    assert ents[0].string == ents[0].string
@pytest.mark.models
 def test_noun_chunk_spans(nlp):
    # from spacy.en import English
    # nlp = English()
    doc = nlp('The sentence in this example has three noun chunks.')
    for chunk in doc.noun_chunks:
        print(chunk.label, chunk.orth_, '<--', chunk.root.head.orth_)
    # NP The sentence <-- has
    # NP this example <-- in
    # NP three noun chunks <-- has
@pytest.mark.models
 def test_count_by(nlp):
    # from spacy.en import English, attrs
    # nlp = English()
    import numpy
    from spacy import attrs
    tokens = nlp('apple apple orange banana')
    assert tokens.count_by(attrs.ORTH) == {3699: 2, 3750: 1, 5965: 1}
    assert repr(tokens.to_array([attrs.ORTH])) == repr(numpy.array([[3699],
                                                        [3699],
                                                        [3750],
                                                        [5965]], dtype=numpy.int32))
@pytest.mark.models
 def test_read_bytes(nlp):
    from spacy.tokens.doc import Doc
    loc = 'test_serialize.bin'
    with open(loc, 'wb') as file_:
        file_.write(nlp(u'This is a document.').to_bytes())
        file_.write(nlp(u'This is another.').to_bytes())
    docs = []
    with open(loc, 'rb') as file_:
        for byte_string in Doc.read_bytes(file_):
            docs.append(Doc(nlp.vocab).from_bytes(byte_string))
    assert len(docs) == 2
 def test_token_span(doc):
    span = doc[4:6]
    token = span[0]
    assert token.i == 4
@pytest.mark.models
 def test_example_i_like_new_york1(nlp):
    toks = nlp('I like New York in Autumn.')
@pytest.fixture
 def toks(nlp):
    doc = nlp('I like New York in Autumn.')
    doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T)
    return doc
 def test_example_i_like_new_york2(toks):
    i, like, new, york, in_, autumn, dot = range(len(toks))
@pytest.fixture
 def tok(toks, tok):
    i, like, new, york, in_, autumn, dot = range(len(toks))
    return locals()[tok]
@pytest.fixture
 def new(toks):
    return tok(toks, "new")
@pytest.fixture
 def york(toks):
    return tok(toks, "york")
@pytest.fixture
 def autumn(toks):
    return tok(toks, "autumn")
@pytest.fixture
 def dot(toks):
    return tok(toks, "dot")
 def test_example_i_like_new_york3(toks, new, york):
    assert toks[new].head.orth_ == 'York'
    assert toks[york].head.orth_ == 'like'
 def test_example_i_like_new_york4(toks, new, york):
    new_york = toks[new:york+1]
    assert new_york.root.orth_ == 'York'
 def test_example_i_like_new_york5(toks, autumn, dot):
    assert toks[autumn].head.orth_ == 'in'
    assert toks[dot].head.orth_ == 'like'
    autumn_dot = toks[autumn:]
    assert autumn_dot.root.orth_ == 'Autumn'
 def test_navigating_the_parse_tree_lefts(doc):
    # TODO: where does the span object come from?
    span = doc[:2]
    lefts = [span.doc[i] for i in range(0, span.start)
             if span.doc[i].head in span]
 def test_navigating_the_parse_tree_rights(doc):
    span = doc[:2]
    rights = [span.doc[i] for i in range(span.end, len(span.doc))
              if span.doc[i].head in span]
 def test_string_store(doc):
    string_store = doc.vocab.strings
    for i, string in enumerate(string_store):
        assert i == string_store[string]
--- a/spacy/tests/website/test_home.py
+++ b/spacy/tests/website/test_home.py
@ -1,180 +0,0 @@
 from __future__ import unicode_literals
 import pytest
 import spacy
 import os
 try:
    xrange
 except NameError:
    xrange = range
@pytest.fixture()
 def token(doc):
    return doc[0]
@pytest.mark.models
 def test_load_resources_and_process_text():
    from spacy.en import English
    nlp = English()
    doc = nlp(u'Hello, world. Here are two sentences.')
@pytest.mark.models
 def test_get_tokens_and_sentences(doc):
    token = doc[0]
    sentence = next(doc.sents)
    assert token is sentence[0]
    assert sentence.text == 'Hello, world.'
@pytest.mark.models
 def test_use_integer_ids_for_any_strings(nlp, token):
    hello_id = nlp.vocab.strings['Hello']
    hello_str = nlp.vocab.strings[hello_id]
    assert token.orth  == hello_id  == 3125
    assert token.orth_ == hello_str == 'Hello'
 def test_get_and_set_string_views_and_flags(nlp, token):
    assert token.shape_ == 'Xxxxx'
    for lexeme in nlp.vocab:
        if lexeme.is_alpha:
            lexeme.shape_ = 'W'
        elif lexeme.is_digit:
            lexeme.shape_ = 'D'
        elif lexeme.is_punct:
            lexeme.shape_ = 'P'
        else:
            lexeme.shape_ = 'M'
    assert token.shape_ == 'W'
 def test_export_to_numpy_arrays(nlp, doc):
    from spacy.attrs import ORTH, LIKE_URL, IS_OOV
    attr_ids = [ORTH, LIKE_URL, IS_OOV]
    doc_array = doc.to_array(attr_ids)
    assert doc_array.shape == (len(doc), len(attr_ids))
    assert doc[0].orth == doc_array[0, 0]
    assert doc[1].orth == doc_array[1, 0]
    assert doc[0].like_url == doc_array[0, 1]
    assert list(doc_array[:, 1]) == [t.like_url for t in doc]
@pytest.mark.models
 def test_word_vectors(nlp):
    doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
    apples = doc[0]
    oranges = doc[2]
    boots = doc[6]
    hippos = doc[8]
    assert apples.similarity(oranges) > boots.similarity(hippos)
@pytest.mark.models
 def test_part_of_speech_tags(nlp):
    from spacy.parts_of_speech import ADV
    def is_adverb(token):
        return token.pos == spacy.parts_of_speech.ADV
    # These are data-specific, so no constants are provided. You have to look
    # up the IDs from the StringStore.
    NNS = nlp.vocab.strings['NNS']
    NNPS = nlp.vocab.strings['NNPS']
    def is_plural_noun(token):
        return token.tag == NNS or token.tag == NNPS
    def print_coarse_pos(token):
        print(token.pos_)
    def print_fine_pos(token):
        print(token.tag_)
@pytest.mark.models
 def test_syntactic_dependencies():
    def dependency_labels_to_root(token):
        '''Walk up the syntactic tree, collecting the arc labels.'''
        dep_labels = []
        while token.head is not token:
            dep_labels.append(token.dep)
            token = token.head
        return dep_labels
@pytest.mark.models
 def test_named_entities():
    def iter_products(docs):
        for doc in docs:
            for ent in doc.ents:
                if ent.label_ == 'PRODUCT':
                    yield ent
    def word_is_in_entity(word):
        return word.ent_type != 0
    def count_parent_verb_by_person(docs):
        counts = defaultdict(defaultdict(int))
        for doc in docs:
            for ent in doc.ents:
                if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
                    counts[ent.orth_][ent.root.head.lemma_] += 1
        return counts
 def test_calculate_inline_mark_up_on_original_string():
    def put_spans_around_tokens(doc, get_classes):
        '''Given some function to compute class names, put each token in a
        span element, with the appropriate classes computed.
        All whitespace is preserved, outside of the spans. (Yes, I know HTML
        won't display it. But the point is no information is lost, so you can
        calculate what you need, e.g. <br /> tags, <p> tags, etc.)
        '''
        output = []
        template = '<span classes="{classes}">{word}</span>{space}'
        for token in doc:
            if token.is_space:
                output.append(token.orth_)
            else:
                output.append(
                  template.format(
                    classes=' '.join(get_classes(token)),
                    word=token.orth_,
                    space=token.whitespace_))
        string = ''.join(output)
        string = string.replace('\n', '')
        string = string.replace('\t', '    ')
        return string
@pytest.mark.models
 def test_efficient_binary_serialization(doc):
    from spacy.tokens.doc import Doc
    byte_string = doc.to_bytes()
    open('moby_dick.bin', 'wb').write(byte_string)
    nlp = spacy.en.English()
    for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')):
       doc = Doc(nlp.vocab)
       doc.from_bytes(byte_string)
@pytest.mark.models
 def test_multithreading(nlp):
    texts = [u'One document.', u'...', u'Lots of documents']
    # .pipe streams input, and produces streaming output
    iter_texts = (texts[i % 3] for i in xrange(100000000))
    for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):
        assert doc.is_parsed
        if i == 100:
            break
--- a/spacy/util.py
+++ b/spacy/util.py
@ -94,8 +94,13 @@ def read_regex(path):
 def compile_prefix_regex(entries):
-    expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
+    if '(' in entries:
-    return re.compile(expression)
+        # Handle deprecated data
        expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
        return re.compile(expression)
    else:
        expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
        return re.compile(expression)
 def compile_suffix_regex(entries):
--- a/website/_harp.json
+++ b/website/_harp.json
@ -22,7 +22,8 @@
            "twitter": "spacy_io",
            "github": "explosion",
            "reddit": "spacynlp",
-            "codepen": "explosion"
+            "codepen": "explosion",
            "gitter": "explosion/spaCy"
        },
        "NAVIGATION": {
@ -53,7 +54,7 @@
            }
        },
-        "V_CSS": "1.10",
+        "V_CSS": "1.14",
        "V_JS": "1.0",
        "DEFAULT_SYNTAX" : "python",
        "ANALYTICS": "UA-58931649-1",
--- a/website/_includes/_mixins-base.jade
+++ b/website/_includes/_mixins-base.jade
@ -1,6 +1,7 @@
 //- 💫 MIXINS > BASE
 //- Aside wrapper
    label - [string] aside label
 mixin aside-wrapper(label)
    aside.c-aside
@ -21,6 +22,10 @@ mixin date(input)
 //- SVG from map
    file   - [string] SVG file name in /assets/img/
    name   - [string] SVG symbol id
    width  - [integer] width in px
    height - [integer] height in px (default: same as width)
 mixin svg(file, name, width, height)
    svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
@ -28,19 +33,23 @@ mixin svg(file, name, width, height)
 //- Icon
    name - [string] icon name, should be SVG symbol ID
    size - [integer] icon width and height (default: 20)
 mixin icon(name, size)
-    +svg("icons", "icon-" + name, size || 20).o-icon&attributes(attributes)
+    +svg("icons", name, size || 20).o-icon&attributes(attributes)
 //- Pro/Con/Neutral icon
    icon - [string] "pro", "con" or "neutral" (default: "neutral")
 mixin procon(icon)
-    - colors = { pro: "green", con: "red" }
+    - colors = { pro: "green", con: "red", neutral: "yellow" }
    +icon(icon)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
 //- Headlines Helper Mixin
    level - [integer] 1, 2, 3, 4, or 5
 mixin headline(level)
    if level == 1
@ -65,6 +74,7 @@ mixin headline(level)
 //- Permalink rendering
    id - [string] permalink ID used for link anchor
 mixin permalink(id)
    if id
@ -77,6 +87,7 @@ mixin permalink(id)
 //- Terminal-style code window
    label - [string] title displayed in top bar of terminal window
 mixin terminal(label)
    .x-terminal
@ -87,6 +98,18 @@ mixin terminal(label)
            block
 //- Gitter chat button and widget
    button - [string] text shown on button
    label  - [string] title of chat window (default: same as button)
 mixin gitter(button, label)
    aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
    button.js-gitter-button.c-chat__button.u-text-small
        +icon("chat").o-icon--inline
        !=button
 //- Logo
 mixin logo()
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -44,7 +44,7 @@ mixin api(path)
    +a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block
        block
-        |  #[+icon("book", 18).o-icon--inline.u-help.u-color-subtle]
+        |  #[+icon("book", 18).o-icon--inline.u-color-subtle]
 //- Aside for text
--- a/website/_includes/_page-docs.jade
+++ b/website/_includes/_page-docs.jade
@ -24,4 +24,6 @@ main.o-main.o-main--sidebar.o-main--aside
            .o-inline-list
                +button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary").u-text-tag Suggest edits #[+icon("code", 14)]
    +gitter("spaCy chat")
    include _footer
--- a/website/_includes/_scripts.jade
+++ b/website/_includes/_scripts.jade
@ -0,0 +1,23 @@
 //- 💫 INCLUDES > SCRIPTS
 script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript")
 script(src="/assets/js/prism.js", type="text/javascript")
 if SECTION == "docs"
    script.
        ((window.gitter = {}).chat = {}).options = {
            useStyles: false,
            activationElement: '.js-gitter-button',
            targetElement: '.js-gitter',
            room: '!{SOCIAL.gitter}'
        };
    script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
 if environment == "deploy"
    script
        | window.ga=window.ga||function(){
        | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
        | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
    script(async src="https://www.google-analytics.com/analytics.js")
--- a/website/_layout.jade
+++ b/website/_layout.jade
@ -52,13 +52,4 @@ html(lang="en")
            main!=yield
                include _includes/_footer
-        script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript")
+        include _includes/_scripts
        script(src="/assets/js/prism.js", type="text/javascript")
        if environment == "deploy"
            script
                | window.ga=window.ga||function(){
                | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
                | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
            script(async src="https://www.google-analytics.com/analytics.js")
--- a/website/assets/css/_base/_fonts.sass
+++ b/website/assets/css/_base/_fonts.sass
@ -6,36 +6,36 @@
    font-family: "Source Sans Pro"
    font-style: normal
    font-weight: 400
-    src: url("../fonts/sourcesanspro-regular.eot")
+    src: url("/assets/fonts/sourcesanspro-regular.eot")
-    src: url("../fonts/sourcesanspro-regular.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-regular.woff2") format("woff2"), url("../fonts/sourcesanspro-regular.woff") format("woff"), url("../fonts/sourcesanspro-regular.ttf") format("truetype"), url("../fonts/sourcesanspro-regular.svg#source_sans_proregular") format("svg")
+    src: url("/assets/fonts/sourcesanspro-regular.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-regular.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-regular.woff") format("woff"), url("/assets/fonts/sourcesanspro-regular.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-regular.svg#source_sans_proregular") format("svg")
@font-face
    font-family: "Source Sans Pro"
    font-style: italic
    font-weight: 400
-    src: url("../fonts/sourcesanspro-italic.eot")
+    src: url("/assets/fonts/sourcesanspro-italic.eot")
-    src: url("../fonts/sourcesanspro-italic.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-italic.woff2") format("woff2"), url("../fonts/sourcesanspro-italic.woff") format("woff"), url("../fonts/sourcesanspro-italic.ttf") format("truetype"), url("../fonts/sourcesanspro-italic.svg#source_sans_proitalic") format("svg")
+    src: url("/assets/fonts/sourcesanspro-italic.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-italic.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-italic.woff") format("woff"), url("/assets/fonts/sourcesanspro-italic.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-italic.svg#source_sans_proitalic") format("svg")
@font-face
    font-family: "Source Sans Pro"
    font-style: normal
    font-weight: 700
-    src: url("../fonts/sourcesanspro-bold.eot")
+    src: url("/assets/fonts/sourcesanspro-bold.eot")
-    src: url("../fonts/sourcesanspro-bold.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-bold.woff2") format("woff2"), url("../fonts/sourcesanspro-bold.woff") format("woff"), url("../fonts/sourcesanspro-bold.ttf") format("truetype"), url("../fonts/sourcesanspro-bold.svg#source_sans_probold") format("svg")
+    src: url("/assets/fonts/sourcesanspro-bold.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-bold.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-bold.woff") format("woff"), url("/assets/fonts/sourcesanspro-bold.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-bold.svg#source_sans_probold") format("svg")
@font-face
    font-family: "Source Sans Pro"
    font-style: italic
    font-weight: 700
-    src: url("../fonts/sourcesanspro-bolditalic.eot")
+    src: url("/assets/fonts/sourcesanspro-bolditalic.eot")
-    src: url("../fonts/sourcesanspro-bolditalic.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-bolditalic.woff2") format("woff2"), url("../fonts/sourcesanspro-bolditalic.woff") format("woff"), url("../fonts/sourcesanspro-bolditalic.ttf") format("truetype"), url("../fonts/sourcesanspro-bolditalic.svg#source_sans_probold_italic") format("svg")
+    src: url("/assets/fonts/sourcesanspro-bolditalic.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-bolditalic.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-bolditalic.woff") format("woff"), url("/assets/fonts/sourcesanspro-bolditalic.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-bolditalic.svg#source_sans_probold_italic") format("svg")
 //  Source Code Pro
@font-face
-  font-family: "Source Code Pro"
+    font-family: "Source Code Pro"
-  font-style: normal
+    font-style: normal
-  font-weight: 600
+    font-weight: 600
-  src: url("../fonts/sourcecodepro-semibold.eot")
+    src: url("/assets/fonts/sourcecodepro-semibold.eot")
-  src: url("../fonts/sourcecodepro-semibold.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcecodepro-semibold.woff") format("woff"), url("../fonts/sourcecodepro-semibold.ttf") format("truetype"), url("../fonts/sourcecodepro-semibold.svg#sourcecodepro_semibold") format("svg")
+    src: url("/assets/fonts/sourcecodepro-semibold.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcecodepro-semibold.woff") format("woff"), url("/assets/fonts/sourcecodepro-semibold.ttf") format("truetype"), url("/assets/fonts/sourcecodepro-semibold.svg#sourcecodepro_semibold") format("svg")
--- a/website/assets/css/_base/_objects.sass
+++ b/website/assets/css/_base/_objects.sass
@ -60,7 +60,7 @@
    background: $color-back
    border-radius: 2px
    border: 1px solid $color-subtle
-    padding: 3.5% 2.5%
+    padding: 3rem 2.5%
 //- Icons
--- a/website/assets/css/_base/_utilities.sass
+++ b/website/assets/css/_base/_utilities.sass
@ -141,12 +141,6 @@
    background: $pattern
 //- Cursors
 .u-help
    cursor: help
 //- Hidden elements
 .u-hidden
--- a/website/assets/css/_components/_chat.sass
+++ b/website/assets/css/_components/_chat.sass
@ -0,0 +1,100 @@
 //- 💫 CSS > COMPONENTS > CHAT
 .c-chat
    @include position(fixed, top, left, 0, 60%)
    bottom: 0
    right: 0
    display: flex
    flex-flow: column nowrap
    background: $color-back
    transition: transform 0.3s cubic-bezier(0.16, 0.22, 0.22, 1.7)
    box-shadow: -0.25rem 0 1rem 0 rgba($color-front, 0.25)
    z-index: 100
    @include breakpoint(min, md)
        left: calc(100% - #{$aside-width} - #{$aside-padding})
    @include breakpoint(max, sm)
        left: 50%
    @include breakpoint(max, xs)
        left: 0
    &.is-collapsed:not(.is-loading)
        transform: translateX(110%)
    &:before
        @include position(absolute, top, left, 1rem, 2rem)
        content: attr(data-title)
        font: bold 1.4rem $font-code
        text-transform: uppercase
        color: $color-back
    &:after
        @include position(absolute, top, left, 0, 100%)
        content: ""
        z-index: -1
        bottom: 0
        right: -100%
        background: $color-back
    & > iframe
        width: 100%
        flex: 1 1 calc(100% - #{$nav-height})
        border: 0
    .gitter-chat-embed-loading-wrapper
        @include position(absolute, top, left, 0, 0)
        right: 0
        bottom: 0
        display: none
        justify-content: center
        align-items: center
        .is-loading &
            display: flex
    .gitter-chat-embed-action-bar,
    .gitter-chat-embed-action-bar-item
        display: flex
    .gitter-chat-embed-action-bar
        align-items: center
        justify-content: flex-end
        background: $color-theme
        padding: 0 1rem 0 2rem
        flex: 0 0 $nav-height
    .gitter-chat-embed-action-bar-item
        @include size(40px)
        padding: 0
        opacity: 0.75
        background-position: 50%
        background-repeat: no-repeat
        background-size: 22px 22px
        border: 0
        cursor: pointer
        transition: all 0.2s ease
        &:focus,
        &:hover
            opacity: 1
        &.gitter-chat-embed-action-bar-item-pop-out
            background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyMCIgaGVpZ2h0PSIyMCIgdmlld0JveD0iMCAwIDIwIDIwIj48cGF0aCBmaWxsPSIjZmZmIiBkPSJNMTYgMmgtOC4wMjFjLTEuMDk5IDAtMS45NzkgMC44OC0xLjk3OSAxLjk4djguMDIwYzAgMS4xIDAuOSAyIDIgMmg4YzEuMSAwIDItMC45IDItMnYtOGMwLTEuMS0wLjktMi0yLTJ6TTE2IDEyaC04di04aDh2OHpNNCAxMGgtMnY2YzAgMS4xIDAuOSAyIDIgMmg2di0yaC02di02eiI+PC9wYXRoPjwvc3ZnPg==)
            margin-right: -4px
        &.gitter-chat-embed-action-bar-item-collapse-chat
            background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyNCIgaGVpZ2h0PSIyNCIgdmlld0JveD0iMCAwIDI0IDI0Ij48cGF0aCBmaWxsPSIjZmZmIiBkPSJNMTguOTg0IDYuNDIybC01LjU3OCA1LjU3OCA1LjU3OCA1LjU3OC0xLjQwNiAxLjQwNi01LjU3OC01LjU3OC01LjU3OCA1LjU3OC0xLjQwNi0xLjQwNiA1LjU3OC01LjU3OC01LjU3OC01LjU3OCAxLjQwNi0xLjQwNiA1LjU3OCA1LjU3OCA1LjU3OC01LjU3OHoiPjwvcGF0aD48L3N2Zz4=)
 .c-chat__button
    @include position(fixed, bottom, right, 0, 2rem)
    padding: 1rem 1.5rem
    background: $color-front
    color: $color-back
    border-top-left-radius: 4px
    border-top-right-radius: 4px
    z-index: 20
    border-color: $color-theme
    border-style: solid
    border-width: 1px 1px 0 1px
--- a/website/assets/css/style.sass
+++ b/website/assets/css/style.sass
@ -24,6 +24,7 @@ $theme: blue !default
@import _components/asides
@import _components/buttons
@import _components/chat
@import _components/code
@import _components/landing
@import _components/lists
--- a/website/assets/img/graphics.svg
+++ b/website/assets/img/graphics.svg
@ -64,5 +64,6 @@
        <symbol id="matt-signature" viewBox="0 0 500 250">
            <title>matt-signature</title>
            <path fill="currentColor" d="M18.6 207c-.3-18.8-.8-37.5-1.4-56.2-.6-18.7-1-37.5-1-56.2v-7.2c0-3.5 0-7 .2-11v-18c.8-2.7 1.8-5 3-6.5 1.6-2 3.6-3 6.4-3 3 0 5.4 1 7.6 2 2.2 2 4 4 5.3 6l36.6 71 1.8 3c1 1 2 3 3 3h1l1 1 1-3 22-76c2-3 3-5 4-8l2-9c1-3 2-6 4-8 1-3 4-5 7-7h2c5 0 8 1 10 4 3 2 4 5 5 9 1 3 2 7 1 12v11l1 7c0 3 0 7 1 12 0 4 1 9 1 14l1 14.2 1 12 .6 6v1l1 7.5 1 11.6 1.4 12 1.4 8 1 4 1.7 5.5 1.7 6c.7 1.7 1 3 1.5 3.6-.5 4-1.5 7-3 9-1 2-4 3-8 3h-6l-3-3c-1-1.4-2-2.3-2-3l-4-14-7.6-58V88c0-3.5-1-7-2-10l-2 1.7-18 74v6c0 2-.2 4-1 6 0 2-1 3.5-3 5-1 1.3-3 2-5 2.2-1 0-2 0-3-1l-3.4-2-3-3c-1-1-1.7-2-2-3l-35-52-5.3-10.6v22c0 10.2.2 20.3.6 30.2.4 10 .6 20 .6 30.2v22c0 2-1 4-3 5.4s-3 3-5 3c-3 0-5 0-7-1-1-1-3-3-4-5zm205-63.2c-1.6 2.7-3.4 6-5.3 9.8l-6.2 12.2c-2 4.3-4 8.6-7 13-2 4.2-5 8.2-8 11.7s-5 6.6-9 9c-3 2.5-6 4-9 4.4-1 0-3-1-4-1l-5-2c-1-1-3-2-4-3s-1-3-1-5c1-18 2-33 4-47s6-27 11-38 12-20 20-27 18-12 29-15l2-1h2c5 0 9 2 11 7s4 12 5 23c1 10 2 24 2 40 1 16 2 36 3 59l1 4v5c0 2.6-1 4.5-2 6s-3 2-5 2c-5 0-8-1.7-10-4s-3-6.6-4-11v-4l-1-9s-1-6.7-1-10l-1-8.5v-1l-.2-6-1-7-.5-8.6-1-1zM218 93.5c-4.7 3.4-9.2 8-13.6 13.7-4.4 5.8-7.5 11.3-9.4 16.8-.8 2.5-1.8 6-2.8 10.4-1 4.4-2 8.8-2.7 13l-2 12-.7 7c.2 0 .4-.2.6-.5l.6-1c10.5-10 18-21 22.2-33 4.6-12 7-25 7.7-39zm72 47c-2.3 0-4.4.6-6.2 1.8-2 1.2-4 1.8-6.6 1.8h-5.4c-.7-1-1.4-1-2.3-2l-2.5-2c-.8 0-1.6-1-2.2-2-.6-1-1-2-1-3 0-2 1-4 3-6 2-1 4.5-3 7.2-4l8.3-3s5-2 6.7-3v-11c0-12-.6-25-1.8-38-1.2-12-1.8-25-1.8-37 0-3 .8-6 2.5-7 1-1 4-1 6-1 3 0 6 1 7 3s2 4 3 7c0 3 1 6 1 9v20l1 18 1 18 1 12 4-1 6-2 6-2 4-1 14-6c4-2.3 9-3.4 14-3.4 3 0 6 1 7 3.5s3 5 3 8c0 2-1 4-3 5l-6 3-46 17-1.5 1s-1 0-1.5 1v8c0 6 0 12 .5 18s1 12.3 2 18.3l3 15c1 5 1.4 10 1.4 15 0 1.4-.6 3.5-1.6 6s-2 4-4.7 4c-5 0-8.7-1.6-11.6-4-3-3-4.3-6.6-4.6-11l-2.2-29-2.7-30h-1zm112 0c-2.4 0-4.5.6-6.3 1.8-2 1.2-4 1.8-6.6 1.8h-5c0-1-1-1-2-2l-2-2c-1 0-1-1-2-2 0-1-1-2-1-3 0-2 1-4 3-6 2-1 5-3 7-4l8-3s5-2 7-3v-11c0-12 0-25-2-38-1-12-1-25-1-37 0-3 1-6 3-7s4-1 7-1c4 0 6 1 8 3s3 4 3 7c1 3 1 6 1 9s0 6 1 8v11l1 18 1 18 1 12 4-1 6-2 6-2 4-1 14-6c4-2 9-4 14-4 4 0 6 1 8 4s3 5 3 8c0 2-1 4-2 5l-5.3 3-49 13.8-1.5 1s-1 .5-1.5 1V157l1 18.3c0 5 1 10 2 15s1 10 1 15c0 1.5-1 3.6-2 6s-3 4-5 4c-5 0-9-1.5-12-4.2s-5-6-5-11l-3-28.3-3-30.3h-1z"/>
        </symbol>
    </defs>
 </svg>
--- a/website/assets/img/icons.svg
+++ b/website/assets/img/icons.svg
@ -1,32 +1,28 @@
 <svg style="position: absolute; width: 0; height: 0;" width="0" height="0" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
    <defs>
-        <symbol id="icon-github" viewBox="0 0 27 32">
+        <symbol id="github" viewBox="0 0 27 32">
-            <title>github</title>
+            <path d="M13.714 2.286q3.732 0 6.884 1.839t4.991 4.991 1.839 6.884q0 4.482-2.616 8.063t-6.759 4.955q-0.482 0.089-0.714-0.125t-0.232-0.536q0-0.054 0.009-1.366t0.009-2.402q0-1.732-0.929-2.536 1.018-0.107 1.83-0.321t1.679-0.696 1.446-1.188 0.946-1.875 0.366-2.688q0-2.125-1.411-3.679 0.661-1.625-0.143-3.643-0.5-0.161-1.446 0.196t-1.643 0.786l-0.679 0.429q-1.661-0.464-3.429-0.464t-3.429 0.464q-0.286-0.196-0.759-0.482t-1.491-0.688-1.518-0.241q-0.804 2.018-0.143 3.643-1.411 1.554-1.411 3.679 0 1.518 0.366 2.679t0.938 1.875 1.438 1.196 1.679 0.696 1.83 0.321q-0.696 0.643-0.875 1.839-0.375 0.179-0.804 0.268t-1.018 0.089-1.17-0.384-0.991-1.116q-0.339-0.571-0.866-0.929t-0.884-0.429l-0.357-0.054q-0.375 0-0.518 0.080t-0.089 0.205 0.161 0.25 0.232 0.214l0.125 0.089q0.393 0.179 0.777 0.679t0.563 0.911l0.179 0.411q0.232 0.679 0.786 1.098t1.196 0.536 1.241 0.125 0.991-0.063l0.411-0.071q0 0.679 0.009 1.58t0.009 0.973q0 0.321-0.232 0.536t-0.714 0.125q-4.143-1.375-6.759-4.955t-2.616-8.063q0-3.732 1.839-6.884t4.991-4.991 6.884-1.839zM5.196 21.982q0.054-0.125-0.125-0.214-0.179-0.054-0.232 0.036-0.054 0.125 0.125 0.214 0.161 0.107 0.232-0.036zM5.75 22.589q0.125-0.089-0.036-0.286-0.179-0.161-0.286-0.054-0.125 0.089 0.036 0.286 0.179 0.179 0.286 0.054zM6.286 23.393q0.161-0.125 0-0.339-0.143-0.232-0.304-0.107-0.161 0.089 0 0.321t0.304 0.125zM7.036 24.143q0.143-0.143-0.071-0.339-0.214-0.214-0.357-0.054-0.161 0.143 0.071 0.339 0.214 0.214 0.357 0.054zM8.054 24.589q0.054-0.196-0.232-0.286-0.268-0.071-0.339 0.125t0.232 0.268q0.268 0.107 0.339-0.107zM9.179 24.679q0-0.232-0.304-0.196-0.286 0-0.286 0.196 0 0.232 0.304 0.196 0.286 0 0.286-0.196zM10.214 24.5q-0.036-0.196-0.321-0.161-0.286 0.054-0.25 0.268t0.321 0.143 0.25-0.25z"></path>
            <path class="path1" d="M13.714 2.286q3.732 0 6.884 1.839t4.991 4.991 1.839 6.884q0 4.482-2.616 8.063t-6.759 4.955q-0.482 0.089-0.714-0.125t-0.232-0.536q0-0.054 0.009-1.366t0.009-2.402q0-1.732-0.929-2.536 1.018-0.107 1.83-0.321t1.679-0.696 1.446-1.188 0.946-1.875 0.366-2.688q0-2.125-1.411-3.679 0.661-1.625-0.143-3.643-0.5-0.161-1.446 0.196t-1.643 0.786l-0.679 0.429q-1.661-0.464-3.429-0.464t-3.429 0.464q-0.286-0.196-0.759-0.482t-1.491-0.688-1.518-0.241q-0.804 2.018-0.143 3.643-1.411 1.554-1.411 3.679 0 1.518 0.366 2.679t0.938 1.875 1.438 1.196 1.679 0.696 1.83 0.321q-0.696 0.643-0.875 1.839-0.375 0.179-0.804 0.268t-1.018 0.089-1.17-0.384-0.991-1.116q-0.339-0.571-0.866-0.929t-0.884-0.429l-0.357-0.054q-0.375 0-0.518 0.080t-0.089 0.205 0.161 0.25 0.232 0.214l0.125 0.089q0.393 0.179 0.777 0.679t0.563 0.911l0.179 0.411q0.232 0.679 0.786 1.098t1.196 0.536 1.241 0.125 0.991-0.063l0.411-0.071q0 0.679 0.009 1.58t0.009 0.973q0 0.321-0.232 0.536t-0.714 0.125q-4.143-1.375-6.759-4.955t-2.616-8.063q0-3.732 1.839-6.884t4.991-4.991 6.884-1.839zM5.196 21.982q0.054-0.125-0.125-0.214-0.179-0.054-0.232 0.036-0.054 0.125 0.125 0.214 0.161 0.107 0.232-0.036zM5.75 22.589q0.125-0.089-0.036-0.286-0.179-0.161-0.286-0.054-0.125 0.089 0.036 0.286 0.179 0.179 0.286 0.054zM6.286 23.393q0.161-0.125 0-0.339-0.143-0.232-0.304-0.107-0.161 0.089 0 0.321t0.304 0.125zM7.036 24.143q0.143-0.143-0.071-0.339-0.214-0.214-0.357-0.054-0.161 0.143 0.071 0.339 0.214 0.214 0.357 0.054zM8.054 24.589q0.054-0.196-0.232-0.286-0.268-0.071-0.339 0.125t0.232 0.268q0.268 0.107 0.339-0.107zM9.179 24.679q0-0.232-0.304-0.196-0.286 0-0.286 0.196 0 0.232 0.304 0.196 0.286 0 0.286-0.196zM10.214 24.5q-0.036-0.196-0.321-0.161-0.286 0.054-0.25 0.268t0.321 0.143 0.25-0.25z"></path>
        </symbol>
-        <symbol id="icon-code" viewBox="0 0 20 20">
+        <symbol id="code" viewBox="0 0 20 20">
-            <title>code</title>
+            <path d="M5.719 14.75c-0.236 0-0.474-0.083-0.664-0.252l-5.060-4.498 5.341-4.748c0.412-0.365 1.044-0.33 1.411 0.083s0.33 1.045-0.083 1.412l-3.659 3.253 3.378 3.002c0.413 0.367 0.45 0.999 0.083 1.412-0.197 0.223-0.472 0.336-0.747 0.336zM14.664 14.748l5.341-4.748-5.060-4.498c-0.413-0.367-1.045-0.33-1.411 0.083s-0.33 1.045 0.083 1.412l3.378 3.003-3.659 3.252c-0.413 0.367-0.45 0.999-0.083 1.412 0.197 0.223 0.472 0.336 0.747 0.336 0.236 0 0.474-0.083 0.664-0.252zM9.986 16.165l2-12c0.091-0.545-0.277-1.060-0.822-1.151-0.547-0.092-1.061 0.277-1.15 0.822l-2 12c-0.091 0.545 0.277 1.060 0.822 1.151 0.056 0.009 0.11 0.013 0.165 0.013 0.48 0 0.904-0.347 0.985-0.835z"></path>
            <path class="path1" d="M5.719 14.75c-0.236 0-0.474-0.083-0.664-0.252l-5.060-4.498 5.341-4.748c0.412-0.365 1.044-0.33 1.411 0.083s0.33 1.045-0.083 1.412l-3.659 3.253 3.378 3.002c0.413 0.367 0.45 0.999 0.083 1.412-0.197 0.223-0.472 0.336-0.747 0.336zM14.664 14.748l5.341-4.748-5.060-4.498c-0.413-0.367-1.045-0.33-1.411 0.083s-0.33 1.045 0.083 1.412l3.378 3.003-3.659 3.252c-0.413 0.367-0.45 0.999-0.083 1.412 0.197 0.223 0.472 0.336 0.747 0.336 0.236 0 0.474-0.083 0.664-0.252zM9.986 16.165l2-12c0.091-0.545-0.277-1.060-0.822-1.151-0.547-0.092-1.061 0.277-1.15 0.822l-2 12c-0.091 0.545 0.277 1.060 0.822 1.151 0.056 0.009 0.11 0.013 0.165 0.013 0.48 0 0.904-0.347 0.985-0.835z"></path>
        </symbol>
-        <symbol id="icon-anchor" viewBox="0 0 16 16">
+        <symbol id="anchor" viewBox="0 0 16 16">
-            <title>anchor</title>
+            <path d="M14.779 12.779c-1.471 1.993-4.031 3.245-6.779 3.221-2.748 0.023-5.309-1.229-6.779-3.221l-1.221 1.221v-4h4l-1.1 1.099c0.882 1.46 2.357 2.509 4.1 2.807v-6.047c-1.723-0.446-3-1.997-3-3.858 0-2.209 1.791-4 4-4s4 1.791 4 4c0 1.862-1.277 3.413-3 3.858v6.047c1.742-0.297 3.218-1.347 4.099-2.807l-1.1-1.099h4v4l-1.221-1.221zM10 4c0-1.104-0.895-2-2-2s-2 0.895-2 2c0 1.104 0.895 2 2 2s2-0.896 2-2z"></path>
            <path class="path1" d="M14.779 12.779c-1.471 1.993-4.031 3.245-6.779 3.221-2.748 0.023-5.309-1.229-6.779-3.221l-1.221 1.221v-4h4l-1.1 1.099c0.882 1.46 2.357 2.509 4.1 2.807v-6.047c-1.723-0.446-3-1.997-3-3.858 0-2.209 1.791-4 4-4s4 1.791 4 4c0 1.862-1.277 3.413-3 3.858v6.047c1.742-0.297 3.218-1.347 4.099-2.807l-1.1-1.099h4v4l-1.221-1.221zM10 4c0-1.104-0.895-2-2-2s-2 0.895-2 2c0 1.104 0.895 2 2 2s2-0.896 2-2z"></path>
        </symbol>
-        <symbol id="icon-book" viewBox="0 0 24 24">
+        <symbol id="book" viewBox="0 0 24 24">
-            <title>book</title>
+            <path d="M18.984 6.984v-1.969h-9.984v1.969h9.984zM15 15v-2.016h-6v2.016h6zM18.984 11.016v-2.016h-9.984v2.016h9.984zM20.016 2.016c1.078 0 1.969 0.891 1.969 1.969v12c0 1.078-0.891 2.016-1.969 2.016h-12c-1.078 0-2.016-0.938-2.016-2.016v-12c0-1.078 0.938-1.969 2.016-1.969h12zM3.984 6v14.016h14.016v1.969h-14.016c-1.078 0-1.969-0.891-1.969-1.969v-14.016h1.969z"></path>
            <path class="path1" d="M18.984 6.984v-1.969h-9.984v1.969h9.984zM15 15v-2.016h-6v2.016h6zM18.984 11.016v-2.016h-9.984v2.016h9.984zM20.016 2.016c1.078 0 1.969 0.891 1.969 1.969v12c0 1.078-0.891 2.016-1.969 2.016h-12c-1.078 0-2.016-0.938-2.016-2.016v-12c0-1.078 0.938-1.969 2.016-1.969h12zM3.984 6v14.016h14.016v1.969h-14.016c-1.078 0-1.969-0.891-1.969-1.969v-14.016h1.969z"></path>
        </symbol>
-        <symbol id="icon-pro" viewBox="0 0 20 20">
+        <symbol id="pro" viewBox="0 0 20 20">
-            <title>pro</title>
+            <path d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-4v4h-2v-4h-4v-2h4v-4h2v4h4v2z"></path>
            <path class="path1" d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-4v4h-2v-4h-4v-2h4v-4h2v4h4v2z"></path>
        </symbol>
-        <symbol id="icon-con" viewBox="0 0 20 20">
+        <symbol id="con" viewBox="0 0 20 20">
-            <title>con</title>
+            <path d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-10v-2h10v2z"></path>
            <path class="path1" d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-10v-2h10v2z"></path>
        </symbol>
-        <symbol id="icon-neutral" viewBox="0 0 20 20">
+        <symbol id="neutral" viewBox="0 0 20 20">
-            <title>neutral</title>
+            <path d="M9.999 0.8c-5.081 0-9.199 4.119-9.199 9.201 0 5.080 4.118 9.199 9.199 9.199s9.2-4.119 9.2-9.199c0-5.082-4.119-9.201-9.2-9.201zM10 13.001c-1.657 0-3-1.344-3-3s1.343-3 3-3c1.656 0 3 1.344 3 3s-1.344 3-3 3z"></path>
-            <path class="path1" d="M9.999 0.8c-5.081 0-9.199 4.119-9.199 9.201 0 5.080 4.118 9.199 9.199 9.199s9.2-4.119 9.2-9.199c0-5.082-4.119-9.201-9.2-9.201zM10 13.001c-1.657 0-3-1.344-3-3s1.343-3 3-3c1.656 0 3 1.344 3 3s-1.344 3-3 3z"></path>
+        </symbol>
        <symbol id="chat" viewBox="0 0 24 24">
            <path d="M18 8.016v-2.016h-12v2.016h12zM18 11.016v-2.016h-12v2.016h12zM18 14.016v-2.016h-12v2.016h12zM21.984 3.984v18l-3.984-3.984h-14.016c-1.078 0-1.969-0.938-1.969-2.016v-12c0-1.078 0.891-1.969 1.969-1.969h16.031c1.078 0 1.969 0.891 1.969 1.969z"></path>
        </symbol>
    </defs>
 </svg>
--- a/website/docs/api/index.jade
+++ b/website/docs/api/index.jade
@ -23,7 +23,7 @@ p
    +row
        +cell Multi-language support
-        each icon in [ "con", "pro", "pro", "pro" ]
+        each icon in [ "neutral", "pro", "pro", "pro" ]
            +cell.u-text-center #[+procon(icon)]
    +row
--- a/website/docs/index.jade
+++ b/website/docs/index.jade
@ -2,8 +2,6 @@
 include ../_includes/_mixins
 p=lorem_short
 +aside("Help us improve the docs")
    |  Did you spot a mistake or come across explanations that
    |  are unclear? You can find a "Suggest edits" button at the
--- a/website/docs/usage/entity-recognition.jade
+++ b/website/docs/usage/entity-recognition.jade
@ -57,7 +57,7 @@ p
    doc.ents = [Span(0, 1, label='GPE')]
    assert doc[0].ent_type_ == 'GPE'
    doc.ents = []
-    doc.ents = [(u'LondonCity', 0, 1, u'GPE')]
+    doc.ents = [(u'LondonCity', u'GPE', 0, 1)]
 p
    |  The value you assign should be a sequence, the values of which
--- a/website/docs/usage/resources.jade
+++ b/website/docs/usage/resources.jade
@ -30,6 +30,13 @@ p Many of the associated tools and resources that we're developing alongside spa
        +cell
            |  REST microservices for spaCy demos and visualisers.
    +row
        +cell
            +src(gh("spacy-notebooks")) spaCy Notebooks
        +cell
            |  Jupyter notebooks for spaCy examples and tutorials.
 +h(2, "libraries") Libraries and projects
 +table(["Name", "Description"])
    +row
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@ -141,7 +141,7 @@ p
            span.merge(label=label, tag='NNP' if label else span.root.tag_)
    matcher.add_entity('GoogleNow', on_match=merge_phrases)
-    matcher.add_pattern('GoogleNow', {ORTH: 'Google'}, {ORTH: 'Now'}])
+    matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
    doc = Doc(matcher.vocab, words=[u'Google', u'Now', u'is', u'being', u'rebranded'])
    matcher(doc)
    print([w.text for w in doc])