From f0c3b09242e92fd07cbf5806055aef76bce982cd Mon Sep 17 00:00:00 2001
From: Gyorgy Orosz <oroszgy@gmail.com>
Date: Wed, 31 May 2017 22:22:42 +0200
Subject: [PATCH 1/5] More robust Hungarian tokenizer.

---
 spacy/lang/hu/punctuation.py          |  5 +++--
 spacy/tests/lang/hu/test_tokenizer.py | 32 ++++++++++++++++++++++-----
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py
index b758e0104..ce6134927 100644
--- a/spacy/lang/hu/punctuation.py
+++ b/spacy/lang/hu/punctuation.py
@@ -9,7 +9,8 @@ LIST_ICONS = [r'[\p{So}--[°]]']
 _currency = r'\$|¢|£|€|¥|฿'
 _quotes = QUOTES.replace("'", '')
 
-_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
+_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
+             [r'[,.:](?=[{a}])'.format(a=ALPHA)])
 
 _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
              [r'(?<=[0-9])\+',
@@ -21,7 +22,7 @@ _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
 
 _infixes = (LIST_ELLIPSES + LIST_ICONS +
             [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
-             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
+             r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
              r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
              r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
              r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
diff --git a/spacy/tests/lang/hu/test_tokenizer.py b/spacy/tests/lang/hu/test_tokenizer.py
index d88b7b7b7..1a4ee1a27 100644
--- a/spacy/tests/lang/hu/test_tokenizer.py
+++ b/spacy/tests/lang/hu/test_tokenizer.py
@@ -5,11 +5,11 @@ import pytest
 
 DEFAULT_TESTS = [
     ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
-    ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
+    pytest.param('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'], marks=pytest.mark.xfail),
     ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
     ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
     ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
-    ('A .hu.', ['A', '.hu', '.']),
+    pytest.param('A .hu.', ['A', '.hu', '.'], marks=pytest.mark.xfail),
     ('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
     ('A pl.', ['A', 'pl.']),
     ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
@@ -18,7 +18,9 @@ DEFAULT_TESTS = [
     ('Valami ...van...', ['Valami', '...', 'van', '...']),
     ('Valami...', ['Valami', '...']),
     ('Valami ...', ['Valami', '...']),
-    ('Valami ... más.', ['Valami', '...', 'más', '.'])
+    ('Valami ... más.', ['Valami', '...', 'más', '.']),
+    ('Soha nem lesz!', ['Soha', 'nem', 'lesz', '!']),
+    ('Soha nem lesz?', ['Soha', 'nem', 'lesz', '?'])
 ]
 
 HYPHEN_TESTS = [
@@ -225,11 +227,11 @@ QUOTE_TESTS = [
 
 DOT_TESTS = [
     ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
-    ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
+    pytest.param('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'], marks=pytest.mark.xfail),
     ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
     ('A pl. rövidítés.', ['A', 'pl.', 'rövidítés', '.']),
     ('A S.M.A.R.T. szó.', ['A', 'S.M.A.R.T.', 'szó', '.']),
-    ('A .hu.', ['A', '.hu', '.']),
+    pytest.param('A .hu.', ['A', '.hu', '.'], marks=pytest.mark.xfail),
     ('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
     ('A pl.', ['A', 'pl.']),
     ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
@@ -241,6 +243,24 @@ DOT_TESTS = [
     ('Valami ... más.', ['Valami', '...', 'más', '.'])
 ]
 
+TYPO_TESTS = [
+    (
+    'Ez egy mondat vége.Ez egy másik eleje.', ['Ez', 'egy', 'mondat', 'vége', '.', 'Ez', 'egy', 'másik', 'eleje', '.']),
+    ('Ez egy mondat vége .Ez egy másik eleje.',
+     ['Ez', 'egy', 'mondat', 'vége', '.', 'Ez', 'egy', 'másik', 'eleje', '.']),
+    (
+    'Ez egy mondat vége!ez egy másik eleje.', ['Ez', 'egy', 'mondat', 'vége', '!', 'ez', 'egy', 'másik', 'eleje', '.']),
+    ('Ez egy mondat vége !ez egy másik eleje.',
+     ['Ez', 'egy', 'mondat', 'vége', '!', 'ez', 'egy', 'másik', 'eleje', '.']),
+    (
+    'Ez egy mondat vége?Ez egy másik eleje.', ['Ez', 'egy', 'mondat', 'vége', '?', 'Ez', 'egy', 'másik', 'eleje', '.']),
+    ('Ez egy mondat vége ?Ez egy másik eleje.',
+     ['Ez', 'egy', 'mondat', 'vége', '?', 'Ez', 'egy', 'másik', 'eleje', '.']),
+    ('egy,kettő', ['egy', ',', 'kettő']),
+    ('egy ,kettő', ['egy', ',', 'kettő']),
+    ('egy :kettő', ['egy', ':', 'kettő']),
+]
+
 WIKI_TESTS = [
     ('!"', ['!', '"']),
     ('lány"a', ['lány', '"', 'a']),
@@ -253,7 +273,7 @@ WIKI_TESTS = [
     ('cérium(IV)-oxid', ['cérium', '(', 'IV', ')', '-oxid'])
 ]
 
-TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS
+TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS + TYPO_TESTS
 
 
 @pytest.mark.parametrize('text,expected_tokens', TESTCASES)

From 6438428ce8f1e6b70e4bcb2931e69922d5e6faa0 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 4 Jun 2017 22:09:33 +0200
Subject: [PATCH 2/5] Update v2 infobox

---
 website/_includes/_page-docs.jade | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/website/_includes/_page-docs.jade b/website/_includes/_page-docs.jade
index d11e22502..7afbc6bdc 100644
--- a/website/_includes/_page-docs.jade
+++ b/website/_includes/_page-docs.jade
@@ -22,12 +22,12 @@ main.o-main.o-main--sidebar.o-main--aside
             +infobox("⚠️ You are viewing the spaCy v2.0.0 alpha docs")
                 strong This page is part of the alpha documentation for spaCy v2.0.
                 |  It does not reflect the state of the latest stable release.
-                |  Because v2.0 is still under development, the actual
-                |  implementation may differ from the intended state described
-                |  here.
-                |  #[+a("#") See here] for more information on how to install
-                |  and test the new version. To read the official docs for
-                |  v1.x, #[+a("https://spacy.io/docs") go here].
+                |  Because v2.0 is still under development, the implementation
+                |  may differ from the intended state described here. See the
+                |  #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes]
+                |  for details on how to install and test the new version. To
+                |  read the official docs for spaCy v1.x,
+                |  #[+a("https://spacy.io/docs") go here].
 
         !=yield
 

From f432bb4b48d84d541420d3888c4487b4e0d57622 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 4 Jun 2017 22:34:31 +0200
Subject: [PATCH 3/5] Fix fixture scopes

---
 spacy/tests/conftest.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index b5a34cb2d..dc5f26536 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -22,48 +22,48 @@ _models = {'en': ['en_core_web_sm', 'en_core_web_md'],
 # only used for tests that require loading the models
 # in all other cases, use specific instances
 
-@pytest.fixture(params=_models['en'], scope="session")
+@pytest.fixture(params=_models['en'], scope='session')
 def EN(request):
     return load_test_model(request.param)
 
 
-@pytest.fixture(params=_models['de'], scope="session")
+@pytest.fixture(params=_models['de'], scope='session')
 def DE(request):
     return load_test_model(request.param)
 
 
-@pytest.fixture(params=_models['fr'], scope="session")
+@pytest.fixture(params=_models['fr'], scope='session')
 def FR(request):
     return load_test_model(request.param)
 
 
-@pytest.fixture(params=_languages)
+@pytest.fixture(params=_languages, scope='session')
 def tokenizer(request):
     lang = util.get_lang_class(request.param)
     return lang.Defaults.create_tokenizer()
 
 
-@pytest.fixture
+@pytest.fixture(scope='module')
 def en_tokenizer():
     return util.get_lang_class('en').Defaults.create_tokenizer()
 
 
-@pytest.fixture
+@pytest.fixture(scope='module')
 def en_vocab():
     return util.get_lang_class('en').Defaults.create_vocab()
 
 
-@pytest.fixture
+@pytest.fixture(scope='module')
 def en_parser():
     return util.get_lang_class('en').Defaults.create_parser()
 
 
-@pytest.fixture
+@pytest.fixture(scope='module')
 def es_tokenizer():
     return util.get_lang_class('es').Defaults.create_tokenizer()
 
 
-@pytest.fixture
+@pytest.fixture(scope='module')
 def de_tokenizer():
     return util.get_lang_class('de').Defaults.create_tokenizer()
 
@@ -73,31 +73,31 @@ def fr_tokenizer():
     return util.get_lang_class('fr').Defaults.create_tokenizer()
 
 
-@pytest.fixture
+@pytest.fixture(scope='module')
 def hu_tokenizer():
     return util.get_lang_class('hu').Defaults.create_tokenizer()
 
 
-@pytest.fixture
+@pytest.fixture(scope='module')
 def fi_tokenizer():
     return util.get_lang_class('fi').Defaults.create_tokenizer()
 
 
-@pytest.fixture
+@pytest.fixture(scope='module')
 def sv_tokenizer():
     return util.get_lang_class('sv').Defaults.create_tokenizer()
 
 
-@pytest.fixture
+@pytest.fixture(scope='module')
 def bn_tokenizer():
     return util.get_lang_class('bn').Defaults.create_tokenizer()
 
 
-@pytest.fixture
+@pytest.fixture(scope='module')
 def he_tokenizer():
     return util.get_lang_class('he').Defaults.create_tokenizer()
 
-@pytest.fixture
+@pytest.fixture(scope='module')
 def nb_tokenizer():
     return util.get_lang_class('nb').Defaults.create_tokenizer()
 
@@ -107,7 +107,7 @@ def stringstore():
     return StringStore()
 
 
-@pytest.fixture
+@pytest.fixture(scope='module')
 def en_entityrecognizer():
      return util.get_lang_class('en').Defaults.create_entity()
 

From 96867a24aec5a1bc2378c5237612e520c51ec196 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 4 Jun 2017 22:36:40 +0200
Subject: [PATCH 4/5] Fix typo

---
 spacy/tests/regression/test_issue910.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/tests/regression/test_issue910.py b/spacy/tests/regression/test_issue910.py
index cc6610e0d..e7f360273 100644
--- a/spacy/tests/regression/test_issue910.py
+++ b/spacy/tests/regression/test_issue910.py
@@ -79,7 +79,8 @@ def test_issue910(EN, train_data, additional_entity_types):
     2) There's no way to set the learning rate for the weight update, so we
         end up out-of-scale, causing it to learn too fast.
     '''
-    doc = EN(u"I am looking for a restaurant in Berlin")
+    nlp = EN
+    doc = nlp(u"I am looking for a restaurant in Berlin")
     ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
     # Fine tune the ner model
     for entity_type in additional_entity_types:

From 8a29308d0bb7fcfa6947b83fa6522c1eda2b6cbf Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 4 Jun 2017 22:39:29 +0200
Subject: [PATCH 5/5] Remove unused imports

---
 spacy/tests/regression/test_issue910.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/spacy/tests/regression/test_issue910.py b/spacy/tests/regression/test_issue910.py
index e7f360273..94f26e49e 100644
--- a/spacy/tests/regression/test_issue910.py
+++ b/spacy/tests/regression/test_issue910.py
@@ -1,6 +1,5 @@
 from __future__ import unicode_literals
 import json
-import os
 import random
 import contextlib
 import shutil
@@ -9,7 +8,6 @@ import tempfile
 from pathlib import Path
 
 
-import pathlib
 from ...gold import GoldParse
 from ...pipeline import EntityRecognizer
 from ...lang.en import English