From 953f638aa503703edad17d0c6c9adac8c948a67b Mon Sep 17 00:00:00 2001
From: Phaninder Pasupula <pasupulaphani@gmail.com>
Date: Mon, 8 May 2017 11:48:05 +0100
Subject: [PATCH 001/195] Update _data.json

---
 website/docs/usage/_data.json | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json
index 2ffbf9d68..0632d3972 100644
--- a/website/docs/usage/_data.json
+++ b/website/docs/usage/_data.json
@@ -149,6 +149,11 @@
                 "author": "Johannes Gontrum",
                 "description": "spaCy accessed by a REST API, wrapped in a Docker container."
             },
+            "spacy-nlp-zeromq": {
+                "url": "https://github.com/pasupulaphani/spacy-nlp-docker",
+                "author": "Phaninder Pasupula",
+                "description": "SpaCy with zeromq bindings docker."
+            },
             "textacy": {
                 "url": "https://github.com/chartbeat-labs/textacy",
                 "author": " Burton DeWilde (Chartbeat)",

From cdaefae60ac08fc0093f86a83d0c5953197eb9fd Mon Sep 17 00:00:00 2001
From: oeg <daniel@recogn.ai>
Date: Fri, 12 May 2017 16:15:19 +0200
Subject: [PATCH 002/195] feature(populate_vocab): Enable pruning out rare
 words from clusters data

---
 spacy/cli/model.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/spacy/cli/model.py b/spacy/cli/model.py
index 3b9a77b93..4e7e0282b 100644
--- a/spacy/cli/model.py
+++ b/spacy/cli/model.py
@@ -98,10 +98,6 @@ def read_clusters(clusters_path):
 
 
 def populate_vocab(vocab, clusters, probs, oov_prob):
-    # Ensure probs has entries for all words seen during clustering.
-    for word in clusters:
-        if word not in probs:
-            probs[word] = oov_prob
     for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
         lexeme = vocab[word]
         lexeme.prob = prob

From e506811a93962c194c195ee277ca374267ba7cf0 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 13 May 2017 03:27:50 +0200
Subject: [PATCH 003/195] Update description

---
 website/docs/usage/_data.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json
index 0632d3972..78e8b3e27 100644
--- a/website/docs/usage/_data.json
+++ b/website/docs/usage/_data.json
@@ -152,7 +152,7 @@
             "spacy-nlp-zeromq": {
                 "url": "https://github.com/pasupulaphani/spacy-nlp-docker",
                 "author": "Phaninder Pasupula",
-                "description": "SpaCy with zeromq bindings docker."
+                "description": "Docker image exposing spaCy with ZeroMQ bindings."
             },
             "textacy": {
                 "url": "https://github.com/chartbeat-labs/textacy",

From ad590feaa88f245b206daefe5743811de2ee2102 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 13 May 2017 11:36:19 +0200
Subject: [PATCH 004/195] Fix test, which imported English incorrectly

---
 spacy/tests/doc/test_doc_api.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 6c8f61a81..064cbecfd 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -219,7 +219,6 @@ def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors):
 
 def test_parse_tree(EN):
     text = 'I like New York in Autumn.'
-    EN = English(parser=False)
     doc = EN(text, tag=True)
     doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T)
     # full method parse_tree(text) is a trivial composition

From c5669450a06f182bb369bc36fdaaaac5140d91a6 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 13 May 2017 12:33:57 +0200
Subject: [PATCH 005/195] Fix formatting

---
 spacy/tokens/printers.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/spacy/tokens/printers.py b/spacy/tokens/printers.py
index d70088540..1cadfc5bf 100644
--- a/spacy/tokens/printers.py
+++ b/spacy/tokens/printers.py
@@ -1,13 +1,20 @@
 from copy import deepcopy
+# coding: utf8
+from __future__ import unicode_literals
 
 def merge_ents(doc):
-    '''Helper: merge adjacent entities into single tokens; modifies the doc.'''
+    """
+    Helper: merge adjacent entities into single tokens; modifies the doc.
+    """
     for ent in doc.ents:
         ent.merge(ent.root.tag_, ent.text, ent.label_)
     return doc
 
+
 def format_POS(token, light, flat):
-    '''helper: form the POS output for a token'''
+    """
+    Helper: form the POS output for a token.
+    """
     subtree = dict([
         ("word", token.text),
         ("lemma", token.lemma_),  # trigger
@@ -26,16 +33,21 @@ def format_POS(token, light, flat):
     return subtree
 
 def POS_tree(root, light, flat):
-    '''Helper: generate a POS tree for a root token.
-    The doc must have merge_ents(doc) ran on it.
-    '''
+
+    """
+    Helper: generate a POS tree for a root token. The doc must have
+    merge_ents(doc) ran on it.
+    """
     subtree = format_POS(root, light=light, flat=flat)
     for c in root.children:
         subtree["modifiers"].append(POS_tree(c))
     return subtree
 
+
 def parse_tree(doc, light=False, flat=False):
-    """Makes a copy of the doc, then construct a syntactic parse tree, similar to the one used in displaCy. Generates the POS tree for all sentences in a doc
+    """
+    Makes a copy of the doc, then construct a syntactic parse tree, similar to
+    the one used in displaCy. Generates the POS tree for all sentences in a doc.
 
     Args:
         doc: The doc for parsing.

From bd428c0a70e589457d1112b4ef5d674cba4c82dd Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 13 May 2017 12:34:05 +0200
Subject: [PATCH 006/195] Set defaults for light and flat kwargs

---
 spacy/tokens/printers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tokens/printers.py b/spacy/tokens/printers.py
index 1cadfc5bf..8ab111120 100644
--- a/spacy/tokens/printers.py
+++ b/spacy/tokens/printers.py
@@ -32,8 +32,8 @@ def format_POS(token, light, flat):
         subtree.pop("modifiers")
     return subtree
 
-def POS_tree(root, light, flat):
 
+def POS_tree(root, light=False, flat=False):
     """
     Helper: generate a POS tree for a root token. The doc must have
     merge_ents(doc) ran on it.

From 573f0ba867d41c81a0f9c0af3f2158463e4c972c Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 13 May 2017 12:34:14 +0200
Subject: [PATCH 007/195] Replace deepcopy

---
 spacy/tokens/printers.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/spacy/tokens/printers.py b/spacy/tokens/printers.py
index 8ab111120..f9b1f3972 100644
--- a/spacy/tokens/printers.py
+++ b/spacy/tokens/printers.py
@@ -1,7 +1,10 @@
-from copy import deepcopy
 # coding: utf8
 from __future__ import unicode_literals
 
+from .doc import Doc
+from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
+
+
 def merge_ents(doc):
     """
     Helper: merge adjacent entities into single tokens; modifies the doc.
@@ -61,6 +64,8 @@ def parse_tree(doc, light=False, flat=False):
     >>> trees = doc.print_tree()
     [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
     """
-    doc_clone = deepcopy(doc)
+    doc_clone  = Doc(doc.vocab, words=[w.text for w in doc])
+    doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
+                         doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE]))
     merge_ents(doc_clone)  # merge the entities into single tokens first
     return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents]

From 6e1dbc608eaa1725566cb48dddf4d969a027c88c Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 13 May 2017 12:34:20 +0200
Subject: [PATCH 008/195] Fix parse_tree test

---
 spacy/tests/doc/test_doc_api.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 064cbecfd..1bc534ecd 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -217,10 +217,13 @@ def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors):
     assert doc.has_vector
 
 
-def test_parse_tree(EN):
+def test_parse_tree(en_tokenizer):
+    """Tests doc.print_tree() method."""
     text = 'I like New York in Autumn.'
-    doc = EN(text, tag=True)
-    doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T)
+    heads = [1, 0, 1, -2, -3, -1, -5]
+    tags = ['PRP', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
+    tokens = en_tokenizer(text)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, tags=tags)
     # full method parse_tree(text) is a trivial composition
     trees = doc.print_tree()
     assert len(trees) > 0

From 24e973b17f23ce3be4abcf4ad7b7b3a6506a2701 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 13 May 2017 13:09:00 +0200
Subject: [PATCH 009/195] Rename about.__docs__ to about.__docs_models__

---
 spacy/about.py        | 2 +-
 spacy/cli/download.py | 2 +-
 spacy/deprecated.py   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/about.py b/spacy/about.py
index ad4a021c2..8c0e0afd3 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -10,7 +10,7 @@ __author__ = 'Matthew Honnibal'
 __email__ = 'matt@explosion.ai'
 __license__ = 'MIT'
 
-__docs__ = 'https://spacy.io/docs/usage'
+__docs_models__ = 'https://spacy.io/docs/usage'
 __download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
 __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
 __shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 0419de118..70ca64b22 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -79,5 +79,5 @@ def check_error_depr(model):
             "As of v1.7.0, the download all command is deprecated. Please "
             "download the models individually via spacy.download [model name] "
             "or pip install. For more info on this, see the documentation: "
-            "{d}".format(d=about.__docs__),
+            "{d}".format(d=about.__docs_models__),
             title="Deprecated command")
diff --git a/spacy/deprecated.py b/spacy/deprecated.py
index 65053089a..3f3c69b88 100644
--- a/spacy/deprecated.py
+++ b/spacy/deprecated.py
@@ -146,7 +146,7 @@ class ModelDownload():
             "The spacy.{l}.download command is now deprecated. Please use "
             "python -m spacy download [model name or shortcut] instead. For more "
             "info and available models, see the documentation: {d}. "
-            "Downloading default '{l}' model now...".format(d=about.__docs__, l=lang),
+            "Downloading default '{l}' model now...".format(d=about.__docs_models__, l=lang),
             title="Warning: deprecated command")
         download(lang)
 

From 9003fd25e5e966bd8d1b67a18f3ebd6010d6f718 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 13 May 2017 13:14:02 +0200
Subject: [PATCH 010/195] Fix error messages if model is required (resolves
 #1051)

Rename about.__docs__ to about.__docs_models__.
---
 spacy/lexeme.pyx       | 10 +++++-----
 spacy/tokens/doc.pyx   | 13 ++++++-------
 spacy/tokens/span.pyx  |  6 +++---
 spacy/tokens/token.pyx | 10 +++++-----
 4 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 5d9ce7b98..05d8bddc6 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -24,6 +24,7 @@ from .attrs cimport IS_QUOTE
 from .attrs cimport IS_LEFT_PUNCT
 from .attrs cimport IS_RIGHT_PUNCT
 from .attrs cimport IS_OOV
+from . import about
 
 
 memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
@@ -137,11 +138,10 @@ cdef class Lexeme:
             cdef int length = self.vocab.vectors_length
             if length == 0:
                 raise ValueError(
-                    "Word vectors set to length 0. This may be because the "
-                    "data is not installed. If you haven't already, run"
-                    "\npython -m spacy download %s\n"
-                    "to install the data." % self.vocab.lang
-                )
+                    "Word vectors set to length 0. This may be because you "
+                    "don't have a model installed or loaded, or because your "
+                    "model doesn't include word vectors. For more info, see "
+                    "the documentation: \n%s\n" % about.__docs_models__)
 
             vector_view = <float[:length,]>self.c.vector
             return numpy.asarray(vector_view)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index fe8e019ec..2089199a0 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -29,6 +29,7 @@ from ..serialize.bits cimport BitArray
 from ..util import normalize_slice
 from ..syntax.iterators import CHUNKERS
 from ..compat import is_config
+from .. import about
 
 
 DEF PADDING = 5
@@ -403,9 +404,8 @@ cdef class Doc:
             if not self.is_parsed:
                 raise ValueError(
                     "noun_chunks requires the dependency parse, which "
-                    "requires data to be installed. If you haven't done so, run: "
-                    "\npython -m spacy download %s\n"
-                    "to install the data" % self.vocab.lang)
+                    "requires data to be installed. For more info, see the "
+                    "documentation: \n%s\n" % about.__docs_models__)
             # Accumulate the result before beginning to iterate over it. This prevents
             # the tokenisation from being changed out from under us during the iteration.
             # The tricky thing here is that Span accepts its tokenisation changing,
@@ -435,10 +435,9 @@ cdef class Doc:
 
             if not self.is_parsed:
                 raise ValueError(
-                    "sentence boundary detection requires the dependency parse, which "
-                    "requires data to be installed. If you haven't done so, run: "
-                    "\npython -m spacy download %s\n"
-                    "to install the data" % self.vocab.lang)
+                    "Sentence boundary detection requires the dependency parse, which "
+                    "requires data to be installed. For more info, see the "
+                    "documentation: \n%s\n" % about.__docs_models__)
             cdef int i
             start = 0
             for i in range(1, self.length):
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index fb1e5c732..09927ab4c 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -16,6 +16,7 @@ from ..util import normalize_slice
 from ..attrs cimport IS_PUNCT, IS_SPACE
 from ..lexeme cimport Lexeme
 from ..compat import is_config
+from .. import about
 
 
 cdef class Span:
@@ -221,9 +222,8 @@ cdef class Span:
             if not self.doc.is_parsed:
                 raise ValueError(
                     "noun_chunks requires the dependency parse, which "
-                    "requires data to be installed. If you haven't done so, run: "
-                    "\npython -m spacy download %s\n"
-                    "to install the data" % self.vocab.lang)
+                    "requires data to be installed. For more info, see the "
+                    "documentation: \n%s\n" % about.__docs_models__)
             # Accumulate the result before beginning to iterate over it. This prevents
             # the tokenisation from being changed out from under us during the iteration.
             # The tricky thing here is that Span accepts its tokenisation changing,
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index f146f5cd6..daef48233 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -26,6 +26,7 @@ from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from ..attrs cimport IS_OOV
 from ..lexeme cimport Lexeme
 from ..compat import is_config
+from .. import about
 
 
 cdef class Token:
@@ -237,11 +238,10 @@ cdef class Token:
             cdef int length = self.vocab.vectors_length
             if length == 0:
                 raise ValueError(
-                    "Word vectors set to length 0. This may be because the "
-                    "data is not installed. If you haven't already, run"
-                    "\npython -m spacy download %s\n"
-                    "to install the data." % self.vocab.lang
-                )
+                    "Word vectors set to length 0. This may be because you "
+                    "don't have a model installed or loaded, or because your "
+                    "model doesn't include word vectors. For more info, see "
+                    "the documentation: \n%s\n" % about.__docs_models__)
             vector_view = <float[:length,]>self.c.lex.vector
             return numpy.asarray(vector_view)
 

From e6f850f0148c84ca1bc6bd0563a989ce0400e762 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Tue, 16 May 2017 14:45:15 +0200
Subject: [PATCH 011/195] Add pip to requirements.txt and setup.py (resolves
 #1064)

---
 requirements.txt | 1 +
 setup.py         | 1 +
 2 files changed, 2 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 42910d1be..8194dee58 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,3 +13,4 @@ requests>=2.13.0,<3.0.0
 regex==2017.4.5
 ftfy>=4.4.2,<5.0.0
 pytest>=3.0.6,<4.0.0
+pip>=9.0.0,<10.0.0
diff --git a/setup.py b/setup.py
index a112a6e80..89aaf8eba 100755
--- a/setup.py
+++ b/setup.py
@@ -197,6 +197,7 @@ def setup_package():
                 'preshed>=1.0.0,<2.0.0',
                 'thinc>=6.5.0,<6.6.0',
                 'plac<1.0.0,>=0.9.6',
+                'pip>=9.0.0,<10.0.0',
                 'six',
                 'pathlib',
                 'ujson>=1.35',

From ce0658d75bda75a508fd75fac5da2443093c0623 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 17 May 2017 18:20:33 +0200
Subject: [PATCH 012/195] Add help icon

---
 website/assets/img/icons.svg | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/website/assets/img/icons.svg b/website/assets/img/icons.svg
index f62901592..e970bb52c 100644
--- a/website/assets/img/icons.svg
+++ b/website/assets/img/icons.svg
@@ -27,5 +27,8 @@
         <symbol id="star" viewBox="0 0 24 24">
             <path d="M12 17.25l-6.188 3.75 1.641-7.031-5.438-4.734 7.172-0.609 2.813-6.609 2.813 6.609 7.172 0.609-5.438 4.734 1.641 7.031z"></path>
         </symbol>
+        <symbol id="help" viewBox="0 0 24 24">
+            <path d="M12 6c2.203 0 3.984 1.781 3.984 3.984 0 2.484-3 2.766-3 5.016h-1.969c0-3.234 3-3 3-5.016 0-1.078-0.938-1.969-2.016-1.969s-2.016 0.891-2.016 1.969h-1.969c0-2.203 1.781-3.984 3.984-3.984zM12 20.016c4.406 0 8.016-3.609 8.016-8.016s-3.609-8.016-8.016-8.016-8.016 3.609-8.016 8.016 3.609 8.016 8.016 8.016zM12 2.016c5.531 0 9.984 4.453 9.984 9.984s-4.453 9.984-9.984 9.984-9.984-4.453-9.984-9.984 4.453-9.984 9.984-9.984zM11.016 18v-2.016h1.969v2.016h-1.969z"/>
+        </symbol>
     </defs>
 </svg>

From 8a415fc402c878900bf00068166193c681389292 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 17 May 2017 18:20:44 +0200
Subject: [PATCH 013/195] Fix light version of colours to be more explicit

---
 website/assets/css/_variables.sass | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/website/assets/css/_variables.sass b/website/assets/css/_variables.sass
index 5f9453ea6..ad0739838 100644
--- a/website/assets/css/_variables.sass
+++ b/website/assets/css/_variables.sass
@@ -27,6 +27,7 @@ $font-code: 'Source Code Pro', Consolas, 'Andale Mono', Menlo, Monaco, Courier,
 // Colors
 
 $colors: ( blue: #09a3d5, red: #d9515d )
+$colors-light: (blue: #cceaf4, red: #f9d7da)
 
 $color-back: #fff !default
 $color-front: #1a1e23 !default
@@ -34,7 +35,7 @@ $color-dark: lighten($color-front, 20) !default
 
 $color-theme: map-get($colors, $theme)
 $color-theme-dark: darken(map-get($colors, $theme), 5)
-$color-theme-light: saturate(lighten(map-get($colors, $theme), 35), 5)
+$color-theme-light: map-get($colors-light, $theme)
 
 $color-subtle: #ddd !default
 $color-subtle-light: #f6f6f6 !default

From 841ad29f6187afcd79b0f1db20d1b1b887817f63 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 17 May 2017 18:20:53 +0200
Subject: [PATCH 014/195] Add tooltips component

---
 website/assets/css/_components/_tooltips.sass | 29 +++++++++++++++++++
 website/assets/css/style.sass                 |  1 +
 2 files changed, 30 insertions(+)
 create mode 100644 website/assets/css/_components/_tooltips.sass

diff --git a/website/assets/css/_components/_tooltips.sass b/website/assets/css/_components/_tooltips.sass
new file mode 100644
index 000000000..a19456aa5
--- /dev/null
+++ b/website/assets/css/_components/_tooltips.sass
@@ -0,0 +1,29 @@
+//- 💫 CSS > COMPONENTS > TOOLTIPS
+
+[data-tooltip]
+    position: relative
+
+    @include breakpoint(min, sm)
+        &:before
+            @include position(absolute, top, left, 125%, 50%)
+            display: inline-block
+            content: attr(data-tooltip)
+            background: $color-front
+            border-radius: 2px
+            color: $color-back
+            font-family: inherit
+            font-size: 1.3rem
+            line-height: 1.25
+            opacity: 0
+            padding: 0.5em 0.75em
+            transform: translateX(-50%) translateY(-2px)
+            transition: opacity 0.1s ease-out, transform 0.1s ease-out
+            visibility: hidden
+            min-width: 200px
+            max-width: 300px
+            z-index: 200
+
+        &:hover:before
+            opacity: 1
+            transform: translateX(-50%) translateY(0)
+            visibility: visible
diff --git a/website/assets/css/style.sass b/website/assets/css/style.sass
index a8d2edad4..259d563c3 100644
--- a/website/assets/css/style.sass
+++ b/website/assets/css/style.sass
@@ -32,3 +32,4 @@ $theme: blue !default
 @import _components/navigation
 @import _components/sidebar
 @import _components/tables
+@import _components/tooltips

From 8a5f1cd35abc11d4f1eca2942a239b28a6fee032 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 17 May 2017 18:21:01 +0200
Subject: [PATCH 015/195] Fix font weight on code blocks

---
 website/assets/css/_components/_code.sass | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/assets/css/_components/_code.sass b/website/assets/css/_components/_code.sass
index 83462ef72..b2ba9c60e 100644
--- a/website/assets/css/_components/_code.sass
+++ b/website/assets/css/_components/_code.sass
@@ -18,7 +18,7 @@
 
 .c-code-block__content
     display: block
-    font: normal normal 1.1rem/#{2} $font-code
+    font: normal 600 1.1rem/#{2} $font-code
     padding: 1em 2em
 
 

From 22a4d19fd8365c55cf966a4a34f315ab967d548c Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 17 May 2017 18:21:13 +0200
Subject: [PATCH 016/195] Add help mixin that displays icon with tooltip

---
 website/_includes/_mixins.jade | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade
index 2f89b0ec4..aeee54f52 100644
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@@ -47,6 +47,14 @@ mixin api(path)
         |  #[+icon("book", 18).o-icon--inline.u-color-subtle]
 
 
+//- Help icon with tooltip
+    tooltip - [string] Tooltip text
+
+mixin help(tooltip)
+    span(data-tooltip=tooltip)&attributes(attributes)
+        +icon("help", 16).i-icon--inline
+
+
 //- Aside for text
     label - [string] aside title (optional)
 

From 35795c88c4dd562a48c703e7ae0150950be87684 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 17 May 2017 18:22:04 +0200
Subject: [PATCH 017/195] Add quickstart.js widget

---
 website/_harp.json                            | 27 +++++-
 website/_includes/_mixins-base.jade           | 41 +++++++++
 website/_includes/_scripts.jade               |  8 +-
 .../assets/css/_components/_quickstart.sass   | 90 +++++++++++++++++++
 website/assets/css/style.sass                 |  1 +
 website/assets/js/quickstart.js               |  8 ++
 website/docs/usage/_data.json                 |  1 +
 website/docs/usage/index.jade                 | 43 +++++++--
 8 files changed, 207 insertions(+), 12 deletions(-)
 create mode 100644 website/assets/css/_components/_quickstart.sass
 create mode 100644 website/assets/js/quickstart.js

diff --git a/website/_harp.json b/website/_harp.json
index 672640405..ef2e48239 100644
--- a/website/_harp.json
+++ b/website/_harp.json
@@ -55,7 +55,32 @@
             }
         },
 
-        "V_CSS": "1.6",
+        "QUICKSTART": [
+            { "id": "os", "title": "Operating system", "options": [
+                { "id": "mac", "title": "macOS / OSX", "checked": true },
+                { "id": "windows", "title": "Windows" },
+                { "id": "linux", "title": "Linux" }]
+            },
+            { "id": "package", "title": "Package manager", "options": [
+                { "id": "pip", "title": "pip", "checked": true },
+                { "id": "conda", "title": "conda" },
+                { "id": "source", "title": "from source" }]
+            },
+            { "id": "python", "title": "Python version", "options": [
+                { "id": 2, "title": "2.x" },
+                { "id": 3, "title": "3.x", "checked": true }]
+            },
+            { "id": "config", "title": "Configuration", "multiple": true, "options": [
+                {"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" }]
+            },
+            { "id": "model", "title": "Models", "multiple": true, "options": [
+                { "id": "en", "title": "English", "meta": "50MB" },
+                { "id": "de", "title": "German", "meta": "645MB" },
+                { "id": "fr", "title": "French", "meta": "1.33GB" }]
+            }
+        ],
+
+        "V_CSS": "1.7",
         "V_JS": "1.2",
         "DEFAULT_SYNTAX": "python",
         "ANALYTICS": "UA-58931649-1",
diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade
index 94b1bfd84..359839d67 100644
--- a/website/_includes/_mixins-base.jade
+++ b/website/_includes/_mixins-base.jade
@@ -121,6 +121,47 @@ mixin badge(name)
             img(src=site.badge alt="{name} version" height="20")
 
 
+//- Quickstart widget
+    quickstart.js with manual markup, inspired by PyTorch's "Getting started"
+    groups - [object] option groups, uses global variable QUICKSTART
+    headline - [string] optional text to be rendered as widget headline
+
+mixin quickstart(groups, headline)
+    .c-quickstart.o-block-small#qs
+        .c-quickstart__content
+            if headline
+                +h(2)=headline
+            for group in groups
+                .c-quickstart__group.u-text-small(data-qs-group=group.id)
+                    .c-quickstart__legend=group.title
+                    .c-quickstart__fields
+                        for option in group.options
+                            input.c-quickstart__input(class="c-quickstart__input--" + (group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id=option.id value=option.id checked=option.checked)
+                            label.c-quickstart__label(for=option.id)=option.title
+                                if option.meta
+                                    |  #[span.c-quickstart__label__meta (#{option.meta})]
+                                if option.help
+                                    |  #[+help(option.help).c-quickstart__label__meta]
+
+        pre.c-code-block
+            code.c-code-block__content.c-quickstart__code(data-qs-results="")
+                block
+
+    .c-quickstart__info.u-text-tiny.o-block.u-text-right
+        |  Like this widget? Check out #[+a("https://github.com/ines/quickstart").u-link quickstart.js]!
+
+
+//- Quickstart code item
+    data [object] - Rendering conditions (keyed by option group ID, value: option)
+
+mixin qs(data)
+    - args = {}
+    for value, setting in data
+        - args['data-qs-' + setting] = value
+    span.c-quickstart__line&attributes(args)
+        block
+
+
 //- Logo
 
 mixin logo()
diff --git a/website/_includes/_scripts.jade b/website/_includes/_scripts.jade
index 544cf0977..b31c7119e 100644
--- a/website/_includes/_scripts.jade
+++ b/website/_includes/_scripts.jade
@@ -1,9 +1,13 @@
 //- 💫 INCLUDES > SCRIPTS
 
-script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript")
-script(src="/assets/js/prism.js", type="text/javascript")
+script(src="/assets/js/main.js?v#{V_JS}")
+script(src="/assets/js/prism.js")
 
 if SECTION == "docs"
+    if quickstart
+        script(src="/assets/js/quickstart.js")
+        script var qs = new Quickstart("#qs");
+
     script.
         ((window.gitter = {}).chat = {}).options = {
             useStyles: false,
diff --git a/website/assets/css/_components/_quickstart.sass b/website/assets/css/_components/_quickstart.sass
new file mode 100644
index 000000000..a3e0bff9c
--- /dev/null
+++ b/website/assets/css/_components/_quickstart.sass
@@ -0,0 +1,90 @@
+//- 💫 CSS > COMPONENTS > QUICKSTART
+
+.c-quickstart
+    border: 1px solid $color-subtle
+    border-radius: 2px
+    display: none
+    background: $color-subtle-light
+
+    &:not([style]) + .c-quickstart__info
+        display: none
+
+.c-quickstart__content
+    padding: 2rem 3rem
+
+.c-quickstart__input
+    @include size(0)
+    opacity: 0
+    position: absolute
+    left: -9999px
+
+.c-quickstart__label
+    cursor: pointer
+    background: $color-back
+    border: 1px solid $color-subtle
+    border-radius: 2px
+    display: inline-block
+    padding: 0.75rem 1.25rem
+    margin: 0 0.5rem 0.5rem 0
+    font-weight: bold
+
+    &:hover
+        background: lighten($color-theme-light, 5)
+
+    .c-quickstart__input:focus + &
+        border: 1px solid $color-theme
+
+    .c-quickstart__input--radio:checked + &
+        color: $color-back
+        border-color: $color-theme
+        background: $color-theme
+
+    .c-quickstart__input--check + &:before
+        content: ""
+        background: $color-back
+        display: inline-block
+        width: 20px
+        height: 20px
+        border: 1px solid $color-subtle
+        vertical-align: middle
+        margin-right: 1rem
+        cursor: pointer
+        border-radius: 50%
+
+    .c-quickstart__input--check:checked + &:before
+        background: $color-theme url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyNCIgaGVpZ2h0PSIyNCIgdmlld0JveD0iMCAwIDI0IDI0Ij4gICAgPHBhdGggZmlsbD0iI2ZmZiIgZD0iTTkgMTYuMTcybDEwLjU5NC0xMC41OTQgMS40MDYgMS40MDYtMTIgMTItNS41NzgtNS41NzggMS40MDYtMS40MDZ6Ii8+PC9zdmc+)
+        background-size: contain
+        border-color: $color-theme
+
+.c-quickstart__label__meta
+    font-weight: normal
+    color: $color-subtle-dark
+
+.c-quickstart__group
+    @include breakpoint(min, md)
+        display: flex
+        flex-flow: row nowrap
+
+    &:not(:last-child)
+        margin-bottom: 1rem
+
+.c-quickstart__fields
+    flex: 100%
+
+.c-quickstart__legend
+    color: $color-subtle-dark
+    margin-right: 2rem
+    padding-top: 0.75rem
+    flex: 1 1 35%
+    font-weight: bold
+
+.c-quickstart__line
+    display: block
+
+    &:before
+        color: $color-theme
+        margin-right: 1em
+        content: "$"
+
+.c-quickstart__code
+    font-size: 1.6rem
diff --git a/website/assets/css/style.sass b/website/assets/css/style.sass
index 259d563c3..809598663 100644
--- a/website/assets/css/style.sass
+++ b/website/assets/css/style.sass
@@ -33,3 +33,4 @@ $theme: blue !default
 @import _components/sidebar
 @import _components/tables
 @import _components/tooltips
+@import _components/quickstart
diff --git a/website/assets/js/quickstart.js b/website/assets/js/quickstart.js
new file mode 100644
index 000000000..d062aa91f
--- /dev/null
+++ b/website/assets/js/quickstart.js
@@ -0,0 +1,8 @@
+/**
+ * quickstart.js
+ * A micro-form for user-specific installation instructions
+ *
+ * @author Ines Montani <ines@ines.io>
+ * @version 0.0.1
+ * @license MIT
+ */'use strict';var _createClass=function(){function a(b,c){for(var e,d=0;d<c.length;d++)e=c[d],e.enumerable=e.enumerable||!1,e.configurable=!0,'value'in e&&(e.writable=!0),Object.defineProperty(b,e.key,e)}return function(b,c,d){return c&&a(b.prototype,c),d&&a(b,d),b}}();function _toConsumableArray(a){if(Array.isArray(a)){for(var b=0,c=Array(a.length);b<a.length;b++)c[b]=a[b];return c}return Array.from(a)}function _classCallCheck(a,b){if(!(a instanceof b))throw new TypeError('Cannot call a class as a function')}var Quickstart=function(){function a(){var b=0<arguments.length&&void 0!==arguments[0]?arguments[0]:'#quickstart',d=arguments[1],c=2<arguments.length&&void 0!==arguments[2]?arguments[2]:{};_classCallCheck(this,a),this.container='string'==typeof b?this._$(b):b,this.groups=d,this.pfx=c.prefix||'qs',this.dpfx='data-'+this.pfx,this.init=this.init.bind(this),c.noInit||document.addEventListener('DOMContentLoaded',this.init)}return _createClass(a,[{key:'init',value:function init(){this.updateContainer(),this.container.style.display='block',this.container.classList.add(''+this.pfx);var b=this.groups;b instanceof Array?b.reverse().forEach(this.createGroup.bind(this)):this._$$('['+this.dpfx+'-group]').forEach(this.updateGroup.bind(this))}},{key:'initGroup',value:function initGroup(b,c){b.addEventListener('change',this.update.bind(this)),b.dispatchEvent(new CustomEvent('change',{detail:c}))}},{key:'updateGroup',value:function updateGroup(b){var c=b.getAttribute(this.dpfx+'-group'),d=this.createStyles(c);b.insertBefore(d,b.firstChild),this.initGroup(b,c)}},{key:'update',value:function update(b){var f=this,c=b.detail||b.target.name,d=this._$$('[name='+c+']:checked').map(function(h){return h.value}),e=d.map(function(h){return':not(['+f.dpfx+'-'+c+'="'+h+'"])'}).join(''),g='['+this.dpfx+'-results]>['+this.dpfx+'-'+c+']'+e+' {display: none}';this._$('['+this.dpfx+'-style="'+c+'"]').textContent=g}},{key:'updateContainer',value:function updateContainer(){if(!this._$('['+this.dpfx+'-results]')){var b=this.childNodes(this.container,'pre'),c=b?b[0]:this._c('pre',this.pfx+'-code'),d=this.childNodes(c,'code')||this.childNodes(this.container,'code'),e=d?d[0]:this._c('code',this.pfx+'-results');e.setAttribute(this.dpfx+'-results','');var f=this.childNodes(e,'span')||this.childNodes(c,'span')||this.childNodes(this.container,'span');f&&f.forEach(function(g){return e.appendChild(g)}),c.appendChild(e),this.container.appendChild(c)}}},{key:'createGroup',value:function createGroup(b){var d=this,c=this._c('fieldset',this.pfx+'-group');c.setAttribute(this.dpfx+'-group',b.id),c.innerHTML=this.createStyles(b.id).outerHTML,c.innerHTML+='<legend class="'+this.pfx+'-legend">'+b.title+'</legend>',c.innerHTML+=b.options.map(function(e){var f=b.multiple?'checkbox':'radio';return'<input class="'+d.pfx+'-input '+d.pfx+'-input--'+f+'" type="'+f+'" name="'+b.id+'" id="'+e.id+'" value="'+e.id+'" '+(e.checked?'checked':'')+' /><label class="'+d.pfx+'-label" for="'+e.id+'">'+e.title+'</label>'}).join(''),this.container.insertBefore(c,this.container.firstChild),this.initGroup(c,b.id)}},{key:'createStyles',value:function createStyles(b){var c=this._c('style');return c.setAttribute(this.dpfx+'-style',b),c.textContent='['+this.dpfx+'-results]>['+this.dpfx+'-'+b+'] {display: none}',c}},{key:'childNodes',value:function childNodes(b,c){var d=c.toUpperCase();if(!b.hasChildNodes)return!1;var e=[].concat(_toConsumableArray(b.childNodes)).filter(function(f){return f.nodeName===d});return!!e.length&&e}},{key:'_$',value:function _$(b){return document.querySelector(b)}},{key:'_$$',value:function _$$(b){return[].concat(_toConsumableArray(document.querySelectorAll(b)))}},{key:'_c',value:function _c(b,c){var d=document.createElement(b);return c&&(d.className=c),d}}]),a}();
diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json
index 78e8b3e27..703a185d6 100644
--- a/website/docs/usage/_data.json
+++ b/website/docs/usage/_data.json
@@ -33,6 +33,7 @@
 
     "index": {
         "title": "Install spaCy",
+        "quickstart": true,
         "next": "models"
     },
 
diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade
index 48fe6b783..da13f4d81 100644
--- a/website/docs/usage/index.jade
+++ b/website/docs/usage/index.jade
@@ -12,6 +12,39 @@ p
     |  #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") macOS/OS X]
     |  and #[a(href="#source-windows") Windows] for details.
 
++quickstart(QUICKSTART, "Quickstart")
+    +qs({config: 'venv', python: 2}) python -m pip install -U virtualenv
+    +qs({config: 'venv', python: 3}) python -m pip install -U venv
+    +qs({config: 'venv', python: 2}) virtualenv .env
+    +qs({config: 'venv', python: 3}) venv .env
+    +qs({config: 'venv', os: 'mac'}) source .env/bin/activate
+    +qs({config: 'venv', os: 'linux'}) source .env/bin/activate
+    +qs({config: 'venv', os: 'windows'}) .env\Scripts\activate
+
+    +qs({package: 'pip'}) pip install -U spacy
+
+    +qs({package: 'conda'}) conda config --add channels conda-forge
+    +qs({package: 'conda'}) conda install spacy
+
+    +qs({package: 'source'}) git clone https://github.com/explosion/spaCy
+    +qs({package: 'source'}) cd spaCy
+    +qs({package: 'source'}) pip install -r requirements.txt
+    +qs({package: 'source'}) pip install -e .
+
+    +qs({model: 'en'}) python -m spacy download en
+    +qs({model: 'de'}) python -m spacy download de
+    +qs({model: 'fr'}) python -m spacy download fr
+
++h(2, "installation") Installation instructions
+
++h(3, "pip") pip
+    +badge("pipy")
+
+p Using pip, spaCy releases are currently only available as source packages.
+
++code(false, "bash").
+    pip install -U spacy
+
 +aside("Download models")
     |  After installation you need to download a language model. For more info
     |  and available models, see the #[+a("/docs/usage/models") docs on models].
@@ -22,14 +55,6 @@ p
         &gt;&gt;&gt; import spacy
         &gt;&gt;&gt; nlp = spacy.load('en')
 
-+h(2, "pip") pip
-    +badge("pipy")
-
-p Using pip, spaCy releases are currently only available as source packages.
-
-+code(false, "bash").
-    pip install -U spacy
-
 p
     |  When using pip it is generally recommended to install packages in a
     |  #[code virtualenv] to avoid modifying system state:
@@ -39,7 +64,7 @@ p
     source .env/bin/activate
     pip install spacy
 
-+h(2, "conda") conda
++h(3, "conda") conda
     +badge("conda")
 
 p

From f37d078d6a840db1a47066c97d4ec01ba966a2b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@likeabird.io>
Date: Thu, 18 May 2017 09:59:38 +0200
Subject: [PATCH 018/195] Fix issue #1069 with custom hook `Doc.sents`
 definition

---
 spacy/tokens/doc.pyx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 2089199a0..cfc146e6a 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -431,7 +431,9 @@ cdef class Doc:
         """
         def __get__(self):
             if 'sents' in self.user_hooks:
-                return self.user_hooks['sents'](self)
+                for sent in self.user_hooks['sents'](self):
+                    yield sent
+                return
 
             if not self.is_parsed:
                 raise ValueError(

From 6381ebfb14c537c5525e3a240c8d3b2bd6f3cc2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@likeabird.io>
Date: Thu, 18 May 2017 10:42:35 +0200
Subject: [PATCH 019/195] Use yield from syntax

---
 spacy/tokens/doc.pyx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index cfc146e6a..ca5a3d696 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -431,8 +431,7 @@ cdef class Doc:
         """
         def __get__(self):
             if 'sents' in self.user_hooks:
-                for sent in self.user_hooks['sents'](self):
-                    yield sent
+                yield from self.user_hooks['sents'](self)
                 return
 
             if not self.is_parsed:

From c56c264510c701b4f72d2ccdc358177e4dcf6716 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Thu, 18 May 2017 13:49:43 +0200
Subject: [PATCH 020/195] Tidy up .gitignore

---
 .gitignore | 94 ++++++++++++++++++++++++------------------------------
 1 file changed, 42 insertions(+), 52 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8716a8ef0..b165abf4b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,50 +1,43 @@
-# Vim
-*.swp
-*.sw*
-Profile.prof
-tmp/
-.dev
-.denv
-.pypyenv
-.eggs
-*.tgz
-.sass-cache
-.python-version
-
-MANIFEST
-
+# spaCy
+spacy/data/
 corpora/
 models/
 keys/
 
-spacy/syntax/*.cpp
-spacy/syntax/*.html
-spacy/en/*.cpp
-spacy/tokens/*.cpp
-spacy/serialize/*.cpp
-spacy/en/data/*
-spacy/*.cpp
-spacy/ner/*.cpp
-spacy/orthography/*.cpp
-ext/murmurhash.cpp
-ext/sparsehash.cpp
+# Website
+website/www/
+website/_deploy.sh
+website/package.json
+website/announcement.jade
+website/.gitignore
 
-/spacy/data/
-
-_build/
-.env/
-tmp/
+# Cython / C extensions
 cythonize.json
-
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-
-# C extensions
+spacy/*.html
+*.cpp
 *.so
 
-# Distribution / packaging
+# Vim / VSCode / editors
+*.swp
+*.sw*
+Profile.prof
+.vscode
+.sass-cache
+
+# Python
 .Python
+.python-version
+__pycache__/
+*.py[cod]
+.env/
+.~env/
+.venv
+venv/
+.dev
+.denv
+.pypyenv
+
+# Distribution / packaging
 env/
 bin/
 build/
@@ -59,6 +52,12 @@ var/
 *.egg-info/
 .installed.cfg
 *.egg
+.eggs
+MANIFEST
+
+# Temporary files
+*.~*
+tmp/
 
 # Installer logs
 pip-log.txt
@@ -87,25 +86,16 @@ coverage.xml
 *.log
 *.pot
 
-# Windows local helper files
+# Windows
 *.bat
+Thumbs.db
+Desktop.ini
 
 # Mac OS X
 *.DS_Store
 
-# Temporary files / Dropbox hack
-*.~*
-
 # Komodo project files
 *.komodoproject
 
-# Website
-website/_deploy.sh
-website/package.json
-website/announcement.jade
-website/www/
-website/.gitignore
-
-# Python virtualenv
-venv
-venv/*
+# Other
+*.tgz

From d40b0839345dabd4cf8c1d981640c15d86ddc5dc Mon Sep 17 00:00:00 2001
From: Niko Rebenich <ardeego@gmail.com>
Date: Thu, 18 May 2017 14:50:43 -0700
Subject: [PATCH 021/195] Print list comprehension

Turn the generator expression into a list comprehension before printing
---
 website/docs/usage/language-processing-pipeline.jade | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade
index c372dfbf4..3ddf9179c 100644
--- a/website/docs/usage/language-processing-pipeline.jade
+++ b/website/docs/usage/language-processing-pipeline.jade
@@ -17,10 +17,10 @@ p
     |  trying to do.
 
 +code.
-    import spacy                         # See "Installing spaCy"
-    nlp = spacy.load('en')               # You are here.
-    doc = nlp(u'Hello, spacy!')          # See "Using the pipeline"
-    print((w.text, w.pos_) for w in doc) # See "Doc, Span and Token"
+    import spacy                           # See "Installing spaCy"
+    nlp = spacy.load('en')                 # You are here.
+    doc = nlp(u'Hello, spacy!')            # See "Using the pipeline"
+    print([(w.text, w.pos_) for w in doc]) # See "Doc, Span and Token"
 
 +aside("Why do we have to preload?")
     |  Loading the models takes ~200x longer than

From 7e4f31c36224d9c97539e456232a30780b3472aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?=
 <raphael0202@users.noreply.github.com>
Date: Fri, 19 May 2017 21:12:41 +0200
Subject: [PATCH 022/195] Deleting (legacy?) whitespace attribute

token.whitespace raises an AttributeError
---
 website/docs/api/token.jade | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/website/docs/api/token.jade b/website/docs/api/token.jade
index 7a09f9d11..de3498353 100644
--- a/website/docs/api/token.jade
+++ b/website/docs/api/token.jade
@@ -238,11 +238,6 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
         +cell #[code text_with_ws]
         +cell unicode
         +cell Text content, with trailing space character if present.
-
-    +row
-        +cell #[code whitespace]
-        +cell int
-        +cell Trailing space character if present.
     +row
         +cell #[code whitespace_]
         +cell unicode

From a3302873048ba6f549ff4f9aa7b2a9bfe78e26bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?=
 <raphael0202@users.noreply.github.com>
Date: Fri, 19 May 2017 21:17:31 +0200
Subject: [PATCH 023/195] Add Token.orth and Token.orth_ description in doc

---
 website/docs/api/token.jade | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/website/docs/api/token.jade b/website/docs/api/token.jade
index 7a09f9d11..ee837b54a 100644
--- a/website/docs/api/token.jade
+++ b/website/docs/api/token.jade
@@ -67,6 +67,16 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
         +cell unicode
         +cell Base form of the word, with no inflectional suffixes.
 
+    +row
+        +cell #[code orth]
+        +cell int
+        +cell word's string.
+
+    +row
+        +cell #[code orth_]
+        +cell unicode
+        +cell word's string.
+
     +row
         +cell #[code lower]
         +cell int

From af3d121ec9f8b4320521b783346a84f66adce0ce Mon Sep 17 00:00:00 2001
From: Yuval Pinter <uvp@gatech.edu>
Date: Mon, 22 May 2017 10:56:03 -0400
Subject: [PATCH 024/195] extend suffixes from first to last

reverse suffix list in `tokenizer_pseudo_code()` so the order of returned tokens matches input order
---
 website/docs/usage/customizing-tokenizer.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade
index d43fb438f..b1fbba652 100644
--- a/website/docs/usage/customizing-tokenizer.jade
+++ b/website/docs/usage/customizing-tokenizer.jade
@@ -113,7 +113,7 @@ p
                 else:
                     tokens.append(substring)
                     substring = ''
-            tokens.extend(suffixes)
+            tokens.extend(reversed(suffixes))
             return tokens
 
 p

From 7f6be41f212c2a6f65612beeccf170665e0ba106 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Tue, 23 May 2017 12:18:00 +0200
Subject: [PATCH 025/195] Fix typo in English tokenizer exceptions (resolves
 #1071)

---
 spacy/en/tokenizer_exceptions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py
index 3d009241b..d9aa01734 100644
--- a/spacy/en/tokenizer_exceptions.py
+++ b/spacy/en/tokenizer_exceptions.py
@@ -178,7 +178,7 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
 
         EXC[orth + "ve"] = [
             {ORTH: orth, LEMMA: word},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
         ]
 
         EXC[orth + "'d"] = [

From 68b387ffc30b6b542c4a09cc38a2d3a89bbac77a Mon Sep 17 00:00:00 2001
From: Yuval Pinter <uvp@gatech.edu>
Date: Tue, 23 May 2017 10:46:17 -0400
Subject: [PATCH 026/195] Fixed link

link to Doc API documentation fixed
---
 website/docs/usage/deep-learning.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/deep-learning.jade b/website/docs/usage/deep-learning.jade
index fec01b4ba..739cf858c 100644
--- a/website/docs/usage/deep-learning.jade
+++ b/website/docs/usage/deep-learning.jade
@@ -36,7 +36,7 @@ p
     |  to #[code spacy.load()]. The function should take a
     |  #[code spacy.language.Language] object as its only argument, and return
     |  a sequence of callables. Each callable should accept a
-    |  #[+api("docs") #[code Doc]] object, modify it in place, and return
+    |  #[+api("doc") #[code Doc]] object, modify it in place, and return
     |  #[code None].
 
 p

From cb418c7aef475607088b80d5acf083198b5ee2d4 Mon Sep 17 00:00:00 2001
From: Yuval Pinter <uvp@gatech.edu>
Date: Tue, 23 May 2017 10:54:13 -0400
Subject: [PATCH 027/195] Fixed span example error

Span as written gives empty text.
---
 website/docs/api/doc.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade
index 72fe34f8c..adcd111a3 100644
--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@@ -103,7 +103,7 @@ p Get a #[code Token] object.
     doc = nlp(u'Give it back! He pleaded.')
     assert doc[0].text == 'Give'
     assert doc[-1].text == '.'
-    span = doc[1:1]
+    span = doc[1:3]
     assert span.text == 'it back'
 
 +table(["Name", "Type", "Description"])

From ab83dd5d25498a3814a2b302f0f75d66e8b935de Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Thu, 1 Jun 2017 17:53:41 +0200
Subject: [PATCH 028/195] Fix lightning tour example

---
 website/docs/usage/lightning-tour.jade | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade
index 967d0c61e..138b0058d 100644
--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@@ -148,24 +148,20 @@ p
 
 +code.
     def put_spans_around_tokens(doc, get_classes):
-        '''Given some function to compute class names, put each token in a
-        span element, with the appropriate classes computed.
-
-        All whitespace is preserved, outside of the spans. (Yes, I know HTML
-        won't display it. But the point is no information is lost, so you can
-        calculate what you need, e.g. <br /> tags, <p> tags, etc.)
-        '''
+        """Given some function to compute class names, put each token in a
+        span element, with the appropriate classes computed. All whitespace is
+        preserved, outside of the spans. (Of course, HTML won't display more than
+        one whitespace character it – but the point is, no information is lost
+        and you can calculate what you need, e.g. &lt;br /&gt;, &lt;p&gt; etc.)
+        """
         output = []
-        template = '<span classes="{classes}">{word}</span>{space}'
+        html = '&lt;span class="{classes}"&gt;{word}&lt;/span&gt;{space}'
         for token in doc:
             if token.is_space:
-                output.append(token.orth_)
+                output.append(token.text)
             else:
-                output.append(
-                  template.format(
-                    classes=' '.join(get_classes(token)),
-                    word=token.orth_,
-                    space=token.whitespace_))
+                classes = ' '.join(get_classes(token))
+                output.append(html.format(classes=classes, word=token.text, space=token.whitespace_))
         string = ''.join(output)
         string = string.replace('\n', '')
         string = string.replace('\t', '    ')

From 1e918b871cb4b7144979a223c94b1eee611e5da7 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Thu, 1 Jun 2017 17:53:47 +0200
Subject: [PATCH 029/195] Remove infoboxes

---
 website/docs/api/_annotation/_dep-labels.jade     | 5 -----
 website/docs/api/_annotation/_named-entities.jade | 5 -----
 website/docs/api/_annotation/_pos-tags.jade       | 5 -----
 3 files changed, 15 deletions(-)

diff --git a/website/docs/api/_annotation/_dep-labels.jade b/website/docs/api/_annotation/_dep-labels.jade
index 9e1e89324..427b2f53a 100644
--- a/website/docs/api/_annotation/_dep-labels.jade
+++ b/website/docs/api/_annotation/_dep-labels.jade
@@ -1,10 +1,5 @@
 //- 💫 DOCS > API > ANNOTATION > DEPENDENCY LABELS
 
-+infobox("Tip")
-    |  In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
-    |  description for the string representation of a label. For example,
-    |  #[code spacy.explain("prt")] will return "particle".
-
 +h(3, "dependency-parsing-english") English dependency labels
 
 p
diff --git a/website/docs/api/_annotation/_named-entities.jade b/website/docs/api/_annotation/_named-entities.jade
index 68b3bd17d..476659d4a 100644
--- a/website/docs/api/_annotation/_named-entities.jade
+++ b/website/docs/api/_annotation/_named-entities.jade
@@ -1,10 +1,5 @@
 //- 💫 DOCS > API > ANNOTATION > NAMED ENTITIES
 
-+infobox("Tip")
-    |  In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
-    |  description for the string representation of an entity label. For example,
-    |  #[code spacy.explain("LANGUAGE")] will return "any named language".
-
 +table([ "Type", "Description" ])
     +row
         +cell #[code PERSON]
diff --git a/website/docs/api/_annotation/_pos-tags.jade b/website/docs/api/_annotation/_pos-tags.jade
index d3ceef777..ea3a225bf 100644
--- a/website/docs/api/_annotation/_pos-tags.jade
+++ b/website/docs/api/_annotation/_pos-tags.jade
@@ -1,10 +1,5 @@
 //- 💫 DOCS > API > ANNOTATION > POS TAGS
 
-+infobox("Tip")
-    |  In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
-    |  description for the string representation of a tag. For example,
-    |  #[code spacy.explain("RB")] will return "adverb".
-
 +h(3, "pos-tagging-english") English part-of-speech tag scheme
 
 p

From a6d99f8dabeca07f339bedec3d48f0bad7b45be9 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Thu, 1 Jun 2017 17:56:18 +0200
Subject: [PATCH 030/195] Add prefix to option IDs to avoid conflicts

---
 website/_includes/_mixins-base.jade | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade
index 359839d67..106f8f1ca 100644
--- a/website/_includes/_mixins-base.jade
+++ b/website/_includes/_mixins-base.jade
@@ -136,8 +136,8 @@ mixin quickstart(groups, headline)
                     .c-quickstart__legend=group.title
                     .c-quickstart__fields
                         for option in group.options
-                            input.c-quickstart__input(class="c-quickstart__input--" + (group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id=option.id value=option.id checked=option.checked)
-                            label.c-quickstart__label(for=option.id)=option.title
+                            input.c-quickstart__input(class="c-quickstart__input--" + (group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked)
+                            label.c-quickstart__label(for="qs-#{option.id}")=option.title
                                 if option.meta
                                     |  #[span.c-quickstart__label__meta (#{option.meta})]
                                 if option.help

From 36b20d66bfab75cfc931358688ba17c79fd1d7cc Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Thu, 1 Jun 2017 18:11:49 +0200
Subject: [PATCH 031/195] Add alpha banner

---
 website/assets/img/graphics.svg | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/website/assets/img/graphics.svg b/website/assets/img/graphics.svg
index c24473b4c..a449c3d04 100644
--- a/website/assets/img/graphics.svg
+++ b/website/assets/img/graphics.svg
@@ -1,5 +1,16 @@
 <svg style="position: absolute; width: 0; height: 0;" width="0" height="0" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
     <defs>
+        <symbol id="v2alpha" viewBox="0 0 200 111">
+            <title>spaCy v2.0.0 alpha</title>
+            <path fill="#ddd" d="M183.3 89.2l-164.6-40-1-29.2 164.6 40M3.8 106.8l41.6-1.4-1-29.2-41.6 1.4L13.2 92"/>
+            <path fill="#a3cad3" d="M45.4 105.4L19.6 94.6l25.4-1"/>
+            <path fill="#ddd" d="M196.6 2L155 3.4l1 29.2 41.6-1.4L187.2 17"/>
+            <path fill="#a3cad3" d="M155 3.4l25.8 10.8-25.4 1"/>
+            <path fill="#fff" d="M17.6 19.4l163-5.6 1 29.2-163 5.6zM19.2 65.6l163-5.6 1 29.2-163 5.6z"/>
+            <path fill="#008EBC" d="M45.8 29h-3.6v-2.4l10-.4.2 2.5h-3.6l.4 10.8h-3L45.8 29zM62 39L59 34.5h-1.6l.2 5h-3l-.5-13.2L59 26c3 0 5.2.8 5.3 4 0 1.8-.8 3-2.2 3.8l3.3 5.2H62zm-4.5-6.8H59c1.6-.2 2.4-.8 2.3-2 0-1.4-1-1.8-2.5-1.8h-1.5l.2 3.8zM69 34.2l-4.3-8.4H68l1.2 3 1.2 2.8c.4-1 .8-2 1-3l1.2-3 3-.2L72 34l.2 4.7h-3l-.2-4.5zM79.5 25.3h3.2l1.8 6 1.2 4.2c.5-1.5.7-2.8 1-4.3L88 25h3L87.7 38H84l-4.5-13zM92.4 25l8.3-.4V27l-5.2.3V30l4.6-.3.2 2.5-4.5.2v3l5.6-.2v2.5L93 38l-.6-13zM111 37.4l-2.6-4.7h-1.6l.2 5h-3l-.5-13.2 4.8-.2c2.8 0 5 .8 5.2 4 0 1.8-.8 3-2.2 3.8l3.2 5.3H111zm-4.3-7h1.5c1.6 0 2.4-.7 2.3-2 0-1.3-1-1.7-2.5-1.7h-1.5l.2 3.8zM116.8 33.5c1 .8 2.2 1.3 3.3 1.3 1.3 0 2-.5 2-1.3s-1-1-2-1.5l-1.8-.7c-1.4-.5-2.7-1.6-2.8-3.5 0-2.2 1.8-4 4.6-4 1.5-.2 3 .4 4.3 1.5l-1.4 2c-1-.7-1.8-1-3-1-1 0-1.6.4-1.5 1.2 0 .8 1 1 2 1.5l1.8.6c1.6.6 2.7 1.6 2.7 3.5 0 2.3-1.7 4.2-4.8 4.4-1.7 0-3.6-.5-5-1.7l1.6-2.2zM126.8 23.7h3l.5 13-3 .2-.5-13.3zM132.5 30c0-4.3 2.2-7 5.8-7 3.6 0 6 2.3 6.2 6.6 0 4.3-2.2 7-5.8 7-3.5.3-6-2.3-6.2-6.6zm9-.3c-.2-2.6-1.4-4.2-3.2-4-1.8 0-3 1.6-2.8 4.2 0 2.5 1.3 4.2 3 4 2 0 3-1.6 3-4.3zM146.7 23h3l3.8 6.3 1.4 3c-.2-1.5-.5-3.3-.5-5l-.2-4.6h2.8l.6 13-3 .2-3.8-6.6-1.4-2.8c0 1.5.4 3.2.4 4.8l.2 4.7-3 .2-.3-13.2z"/>
+            <path fill="#1A1E23" d="M50.2 84.7c3.2-3.2 5.4-5.5 5.3-7.3 0-1.3-.8-2-2-2-.8 0-1.5.8-2 1.5l-1.8-1.6c1.2-1.4 2.4-2 4.2-2.2 2.4 0 4.2 1.5 4.3 4 0 2-2 4.4-4 6.7.7-.2 1.6-.3 2.2-.3H59l.2 2.4-9 .4v-1.7zM63 82.4c1 0 2 .7 2 1.8 0 1-.7 2-1.7 2s-1.8-.8-2-2c0-1 .7-1.8 1.8-1.8zM66.7 79.3c-.2-4.4 1.6-6.7 4.4-6.8 3 0 4.8 2 5 6.5s-1.7 6.8-4.5 7c-2.7 0-4.6-2.3-4.8-6.7zM73 79c0-3.4-.8-4.2-1.8-4-1 0-1.8.7-1.6 4.3 0 3.5 1 4.4 2 4.3 1 0 1.6-1 1.5-4.5zM79.8 81.8c1 0 1.8.7 2 1.8 0 1-.8 2-1.8 2s-1.8-.8-2-2c0-1 .8-1.7 1.8-1.8zM83.5 78.7C83.3 74.3 85 72 88 72c2.7-.2 4.6 2 4.7 6.4s-1.6 6.8-4.4 7c-2.8 0-4.7-2.3-4.8-6.7zm6.3-.2c0-3.5-1-4.3-2-4.2-1 0-1.7.8-1.5 4.4 0 3.5 1 4.4 2 4.3 1 0 1.7-1 1.5-4.5zM105.5 81.3h-4l-.7 3.3h-3l3.7-13.2h3.6l4.7 13h-3.2l-1-3zm-.7-2.3l-.4-1.2-1.2-4.2-1 4.3-.3 1h2.8zM110.5 71h3l.4 10.7 5-.2.2 2.5-8.2.3-.5-13.2zM121 70.7l4.7-.2c3 0 5.2 1 5.3 4 0 3.2-2.2 4.7-5 4.7h-1.8l.2 4.6h-3l-.5-13zm4.7 6.2c1.6-.2 2.4-1 2.4-2.3 0-1.4-.8-2-2.4-1.8H124v4h1.7zM133 70.3h3l.3 5 4.5-.2-.2-5h3l.5 13-3 .2v-5.5l-4.6.2.2 5.4h-3l-.5-13zM153.3 79.7h-4l-.7 3.3h-3l3.7-13.2h3.6l4.5 13h-3.2l-1-3zm-.7-2.3l-.4-1.2L151 72l-1 4.3-.3 1.2h3z"/>
+        </symbol>
+
         <symbol id="usersurvey" viewBox="0 0 200 111">
             <title>spaCy user survey 2017</title>
             <path fill="#ddd" d="M183.3 89.2l-164.6-40-1-29.2 164.6 40M3.8 106.8l41.6-1.4-1-29.2-41.6 1.4L13.2 92"/>

From 5b385e7d78fd955d97b59024645d2592bdbc0949 Mon Sep 17 00:00:00 2001
From: Francisco Aranda <francisco.aranda@adesis.com>
Date: Fri, 2 Jun 2017 08:14:06 +0200
Subject: [PATCH 032/195] feat(spanish model): add the spanish noun chunker

---
 spacy/syntax/iterators.pyx | 55 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 53 insertions(+), 2 deletions(-)

diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx
index e1c44da7f..b0d1c78ca 100644
--- a/spacy/syntax/iterators.pyx
+++ b/spacy/syntax/iterators.pyx
@@ -1,7 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from ..parts_of_speech cimport NOUN, PROPN, PRON
+from ..parts_of_speech cimport NOUN, PROPN, PRON, VERB, AUX
 
 
 def english_noun_chunks(obj):
@@ -66,4 +66,55 @@ def german_noun_chunks(obj):
             yield word.left_edge.i, rbracket, np_label
 
 
-CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}
+def es_noun_chunks(obj):
+
+    doc = obj.doc
+    np_label = doc.vocab.strings['NP']
+
+    left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
+    right_labels = ['flat', 'fixed', 'compound', 'neg']
+    stop_labels = ['punct']
+
+    np_left_deps = [doc.vocab.strings[label] for label in left_labels]
+    np_right_deps = [doc.vocab.strings[label] for label in right_labels]
+    stop_deps = [doc.vocab.strings[label] for label in stop_labels]
+
+    def next_token(token):
+        try:
+            return token.nbor()
+        except:
+            return None
+
+    def noun_bounds(root):
+
+        def is_verb_token(token):
+            return token.pos in [VERB, AUX]
+
+        left_bound = root
+        for token in reversed(list(root.lefts)):
+            if token.dep in np_left_deps:
+                left_bound = token
+
+        right_bound = root
+        for token in root.rights:
+            if (token.dep in np_right_deps):
+                left, right = noun_bounds(token)
+
+                if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps, doc[left_bound.i: right.i])):
+                    break
+                else:
+                    right_bound = right
+
+        return left_bound, right_bound
+
+
+    token = doc[0]
+    while token and token.i < len(doc):
+        if token.pos in [PROPN, NOUN, PRON]:
+            left, right = noun_bounds(token)
+            yield left.i, right.i+1, np_label
+            token = right
+        token = next_token(token)
+
+
+CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, 'es': es_noun_chunks}

From 70a21801994d7c9023f050ecfa2e3ec8a5d52d04 Mon Sep 17 00:00:00 2001
From: Francisco Aranda <francisco.aranda@adesis.com>
Date: Fri, 2 Jun 2017 08:19:57 +0200
Subject: [PATCH 033/195] fix(spanish sentence segmentation): remove tokenizer
 exceptions the break sentence segmentation. Aligned with training corpus

---
 spacy/es/tokenizer_exceptions.py | 33 ++------------------------------
 1 file changed, 2 insertions(+), 31 deletions(-)

diff --git a/spacy/es/tokenizer_exceptions.py b/spacy/es/tokenizer_exceptions.py
index e60bcd104..fb274f907 100644
--- a/spacy/es/tokenizer_exceptions.py
+++ b/spacy/es/tokenizer_exceptions.py
@@ -6,44 +6,15 @@ from ..language_data import PRON_LEMMA, DET_LEMMA
 
 
 TOKENIZER_EXCEPTIONS = {
-    "al": [
-        {ORTH: "a", LEMMA: "a", TAG: ADP},
-        {ORTH: "el", LEMMA: "el", TAG: DET}
-    ],
-
-    "consigo": [
-        {ORTH: "con", LEMMA: "con"},
-        {ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"}
-    ],
-
-    "conmigo": [
-        {ORTH: "con", LEMMA: "con"},
-        {ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"}
-    ],
-
-    "contigo": [
-        {ORTH: "con", LEMMA: "con"},
-        {ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"}
-    ],
-
-    "del": [
-        {ORTH: "de", LEMMA: "de", TAG: ADP},
-        {ORTH: "l", LEMMA: "el", TAG: DET}
-    ],
-
-    "pel": [
-        {ORTH: "pe", LEMMA: "per", TAG: ADP},
-        {ORTH: "l", LEMMA: "el", TAG: DET}
-    ],
 
     "pal": [
         {ORTH: "pa", LEMMA: "para"},
-        {ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"}
+        {ORTH: "el", LEMMA: DET_LEMMA, NORM: "el"}
     ],
 
     "pala": [
         {ORTH: "pa", LEMMA: "para"},
-        {ORTH: "la", LEMMA: DET_LEMMA}
+        {ORTH: "la", LEMMA: DET_LEMMA, NORM: "la"}
     ],
 
     "aprox.": [

From 86277d4ef2efb80f8a30559700c9e724a1c08aff Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Fri, 2 Jun 2017 12:13:59 +0200
Subject: [PATCH 034/195] Create appveyor.yml

---
 appveyor.yml | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 appveyor.yml

diff --git a/appveyor.yml b/appveyor.yml
new file mode 100644
index 000000000..2cca96974
--- /dev/null
+++ b/appveyor.yml
@@ -0,0 +1,9 @@
+branches:
+  only:
+    - master
+    - develop
+
+notifications:
+  - provider: Slack
+    incoming_webhook: https://hooks.slack.com/services/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
+    channel: '#dev'

From 0404b5f43b5821196d4a5e8b9d4d3cfcfef260b6 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Fri, 2 Jun 2017 12:18:51 +0200
Subject: [PATCH 035/195] Update appveyor.yml

---
 appveyor.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/appveyor.yml b/appveyor.yml
index 2cca96974..30626c977 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,3 +1,5 @@
+build: off
+
 branches:
   only:
     - master

From 3c2cce8efc56ed00759c2ad10076c2f6cde42585 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Fri, 2 Jun 2017 12:27:36 +0200
Subject: [PATCH 036/195] Update appveyor.yml

---
 appveyor.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/appveyor.yml b/appveyor.yml
index 30626c977..d1c70a166 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -4,8 +4,3 @@ branches:
   only:
     - master
     - develop
-
-notifications:
-  - provider: Slack
-    incoming_webhook: https://hooks.slack.com/services/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
-    channel: '#dev'

From 3e16535fef0c6025ad937261e9eb3276df3d1e60 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Fri, 2 Jun 2017 12:31:31 +0200
Subject: [PATCH 037/195] Update appveyor.yml

---
 appveyor.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/appveyor.yml b/appveyor.yml
index d1c70a166..4dd7b0a31 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,6 +1 @@
 build: off
-
-branches:
-  only:
-    - master
-    - develop

From af466496f1f659236eee60e7aea22c3d7a0f4440 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Fri, 2 Jun 2017 12:33:57 +0200
Subject: [PATCH 038/195] Rename appveyor.yml to .appveyor.yml

---
 appveyor.yml => .appveyor.yml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename appveyor.yml => .appveyor.yml (100%)

diff --git a/appveyor.yml b/.appveyor.yml
similarity index 100%
rename from appveyor.yml
rename to .appveyor.yml

From c4e62c76519f534ff83d67f30275f164762ebd1e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Fri, 2 Jun 2017 12:39:44 +0200
Subject: [PATCH 039/195] Update README.rst

---
 README.rst | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 24b0c232a..4a34770e8 100644
--- a/README.rst
+++ b/README.rst
@@ -15,7 +15,11 @@ MIT license.
 
 .. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square
     :target: https://travis-ci.org/explosion/spaCy
-    :alt: Build Status
+    :alt: Travis Build Status
+    
+.. image:: https://img.shields.io/appveyor/ci/explosion/spacy.svg?style=flat-square
+    :target: https://ci.appveyor.com/project/explosion/spacy
+    :alt: Appveyor Build Status
 
 .. image:: https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square
     :target: https://github.com/explosion/spaCy/releases

From 83467a00a76ace3b1e858374bbcd5081555447ee Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Fri, 2 Jun 2017 12:42:22 +0200
Subject: [PATCH 040/195] Update README.rst

---
 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 4a34770e8..76bd333d8 100644
--- a/README.rst
+++ b/README.rst
@@ -17,7 +17,7 @@ MIT license.
     :target: https://travis-ci.org/explosion/spaCy
     :alt: Travis Build Status
     
-.. image:: https://img.shields.io/appveyor/ci/explosion/spacy.svg?style=flat-square
+.. image:: https://img.shields.io/appveyor/ci/explosion/spacy/master.svg?style=flat-square
     :target: https://ci.appveyor.com/project/explosion/spacy
     :alt: Appveyor Build Status
 

From e7ef51b3828f9acf0dc815c2d5eeddca5eda3d28 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Fri, 2 Jun 2017 19:00:01 +0200
Subject: [PATCH 041/195] Update tokenizer_exceptions.py

---
 spacy/es/tokenizer_exceptions.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/es/tokenizer_exceptions.py b/spacy/es/tokenizer_exceptions.py
index fb274f907..f9c955338 100644
--- a/spacy/es/tokenizer_exceptions.py
+++ b/spacy/es/tokenizer_exceptions.py
@@ -6,10 +6,9 @@ from ..language_data import PRON_LEMMA, DET_LEMMA
 
 
 TOKENIZER_EXCEPTIONS = {
-
     "pal": [
         {ORTH: "pa", LEMMA: "para"},
-        {ORTH: "el", LEMMA: DET_LEMMA, NORM: "el"}
+        {ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"}
     ],
 
     "pala": [

From e66cd9cc70f40288aef24b241d22fadbd5c7bc59 Mon Sep 17 00:00:00 2001
From: Pascal van Kooten <kootenpv@gmail.com>
Date: Mon, 5 Jun 2017 20:41:28 +0200
Subject: [PATCH 042/195] for easy copy & paste

---
 website/docs/usage/rule-based-matching.jade | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade
index aea943a61..db7c70608 100644
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@@ -19,11 +19,11 @@ p Here's a minimal example. We first add a pattern that specifies three tokens:
 p
     |  Once we've added the pattern, we can use the #[code matcher] as a
     |  callable, to receive a list of #[code (ent_id, start, end)] tuples.
-    |  Note that #[code LOWER] and #[code IS_PUNCT] are data attributes
-    |  of #[code spacy.attrs].
 
 +code.
     from spacy.matcher import Matcher
+    from spacy.attrs import IS_PUNCT, LOWER
+    
     matcher = Matcher(nlp.vocab)
     matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True}, {LOWER: "world"}])
 

From 4cbe55622d921569943147a6a0eb1ab3fb489686 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Mon, 5 Jun 2017 21:32:36 +0200
Subject: [PATCH 043/195] Update README.rst

---
 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 76bd333d8..3eeca36bc 100644
--- a/README.rst
+++ b/README.rst
@@ -9,7 +9,7 @@ Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew,
 Chinese and Japanese. It's commercial open-source software, released under the
 MIT license.
 
-📊 **Help us improve the library!** `Take the spaCy user survey <https://survey.spacy.io>`_.
+⭐️ **Test spaCy v2.0.0 alpha and the new models!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/tag/v2.0.0-alpha>`_
 
 💫 **Version 1.8 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
 

From 99d02b2bb626c80ef53d1d03e226aee322b6fa57 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Tue, 6 Jun 2017 03:20:20 +0200
Subject: [PATCH 044/195] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index b64dc8db3..ea6096a52 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -16,6 +16,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a
 * Daniel Vila Suero, [@dvsrepo](https://github.com/dvsrepo)
 * Dmytro Sadovnychyi, [@sadovnychyi](https://github.com/sadovnychyi)
 * Eric Zhao, [@ericzhao28](https://github.com/ericzhao28)
+* Francisco Aranda, [@frascuchon](https://github.com/frascuchon)
 * Greg Baker, [@solresol](https://github.com/solresol)
 * Grégory Howard, [@Gregory-Howard](https://github.com/Gregory-Howard)
 * György Orosz, [@oroszgy](https://github.com/oroszgy)

From 6071c727d263d97c9251b076858d24de30da5f78 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Tue, 6 Jun 2017 12:49:17 +0200
Subject: [PATCH 045/195] Add more env options to gitignore

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index b165abf4b..2209f5b4a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,6 +30,8 @@ Profile.prof
 __pycache__/
 *.py[cod]
 .env/
+.env2/
+.env3/
 .~env/
 .venv
 venv/

From 6ef04afdc89eab35e6351c90b8be2c56c025324b Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Tue, 6 Jun 2017 12:49:25 +0200
Subject: [PATCH 046/195] Update docs with Spanish model

---
 website/_harp.json                   | 7 ++++---
 website/docs/usage/_models-list.jade | 1 +
 website/docs/usage/index.jade        | 1 +
 website/index.jade                   | 2 +-
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/website/_harp.json b/website/_harp.json
index ef2e48239..cb476541a 100644
--- a/website/_harp.json
+++ b/website/_harp.json
@@ -14,8 +14,8 @@
 
         "SPACY_VERSION": "1.8",
         "LATEST_NEWS": {
-            "url": "https://survey.spacy.io/",
-            "title": "Take the spaCy user survey and help us improve the library!"
+            "url": "/docs/usage/models",
+            "title": "The first official Spanish model is here!"
         },
 
         "SOCIAL": {
@@ -76,7 +76,8 @@
             { "id": "model", "title": "Models", "multiple": true, "options": [
                 { "id": "en", "title": "English", "meta": "50MB" },
                 { "id": "de", "title": "German", "meta": "645MB" },
-                { "id": "fr", "title": "French", "meta": "1.33GB" }]
+                { "id": "fr", "title": "French", "meta": "1.33GB" },
+                { "id": "es", "title": "Spanish", "meta": "377MB"}]
             }
         ],
 
diff --git a/website/docs/usage/_models-list.jade b/website/docs/usage/_models-list.jade
index 942de28c4..36de137e5 100644
--- a/website/docs/usage/_models-list.jade
+++ b/website/docs/usage/_models-list.jade
@@ -25,3 +25,4 @@ p
     +model-row("en_vectors_glove_md", "English", [1, 0, 0, 1], "727 MB", "CC BY-SA")
     +model-row("de_core_news_md", "German", [1, 1, 1, 1], "645 MB", "CC BY-SA", true, true)
     +model-row("fr_depvec_web_lg", "French", [1, 1, 0, 1], "1.33 GB", "CC BY-NC", true, true)
+    +model-row("es_core_web_md", "Spanish", [1, 1, 1, 1], "377 MB", "CC BY-SA", true, true)
diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade
index da13f4d81..9ad2fde5f 100644
--- a/website/docs/usage/index.jade
+++ b/website/docs/usage/index.jade
@@ -34,6 +34,7 @@ p
     +qs({model: 'en'}) python -m spacy download en
     +qs({model: 'de'}) python -m spacy download de
     +qs({model: 'fr'}) python -m spacy download fr
+    +qs({model: 'es'}) python -m spacy download es
 
 +h(2, "installation") Installation instructions
 
diff --git a/website/index.jade b/website/index.jade
index 17b564b42..df5428316 100644
--- a/website/index.jade
+++ b/website/index.jade
@@ -11,7 +11,7 @@ include _includes/_mixins
     h2.c-landing__title.o-block.u-heading-1
         | in Python
 
-    +landing-badge("https://survey.spacy.io", "usersurvey", "Take the user survey!")
+    +landing-badge(gh("spaCy") + "/releases/tag/v2.0.0-alpha", "v2alpha", "Try spaCy v2.0.0 alpha!")
 
     +grid.o-content
         +grid-col("third").o-card

From 3cceabbf32df2897eff6d694da9284e87dbea735 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Tue, 6 Jun 2017 14:39:54 +0200
Subject: [PATCH 047/195] Update README.rst

---
 README.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.rst b/README.rst
index 3eeca36bc..0f3efc146 100644
--- a/README.rst
+++ b/README.rst
@@ -4,7 +4,7 @@ spaCy: Industrial-strength NLP
 spaCy is a library for advanced natural language processing in Python and
 Cython. spaCy is built on  the very latest research, but it isn't researchware.
 It was designed from day one to be used in real products. spaCy currently supports
-English, German and French, as well as tokenization for Spanish, Italian,
+English, German, French and Spanish, as well as tokenization for Italian,
 Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew,
 Chinese and Japanese. It's commercial open-source software, released under the
 MIT license.
@@ -89,7 +89,7 @@ Features
 * GIL-free **multi-threading**
 * Efficient binary serialization
 * Easy **deep learning** integration
-* Statistical models for **English** and **German**
+* Statistical models for **English**, **German**, **French** and **Spanish**
 * State-of-the-art speed
 * Robust, rigorously evaluated accuracy
 

From fa26041da62321d6f34306ce4f287cc938f27b32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gy=C3=B6rgy=20Orosz?= <gyorgy@orosz.link>
Date: Wed, 7 Jun 2017 16:19:08 +0200
Subject: [PATCH 048/195] Fixed typo in cli/package.py

---
 spacy/cli/package.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 102b07472..74086613a 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -47,7 +47,7 @@ def package(input_dir, output_dir, meta_path, force):
 
 def check_dirs(input_path, output_path, meta_path):
     if not input_path.exists():
-        util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found")
+        util.sys_exit(unicode_(input_path.as_posix()), title="Model directory not found")
     if not output_path.exists():
         util.sys_exit(unicode_(output_path), title="Output directory not found")
     if meta_path and not meta_path.exists():

From e55199d454c6490775cc3403793768da88b0d2dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@bournhonesque.eu>
Date: Tue, 9 May 2017 22:50:50 +0200
Subject: [PATCH 049/195] Implementation of Pattern

---
 spacy/compat.py           |   6 +
 spacy/pattern/__init__.py |   4 +
 spacy/pattern/parser.py   | 364 ++++++++++++++++++++++++++++++++++++++
 spacy/pattern/pattern.py  | 312 ++++++++++++++++++++++++++++++++
 4 files changed, 686 insertions(+)
 create mode 100644 spacy/pattern/__init__.py
 create mode 100644 spacy/pattern/parser.py
 create mode 100644 spacy/pattern/pattern.py

diff --git a/spacy/compat.py b/spacy/compat.py
index 1ca8a59fe..8d962976b 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -16,6 +16,10 @@ try:
 except ImportError:
     import copyreg as copy_reg
 
+try:
+    import Queue as queue
+except ImportError:
+    import queue
 
 is_python2 = six.PY2
 is_python3 = six.PY3
@@ -32,6 +36,7 @@ if is_python2:
     basestring_ = basestring
     input_ = raw_input
     json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8')
+    intern = intern
 
 elif is_python3:
     bytes_ = bytes
@@ -39,6 +44,7 @@ elif is_python3:
     basestring_ = str
     input_ = input
     json_dumps = lambda data: ujson.dumps(data, indent=2)
+    intern = sys.intern
 
 
 def symlink_to(orig, dest):
diff --git a/spacy/pattern/__init__.py b/spacy/pattern/__init__.py
new file mode 100644
index 000000000..325ba04ea
--- /dev/null
+++ b/spacy/pattern/__init__.py
@@ -0,0 +1,4 @@
+# coding: utf-8
+
+from .pattern import DependencyTree
+from .parser import PatternParser
diff --git a/spacy/pattern/parser.py b/spacy/pattern/parser.py
new file mode 100644
index 000000000..9ebb9bd5c
--- /dev/null
+++ b/spacy/pattern/parser.py
@@ -0,0 +1,364 @@
+# coding: utf-8
+
+from spacy.compat import intern, queue
+from operator import itemgetter
+import re
+from hashlib import md5
+import json
+
+from .pattern import DependencyPattern
+
+TOKEN_INITIAL = intern('initial')
+
+
+class PatternParser(object):
+    """Compile a Pattern query into a :class:`Pattern`, that can be used to
+    match :class:`DependencyTree`s."""
+    whitespace_re = re.compile(r'\s+', re.U)
+    newline_re = re.compile(r'(\r\n|\r|\n)')
+    name_re = re.compile(r'\w+', re.U)
+
+    TOKEN_BLOCK_BEGIN = '['
+    TOKEN_BLOCK_END = ']'
+    EDGE_BLOCK_BEGIN = '>'
+    WHITESPACE = ' '
+
+    @classmethod
+    def parse(cls, query):
+        """Parse the given `query`, and compile it into a :class:`Pattern`."""
+        pattern = DependencyPattern()
+
+        for lineno, token_stream in enumerate(cls.tokenize(query)):
+            try:
+                cls._parse_line(token_stream, pattern, lineno+1)
+            except StopIteration:
+                raise SyntaxError("A token is missing, please check your "
+                                  "query.")
+
+        if not pattern.nodes:
+            return
+
+        return pattern
+
+    @classmethod
+    def _parse_line(cls, stream, pattern, lineno):
+        while not stream.closed:
+            token = stream.current
+
+            if token.type == 'name':
+                next_token = stream.look()
+
+                if next_token.type == 'node':
+                    cls.parse_node_def(stream, pattern)
+
+                elif next_token.type == 'edge':
+                    cls.parse_edge_def(stream, pattern)
+
+                else:
+                    raise SyntaxError("line %d: A 'node' or 'edge' token must "
+                                      "follow a 'name' token." % lineno)
+
+            elif token.type == 'node':
+                next_token = stream.look()
+
+                if next_token.type == 'edge':
+                    cls.parse_edge_def(stream, pattern)
+                else:
+                    raise SyntaxError("line %d: an 'edge' token is "
+                                      "expected." % lineno)
+
+            if not stream.closed:
+                next(stream)
+
+    @classmethod
+    def parse_node_def(cls, stream, pattern):
+        name_token = stream.current
+        next(stream)
+        node_token = stream.current
+        cls.add_node(node_token, pattern, name_token)
+
+    @classmethod
+    def add_node(cls, node_token, pattern, name_token=None):
+        token_name = None
+        if name_token is not None:
+            token_id = name_token.value
+            token_name = name_token.value
+        else:
+            token_id = node_token.hash()
+
+        if token_id in pattern.nodes:
+            raise SyntaxError("Token with ID '{}' already registered.".format(
+                token_id))
+
+        token_attr = cls.parse_node_attributes(node_token.value)
+        token_attr['_name'] = token_name
+        pattern.add_node(token_id, token_attr)
+
+    @classmethod
+    def parse_edge_def(cls, stream, pattern):
+        token = stream.current
+
+        if token.type == 'name':
+            token_id = token.value
+            if token_id not in pattern.nodes:
+                raise SyntaxError("Token '{}' with ID '{}' is not "
+                                  "defined.".format(token, token_id))
+
+        elif token.type == 'node':
+            token_id = token.hash()
+            cls.add_node(token, pattern)
+
+        next(stream)
+        edge_attr = cls.parse_edge_attributes(stream.current.value)
+        next(stream)
+
+        head_token = stream.current
+        if head_token.type == 'name':
+            head_token_id = head_token.value
+            if head_token_id not in pattern.nodes:
+                raise SyntaxError("Token '{}' with ID '{}' is not "
+                                  "defined.".format(head_token, head_token_id))
+        elif head_token.type == 'node':
+            head_token_id = head_token.hash()
+            cls.add_node(head_token, pattern)
+        else:
+            raise SyntaxError("A 'node' or 'name' token was expected.")
+
+        # inverse the dependency to have an actual tree
+        pattern.add_edge(head_token_id, token_id, edge_attr)
+
+    @classmethod
+    def parse_node_attributes(cls, string):
+        string = string[1:]  # remove the trailing '['
+        end_delimiter_idx = string.find(']')
+
+        attr_str = string[:end_delimiter_idx]
+        attr = {}
+
+        try:
+            attr = json.loads(attr_str)
+        except json.JSONDecodeError:
+            for pair in attr_str.split(","):
+                key, value = pair.split(':')
+                attr[key] = value
+
+        for key, value in attr.items():
+            attr[key] = cls.compile_expression(value)
+
+        alias = string[end_delimiter_idx+2:]
+
+        if alias:
+            attr['_alias'] = alias
+
+        return attr
+
+    @classmethod
+    def parse_edge_attributes(cls, string):
+        string = string[1:]  # remove the trailing '>'
+
+        if not string:
+            return None
+
+        return cls.compile_expression(string)
+
+    @staticmethod
+    def compile_expression(expr):
+        if expr.startswith('/') and expr.endswith('/'):
+            string = expr[1:-1]
+            return re.compile(string, re.U)
+
+        return expr
+
+    @classmethod
+    def tokenize(cls, text):
+        lines = text.splitlines()
+
+        for lineno, line in enumerate(lines):
+            yield TokenStream(cls._tokenize_line(line, lineno+1))
+
+    @classmethod
+    def _tokenize_line(cls, line, lineno):
+        reader = Reader(line)
+
+        while reader.remaining():
+            char = reader.next()
+
+            if char == cls.TOKEN_BLOCK_BEGIN:
+                token = 'node'
+                idx = reader.find(cls.TOKEN_BLOCK_END)
+
+                if idx == -1:
+                    raise SyntaxError("A token block end ']' was expected.")
+
+                idx += 1
+                if len(reader) > idx and reader[idx] == '=':
+                    # The node has a name
+                    idx = reader.find(cls.WHITESPACE, start=idx)
+
+                    if idx == -1:
+                        idx = reader.remaining()
+
+            elif char == cls.EDGE_BLOCK_BEGIN:
+                token = 'edge'
+                idx = reader.find(cls.WHITESPACE)
+
+            elif cls.name_re.match(char):
+                token = 'name'
+                idx = reader.find(cls.WHITESPACE)
+
+                if idx == -1:
+                    whole_name_match = cls.name_re.match(str(reader))
+                    idx = whole_name_match.end()
+
+            elif cls.newline_re.match(char) or cls.whitespace_re.match(char):
+                # skip the whitespace
+                reader.consume()
+                continue
+
+            else:
+                raise SyntaxError("Unrecognized token BEGIN char: '{"
+                                  "}'".format(char))
+
+            if idx == -1:
+                raise SyntaxError("Ending character of token '{}' not "
+                                  "found.".format(token))
+            value = reader.consume(idx)
+
+            yield Token(lineno, token, value)
+
+
+class Token(tuple):
+    """Token class."""
+    __slots__ = ()
+    lineno, type, value = (property(itemgetter(x)) for x in range(3))
+
+    def __new__(cls, lineno, type, value):
+        return tuple.__new__(cls, (lineno, intern(str(type)), value))
+
+    def hash(self):
+        string = str(self.value)
+        return md5(string.encode('utf-8')).hexdigest()
+
+    def __repr__(self):
+        return 'Token(%r, %r, %r)' % (
+            self.lineno,
+            self.type,
+            self.value)
+
+
+class Reader(object):
+    """A class used by the :class:`PatternParser` to tokenize the `text`."""
+    __slots__ = ('text', 'pos')
+
+    def __init__(self, text):
+        self.text = text
+        self.pos = 0
+
+    def find(self, needle, start=0, end=None):
+        pos = self.pos
+        start += pos
+        if end is None:
+            index = self.text.find(needle, start)
+        else:
+            end += pos
+            index = self.text.find(needle, start, end)
+        if index != -1:
+            index -= pos
+        return index
+
+    def consume(self, count=1):
+        new_pos = self.pos + count
+        s = self.text[self.pos:new_pos]
+        self.pos = new_pos
+        return s
+
+    def next(self):
+        return self.text[self.pos:self.pos+1]
+
+    def remaining(self):
+        return len(self.text) - self.pos
+
+    def __len__(self):
+        return self.remaining()
+
+    def __getitem__(self, key):
+        if key < 0:
+            return self.text[key]
+        else:
+            return self.text[self.pos + key]
+
+    def __str__(self):
+        return self.text[self.pos:]
+
+
+class TokenStreamIterator(object):
+    """The iterator for tokenstreams. Iterate over the stream until the
+    stream is empty.
+    """
+
+    def __init__(self, stream):
+        self.stream = stream
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        token = self.stream.current
+        try:
+            next(self.stream)
+        except StopIteration:
+            self.stream.close()
+            raise StopIteration()
+
+        return token
+
+
+class TokenStream(object):
+    """A token stream is an iterable that yields :class:`Token`s. The
+    current active token is stored as :attr:`current`.
+    """
+
+    def __init__(self, generator):
+        self._iter = iter(generator)
+        self._pushed = queue.deque()
+        self.closed = False
+        self.current = Token(1, TOKEN_INITIAL, '')
+        next(self)
+
+    def __iter__(self):
+        return TokenStreamIterator(self)
+
+    def __bool__(self):
+        return bool(self._pushed)
+    __nonzero__ = __bool__  # py2
+
+    def push(self, token):
+        """Push a token back to the stream."""
+        self._pushed.append(token)
+
+    def look(self):
+        """Look at the next token."""
+        old_token = next(self)
+        result = self.current
+        self.push(result)
+        self.current = old_token
+        return result
+
+    def __next__(self):
+        """Go one token ahead and return the old one."""
+        rv = self.current
+        if self._pushed:
+            self.current = self._pushed.popleft()
+        else:
+            if self.closed:
+                raise StopIteration("No token left.")
+            try:
+                self.current = next(self._iter)
+            except StopIteration:
+                self.close()
+        return rv
+
+    def close(self):
+        """Close the stream."""
+        self._iter = None
+        self.closed = True
diff --git a/spacy/pattern/pattern.py b/spacy/pattern/pattern.py
new file mode 100644
index 000000000..42a15f769
--- /dev/null
+++ b/spacy/pattern/pattern.py
@@ -0,0 +1,312 @@
+# coding: utf-8
+
+import logging
+from collections import defaultdict
+
+
+logger = logging.getLogger(__name__)
+
+
+class Tree(object):
+    def __init__(self):
+        self.adjacency = defaultdict(dict)
+        self.nodes = {}
+
+    def __getitem__(self, item):
+        return self.nodes[item]
+
+    def number_of_nodes(self):
+        return len(self)
+    
+    def __len__(self):
+        return len(self.nodes)
+
+    def number_of_edges(self):
+        return sum(len(adj_dict) for adj_dict in self.adjacency.values())
+
+    def edges_iter(self, origin=None, data=True):
+        nbunch = (self.adjacency.items() if origin is None
+                  else [(origin, self.adjacency[origin])])
+
+        for u, nodes in nbunch:
+            for v, dep in nodes.items():
+                if data:
+                    yield (u, v, dep)
+                else:
+                    yield (u, v)
+    
+    def nodes_iter(self):
+        for node in self.nodes.keys():
+            yield node
+
+    def is_connected(self):
+        if len(self) == 0:
+            raise ValueError('Connectivity is undefined for the null graph.')
+        return len(set(self._plain_bfs(next(self.nodes_iter()),
+                                       undirected=True))) == len(self)
+
+    def _plain_bfs(self, source, undirected=False):
+        """A fast BFS node generator.
+        :param: source: the source node
+        """
+        seen = set()
+        next_level = {source}
+        while next_level:
+            this_level = next_level
+            next_level = set()
+            for v in this_level:
+                if v not in seen:
+                    yield v
+                    seen.add(v)
+                    next_level.update(self.adjacency[v].keys())
+
+                    if undirected:
+                        for n, adj in self.adjacency.items():
+                            if v in adj.keys():
+                                next_level.add(n)
+
+
+class DependencyPattern(Tree):
+    def add_node(self, node, attr_dict=None):
+        attr_dict = attr_dict or {}
+        self.nodes[node] = attr_dict
+
+    def add_edge(self, u, v, dep=None):
+        if u not in self.nodes or v not in self.nodes:
+            raise ValueError("Each node must be defined before adding an edge.")
+
+        self.adjacency[u][v] = dep
+
+    @property
+    def root_node(self):
+        if self.number_of_nodes() == 1:
+            # if the graph has a single node, it is the root
+            return next(iter(self.nodes.keys()))
+
+        if not self.is_connected():
+            return None
+
+        in_node = set()
+        out_node = set()
+        for u, v in self.edges_iter(data=False):
+            in_node.add(v)
+            out_node.add(u)
+
+        try:
+            return list(out_node.difference(in_node))[0]
+        except IndexError:
+            return None
+
+
+class DependencyTree(Tree):
+    def __init__(self, doc):
+        super(DependencyTree, self).__init__()
+
+        for token in doc:
+            self.nodes[token.i] = token
+            # inverse the dependency to have an actual tree
+            self.adjacency[token.head.i][token.i] = token.dep_
+
+    def __getitem__(self, item):
+        return self.nodes[item]
+
+    def match_nodes(self, attr_dict, **kwargs):
+        results = []
+        for token_idx, token in self.nodes.items():
+            if match_token(token, attr_dict, **kwargs):
+                results.append(token_idx)
+
+        return results
+
+    def match(self, pattern):
+        """Return a list of matches between the given
+        :class:`DependencyPattern` and `self` if any, or None.
+
+        :param pattern: a :class:`DependencyPattern`
+        """
+        pattern_root_node = pattern.root_node
+        pattern_root_node_attr = pattern[pattern_root_node]
+        dep_root_nodes = self.match_nodes(pattern_root_node_attr)
+
+        matches = []
+        for candidate_root_node in dep_root_nodes:
+            match_list = subtree_in_graph(candidate_root_node, self,
+                                          pattern_root_node, pattern)
+            for mapping in match_list:
+                match = PatternMatch(mapping, pattern, self)
+                matches.append(match)
+
+        return matches
+
+
+class PatternMatch(object):
+    def __init__(self, mapping, pattern, tree):
+        for pattern_node_id, tree_node_id in mapping.items():
+            mapping[pattern_node_id] = tree[tree_node_id]
+        self.mapping = mapping
+        self.pattern = pattern
+        self.tree = tree
+
+        self.alias_map = {}
+        for pattern_node_id in self.mapping:
+            pattern_node = self.pattern[pattern_node_id]
+
+            alias = pattern_node.get('_alias')
+            if alias:
+                self.alias_map[alias] = self.mapping[pattern_node_id]
+
+    def __repr__(self):
+        return "<Pattern Match: {} node>".format(len(self.mapping))
+
+    def __getitem__(self, item):
+        return self.alias_map[item]
+
+
+def subtree_in_graph(dep_tree_node, dep_tree, pattern_node, pattern):
+    """Return a list of matches of `pattern` as a subtree of `dep_tree`.
+    :param dep_tree_node: the token (identified by its index) to start from
+                          (int)
+    :param dep_tree: a :class:`DependencyTree`
+    :param pattern_node: the pattern node to start from
+    :param pattern: a :class:`DependencyPattern`
+    :return: found matches (list)
+    """
+    results = []
+    association_dict = {pattern_node: dep_tree_node}
+    _subtree_in_graph(dep_tree_node, dep_tree, pattern_node,
+                      pattern, results=results,
+                      association_dict=association_dict)
+    results = results or []
+    return results
+
+
+def _subtree_in_graph(dep_tree_node, dep_tree, pattern_node, pattern,
+                      association_dict=None, results=None):
+    token = dep_tree[dep_tree_node]
+    logger.debug("Starting from token '{}'".format(token.orth_))
+
+    adjacent_edges = list(pattern.edges_iter(origin=pattern_node))
+    if adjacent_edges:
+        for (_, adjacent_pattern_node,
+             dep) in adjacent_edges:
+            adjacent_pattern_node_attr = pattern[adjacent_pattern_node]
+            logger.debug("Exploring relation {} -[{}]-> {} from "
+                         "pattern".format(pattern[pattern_node],
+                                          dep,
+                                          adjacent_pattern_node_attr))
+
+            adjacent_nodes = find_adjacent_nodes(dep_tree,
+                                                 dep_tree_node,
+                                                 dep,
+                                                 adjacent_pattern_node_attr)
+
+            if not adjacent_nodes:
+                logger.debug("No adjacent nodes in dep_tree satisfying these "
+                             "conditions.")
+                return None
+
+            for adjacent_node in adjacent_nodes:
+                logger.debug("Found adjacent node '{}' in "
+                             "dep_tree".format(dep_tree[adjacent_node].orth_))
+                association_dict[adjacent_pattern_node] = adjacent_node
+                recursive_return = _subtree_in_graph(adjacent_node,
+                                                     dep_tree,
+                                                     adjacent_pattern_node,
+                                                     pattern,
+                                                     association_dict,
+                                                     results=results)
+
+                if recursive_return is None:
+                    # No Match
+                    return None
+
+                association_dict, results = recursive_return
+
+    else:
+        if len(association_dict) == pattern.number_of_nodes():
+            logger.debug("Add to results: {}".format(association_dict))
+            results.append(dict(association_dict))
+
+        else:
+            logger.debug("{} nodes in subgraph, only {} "
+                         "mapped".format(pattern.number_of_nodes(),
+                                         len(association_dict)))
+
+    logger.debug("Return intermediate: {}".format(association_dict))
+    return association_dict, results
+
+
+def find_adjacent_nodes(dep_tree, node, target_dep, node_attributes):
+    """Find nodes adjacent to ``node`` that fulfill specified attributes
+    values on edge and node.
+
+    :param dep_tree: a :class:`DependencyTree`
+    :param node: initial node to search from
+    :param target_dep: edge attributes that must be fulfilled (pair-value)
+     :type target_dep: dict
+    :param node_attributes: node attributes that must be fulfilled (pair-value)
+    :type node_attributes: dict
+    :return: adjacent nodes that fulfill the given criteria (list)
+    """
+    results = []
+    for _, adj_node, adj_dep in dep_tree.edges_iter(origin=node):
+        adj_token = dep_tree[adj_node]
+        if (match_edge(adj_dep, target_dep)
+                and match_token(adj_token, node_attributes)):
+            results.append(adj_node)
+
+    return results
+
+
+def match_edge(token_dep, target_dep):
+    if target_dep is None:
+        return True
+
+    if hasattr(target_dep, 'match'):
+        return target_dep.match(token_dep) is not None
+
+    if token_dep == target_dep:
+        return True
+
+    return False
+
+
+def match_token(token,
+                target_attributes,
+                ignore_special_key=True,
+                lower=True):
+    bind_map = {
+        'word': lambda t: t.orth_,
+        'lemma': lambda t: t.lemma_,
+    }
+
+    if lower:
+        bind_map = {key: lambda t: func(t).lower() for key, func in
+                    bind_map.items()}
+
+    for target_key, target_value in target_attributes.items():
+        is_special_key = target_key[0] == '_'
+
+        if ignore_special_key and is_special_key:
+            continue
+
+        if lower and hasattr(target_value, 'lower'):
+            target_value = target_value.lower()
+
+        if target_key in bind_map:
+            token_attr = bind_map[target_key](token)
+
+            if hasattr(target_value, 'match'):  # if it is a compiled regex
+                if target_value.match(token_attr) is None:
+                    break
+            else:
+                if not token_attr == target_value:
+                    break
+
+        else:
+            raise ValueError("Unknown key: '{}'".format(target_key))
+
+    else:  # the loop was not broken
+        return True
+
+    return False

From 8ff4f512a25f2ad9607d5491f054d07e90128c0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@bournhonesque.eu>
Date: Sun, 11 Jun 2017 18:28:36 +0200
Subject: [PATCH 050/195] Check in PatternParser that the generated Pattern is
 valid

---
 spacy/pattern/parser.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/spacy/pattern/parser.py b/spacy/pattern/parser.py
index 9ebb9bd5c..a36446a1a 100644
--- a/spacy/pattern/parser.py
+++ b/spacy/pattern/parser.py
@@ -38,8 +38,18 @@ class PatternParser(object):
         if not pattern.nodes:
             return
 
+        cls.check_pattern(pattern)
         return pattern
 
+    @staticmethod
+    def check_pattern(pattern):
+        if not pattern.is_connected():
+            raise ValueError("The pattern tree must be a fully connected "
+                             "graph.")
+
+        if pattern.root_node is None:
+            raise ValueError("The root node of the tree could not be found.")
+
     @classmethod
     def _parse_line(cls, stream, pattern, lineno):
         while not stream.closed:

From d9c567371f9ae85cb75b86ecd90bae5b00544905 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@bournhonesque.eu>
Date: Sun, 11 Jun 2017 18:29:28 +0200
Subject: [PATCH 051/195] Move add_node and add_edge methods to the Tree base
 class

---
 spacy/pattern/pattern.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/spacy/pattern/pattern.py b/spacy/pattern/pattern.py
index 42a15f769..de7a54e05 100644
--- a/spacy/pattern/pattern.py
+++ b/spacy/pattern/pattern.py
@@ -15,6 +15,16 @@ class Tree(object):
     def __getitem__(self, item):
         return self.nodes[item]
 
+    def add_node(self, node, attr_dict=None):
+        attr_dict = attr_dict or {}
+        self.nodes[node] = attr_dict
+
+    def add_edge(self, u, v, dep=None):
+        if u not in self.nodes or v not in self.nodes:
+            raise ValueError("Each node must be defined before adding an edge.")
+
+        self.adjacency[u][v] = dep
+
     def number_of_nodes(self):
         return len(self)
     
@@ -67,16 +77,6 @@ class Tree(object):
 
 
 class DependencyPattern(Tree):
-    def add_node(self, node, attr_dict=None):
-        attr_dict = attr_dict or {}
-        self.nodes[node] = attr_dict
-
-    def add_edge(self, u, v, dep=None):
-        if u not in self.nodes or v not in self.nodes:
-            raise ValueError("Each node must be defined before adding an edge.")
-
-        self.adjacency[u][v] = dep
-
     @property
     def root_node(self):
         if self.number_of_nodes() == 1:

From 4ca8a396a2934ddef3b7c71bab932f6bbe649759 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@bournhonesque.eu>
Date: Sun, 11 Jun 2017 18:30:01 +0200
Subject: [PATCH 052/195] Do not add the root token to the adjacency map

---
 spacy/pattern/pattern.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/spacy/pattern/pattern.py b/spacy/pattern/pattern.py
index de7a54e05..f21edf5a6 100644
--- a/spacy/pattern/pattern.py
+++ b/spacy/pattern/pattern.py
@@ -104,8 +104,10 @@ class DependencyTree(Tree):
 
         for token in doc:
             self.nodes[token.i] = token
-            # inverse the dependency to have an actual tree
-            self.adjacency[token.head.i][token.i] = token.dep_
+
+            if token.head.i != token.i:
+                # inverse the dependency to have an actual tree
+                self.adjacency[token.head.i][token.i] = token.dep_
 
     def __getitem__(self, item):
         return self.nodes[item]

From d010f5a123e724a168e2f30a3a6c903f7c0443d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@bournhonesque.eu>
Date: Sun, 11 Jun 2017 18:30:28 +0200
Subject: [PATCH 053/195] Fix node matching bug caused by lower function

---
 spacy/pattern/pattern.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/spacy/pattern/pattern.py b/spacy/pattern/pattern.py
index f21edf5a6..d47022fec 100644
--- a/spacy/pattern/pattern.py
+++ b/spacy/pattern/pattern.py
@@ -282,10 +282,6 @@ def match_token(token,
         'lemma': lambda t: t.lemma_,
     }
 
-    if lower:
-        bind_map = {key: lambda t: func(t).lower() for key, func in
-                    bind_map.items()}
-
     for target_key, target_value in target_attributes.items():
         is_special_key = target_key[0] == '_'
 
@@ -298,6 +294,9 @@ def match_token(token,
         if target_key in bind_map:
             token_attr = bind_map[target_key](token)
 
+            if lower:
+                token_attr = token_attr.lower()
+
             if hasattr(target_value, 'match'):  # if it is a compiled regex
                 if target_value.match(token_attr) is None:
                     break

From 4289a21703d97e72c3dc81105c897efe69a45b62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@bournhonesque.eu>
Date: Sun, 11 Jun 2017 18:30:53 +0200
Subject: [PATCH 054/195] Add 'ent' to node matching key

---
 spacy/pattern/pattern.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/pattern/pattern.py b/spacy/pattern/pattern.py
index d47022fec..282cea0e3 100644
--- a/spacy/pattern/pattern.py
+++ b/spacy/pattern/pattern.py
@@ -280,6 +280,7 @@ def match_token(token,
     bind_map = {
         'word': lambda t: t.orth_,
         'lemma': lambda t: t.lemma_,
+        'ent': lambda t: t.ent_type_,
     }
 
     for target_key, target_value in target_attributes.items():

From 1849a110e3abf9938f92563213f6583ab459931b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@bournhonesque.eu>
Date: Sun, 11 Jun 2017 18:31:19 +0200
Subject: [PATCH 055/195] Improve logging

---
 spacy/pattern/pattern.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/spacy/pattern/pattern.py b/spacy/pattern/pattern.py
index 282cea0e3..552283066 100644
--- a/spacy/pattern/pattern.py
+++ b/spacy/pattern/pattern.py
@@ -130,6 +130,10 @@ class DependencyTree(Tree):
         pattern_root_node_attr = pattern[pattern_root_node]
         dep_root_nodes = self.match_nodes(pattern_root_node_attr)
 
+        if not dep_root_nodes:
+            logger.debug("No node matches the pattern root "
+                         "'{}'".format(pattern_root_node_attr))
+
         matches = []
         for candidate_root_node in dep_root_nodes:
             match_list = subtree_in_graph(candidate_root_node, self,

From 46637369aaffc0ba0e62cec675289b8275d149c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@bournhonesque.eu>
Date: Sun, 11 Jun 2017 18:34:38 +0200
Subject: [PATCH 056/195] Add basic unit tests for Pattern

---
 spacy/tests/pattern/__init__.py |  1 +
 spacy/tests/pattern/parser.py   | 68 +++++++++++++++++++++++++++++++++
 spacy/tests/pattern/pattern.py  | 61 +++++++++++++++++++++++++++++
 3 files changed, 130 insertions(+)
 create mode 100644 spacy/tests/pattern/__init__.py
 create mode 100644 spacy/tests/pattern/parser.py
 create mode 100644 spacy/tests/pattern/pattern.py

diff --git a/spacy/tests/pattern/__init__.py b/spacy/tests/pattern/__init__.py
new file mode 100644
index 000000000..57d631c3f
--- /dev/null
+++ b/spacy/tests/pattern/__init__.py
@@ -0,0 +1 @@
+# coding: utf-8
diff --git a/spacy/tests/pattern/parser.py b/spacy/tests/pattern/parser.py
new file mode 100644
index 000000000..a56bda20a
--- /dev/null
+++ b/spacy/tests/pattern/parser.py
@@ -0,0 +1,68 @@
+# coding: utf-8
+
+
+import re
+from ...pattern.parser import PatternParser
+
+
+class TestPatternParser:
+    def test_empty_query(self):
+        assert PatternParser.parse('') is None
+        assert PatternParser.parse('      ') is None
+
+    def test_define_node(self):
+        query = "fox [lemma:fox,word:fox]=alias"
+        pattern = PatternParser.parse(query)
+
+        assert pattern is not None
+        assert pattern.number_of_nodes() == 1
+        assert pattern.number_of_edges() == 0
+
+        assert 'fox' in pattern.nodes
+
+        attrs = pattern['fox']
+        assert attrs.get('lemma') == 'fox'
+        assert attrs.get('word') == 'fox'
+        assert attrs.get('_name') == 'fox'
+        assert attrs.get('_alias') == 'alias'
+
+        for adj_list in pattern.adjacency.values():
+            assert not adj_list
+
+    def test_define_node_with_regex(self):
+        query = "fox [lemma:/fo.*/]"
+        pattern = PatternParser.parse(query)
+
+        attrs = pattern['fox']
+        assert attrs.get('lemma') == re.compile(r'fo.*', re.U)
+
+    def test_define_edge(self):
+        query = "[word:quick] >amod [word:fox]"
+        pattern = PatternParser.parse(query)
+
+        assert pattern is not None
+        assert pattern.number_of_nodes() == 2
+        assert pattern.number_of_edges() == 1
+
+        base_node_id = list(pattern.adjacency.keys())[0]
+        adj_map = pattern.adjacency[base_node_id]
+
+        assert len(adj_map) == 1
+        head_node_id = list(adj_map.keys())[0]
+        dep = adj_map[head_node_id]
+
+        assert dep == 'amod'
+        assert pattern[base_node_id]['word'] == 'fox'
+        assert pattern[head_node_id]['word'] == 'quick'
+
+    def test_define_edge_with_regex(self):
+        query = "[word:quick] >/amod|nsubj/ [word:fox]"
+        pattern = PatternParser.parse(query)
+
+        base_node_id = list(pattern.adjacency.keys())[0]
+        adj_map = pattern.adjacency[base_node_id]
+
+        assert len(adj_map) == 1
+        head_node_id = list(adj_map.keys())[0]
+        dep = adj_map[head_node_id]
+        assert dep == re.compile(r'amod|nsubj', re.U)
diff --git a/spacy/tests/pattern/pattern.py b/spacy/tests/pattern/pattern.py
new file mode 100644
index 000000000..a476f92f7
--- /dev/null
+++ b/spacy/tests/pattern/pattern.py
@@ -0,0 +1,61 @@
+# coding: utf-8
+
+from ..util import get_doc
+from ...pattern.pattern import Tree, DependencyTree
+from ...pattern.parser import PatternParser
+
+import pytest
+
+import logging
+logger = logging.getLogger()
+logger.addHandler(logging.StreamHandler())
+logger.setLevel(logging.DEBUG)
+
+
+@pytest.fixture
+def doc(en_vocab):
+    words = ['I', "'m", 'going', 'to', 'the', 'zoo', 'next', 'week', '.']
+    doc = get_doc(en_vocab,
+                  words=words,
+                  deps=['nsubj', 'aux', 'ROOT', 'prep', 'det', 'pobj',
+                        'amod', 'npadvmod', 'punct'],
+                  heads=[2, 1, 0, -1, 1, -2, 1, -5, -6])
+    return doc
+
+
+class TestTree:
+    def test_is_connected(self):
+        tree = Tree()
+        tree.add_node(1)
+        tree.add_node(2)
+        tree.add_edge(1, 2)
+
+        assert tree.is_connected()
+
+        tree.add_node(3)
+        assert not tree.is_connected()
+
+
+class TestDependencyTree:
+    def test_from_doc(self, doc):
+        dep_tree = DependencyTree(doc)
+
+        assert len(dep_tree) == len(doc)
+        assert dep_tree.is_connected()
+        assert dep_tree.number_of_edges() == len(doc) - 1
+
+    def test_simple_matching(self, doc):
+        dep_tree = DependencyTree(doc)
+        pattern = PatternParser.parse("""root [word:going]
+                                         to [word:to]
+                                         [word:week]=date > root
+                                         [word:/zoo|park/]=place >pobj to
+                                         to >prep root
+                                         """)
+        assert pattern is not None
+        matches = dep_tree.match(pattern)
+        assert len(matches) == 1
+
+        match = matches[0]
+        assert match['place'] == doc[5]
+        assert match['date'] == doc[7]

From e4a45ae55fba89a65fc0851783fd712ae6d1755d Mon Sep 17 00:00:00 2001
From: Bart Broere <broerebart@hotmail.com>
Date: Mon, 12 Jun 2017 12:28:51 +0200
Subject: [PATCH 057/195] Very minor documentation fix

---
 website/docs/usage/customizing-tokenizer.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade
index b1fbba652..354a56c22 100644
--- a/website/docs/usage/customizing-tokenizer.jade
+++ b/website/docs/usage/customizing-tokenizer.jade
@@ -214,7 +214,7 @@ p
         def __call__(self, text):
             words = text.split(' ')
             # All tokens 'own' a subsequent space character in this tokenizer
-            spaces = [True] * len(word)
+            spaces = [True] * len(words)
             return Doc(self.vocab, words=words, spaces=spaces)
 
 p

From d19ce29a23de1805be3bb2b0a694a38d671fdfb3 Mon Sep 17 00:00:00 2001
From: Ian Mobbs <ianmobbs@users.noreply.github.com>
Date: Mon, 12 Jun 2017 13:21:44 -0400
Subject: [PATCH 058/195] Create requirements.txt

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 8194dee58..20c587841 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,6 +7,7 @@ thinc>=6.5.0,<6.6.0
 murmurhash>=0.26,<0.27
 plac<1.0.0,>=0.9.6
 six
+html5lib==1.0b8
 ujson>=1.35
 dill>=0.2,<0.3
 requests>=2.13.0,<3.0.0

From 81166c3d563bf5c3ca86924b06c4fd44dd6e3a11 Mon Sep 17 00:00:00 2001
From: Nathan Glenn <garfieldnate@gmail.com>
Date: Wed, 21 Jun 2017 19:22:30 +0200
Subject: [PATCH 059/195] fix confusing typo

This document describes the `Vocab` class, not the `Span` class.
---
 website/docs/api/vocab.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/vocab.jade b/website/docs/api/vocab.jade
index 7490bccf4..c036c650b 100644
--- a/website/docs/api/vocab.jade
+++ b/website/docs/api/vocab.jade
@@ -124,7 +124,7 @@ p
         +cell #[code Lexeme]
         +cell The lexeme indicated by the given ID.
 
-+h(2, "iter") Span.__iter__
++h(2, "iter") Vocab.__iter__
     +tag method
 
 p Iterate over the lexemes in the vocabulary.

From f69ff1508959e60ced2a0bf329aae07710bc9bde Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Tue, 27 Jun 2017 14:49:02 +0200
Subject: [PATCH 060/195] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index ea6096a52..c419a03cf 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -25,6 +25,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a
 * Ines Montani, [@ines](https://github.com/ines)
 * J Nicolas Schrading, [@NSchrading](https://github.com/NSchrading)
 * Janneke van der Zwaan, [@jvdzwaan](https://github.com/jvdzwaan)
+* Jim Regan, [@jimregan](https://github.com/jimregan)
 * Jordan Suchow, [@suchow](https://github.com/suchow)
 * Josh Reeter, [@jreeter](https://github.com/jreeter)
 * Juan Miguel Cejuela, [@juanmirocks](https://github.com/juanmirocks)

From 84041a2bb517841d725781bdd72b1daf4f8e603d Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 28 Jun 2017 01:18:05 +0900
Subject: [PATCH 061/195] Make create_tokenizer work with Japanese

---
 spacy/ja/__init__.py | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py
index 07e40ada6..1c85ded95 100644
--- a/spacy/ja/__init__.py
+++ b/spacy/ja/__init__.py
@@ -3,21 +3,39 @@ from __future__ import unicode_literals, print_function
 
 from os import path
 
-from ..language import Language
+from ..language import Language, BaseDefaults
+from ..tokenizer import Tokenizer
 from ..attrs import LANG
 from ..tokens import Doc
 
 from .language_data import *
 
-
-class Japanese(Language):
-    lang = 'ja'
-
-    def make_doc(self, text):
+class JapaneseTokenizer(object):
+    def __init__(self, cls, nlp=None):
+        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
         try:
             from janome.tokenizer import Tokenizer
         except ImportError:
             raise ImportError("The Japanese tokenizer requires the Janome library: "
                               "https://github.com/mocobeta/janome")
-        words = [x.surface for x in Tokenizer().tokenize(text)]
+        self.tokenizer = Tokenizer()
+
+    def __call__(self, text):
+        words = [x.surface for x in self.tokenizer.tokenize(text)]
         return Doc(self.vocab, words=words, spaces=[False]*len(words))
+
+class JapaneseDefaults(BaseDefaults):
+    @classmethod
+    def create_tokenizer(cls, nlp=None):
+        return JapaneseTokenizer(cls, nlp)
+
+class Japanese(Language):
+    lang = 'ja'
+
+    Defaults = JapaneseDefaults
+
+    def make_doc(self, text):
+        words = self.tokenizer(text)
+        return Doc(self.vocab, words=words, spaces=[False]*len(words))
+
+        

From e56fea14eb7e807d5ea4ee5fdd12f7ca0610690a Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 28 Jun 2017 01:24:25 +0900
Subject: [PATCH 062/195] Add basic Japanese tokenizer test

---
 spacy/tests/conftest.py          | 8 +++++++-
 spacy/tests/ja/__init__.py       | 0
 spacy/tests/ja/test_tokenizer.py | 8 ++++++++
 3 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100644 spacy/tests/ja/__init__.py
 create mode 100644 spacy/tests/ja/test_tokenizer.py

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index b8ada1d9a..b0f11b5a4 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -5,6 +5,7 @@ from ..en import English
 from ..de import German
 from ..es import Spanish
 from ..it import Italian
+from ..ja import Japanese
 from ..fr import French
 from ..pt import Portuguese
 from ..nl import Dutch
@@ -27,7 +28,7 @@ import os
 import pytest
 
 
-LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
+LANGUAGES = [English, German, Spanish, Italian, Japanese, French, Portuguese, Dutch,
              Swedish, Hungarian, Finnish, Bengali, Norwegian]
 
 
@@ -76,6 +77,11 @@ def fi_tokenizer():
     return Finnish.Defaults.create_tokenizer()
 
 
+@pytest.fixture
+def ja_tokenizer():
+    return Japanese.Defaults.create_tokenizer()
+
+
 @pytest.fixture
 def sv_tokenizer():
     return Swedish.Defaults.create_tokenizer()
diff --git a/spacy/tests/ja/__init__.py b/spacy/tests/ja/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/ja/test_tokenizer.py b/spacy/tests/ja/test_tokenizer.py
new file mode 100644
index 000000000..8d45c822d
--- /dev/null
+++ b/spacy/tests/ja/test_tokenizer.py
@@ -0,0 +1,8 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+def test_japanese_tokenizer(ja_tokenizer):
+    tokens = ja_tokenizer("日本語だよ")
+    assert len(tokens) == 3

From 1b3a5d87bad69dcb8ec9cdb26ec030f7894708ec Mon Sep 17 00:00:00 2001
From: Alexis <alexis.eidelman.pro@gmail.com>
Date: Wed, 28 Jun 2017 14:11:20 +0200
Subject: [PATCH 063/195] French NUM_WORDS and ORDINAL_WORDS

---
 spacy/fr/stop_words.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/spacy/fr/stop_words.py b/spacy/fr/stop_words.py
index d9b820537..71f124d6c 100644
--- a/spacy/fr/stop_words.py
+++ b/spacy/fr/stop_words.py
@@ -86,3 +86,28 @@ votre vous vous-mêmes vu vé vôtre vôtres
 
 zut
 """.split())
+
+
+
+# Number words
+
+NUM_WORDS = set("""
+zero un deux trois quatre cinq six sept huit neuf dix
+onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf 
+vingt trente quanrante cinquante soixante septante quatre-vingt huitante nonante
+cent mille mil million milliard billion quadrillion quintillion
+sextillion septillion octillion nonillion decillion
+""".split())
+
+# Ordinal words
+
+ORDINAL_WORDS = set("""
+premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième
+onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neufième 
+vingtième trentième quanrantième cinquantième soixantième septantième quatre-vingtième huitantième nonantième
+centième millième millionnième milliardième billionnième quadrillionnième quintillionnième
+sextillionnième septillionnième octillionnième nonillionnième decillionnième
+""".split())
+
+
+

From 30a34ebb6edb513e262d1f47b6742b4480282f3c Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 29 Jun 2017 00:09:20 +0900
Subject: [PATCH 064/195] Add importorskip for janome

---
 spacy/tests/conftest.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index b0f11b5a4..222f9aa1d 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -79,6 +79,7 @@ def fi_tokenizer():
 
 @pytest.fixture
 def ja_tokenizer():
+    janome = pytest.importorskip("janome")
     return Japanese.Defaults.create_tokenizer()
 
 

From c33619339217dbeff75243d7493dc60685ddf28c Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 29 Jun 2017 00:09:40 +0900
Subject: [PATCH 065/195] Parametrize and extend Japanese tokenizer tests

---
 spacy/tests/ja/test_tokenizer.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/ja/test_tokenizer.py b/spacy/tests/ja/test_tokenizer.py
index 8d45c822d..58700b353 100644
--- a/spacy/tests/ja/test_tokenizer.py
+++ b/spacy/tests/ja/test_tokenizer.py
@@ -3,6 +3,15 @@ from __future__ import unicode_literals
 
 import pytest
 
-def test_japanese_tokenizer(ja_tokenizer):
-    tokens = ja_tokenizer("日本語だよ")
-    assert len(tokens) == 3
+TOKENIZER_TESTS = [
+        ("日本語だよ", ['日本語', 'だ', 'よ']),
+        ("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
+        ("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']),
+        ("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お仕置き', 'よ', '!']),
+        ("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'])
+]
+
+@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
+def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens):
+    tokens = [token.text for token in ja_tokenizer(text)]
+    assert tokens == expected_tokens

From dfaeee1f37d8b7b614e55cd732c6c89abb9afd92 Mon Sep 17 00:00:00 2001
From: Callum Kift <callumkift@gmail.com>
Date: Fri, 30 Jun 2017 09:56:33 +0200
Subject: [PATCH 066/195] fixed bug in training ner documentation and example

---
 examples/training/train_new_entity_type.py | 2 +-
 website/docs/usage/training-ner.jade       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py
index 4eae11c75..987ab5859 100644
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@@ -52,6 +52,7 @@ def train_ner(nlp, train_data, output_dir):
         random.shuffle(train_data)
         loss = 0.
         for raw_text, entity_offsets in train_data:
+            doc = nlp.make_doc(raw_text)
             gold = GoldParse(doc, entities=entity_offsets)
             # By default, the GoldParse class assumes that the entities
             # described by offset are complete, and all other words should
@@ -63,7 +64,6 @@ def train_ner(nlp, train_data, output_dir):
             #for i in range(len(gold.ner)):
                 #if not gold.ner[i].endswith('ANIMAL'):
                 #    gold.ner[i] = '-'
-            doc = nlp.make_doc(raw_text)
             nlp.tagger(doc)
             # As of 1.9, spaCy's parser now lets you supply a dropout probability
             # This might help the model generalize better from only a few
diff --git a/website/docs/usage/training-ner.jade b/website/docs/usage/training-ner.jade
index 78eb4905e..52eedd21e 100644
--- a/website/docs/usage/training-ner.jade
+++ b/website/docs/usage/training-ner.jade
@@ -150,8 +150,8 @@ p
         for itn in range(20):
             random.shuffle(train_data)
             for raw_text, entity_offsets in train_data:
-                gold = GoldParse(doc, entities=entity_offsets)
                 doc = nlp.make_doc(raw_text)
+                gold = GoldParse(doc, entities=entity_offsets)
                 nlp.tagger(doc)
                 loss = nlp.entity.update(doc, gold)
         nlp.end_training()

From 669bd142130f3e3c66b253efd0df1dd7ce2ba3f4 Mon Sep 17 00:00:00 2001
From: gispk47 <gispk47@163.com>
Date: Sat, 1 Jul 2017 13:12:00 +0800
Subject: [PATCH 067/195] Update __init__.py

remove the empty string return from jieba.cut,this will cause the list of tokens cant be pushed assert error
---
 spacy/zh/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/zh/__init__.py b/spacy/zh/__init__.py
index 1847a7d8d..0f407dec6 100644
--- a/spacy/zh/__init__.py
+++ b/spacy/zh/__init__.py
@@ -8,4 +8,5 @@ class Chinese(Language):
     def make_doc(self, text):
         import jieba
         words = list(jieba.cut(text, cut_all=True))
+        words=[x for x in words if x]
         return Doc(self.vocab, words=words, spaces=[False]*len(words))

From c3d722d66f150a69037340e4daf03ec921f4e489 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@bournhonesque.eu>
Date: Sat, 1 Jul 2017 13:09:50 +0200
Subject: [PATCH 068/195] Add a disclaimer about classes copied from the Jinja2
 project

---
 spacy/pattern/parser.py | 43 ++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/spacy/pattern/parser.py b/spacy/pattern/parser.py
index a36446a1a..4b6fbc8dd 100644
--- a/spacy/pattern/parser.py
+++ b/spacy/pattern/parser.py
@@ -237,25 +237,6 @@ class PatternParser(object):
             yield Token(lineno, token, value)
 
 
-class Token(tuple):
-    """Token class."""
-    __slots__ = ()
-    lineno, type, value = (property(itemgetter(x)) for x in range(3))
-
-    def __new__(cls, lineno, type, value):
-        return tuple.__new__(cls, (lineno, intern(str(type)), value))
-
-    def hash(self):
-        string = str(self.value)
-        return md5(string.encode('utf-8')).hexdigest()
-
-    def __repr__(self):
-        return 'Token(%r, %r, %r)' % (
-            self.lineno,
-            self.type,
-            self.value)
-
-
 class Reader(object):
     """A class used by the :class:`PatternParser` to tokenize the `text`."""
     __slots__ = ('text', 'pos')
@@ -283,7 +264,7 @@ class Reader(object):
         return s
 
     def next(self):
-        return self.text[self.pos:self.pos+1]
+        return self.text[self.pos:self.pos + 1]
 
     def remaining(self):
         return len(self.text) - self.pos
@@ -301,6 +282,28 @@ class Reader(object):
         return self.text[self.pos:]
 
 
+# The following classes were copied from Jinja2, a BSD-licensed project,
+# and slightly modified: Token, TokenStreamIterator, TokenStream.
+
+class Token(tuple):
+    """Token class."""
+    __slots__ = ()
+    lineno, type, value = (property(itemgetter(x)) for x in range(3))
+
+    def __new__(cls, lineno, type, value):
+        return tuple.__new__(cls, (lineno, intern(str(type)), value))
+
+    def hash(self):
+        string = str(self.value)
+        return md5(string.encode('utf-8')).hexdigest()
+
+    def __repr__(self):
+        return 'Token(%r, %r, %r)' % (
+            self.lineno,
+            self.type,
+            self.value)
+
+
 class TokenStreamIterator(object):
     """The iterator for tokenstreams. Iterate over the stream until the
     stream is empty.

From f4748834d973a525024cac18fb58fe9934957170 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@bournhonesque.eu>
Date: Sat, 1 Jul 2017 13:17:26 +0200
Subject: [PATCH 069/195] Use spacy hash_string function instead of md5

---
 spacy/pattern/parser.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/pattern/parser.py b/spacy/pattern/parser.py
index 4b6fbc8dd..122d2b8f3 100644
--- a/spacy/pattern/parser.py
+++ b/spacy/pattern/parser.py
@@ -1,9 +1,9 @@
 # coding: utf-8
 
 from spacy.compat import intern, queue
+from spacy.strings import hash_string
 from operator import itemgetter
 import re
-from hashlib import md5
 import json
 
 from .pattern import DependencyPattern
@@ -294,8 +294,8 @@ class Token(tuple):
         return tuple.__new__(cls, (lineno, intern(str(type)), value))
 
     def hash(self):
-        string = str(self.value)
-        return md5(string.encode('utf-8')).hexdigest()
+        string = self.value
+        return hash_string(string)
 
     def __repr__(self):
         return 'Token(%r, %r, %r)' % (

From 8592f3de47406daed2a26e3d0927a7706b1191d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@bournhonesque.eu>
Date: Sat, 1 Jul 2017 15:03:32 +0200
Subject: [PATCH 070/195] Fix fuzzy unit tests

---
 spacy/tests/pattern/parser.py | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/spacy/tests/pattern/parser.py b/spacy/tests/pattern/parser.py
index a56bda20a..50dd3ac60 100644
--- a/spacy/tests/pattern/parser.py
+++ b/spacy/tests/pattern/parser.py
@@ -44,25 +44,33 @@ class TestPatternParser:
         assert pattern.number_of_nodes() == 2
         assert pattern.number_of_edges() == 1
 
-        base_node_id = list(pattern.adjacency.keys())[0]
-        adj_map = pattern.adjacency[base_node_id]
+        quick_id = [node_id for node_id, node_attr in pattern.nodes.items()
+                    if node_attr['word'] == 'quick'][0]
 
-        assert len(adj_map) == 1
-        head_node_id = list(adj_map.keys())[0]
-        dep = adj_map[head_node_id]
+        fox_id = [node_id for node_id, node_attr in pattern.nodes.items()
+                  if node_attr['word'] == 'fox'][0]
+
+        quick_map = pattern.adjacency[quick_id]
+        fox_map = pattern.adjacency[fox_id]
+
+        assert len(quick_map) == 0
+        assert len(fox_map) == 1
+
+        dep = fox_map[quick_id]
 
         assert dep == 'amod'
-        assert pattern[base_node_id]['word'] == 'fox'
-        assert pattern[head_node_id]['word'] == 'quick'
 
     def test_define_edge_with_regex(self):
         query = "[word:quick] >/amod|nsubj/ [word:fox]"
         pattern = PatternParser.parse(query)
 
-        base_node_id = list(pattern.adjacency.keys())[0]
-        adj_map = pattern.adjacency[base_node_id]
+        quick_id = [node_id for node_id, node_attr in pattern.nodes.items()
+                    if node_attr['word'] == 'quick'][0]
+
+        fox_id = [node_id for node_id, node_attr in pattern.nodes.items()
+                  if node_attr['word'] == 'fox'][0]
+
+        fox_map = pattern.adjacency[fox_id]
+        dep = fox_map[quick_id]
 
-        assert len(adj_map) == 1
-        head_node_id = list(adj_map.keys())[0]
-        dep = adj_map[head_node_id]
         assert dep == re.compile(r'amod|nsubj', re.U)

From 5357874bf74b05a40961ba05936f6009453a48b8 Mon Sep 17 00:00:00 2001
From: Swier <swierh@users.noreply.github.com>
Date: Wed, 5 Jul 2017 14:03:30 +0200
Subject: [PATCH 071/195] add Dutch numbers and ordinals

---
 spacy/nl/stop_words.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/spacy/nl/stop_words.py b/spacy/nl/stop_words.py
index 22f1d714c..d19515262 100644
--- a/spacy/nl/stop_words.py
+++ b/spacy/nl/stop_words.py
@@ -41,3 +41,22 @@ want waren was wat we wel werd wezen wie wij wil worden
 
 zal ze zei zelf zich zij zijn zo zonder zou
 """.split())
+
+
+# Number words
+
+NUM_WORDS = set("""
+nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
+veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
+duizend miljoen miljard biljoen biljard triljoen triljard
+""".split())
+
+
+# Ordinal words
+
+ORDINAL_WORDS = set("""
+eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
+twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
+zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
+miljardste biljoenste biljardste triljoenste triljardste
+""".split())

From f377c9c952ed6b42086c0ee9fcedb5a67af963b4 Mon Sep 17 00:00:00 2001
From: Swier <swierh@users.noreply.github.com>
Date: Wed, 5 Jul 2017 14:06:28 +0200
Subject: [PATCH 072/195] Rename stop_words.py to word_sets.py

---
 spacy/nl/{stop_words.py => word_sets.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename spacy/nl/{stop_words.py => word_sets.py} (100%)

diff --git a/spacy/nl/stop_words.py b/spacy/nl/word_sets.py
similarity index 100%
rename from spacy/nl/stop_words.py
rename to spacy/nl/word_sets.py

From 29720150f9960c1a57b2d463d4653e0a8f3211e0 Mon Sep 17 00:00:00 2001
From: Swier <swierh@users.noreply.github.com>
Date: Wed, 5 Jul 2017 14:08:04 +0200
Subject: [PATCH 073/195] fix import of stop words in language data

---
 spacy/nl/language_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/nl/language_data.py b/spacy/nl/language_data.py
index f9899d8d1..b3ca1aef9 100644
--- a/spacy/nl/language_data.py
+++ b/spacy/nl/language_data.py
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
 from .. import language_data as base
 from ..language_data import update_exc, strings_to_exc
 
-from .stop_words import STOP_WORDS
+from .word_sets import STOP_WORDS, NUM_WORDS
 
 
 STOP_WORDS = set(STOP_WORDS)

From 19d4706f69b8788bffc43ab0bf07a80a1ed5bdab Mon Sep 17 00:00:00 2001
From: val314159 <jmward@gmail.com>
Date: Fri, 7 Jul 2017 13:18:17 -0700
Subject: [PATCH 074/195] make this work in python2.7

---
 website/docs/usage/lightning-tour.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade
index 138b0058d..2fd390d26 100644
--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@@ -83,7 +83,7 @@ p
 +h(2, "examples-word-vectors") Word vectors
 
 +code.
-    doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
+    doc = nlp(u"Apples and oranges are similar. Boots and hippos aren't.")
 
     apples = doc[0]
     oranges = doc[2]

From 04e6a6518869b1ca15beb79694049e0fb164a2aa Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Sun, 9 Jul 2017 16:23:26 +0900
Subject: [PATCH 075/195] Remove Japanese from LANGUAGES

LANGUAGES is a list of languages whose tokenizers get run through a
variety of generic tests. Since the generic tests don't check the JA
fixture, it blows up when it can't find janome. -POLM
---
 spacy/tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 222f9aa1d..29d896a5d 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -28,7 +28,7 @@ import os
 import pytest
 
 
-LANGUAGES = [English, German, Spanish, Italian, Japanese, French, Portuguese, Dutch,
+LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
              Swedish, Hungarian, Finnish, Bengali, Norwegian]
 
 

From bc87b815cc34d375e1a4b4c9b54c296691cee237 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Sun, 9 Jul 2017 16:28:55 +0900
Subject: [PATCH 076/195] Add comment clarifying what LANGUAGES does

---
 spacy/tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 29d896a5d..6e00b1513 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -27,7 +27,7 @@ from pathlib import Path
 import os
 import pytest
 
-
+# These languages get run through generic tokenizer tests
 LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
              Swedish, Hungarian, Finnish, Bengali, Norwegian]
 

From 6cf26909438230b4f9626d6cf25a19ecd0d1555c Mon Sep 17 00:00:00 2001
From: lgenerknol <lg@enerknol.com>
Date: Wed, 12 Jul 2017 11:06:16 -0400
Subject: [PATCH 077/195] Missing markup char

Frontend displayed:
```
 If start_idx and do not mark[...]
```
Note the missing "end_idx" after 'and'.
---
 website/docs/api/doc.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade
index adcd111a3..1c2911f52 100644
--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@@ -272,7 +272,7 @@ p Import the document contents from a binary string.
 p
     |  Retokenize the document, such that the span at
     |  #[code doc.text[start_idx : end_idx]] is merged into a single token. If
-    |  #[code start_idx] and #[end_idx] do not mark start and end token
+    |  #[code start_idx] and #[code end_idx] do not mark start and end token
     |  boundaries, the document remains unchanged.
 
 +table(["Name", "Type", "Description"])

From 2b219caf0d01e98e10b82b940ba184a63ead64a5 Mon Sep 17 00:00:00 2001
From: lgenerknol <lg@enerknol.com>
Date: Wed, 12 Jul 2017 13:12:24 -0400
Subject: [PATCH 078/195] .../cli/#foo is 404

https://spacy.io/docs/usage/cli/#package is a 404.
Changed to https://spacy.io/docs/usage/cli#package

Definitely a larger fix possible to deal with trailing slashes
---
 website/docs/usage/saving-loading.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade
index c4eb08f04..8978cce7a 100644
--- a/website/docs/usage/saving-loading.jade
+++ b/website/docs/usage/saving-loading.jade
@@ -28,7 +28,7 @@ p
     |  and walk you through generating the meta data. You can also create the
     |  meta.json manually and place it in the model data directory, or supply a
     |  path to it using the #[code --meta] flag. For more info on this, see the
-    |  #[+a("/docs/usage/cli/#package") #[code package] command] documentation.
+    |  #[+a("/docs/usage/cli#package") #[code package] command] documentation.
 
 +aside-code("meta.json", "json").
     {

From fadacd0d47a898173ae68bdfb758e688f7a176ce Mon Sep 17 00:00:00 2001
From: Jorge Paredes <warderm6@gmail.com>
Date: Sun, 16 Jul 2017 10:06:32 -0500
Subject: [PATCH 079/195] Fix url broken

The related url to **custom named entities** was broken
---
 website/docs/usage/models.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade
index 9bb75ba9a..30863720c 100644
--- a/website/docs/usage/models.jade
+++ b/website/docs/usage/models.jade
@@ -203,7 +203,7 @@ p
 p
     |  If you've trained your own model, for example for
     |  #[+a("/docs/usage/adding-languages") additional languages] or
-    |  #[+a("/docs/usage/train-ner") custom named entities], you can save its
+    |  #[+a("/docs/usage/training-ner") custom named entities], you can save its
     |  state using the #[code Language.save_to_directory()] method. To make the
     |  model more convenient to deploy, we recommend wrapping it as a Python
     |  package.

From 8bb443be4fc63fd76e6ddf48008aacfe3a716398 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 22 Jul 2017 13:28:51 +0200
Subject: [PATCH 080/195] Add standalone tagger training example

---
 examples/training/train_tagger_ud.py | 150 +++++++++++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 examples/training/train_tagger_ud.py

diff --git a/examples/training/train_tagger_ud.py b/examples/training/train_tagger_ud.py
new file mode 100644
index 000000000..3015c52e8
--- /dev/null
+++ b/examples/training/train_tagger_ud.py
@@ -0,0 +1,150 @@
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import plac
+import codecs
+import spacy.symbols as symbols
+import spacy
+from pathlib import Path
+
+from spacy.vocab import Vocab
+from spacy.tagger import Tagger
+from spacy.tokens import Doc
+from spacy.gold import GoldParse
+from spacy.language import Language
+from spacy import orth
+from spacy import attrs
+
+import random
+
+TAG_MAP = {
+    'ADJ': {symbols.POS: symbols.ADJ},
+    'ADP': {symbols.POS: symbols.ADP},
+    'PUNCT': {symbols.POS: symbols.PUNCT},
+    'ADV': {symbols.POS: symbols.ADV},
+    'AUX': {symbols.POS: symbols.AUX},
+    'SYM': {symbols.POS: symbols.SYM},
+    'INTJ': {symbols.POS: symbols.INTJ},
+    'CCONJ': {symbols.POS: symbols.CCONJ},
+    'X': {symbols.POS: symbols.X},
+    'NOUN': {symbols.POS: symbols.NOUN},
+    'DET': {symbols.POS: symbols.DET},
+    'PROPN': {symbols.POS: symbols.PROPN},
+    'NUM': {symbols.POS: symbols.NUM},
+    'VERB': {symbols.POS: symbols.VERB},
+    'PART': {symbols.POS: symbols.PART},
+  	'PRON': {symbols.POS: symbols.PRON},
+    'SCONJ': {symbols.POS: symbols.SCONJ},
+}
+
+LEX_ATTR_GETTERS = {
+    attrs.LOWER: lambda string: string.lower(),
+    attrs.NORM: lambda string: string,
+    attrs.SHAPE: orth.word_shape,
+    attrs.PREFIX: lambda string: string[0],
+    attrs.SUFFIX: lambda string: string[-3:],
+    attrs.CLUSTER: lambda string: 0,
+    attrs.IS_ALPHA: orth.is_alpha,
+    attrs.IS_ASCII: orth.is_ascii,
+    attrs.IS_DIGIT: lambda string: string.isdigit(),
+    attrs.IS_LOWER: orth.is_lower,
+    attrs.IS_PUNCT: orth.is_punct,
+    attrs.IS_SPACE: lambda string: string.isspace(),
+    attrs.IS_TITLE: orth.is_title,
+    attrs.IS_UPPER: orth.is_upper,
+    attrs.IS_BRACKET: orth.is_bracket,
+    attrs.IS_QUOTE: orth.is_quote,
+    attrs.IS_LEFT_PUNCT: orth.is_left_punct,
+    attrs.IS_RIGHT_PUNCT: orth.is_right_punct,
+    attrs.LIKE_URL: orth.like_url,
+    attrs.LIKE_NUM: orth.like_number,
+    attrs.LIKE_EMAIL: orth.like_email,
+    attrs.IS_STOP: lambda string: False,
+    attrs.IS_OOV: lambda string: True
+}
+
+
+def read_ud_data(path):
+    data = []
+    last_number = -1
+    sentence_words = []
+    sentence_tags = []
+    with codecs.open(path, encoding="utf-8") as f:
+        while True:
+            line = f.readline()
+            if not line:
+                break
+
+            if line[0].isdigit():
+                d = line.split()
+                if not "-" in d[0]:
+                    number = int(line[0])
+                    if number < last_number:
+                        data.append((sentence_words, sentence_tags),)
+                        sentence_words = []
+                        sentence_tags = []
+                    sentence_words.append(d[2])
+                    sentence_tags.append(d[3])
+                    last_number = number
+    if len(sentence_words) > 0:
+        data.append((sentence_words, sentence_tags,))
+    return data
+
+def ensure_dir(path):
+    if not path.exists():
+        path.mkdir()
+
+
+def main(train_loc, dev_loc, output_dir=None):
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        ensure_dir(output_dir)
+        ensure_dir(output_dir / "pos")
+        ensure_dir(output_dir / "vocab")
+
+    train_data = read_ud_data(train_loc)
+    vocab = Vocab(tag_map=TAG_MAP, lex_attr_getters=LEX_ATTR_GETTERS)
+    # Populate vocab
+    for words, _ in train_data:
+        for word in words:
+            _ = vocab[word]
+    
+    model = spacy.tagger.TaggerModel(spacy.tagger.Tagger.feature_templates)
+    tagger = Tagger(vocab, model)
+    print(tagger.tag_names)
+    for i in range(30):
+        print("training model (iteration " + str(i) + ")...")
+        score = 0.
+        num_samples = 0.
+        for words, tags in train_data:
+            doc = Doc(vocab, words=words)
+            gold = GoldParse(doc, tags=tags)
+            cost = tagger.update(doc, gold)
+            for i, word in enumerate(doc):
+                num_samples += 1
+                if word.tag_ == tags[i]:
+                    score += 1
+        print('Train acc', score/num_samples) 
+        random.shuffle(train_data)
+    tagger.model.end_training()
+
+    score = 0.0
+    test_data = read_ud_data(dev_loc)
+    num_samples = 0
+    for words, tags in test_data:
+        doc = Doc(vocab, words)
+        tagger(doc)
+        for i, word in enumerate(doc):
+            num_samples += 1
+            if word.tag_ == tags[i]:
+                score += 1
+    print("score: " + str(score / num_samples * 100.0))
+    
+    if output_dir is not None:
+        tagger.model.dump(str(output_dir / 'pos' / 'model'))
+        with (output_dir / 'vocab' / 'strings.json').open('w') as file_:
+            tagger.vocab.strings.dump(file_)
+
+
+if __name__ == '__main__':
+    plac.call(main)

From 3fef5f642bd7f40cbc41319e51e71579bde791f9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 22 Jul 2017 13:29:15 +0200
Subject: [PATCH 081/195] Rename tagger training example

---
 .../{train_tagger_ud.py => train_tagger_standalone_ud.py}         | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/training/{train_tagger_ud.py => train_tagger_standalone_ud.py} (100%)

diff --git a/examples/training/train_tagger_ud.py b/examples/training/train_tagger_standalone_ud.py
similarity index 100%
rename from examples/training/train_tagger_ud.py
rename to examples/training/train_tagger_standalone_ud.py

From a405660068f9f1c17a71a54866f475b2b13eef6c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 22 Jul 2017 13:32:48 +0200
Subject: [PATCH 082/195] Add commit to tagger example

---
 examples/training/train_new_entity_type.py      |  4 ++--
 examples/training/train_tagger_standalone_ud.py | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py
index 4eae11c75..6c432acdf 100644
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@@ -24,8 +24,8 @@ For more details, see the documentation:
 * Training the Named Entity Recognizer: https://spacy.io/docs/usage/train-ner
 * Saving and loading models: https://spacy.io/docs/usage/saving-loading
 
-Developed for: spaCy 1.7.6
-Last tested for: spaCy 1.7.6
+Developed for: spaCy 1.9.0
+Last tested for: spaCy 1.9.0
 """
 from __future__ import unicode_literals, print_function
 
diff --git a/examples/training/train_tagger_standalone_ud.py b/examples/training/train_tagger_standalone_ud.py
index 3015c52e8..ce1ab50d6 100644
--- a/examples/training/train_tagger_standalone_ud.py
+++ b/examples/training/train_tagger_standalone_ud.py
@@ -1,3 +1,17 @@
+'''
+This example shows training of the POS tagger without the Language class,
+showing the APIs of the atomic components.
+
+This example was adapted from the gist here:
+
+https://gist.github.com/kamac/a7bc139f62488839a8118214a4d932f2
+
+Issue discussing the gist:
+
+https://github.com/explosion/spaCy/issues/1179
+
+The example was written for spaCy 1.8.2.
+'''
 from __future__ import unicode_literals
 from __future__ import print_function
 

From 5916d46ba8a9c85f5f8c115bb831561e3c64d256 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 22 Jul 2017 13:34:01 +0200
Subject: [PATCH 083/195] Avoid use of deepcopy in printer

---
 spacy/tokens/printers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/tokens/printers.py b/spacy/tokens/printers.py
index d70088540..487d74167 100644
--- a/spacy/tokens/printers.py
+++ b/spacy/tokens/printers.py
@@ -49,6 +49,7 @@ def parse_tree(doc, light=False, flat=False):
     >>> trees = doc.print_tree()
     [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
     """
-    doc_clone = deepcopy(doc)
+    doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
+    doc_clone.from_array(doc.to_array([HEAD, DEP, TAG, ENT_IOB, ENT_TYPE])
     merge_ents(doc_clone)  # merge the entities into single tokens first
     return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents]

From 8b581fdac515173f80a2b1560f2b58286d3c92e3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 22 Jul 2017 13:36:54 +0200
Subject: [PATCH 084/195] Remove unused example

---
 examples/chainer_sentiment.py | 322 ----------------------------------
 1 file changed, 322 deletions(-)
 delete mode 100644 examples/chainer_sentiment.py

diff --git a/examples/chainer_sentiment.py b/examples/chainer_sentiment.py
deleted file mode 100644
index 747ef508a..000000000
--- a/examples/chainer_sentiment.py
+++ /dev/null
@@ -1,322 +0,0 @@
-'''WIP --- Doesn't work well yet'''
-import plac
-import random
-import six
-
-import cProfile
-import pstats
-
-import pathlib
-import cPickle as pickle
-from itertools import izip
-
-import spacy
-
-import cytoolz
-import cupy as xp
-import cupy.cuda
-import chainer.cuda
-
-import chainer.links as L
-import chainer.functions as F
-from chainer import Chain, Variable, report
-import chainer.training
-import chainer.optimizers
-from chainer.training import extensions
-from chainer.iterators import SerialIterator
-from chainer.datasets import TupleDataset
-
-
-class SentimentAnalyser(object):
-    @classmethod
-    def load(cls, path, nlp, max_length=100):
-        raise NotImplementedError
-        #with (path / 'config.json').open() as file_:
-        #    model = model_from_json(file_.read())
-        #with (path / 'model').open('rb') as file_:
-        #    lstm_weights = pickle.load(file_)
-        #embeddings = get_embeddings(nlp.vocab)
-        #model.set_weights([embeddings] + lstm_weights)
-        #return cls(model, max_length=max_length)
-
-    def __init__(self, model, max_length=100):
-        self._model = model
-        self.max_length = max_length
-
-    def __call__(self, doc):
-        X = get_features([doc], self.max_length)
-        y = self._model.predict(X)
-        self.set_sentiment(doc, y)
-
-    def pipe(self, docs, batch_size=1000, n_threads=2):
-        for minibatch in cytoolz.partition_all(batch_size, docs):
-            minibatch = list(minibatch)
-            sentences = []
-            for doc in minibatch:
-                sentences.extend(doc.sents)
-            Xs = get_features(sentences, self.max_length)
-            ys = self._model.predict(Xs)
-            for sent, label in zip(sentences, ys):
-                sent.doc.sentiment += label - 0.5
-            for doc in minibatch:
-                yield doc
-
-    def set_sentiment(self, doc, y):
-        doc.sentiment = float(y[0])
-        # Sentiment has a native slot for a single float.
-        # For arbitrary data storage, there's:
-        # doc.user_data['my_data'] = y
-
-
-class Classifier(Chain):
-    def __init__(self, predictor):
-        super(Classifier, self).__init__(predictor=predictor)
-
-    def __call__(self, x, t):
-        y = self.predictor(x)
-        loss = F.softmax_cross_entropy(y, t)
-        accuracy = F.accuracy(y, t)
-        report({'loss': loss, 'accuracy': accuracy}, self)
-        return loss
-
-
-class SentimentModel(Chain):
-    def __init__(self, nlp, shape, **settings):
-        Chain.__init__(self,
-            embed=_Embed(shape['nr_vector'], shape['nr_dim'], shape['nr_hidden'],
-                set_vectors=lambda arr: set_vectors(arr, nlp.vocab)),
-            encode=_Encode(shape['nr_hidden'], shape['nr_hidden']),
-            attend=_Attend(shape['nr_hidden'], shape['nr_hidden']),
-            predict=_Predict(shape['nr_hidden'], shape['nr_class']))
-        self.to_gpu(0)
-
-    def __call__(self, sentence):
-        return self.predict(
-                  self.attend(
-                      self.encode(
-                          self.embed(sentence))))
-
-
-class _Embed(Chain):
-    def __init__(self, nr_vector, nr_dim, nr_out, set_vectors=None):
-        Chain.__init__(self,
-            embed=L.EmbedID(nr_vector, nr_dim, initialW=set_vectors),
-            project=L.Linear(None, nr_out, nobias=True))
-        self.embed.W.volatile = False
-
-    def __call__(self, sentence):
-        return [self.project(self.embed(ts)) for ts in F.transpose(sentence)]
-
-
-class _Encode(Chain):
-    def __init__(self, nr_in, nr_out):
-        Chain.__init__(self,
-            fwd=L.LSTM(nr_in, nr_out),
-            bwd=L.LSTM(nr_in, nr_out),
-            mix=L.Bilinear(nr_out, nr_out, nr_out))
-
-    def __call__(self, sentence):
-        self.fwd.reset_state()
-        fwds = map(self.fwd, sentence)
-        self.bwd.reset_state()
-        bwds = reversed(map(self.bwd, reversed(sentence)))
-        return [F.elu(self.mix(f, b)) for f, b in zip(fwds, bwds)]
-
-
-class _Attend(Chain):
-    def __init__(self, nr_in, nr_out):
-        Chain.__init__(self)
-
-    def __call__(self, sentence):
-        sent = sum(sentence)
-        return sent
-
-
-class _Predict(Chain):
-    def __init__(self, nr_in, nr_out):
-        Chain.__init__(self,
-            l1=L.Linear(nr_in, nr_in),
-            l2=L.Linear(nr_in, nr_out))
-
-    def __call__(self, vector):
-        vector = self.l1(vector)
-        vector = F.elu(vector)
-        vector = self.l2(vector)
-        return vector
-
-
-class SentenceDataset(TupleDataset):
-    def __init__(self, nlp, texts, labels, max_length):
-        self.max_length = max_length
-        sents, labels = self._get_labelled_sentences(
-            nlp.pipe(texts, batch_size=5000, n_threads=3),
-            labels)
-        TupleDataset.__init__(self,
-            get_features(sents, max_length),
-            labels)
-
-    def __getitem__(self, index):
-        batches = [dataset[index] for dataset in self._datasets]
-        if isinstance(index, slice):
-            length = len(batches[0])
-            returns = [tuple([batch[i] for batch in batches])
-                       for i in six.moves.range(length)]
-            return returns
-        else:
-            return tuple(batches)
-
-    def _get_labelled_sentences(self, docs, doc_labels):
-        labels = []
-        sentences = []
-        for doc, y in izip(docs, doc_labels):
-            for sent in doc.sents:
-                sentences.append(sent)
-                labels.append(y)
-        return sentences, xp.asarray(labels, dtype='i')
-
-
-class DocDataset(TupleDataset):
-    def __init__(self, nlp, texts, labels):
-        self.max_length = max_length
-        DatasetMixin.__init__(self,
-            get_features(
-                nlp.pipe(texts, batch_size=5000, n_threads=3), self.max_length),
-            labels)
-
-def read_data(data_dir, limit=0):
-    examples = []
-    for subdir, label in (('pos', 1), ('neg', 0)):
-        for filename in (data_dir / subdir).iterdir():
-            with filename.open() as file_:
-                text = file_.read()
-            examples.append((text, label))
-    random.shuffle(examples)
-    if limit >= 1:
-        examples = examples[:limit]
-    return zip(*examples) # Unzips into two lists
-
-
-def get_features(docs, max_length):
-    docs = list(docs)
-    Xs = xp.zeros((len(docs), max_length), dtype='i')
-    for i, doc in enumerate(docs):
-        j = 0
-        for token in doc:
-            if token.has_vector and not token.is_punct and not token.is_space:
-                Xs[i, j] = token.norm
-                j += 1
-                if j >= max_length:
-                    break
-    return Xs
-
-
-def set_vectors(vectors, vocab):
-    for lex in vocab:
-        if lex.has_vector and (lex.rank+1) < vectors.shape[0]:
-            lex.norm = lex.rank+1
-            vectors[lex.rank + 1] = lex.vector
-        else:
-            lex.norm = 0
-    return vectors
-
-
-def train(train_texts, train_labels, dev_texts, dev_labels,
-        lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5,
-        by_sentence=True):
-    nlp = spacy.load('en', entity=False)
-    if 'nr_vector' not in lstm_shape:
-        lstm_shape['nr_vector'] = max(lex.rank+1 for lex in nlp.vocab if lex.has_vector)
-    if 'nr_dim' not in lstm_shape:
-        lstm_shape['nr_dim'] = nlp.vocab.vectors_length
-    print("Make model")
-    model = Classifier(SentimentModel(nlp, lstm_shape, **lstm_settings))
-    print("Parsing texts...")
-    if by_sentence:
-        train_data = SentenceDataset(nlp, train_texts, train_labels, lstm_shape['max_length'])
-        dev_data = SentenceDataset(nlp, dev_texts, dev_labels, lstm_shape['max_length'])
-    else:
-        train_data = DocDataset(nlp, train_texts, train_labels)
-        dev_data = DocDataset(nlp, dev_texts, dev_labels)
-    train_iter = SerialIterator(train_data, batch_size=batch_size,
-                                shuffle=True, repeat=True)
-    dev_iter = SerialIterator(dev_data, batch_size=batch_size,
-                              shuffle=False, repeat=False)
-    optimizer = chainer.optimizers.Adam()
-    optimizer.setup(model)
-    updater = chainer.training.StandardUpdater(train_iter, optimizer, device=0)
-    trainer = chainer.training.Trainer(updater, (1, 'epoch'), out='result')
-
-    trainer.extend(extensions.Evaluator(dev_iter, model, device=0))
-    trainer.extend(extensions.LogReport())
-    trainer.extend(extensions.PrintReport([
-        'epoch', 'main/accuracy', 'validation/main/accuracy']))
-    trainer.extend(extensions.ProgressBar())
-    
-    trainer.run()
-
-
-def evaluate(model_dir, texts, labels, max_length=100):
-    def create_pipeline(nlp):
-        '''
-        This could be a lambda, but named functions are easier to read in Python.
-        '''
-        return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp,
-                                                               max_length=max_length)]
-    
-    nlp = spacy.load('en')
-    nlp.pipeline = create_pipeline(nlp)
-
-    correct = 0
-    i = 0 
-    for doc in nlp.pipe(texts, batch_size=1000, n_threads=4):
-        correct += bool(doc.sentiment >= 0.5) == bool(labels[i])
-        i += 1
-    return float(correct) / i
-
-
-@plac.annotations(
-    train_dir=("Location of training file or directory"),
-    dev_dir=("Location of development file or directory"),
-    model_dir=("Location of output model directory",),
-    is_runtime=("Demonstrate run-time usage", "flag", "r", bool),
-    nr_hidden=("Number of hidden units", "option", "H", int),
-    max_length=("Maximum sentence length", "option", "L", int),
-    dropout=("Dropout", "option", "d", float),
-    learn_rate=("Learn rate", "option", "e", float),
-    nb_epoch=("Number of training epochs", "option", "i", int),
-    batch_size=("Size of minibatches for training LSTM", "option", "b", int),
-    nr_examples=("Limit to N examples", "option", "n", int)
-)
-def main(model_dir, train_dir, dev_dir,
-         is_runtime=False,
-         nr_hidden=64, max_length=100, # Shape
-         dropout=0.5, learn_rate=0.001, # General NN config
-         nb_epoch=5, batch_size=32, nr_examples=-1):  # Training params
-    model_dir = pathlib.Path(model_dir)
-    train_dir = pathlib.Path(train_dir)
-    dev_dir = pathlib.Path(dev_dir)
-    if is_runtime:
-        dev_texts, dev_labels = read_data(dev_dir)
-        acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length)
-        print(acc)
-    else:
-        print("Read data")
-        train_texts, train_labels = read_data(train_dir, limit=nr_examples)
-        dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples)
-        print("Using GPU 0")
-        #chainer.cuda.get_device(0).use()
-        train_labels = xp.asarray(train_labels, dtype='i')
-        dev_labels = xp.asarray(dev_labels, dtype='i')
-        lstm = train(train_texts, train_labels, dev_texts, dev_labels,
-                     {'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 2,
-                      'nr_vector': 5000},
-                      {'dropout': 0.5, 'lr': learn_rate},
-                      {},
-                      nb_epoch=nb_epoch, batch_size=batch_size)
-
-
-if __name__ == '__main__':
-    #cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
-    #s = pstats.Stats("Profile.prof")
-    #s.strip_dirs().sort_stats("time").print_stats()
-    plac.call(main)

From 69396dcfd35cf40c9706bf1199f3de8b8e7a06a5 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Sat, 22 Jul 2017 13:43:15 +0200
Subject: [PATCH 085/195] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index c419a03cf..bfdbf5c4f 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -3,6 +3,7 @@
 This is a list of everyone who has made significant contributions to spaCy, in alphabetical order. Thanks a lot for the great work!
 
 * Adam Bittlingmayer, [@bittlingmayer](https://github.com/bittlingmayer)
+* Alexis Eidelman, [@AlexisEidelman](https://github.com/AlexisEidelman)
 * Andreas Grivas, [@andreasgrv](https://github.com/andreasgrv)
 * Andrew Poliakov, [@pavlin99th](https://github.com/pavlin99th)
 * Aniruddha Adhikary [@aniruddha-adhikary](https://github.com/aniruddha-adhikary)
@@ -47,6 +48,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a
 * Sam Bozek, [@sambozek](https://github.com/sambozek)
 * Sasho Savkov, [@savkov](https://github.com/savkov)
 * Shuvanon Razik, [@shuvanon](https://github.com/shuvanon)
+* Swier, [@swierh](https://github.com/swierh)
 * Thomas Tanon, [@Tpt](https://github.com/Tpt)
 * Tiago Rodrigues, [@TiagoMRodrigues](https://github.com/TiagoMRodrigues)
 * Vsevolod Solovyov, [@vsolovyov](https://github.com/vsolovyov)

From 8b9c4c5e1c80e7e3814b39a64e58a24c005b15f0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 22 Jul 2017 13:43:47 +0200
Subject: [PATCH 086/195] Add missing SP symbol to tag map, re #1052

---
 spacy/language_data/tag_map.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/language_data/tag_map.py b/spacy/language_data/tag_map.py
index ead6dd1c6..65dab9b0d 100644
--- a/spacy/language_data/tag_map.py
+++ b/spacy/language_data/tag_map.py
@@ -22,5 +22,6 @@ TAG_MAP = {
     "CCONJ":    {POS: CCONJ}, # U20
     "ADJ":      {POS: ADJ},
     "VERB":     {POS: VERB},
-    "PART":     {POS: PART}
+    "PART":     {POS: PART},
+    'SP':       {POS: SPACE}
 }

From 45f6961ae0f54f1e6cbb6fb59158e2ce03e27417 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 22 Jul 2017 13:45:21 +0200
Subject: [PATCH 087/195] Add __version__ symbol in __init__.py

---
 spacy/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/__init__.py b/spacy/__init__.py
index 2308ce7e4..3afb38cfb 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -5,6 +5,7 @@ from . import util
 from .deprecated import resolve_model_name
 from .cli.info import info
 from .glossary import explain
+from .about import __version__
 
 from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja
 

From 0ae3807d7df39b70cc45fc973b84701d9c4f9e25 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 22 Jul 2017 13:53:48 +0200
Subject: [PATCH 088/195] Fix gaps in Lexeme API. Closes #1031

---
 spacy/lexeme.pyx                         |  9 +++++++++
 spacy/tests/regression/test_issue1031.py | 13 +++++++++++++
 2 files changed, 22 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue1031.py

diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 05d8bddc6..dc0440486 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -159,6 +159,10 @@ cdef class Lexeme:
         def __get__(self):
             return self.c.id
 
+    property lex_id:
+        def __get__(self):
+            return self.c.id
+
     property repvec:
         def __get__(self):
             raise AttributeError("lex.repvec has been renamed to lex.vector")
@@ -173,6 +177,11 @@ cdef class Lexeme:
         def __get__(self):
             return self.vocab.strings[self.c.orth]
 
+    property text:
+        def __get__(self):
+            return self.vocab.strings[self.c.orth]
+
+
     property lower:
         def __get__(self): return self.c.lower
         def __set__(self, int x): self.c.lower = x
diff --git a/spacy/tests/regression/test_issue1031.py b/spacy/tests/regression/test_issue1031.py
new file mode 100644
index 000000000..1ac14eb7b
--- /dev/null
+++ b/spacy/tests/regression/test_issue1031.py
@@ -0,0 +1,13 @@
+from ...vocab import Vocab
+
+def test_lexeme_text():
+    vocab = Vocab()
+    lex = vocab[u'the']
+    assert lex.text == u'the'
+
+
+def test_lexeme_lex_id():
+    vocab = Vocab()
+    lex1 = vocab[u'the']
+    lex2 = vocab[u'be']
+    assert lex1.lex_id != lex2.lex_id

From dfbc7e49de96c9e8980c89706d4889244d1f6e39 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 22 Jul 2017 14:14:01 +0200
Subject: [PATCH 089/195] Add test for Issue #1207

---
 spacy/tests/regression/test_issue1307.py | 25 ++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue1307.py

diff --git a/spacy/tests/regression/test_issue1307.py b/spacy/tests/regression/test_issue1307.py
new file mode 100644
index 000000000..a71faebcb
--- /dev/null
+++ b/spacy/tests/regression/test_issue1307.py
@@ -0,0 +1,25 @@
+from __future__ import unicode_literals
+from ..util import get_doc
+from ...vocab import Vocab
+from ...en import English
+
+
+def test_span_noun_chunks():
+    vocab = Vocab(lang='en', tag_map=English.Defaults.tag_map)
+    words = "Employees are recruiting talented staffers from overseas .".split()
+    heads = [1, 1, 0, 1, -2, -1, -5]
+    deps = ['nsubj', 'aux', 'ROOT', 'nmod', 'dobj', 'adv', 'pobj']
+    tags = ['NNS', 'VBP', 'VBG', 'JJ', 'NNS', 'IN', 'NN', '.']
+    doc = get_doc(vocab, words=words, heads=heads, deps=deps, tags=tags)
+    doc.is_parsed = True
+    
+    noun_chunks = [np.text for np in doc.noun_chunks]
+    assert noun_chunks == ['Employees', 'talented staffers', 'overseas']
+
+    span = doc[0:4]
+    noun_chunks = [np.text for np in span.noun_chunks]
+    assert noun_chunks == ['Employees']
+
+    for sent in doc.sents:
+        noun_chunks = [np.text for np in sent.noun_chunks]
+        assert noun_chunks == ['Employees', 'talented staffers', 'overseas']

From d9b85675d79553b4435aef1140354161c3f5dc91 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 22 Jul 2017 14:14:35 +0200
Subject: [PATCH 090/195] Rename regression test

---
 spacy/tests/regression/{test_issue1307.py => test_issue1207.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename spacy/tests/regression/{test_issue1307.py => test_issue1207.py} (100%)

diff --git a/spacy/tests/regression/test_issue1307.py b/spacy/tests/regression/test_issue1207.py
similarity index 100%
rename from spacy/tests/regression/test_issue1307.py
rename to spacy/tests/regression/test_issue1207.py

From 9750a0128cf211dac80217eee38e41c38f2c761c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 22 Jul 2017 14:14:57 +0200
Subject: [PATCH 091/195] Fix Span.noun_chunks. Closes #1207

---
 spacy/tokens/span.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 09927ab4c..d8890addc 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -230,7 +230,7 @@ cdef class Span:
             # so it's okay once we have the Span objects. See Issue #375
             spans = []
             for start, end, label in self.doc.noun_chunks_iterator(self):
-                spans.append(Span(self, start, end, label=label))
+                spans.append(Span(self.doc, start, end, label=label))
             for span in spans:
                 yield span
 

From 23a55b40ca8af1af588b6cbf5504b8d87e3b91d5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 22 Jul 2017 14:15:25 +0200
Subject: [PATCH 092/195] Default to English noun chunks iterator if no lang
 set

---
 spacy/syntax/iterators.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx
index b0d1c78ca..0fe724622 100644
--- a/spacy/syntax/iterators.pyx
+++ b/spacy/syntax/iterators.pyx
@@ -117,4 +117,5 @@ def es_noun_chunks(obj):
         token = next_token(token)
 
 
-CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, 'es': es_noun_chunks}
+CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, 'es': es_noun_chunks,
+            None: english_noun_chunks, '': english_noun_chunks}

From e3f23f9d910b0fa0e5c71b5b4c5c2a243fe66e60 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 22 Jul 2017 14:57:51 +0200
Subject: [PATCH 093/195] Use latest available version in examples

---
 website/docs/usage/models.jade | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade
index 30863720c..42a3c0bbf 100644
--- a/website/docs/usage/models.jade
+++ b/website/docs/usage/models.jade
@@ -67,7 +67,7 @@ p
     python -m spacy download en_core_web_md
 
     # download exact model version (doesn't create shortcut link)
-    python -m spacy download en_core_web_md-1.2.0 --direct
+    python -m spacy download en_core_web_md-1.2.1 --direct
 
 p
     |  The download command will #[+a("#download-pip") install the model] via
@@ -96,10 +96,10 @@ p
 
 +code(false, "bash").
     # with external URL
-    pip install #{gh("spacy-models")}/releases/download/en_core_web_md-1.2.0/en_core_web_md-1.2.0.tar.gz
+    pip install #{gh("spacy-models")}/releases/download/en_core_web_md-1.2.1/en_core_web_md-1.2.1.tar.gz
 
     # with local file
-    pip install /Users/you/en_core_web_md-1.2.0.tar.gz
+    pip install /Users/you/en_core_web_md-1.2.1.tar.gz
 
 p
     |  By default, this will install the model into your #[code site-packages]

From b22b18a0199ae2856f8f6923fb0db1cebe74dbb5 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 22 Jul 2017 15:02:15 +0200
Subject: [PATCH 094/195] Add notes on spacy.explain() to annotation docs

---
 website/docs/api/annotation.jade | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade
index 8c6b8fb10..30080dfd9 100644
--- a/website/docs/api/annotation.jade
+++ b/website/docs/api/annotation.jade
@@ -38,6 +38,11 @@ p
 
 +h(2, "pos-tagging") Part-of-speech Tagging
 
++infobox("Tip: Understanding tags")
+    |  In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
+    |  description for the string representation of a tag. For example,
+    |  #[code spacy.explain("RB")] will return "adverb".
+
 include _annotation/_pos-tags
 
 +h(2, "lemmatization") Lemmatization
@@ -65,10 +70,20 @@ p
 
 +h(2, "dependency-parsing") Syntactic Dependency Parsing
 
++infobox("Tip: Understanding labels")
+    |  In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
+    |  description for the string representation of a label. For example,
+    |  #[code spacy.explain("prt")] will return "particle".
+
 include _annotation/_dep-labels
 
 +h(2, "named-entities") Named Entity Recognition
 
++infobox("Tip: Understanding entity types")
+    |  In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
+    |  description for the string representation of an entity label. For example,
+    |  #[code spacy.explain("LANGUAGE")] will return "any named language".
+
 include _annotation/_named-entities
 
 +h(2, "json-input") JSON input format for training

From 96df9c7154b7967a145423200be62fa245039e8b Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Sat, 22 Jul 2017 15:05:46 +0200
Subject: [PATCH 095/195] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index bfdbf5c4f..995f6901f 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -58,3 +58,4 @@ This is a list of everyone who has made significant contributions to spaCy, in a
 * Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang)
 * Yasuaki Uechi, [@uetchy](https://github.com/uetchy)
 * Yubing Dong, [@tomtung](https://github.com/tomtung)
+* Yuval Pinter, [@yuvalpinter](https://github.com/yuvalpinter)

From 4b2e5e59eda15c5f60710acbfb8624f748a169fc Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 22 Jul 2017 15:06:50 +0200
Subject: [PATCH 096/195] Add flush_cache method to tokenizer, to fix #1061

The tokenizer caches output for common chunks, for efficiency. This
cache is be invalidated when the tokenizer rules change, e.g. when a new
special-case rule is introduced. That's what was causing #1061.

When the cache is flushed, we free the intermediate token chunks.
I *think* this is safe --- but if we start getting segfaults, this patch
is to blame. The resolution would be to simply not free those bits of
memory. They'll be freed when the tokenizer exits anyway.
---
 spacy/tests/regression/test_issue1061.py | 27 ++++++++++++++
 spacy/tokenizer.pyx                      | 46 +++++++++++++++++++++---
 2 files changed, 68 insertions(+), 5 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue1061.py

diff --git a/spacy/tests/regression/test_issue1061.py b/spacy/tests/regression/test_issue1061.py
new file mode 100644
index 000000000..821ca2bfc
--- /dev/null
+++ b/spacy/tests/regression/test_issue1061.py
@@ -0,0 +1,27 @@
+from __future__ import unicode_literals
+
+from ...symbols import ORTH
+
+from ...vocab import Vocab
+from ...en import English
+
+
+def test_issue1061():
+    '''Test special-case works after tokenizing. Was caching problem.'''
+    text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.'
+    tokenizer = English.Defaults.create_tokenizer()
+    doc = tokenizer(text)
+    assert 'MATH' in [w.text for w in doc]
+    assert '_MATH_' not in [w.text for w in doc]
+
+    tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])
+    doc = tokenizer(text)
+    assert '_MATH_' in [w.text for w in doc]
+    assert 'MATH' not in [w.text for w in doc]
+
+    # For sanity, check it works when pipeline is clean.
+    tokenizer = English.Defaults.create_tokenizer()
+    tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])
+    doc = tokenizer(text)
+    assert '_MATH_' in [w.text for w in doc]
+    assert 'MATH' not in [w.text for w in doc]
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index c094bea0d..276f0ef20 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -186,7 +186,13 @@ cdef class Tokenizer:
     cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
         cached = <_Cached*>self._cache.get(key)
         if cached == NULL:
-            return False
+            # See 'flush_cache' below for hand-wringing about
+            # how to handle this.
+            cached = <_Cached*>self._specials.get(key)
+            if cached == NULL:
+                return False
+            else:
+                self._cache.set(key, cached)
         cdef int i
         if cached.is_lex:
             for i in range(cached.length):
@@ -201,9 +207,15 @@ cdef class Tokenizer:
         cdef vector[LexemeC*] suffixes
         cdef int orig_size
         orig_size = tokens.length
-        span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes)
-        self._attach_tokens(tokens, span, &prefixes, &suffixes)
-        self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size)
+        special_case = <const _Cached*>self._specials.get(orig_key)
+        if special_case is not NULL:
+            for i in range(special_case.length):
+                tokens.push_back(&special_case.data.tokens[i], False)
+            self._cache.set(orig_key, <void*>special_case)
+        else: 
+            span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes)
+            self._attach_tokens(tokens, span, &prefixes, &suffixes)
+            self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size)
 
     cdef unicode _split_affixes(self, Pool mem, unicode string,
                                 vector[const LexemeC*] *prefixes,
@@ -389,5 +401,29 @@ cdef class Tokenizer:
         cached.data.tokens = self.vocab.make_fused_token(substrings)
         key = hash_string(string)
         self._specials.set(key, cached)
-        self._cache.set(key, cached)
         self._rules[string] = substrings
+        # After changing the tokenization rules, the previous tokenization
+        # may be stale.
+        self.flush_cache()
+
+    def flush_cache(self):
+        '''Flush the tokenizer's cache. May not free memory immediately.
+        
+        This is called automatically after `add_special_case`, but if you
+        write to the prefix or suffix functions, you'll have to call this
+        yourself. You may also need to flush the tokenizer cache after
+        changing the lex_attr_getter functions.
+        '''
+        cdef hash_t key
+        for key in self._cache.keys():
+            special_case = self._specials.get(key)
+            # Don't free data shared with special-case rules
+            if special_case is not NULL:
+                continue
+            cached = <_Cached*>self._cache.get(key)
+            if cached is not NULL:
+                self.mem.free(cached)
+        self._cache = PreshMap(1000)
+        # We could here readd the data from specials --- but if we loop over
+        # a bunch of special-cases, we'll get a quadratic behaviour. The extra
+        # lookup isn't so bad? Tough to tell.

From d7560047c5038fb4bf8a3f3a52b7a02ab6e88b25 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 22 Jul 2017 15:24:31 +0200
Subject: [PATCH 097/195] Fix version

---
 website/docs/api/annotation.jade | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade
index 30080dfd9..d4b01a819 100644
--- a/website/docs/api/annotation.jade
+++ b/website/docs/api/annotation.jade
@@ -39,7 +39,7 @@ p
 +h(2, "pos-tagging") Part-of-speech Tagging
 
 +infobox("Tip: Understanding tags")
-    |  In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
+    |  In spaCy v1.9+, you can also use #[code spacy.explain()] to get the
     |  description for the string representation of a tag. For example,
     |  #[code spacy.explain("RB")] will return "adverb".
 
@@ -71,7 +71,7 @@ p
 +h(2, "dependency-parsing") Syntactic Dependency Parsing
 
 +infobox("Tip: Understanding labels")
-    |  In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
+    |  In spaCy v1.9+, you can also use #[code spacy.explain()] to get the
     |  description for the string representation of a label. For example,
     |  #[code spacy.explain("prt")] will return "particle".
 
@@ -80,7 +80,7 @@ include _annotation/_dep-labels
 +h(2, "named-entities") Named Entity Recognition
 
 +infobox("Tip: Understanding entity types")
-    |  In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
+    |  In spaCy v1.9+, you can also use #[code spacy.explain()] to get the
     |  description for the string representation of an entity label. For example,
     |  #[code spacy.explain("LANGUAGE")] will return "any named language".
 

From de25bad036c7ddcf30181e71c4c1750ff6b93c18 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 22 Jul 2017 15:29:10 +0200
Subject: [PATCH 098/195] Use lower min version for requests dependency (fixes
 #1137)

Ensure compatibility with docker-compose and other packages
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 20c587841..fe273ee53 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,7 @@ six
 html5lib==1.0b8
 ujson>=1.35
 dill>=0.2,<0.3
-requests>=2.13.0,<3.0.0
+requests>=2.11.0,<3.0.0
 regex==2017.4.5
 ftfy>=4.4.2,<5.0.0
 pytest>=3.0.6,<4.0.0

From 7c4bf9994d23f5b07ebed24034b8d8eee2eaa6f6 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 22 Jul 2017 15:40:12 +0200
Subject: [PATCH 099/195] Add note on requirements and preventing model
 re-downloads (closes #1143)

---
 website/docs/usage/models.jade | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade
index 42a3c0bbf..2d0f83663 100644
--- a/website/docs/usage/models.jade
+++ b/website/docs/usage/models.jade
@@ -198,6 +198,37 @@ p
     nlp = en_core_web_md.load()
     doc = nlp(u'This is a sentence.')
 
++h(3, "models-download") Downloading and requiring model dependencies
+
+p
+    |  spaCy's built-in #[+api("cli#download") #[code download]] command
+    |  is mostly intended as a convenient, interactive wrapper. It performs
+    |  compatibility checks and prints detailed error messages and warnings.
+    |  However, if you're downloading models as part of an automated build
+    |  process, this only adds an unecessary layer of complexity. If you know
+    |  which models your application needs, you should be specifying them directly.
+
++aside("Prevent re-downloading models")
+    |  If you're installing a model from a URL, pip will usually re-download and
+    |  re-install the package, even if you already have a matching
+    |  version installed. To prevent this, simply add #[code #egg=] and the
+    |  package name after the URL, e.g. #[code #egg=en_core_web_sm] or
+    |  #[code #egg=en_core_web_sm-1.2.0]. This tells pip which package and version
+    |  you're trying to download, and will skip the package if a matching
+    |  installation is found.
+
+p
+    |  Because all models are valid Python packages, you can add them to your
+    |  application's #[code requirements.txt]. If you're running your own
+    |  internal PyPi installation, you can simply upload the models there. pip's
+    |  #[+a("https://pip.pypa.io/en/latest/reference/pip_install/#requirements-file-format") requirements file format]
+    |  supports both package names to download via a PyPi server, as well as direct
+    |  URLs.
+
++code("requirements.txt", "text").
+    spacy&gt;=1.8.0,&lt;2.0.0
+    -e #{gh("spacy-models")}/releases/download/en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz#egg=en_core_web_sm-1.2.0
+
 +h(2, "own-models") Using your own models
 
 p

From 796b2f4c1b49401f7cb490df174fe32f0186bc56 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 22 Jul 2017 15:42:38 +0200
Subject: [PATCH 100/195] Remove print statements in tests

---
 spacy/tests/regression/test_issue693.py | 2 --
 spacy/tests/regression/test_issue995.py | 1 -
 2 files changed, 3 deletions(-)

diff --git a/spacy/tests/regression/test_issue693.py b/spacy/tests/regression/test_issue693.py
index e4d907716..5deeb3215 100644
--- a/spacy/tests/regression/test_issue693.py
+++ b/spacy/tests/regression/test_issue693.py
@@ -14,7 +14,5 @@ def test_issue693(EN):
     doc2 = EN(text2)
     chunks1 = [chunk for chunk in doc1.noun_chunks]
     chunks2 = [chunk for chunk in doc2.noun_chunks]
-    for word in doc1:
-        print(word.text, word.dep_, word.head.text)
     assert len(chunks1) == 2
     assert len(chunks2) == 2
diff --git a/spacy/tests/regression/test_issue995.py b/spacy/tests/regression/test_issue995.py
index 633e96fb5..108b434a2 100644
--- a/spacy/tests/regression/test_issue995.py
+++ b/spacy/tests/regression/test_issue995.py
@@ -15,7 +15,6 @@ def test_issue955(doc):
     '''Test that we don't have any nested noun chunks'''
     seen_tokens = set()
     for np in doc.noun_chunks:
-        print(np.text, np.root.text, np.root.dep_, np.root.tag_)
         for word in np:
             key = (word.i, word.text)
             assert key not in seen_tokens

From d51d55bba673cbe35784589825ff88fd33bb1f73 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 22 Jul 2017 15:43:16 +0200
Subject: [PATCH 101/195] Increment version

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 8c0e0afd3..d34c6f948 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -3,7 +3,7 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 
 __title__ = 'spacy'
-__version__ = '1.8.2'
+__version__ = '1.9.0'
 __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 __uri__ = 'https://spacy.io'
 __author__ = 'Matthew Honnibal'

From 78fcf56dd5ce0beeebdcd58c0082f78d751ba206 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 22 Jul 2017 15:57:58 +0200
Subject: [PATCH 102/195] Update version pin for regex library

---
 requirements.txt | 2 +-
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index fe273ee53..6f7d067a5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,7 +11,7 @@ html5lib==1.0b8
 ujson>=1.35
 dill>=0.2,<0.3
 requests>=2.11.0,<3.0.0
-regex==2017.4.5
+regex==2017.7.11
 ftfy>=4.4.2,<5.0.0
 pytest>=3.0.6,<4.0.0
 pip>=9.0.0,<10.0.0
diff --git a/setup.py b/setup.py
index 89aaf8eba..61bd6b6bb 100755
--- a/setup.py
+++ b/setup.py
@@ -203,7 +203,7 @@ def setup_package():
                 'ujson>=1.35',
                 'dill>=0.2,<0.3',
                 'requests>=2.13.0,<3.0.0',
-                'regex==2017.4.5',
+                'regex==2017.7.11',
                 'ftfy>=4.4.2,<5.0.0'],
             classifiers=[
                 'Development Status :: 5 - Production/Stable',

From 5494605689758238e92703fa759a2f56cbb00598 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 22 Jul 2017 16:09:50 +0200
Subject: [PATCH 103/195] Fiddle with regex pin

---
 requirements.txt | 2 +-
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 6f7d067a5..9d6f34133 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,7 +11,7 @@ html5lib==1.0b8
 ujson>=1.35
 dill>=0.2,<0.3
 requests>=2.11.0,<3.0.0
-regex==2017.7.11
+regex>=2017.4.1,<2017.12.1
 ftfy>=4.4.2,<5.0.0
 pytest>=3.0.6,<4.0.0
 pip>=9.0.0,<10.0.0
diff --git a/setup.py b/setup.py
index 61bd6b6bb..1b127962b 100755
--- a/setup.py
+++ b/setup.py
@@ -203,7 +203,7 @@ def setup_package():
                 'ujson>=1.35',
                 'dill>=0.2,<0.3',
                 'requests>=2.13.0,<3.0.0',
-                'regex==2017.7.11',
+                'regex>=2017.4.1,<2017.12.1',
                 'ftfy>=4.4.2,<5.0.0'],
             classifiers=[
                 'Development Status :: 5 - Production/Stable',

From 570964e67f0c7a12e64551cd4b71dca3c40b6ad8 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Sat, 22 Jul 2017 16:20:19 +0200
Subject: [PATCH 104/195] Update README.rst

---
 README.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 0f3efc146..9d52a6c9d 100644
--- a/README.rst
+++ b/README.rst
@@ -63,11 +63,12 @@ MIT license.
 💬 Where to ask questions
 ==========================
 
+Please understand that we won't be able to provide individual support via email. We also believe that help is much more valuable if it's shared publicly, so that more people can benefit from it.
+
 ====================== ===
 **Bug reports**        `GitHub issue tracker`_
 **Usage questions**    `StackOverflow`_, `Gitter chat`_, `Reddit user group`_
 **General discussion** `Gitter chat`_, `Reddit user group`_
-**Commercial support** contact@explosion.ai
 ====================== ===
 
 .. _GitHub issue tracker: https://github.com/explosion/spaCy/issues

From e349271506b66e4257e8c69e02c664bbb0442fda Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 22 Jul 2017 18:29:30 +0200
Subject: [PATCH 105/195] Increment version

---
 website/_harp.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/_harp.json b/website/_harp.json
index cb476541a..37a0b54dd 100644
--- a/website/_harp.json
+++ b/website/_harp.json
@@ -12,7 +12,7 @@
         "COMPANY_URL": "https://explosion.ai",
         "DEMOS_URL": "https://demos.explosion.ai",
 
-        "SPACY_VERSION": "1.8",
+        "SPACY_VERSION": "1.9",
         "LATEST_NEWS": {
             "url": "/docs/usage/models",
             "title": "The first official Spanish model is here!"

From 864cefd3b267e08a843703687fcd0b2587c8d080 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 22 Jul 2017 18:29:55 +0200
Subject: [PATCH 106/195] Update README.rst

---
 README.rst | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index 9d52a6c9d..4efd5b1de 100644
--- a/README.rst
+++ b/README.rst
@@ -9,14 +9,14 @@ Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew,
 Chinese and Japanese. It's commercial open-source software, released under the
 MIT license.
 
-⭐️ **Test spaCy v2.0.0 alpha and the new models!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/tag/v2.0.0-alpha>`_
+⭐️ **Test spaCy v2.0.0 alpha and the new models!** `Read the release notes. <https://github.com/explosion/spaCy/releases/tag/v2.0.0-alpha>`_
 
-💫 **Version 1.8 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
+💫 **Version 1.9 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
 
 .. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square
     :target: https://travis-ci.org/explosion/spaCy
     :alt: Travis Build Status
-    
+
 .. image:: https://img.shields.io/appveyor/ci/explosion/spacy/master.svg?style=flat-square
     :target: https://ci.appveyor.com/project/explosion/spacy
     :alt: Appveyor Build Status
@@ -326,6 +326,7 @@ and ``--model`` are optional and enable additional tests:
 =========== ============== ===========
 Version     Date           Description
 =========== ============== ===========
+`v1.9.0`_   ``2017-07-22`` Spanish model, alpha support for Norwegian & Japanese, and bug fixes
 `v1.8.2`_   ``2017-04-26`` French model and small improvements
 `v1.8.1`_   ``2017-04-23`` Saving, loading and training bug fixes
 `v1.8.0`_   ``2017-04-16`` Better NER training, saving and loading
@@ -359,6 +360,7 @@ Version     Date           Description
 `v0.93`_    ``2015-09-22`` Bug fixes to word vectors
 =========== ============== ===========
 
+.. _v1.9.0: https://github.com/explosion/spaCy/releases/tag/v1.9.0
 .. _v1.8.2: https://github.com/explosion/spaCy/releases/tag/v1.8.2
 .. _v1.8.1: https://github.com/explosion/spaCy/releases/tag/v1.8.1
 .. _v1.8.0: https://github.com/explosion/spaCy/releases/tag/v1.8.0

From 7e98a3613c4934709f3358594a928f476e2fa8f2 Mon Sep 17 00:00:00 2001
From: Gideon Dresdner <gideond+github@gmail.com>
Date: Sun, 6 Aug 2017 13:21:45 +0200
Subject: [PATCH 107/195] improve pipe, tee, izip explanation

Use an example from an old issue https://github.com/explosion/spaCy/issues/172#issuecomment-183963403.
---
 website/docs/usage/processing-text.jade | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/website/docs/usage/processing-text.jade b/website/docs/usage/processing-text.jade
index 4bd6132d2..600654f65 100644
--- a/website/docs/usage/processing-text.jade
+++ b/website/docs/usage/processing-text.jade
@@ -98,7 +98,8 @@ p
         |  important metadata, e.g. a JSON document. To pair up the metadata
         |  with the processed #[code Doc] object, you should use the tee
         |  function to split the generator in two, and then #[code izip] the
-        |  extra stream to the document stream.
+        |  extra stream to the document stream. Here's an 
+        |  #[a(href="https://github.com/explosion/spaCy/issues/172#issuecomment-183963403")= "example"]
 
 +h(2, "own-annotations") Bringing your own annotations
 

From d3b03f05441de59cfd45b7414d4aab6fd1b32242 Mon Sep 17 00:00:00 2001
From: Delirious Lettuce <delirious.lettuce@gmail.com>
Date: Sun, 6 Aug 2017 21:31:39 -0600
Subject: [PATCH 108/195] Fix typos:   * `auxillary` -> `auxiliary`   *
 `consistute` -> `constitute`   * `earlist` -> `earliest`   * `prefered` ->
 `preferred`   * `direcory` -> `directory`   * `reuseable` -> `reusable`   *
 `idiosyncracies` -> `idiosyncrasies`   * `enviroment` -> `environment`   *
 `unecessary` -> `unnecessary`   * `yesteday` -> `yesterday`   * `resouces` ->
 `resources`

---
 spacy/glossary.py                             | 4 ++--
 website/docs/api/_annotation/_pos-tags.jade   | 4 ++--
 website/docs/api/features.jade                | 2 +-
 website/docs/api/span.jade                    | 2 +-
 website/docs/usage/adding-languages.jade      | 4 ++--
 website/docs/usage/customizing-tokenizer.jade | 2 +-
 website/docs/usage/index.jade                 | 2 +-
 website/docs/usage/models.jade                | 2 +-
 website/docs/usage/pos-tagging.jade           | 2 +-
 website/docs/usage/saving-loading.jade        | 2 +-
 10 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/spacy/glossary.py b/spacy/glossary.py
index 4df5264a6..ed1c22c21 100644
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@@ -60,7 +60,7 @@ GLOSSARY = {
     'JJR':          'adjective, comparative',
     'JJS':          'adjective, superlative',
     'LS':           'list item marker',
-    'MD':           'verb, modal auxillary',
+    'MD':           'verb, modal auxiliary',
     'NIL':          'missing tag',
     'NN':           'noun, singular or mass',
     'NNP':          'noun, proper singular',
@@ -91,7 +91,7 @@ GLOSSARY = {
     'NFP':          'superfluous punctuation',
     'GW':           'additional word in multi-word expression',
     'XX':           'unknown',
-    'BES':          'auxillary "be"',
+    'BES':          'auxiliary "be"',
     'HVS':          'forms of "have"',
 
 
diff --git a/website/docs/api/_annotation/_pos-tags.jade b/website/docs/api/_annotation/_pos-tags.jade
index ea3a225bf..51db4f4e2 100644
--- a/website/docs/api/_annotation/_pos-tags.jade
+++ b/website/docs/api/_annotation/_pos-tags.jade
@@ -21,7 +21,7 @@ p
     +pos-row("$", "SYM", "SymType=currency", "symbol, currency")
     +pos-row("ADD", "X", "", "email")
     +pos-row("AFX", "ADJ", "Hyph=yes", "affix")
-    +pos-row("BES", "VERB", "", 'auxillary "be"')
+    +pos-row("BES", "VERB", "", 'auxiliary "be"')
     +pos-row("CC", "CONJ", "ConjType=coor", "conjunction, coordinating")
     +pos-row("CD", "NUM", "NumType=card", "cardinal number")
     +pos-row("DT", "DET", "determiner")
@@ -35,7 +35,7 @@ p
     +pos-row("JJR", "ADJ", "Degree=comp", "adjective, comparative")
     +pos-row("JJS", "ADJ", "Degree=sup", "adjective, superlative")
     +pos-row("LS", "PUNCT", "NumType=ord", "list item marker")
-    +pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxillary")
+    +pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxiliary")
     +pos-row("NFP", "PUNCT", "", "superfluous punctuation")
     +pos-row("NIL", "", "", "missing tag")
     +pos-row("NN", "NOUN", "Number=sing", "noun, singular or mass")
diff --git a/website/docs/api/features.jade b/website/docs/api/features.jade
index 018790145..21481cf65 100644
--- a/website/docs/api/features.jade
+++ b/website/docs/api/features.jade
@@ -18,7 +18,7 @@ p
     |  consisting of the words to be processed.
 
 p
-    |  Each state consists of the words on the stack (if any), which consistute
+    |  Each state consists of the words on the stack (if any), which constitute
     |  the current entity being constructed. We also have the current word, and
     |  the two subsequent words. Finally, we also have the entities previously
     |  built.
diff --git a/website/docs/api/span.jade b/website/docs/api/span.jade
index 770ee3e9b..d2d3d0f27 100644
--- a/website/docs/api/span.jade
+++ b/website/docs/api/span.jade
@@ -222,7 +222,7 @@ p The sentence span that this span is a part of.
 
 p
     |  The token within the span that's highest in the parse tree. If there's a
-    |  tie, the earlist is prefered.
+    |  tie, the earliest is preferred.
 
 +table(["Name", "Type", "Description"])
     +footrow
diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade
index 30c4486b0..7d893b4eb 100644
--- a/website/docs/usage/adding-languages.jade
+++ b/website/docs/usage/adding-languages.jade
@@ -28,7 +28,7 @@ p
         |  #[a(href="#word-vectors") word vectors].
 
     +item
-        |  #[strong Set up] a #[a(href="#model-directory") model direcory] and #[strong train] the #[a(href="#train-tagger-parser") tagger and parser].
+        |  #[strong Set up] a #[a(href="#model-directory") model directory] and #[strong train] the #[a(href="#train-tagger-parser") tagger and parser].
 
 p
     |  For some languages, you may also want to develop a solution for
@@ -303,7 +303,7 @@ p
 p
     |  Because languages can vary in quite arbitrary ways, spaCy avoids
     |  organising the language data into an explicit inheritance hierarchy.
-    |  Instead, reuseable functions and data are collected as atomic pieces in
+    |  Instead, reusable functions and data are collected as atomic pieces in
     |  the #[code spacy.language_data] package.
 
 +aside-code("Example").
diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade
index 354a56c22..ca5be9ef1 100644
--- a/website/docs/usage/customizing-tokenizer.jade
+++ b/website/docs/usage/customizing-tokenizer.jade
@@ -21,7 +21,7 @@ p
 +h(2, "special-cases") Adding special case tokenization rules
 
 p
-    |  Most domains have at least some idiosyncracies that require custom
+    |  Most domains have at least some idiosyncrasies that require custom
     |  tokenization rules. Here's how to add a special case rule to an existing
     |  #[+api("tokenizer") #[code Tokenizer]] instance:
 
diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade
index 9ad2fde5f..092c996b3 100644
--- a/website/docs/usage/index.jade
+++ b/website/docs/usage/index.jade
@@ -87,7 +87,7 @@ p
     |  The other way to install spaCy is to clone its
     |  #[+a(gh("spaCy")) GitHub repository] and build it from source. That is
     |  the common way if you want to make changes to the code base. You'll need to
-    |  make sure that you have a development enviroment consisting of a Python
+    |  make sure that you have a development environment consisting of a Python
     |  distribution including header files, a compiler,
     |  #[+a("https://pip.pypa.io/en/latest/installing/") pip],
     |  #[+a("https://virtualenv.pypa.io/") virtualenv] and
diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade
index 2d0f83663..4951ea211 100644
--- a/website/docs/usage/models.jade
+++ b/website/docs/usage/models.jade
@@ -205,7 +205,7 @@ p
     |  is mostly intended as a convenient, interactive wrapper. It performs
     |  compatibility checks and prints detailed error messages and warnings.
     |  However, if you're downloading models as part of an automated build
-    |  process, this only adds an unecessary layer of complexity. If you know
+    |  process, this only adds an unnecessary layer of complexity. If you know
     |  which models your application needs, you should be specifying them directly.
 
 +aside("Prevent re-downloading models")
diff --git a/website/docs/usage/pos-tagging.jade b/website/docs/usage/pos-tagging.jade
index cded00b6c..3f22ab43f 100644
--- a/website/docs/usage/pos-tagging.jade
+++ b/website/docs/usage/pos-tagging.jade
@@ -50,7 +50,7 @@ p
         +cell #[code VerbForm=Fin], #[code Mood=Ind], #[code Tense=Pres]
 
     +row
-        +cell I read the paper yesteday
+        +cell I read the paper yesterday
         +cell read
         +cell read
         +cell verb
diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade
index 8978cce7a..56b218c29 100644
--- a/website/docs/usage/saving-loading.jade
+++ b/website/docs/usage/saving-loading.jade
@@ -58,7 +58,7 @@ p This command will create a model package directory that should look like this:
 
 p
     |  You can also find templates for all files in our
-    |  #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources].
+    |  #[+a(gh("spacy-dev-resources", "templates/model")) spaCy dev resources].
     |  If you're creating the package manually, keep in mind that the directories
     |  need to be named according to the naming conventions of
     |  #[code [language]_[name]] and #[code [language]_[name]-[version]]. The

From 6e9e686568ab1f70d0b517e0d5f2bcbb894eb17a Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 8 Aug 2017 01:27:15 +0900
Subject: [PATCH 109/195] Sample implementation of Japanese Tagger (ref #1214)

This is far from complete but it should be enough to check some things.

1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD
tag mappings are based on Unidic. This switches out Mecab for Janome to
get around that.

2. Raw tag extension. A simple tag map can't meet the specifications for
UD tag mappings, so this adds an extra field to ambiguous cases. For
this demo it just deals with the simplest case, which only needs to look
at the literal token. (In reality it may be necessary to look at the
whole sentence, but that's another issue.)

3. General code structure. Seems nobody else has implemented a custom
Tagger yet, so still not sure this is the correct way to pass the
vocabulary around, for example.

Any feedback would be greatly appreciated. -POLM
---
 spacy/ja/__init__.py          | 92 +++++++++++++++++++++++++++++----
 spacy/ja/tag_map.py           | 97 +++++++++++++++++++++++++++++------
 spacy/tests/conftest.py       |  5 +-
 spacy/tests/ja/test_tagger.py | 10 ++++
 4 files changed, 177 insertions(+), 27 deletions(-)
 create mode 100644 spacy/tests/ja/test_tagger.py

diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py
index 1c85ded95..5f49f0b1b 100644
--- a/spacy/ja/__init__.py
+++ b/spacy/ja/__init__.py
@@ -5,37 +5,111 @@ from os import path
 
 from ..language import Language, BaseDefaults
 from ..tokenizer import Tokenizer
+from ..tagger import Tagger
 from ..attrs import LANG
 from ..tokens import Doc
 
 from .language_data import *
 
+import re
+from collections import namedtuple
+
+ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech'])
+
 class JapaneseTokenizer(object):
     def __init__(self, cls, nlp=None):
         self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
         try:
-            from janome.tokenizer import Tokenizer
+            import MeCab
         except ImportError:
-            raise ImportError("The Japanese tokenizer requires the Janome library: "
-                              "https://github.com/mocobeta/janome")
-        self.tokenizer = Tokenizer()
+            raise ImportError("The Japanese tokenizer requires the MeCab library: "
+                              "https://github.com/SamuraiT/mecab-python3")
+        self.tokenizer = MeCab.Tagger()
 
     def __call__(self, text):
-        words = [x.surface for x in self.tokenizer.tokenize(text)]
+        words = [x.surface for x in detailed_tokens(self.tokenizer, text)]
         return Doc(self.vocab, words=words, spaces=[False]*len(words))
 
+def resolve_pos(token):
+    """If necessary, add a field to the POS tag for UD mapping.
+
+    Under Universal Dependencies, sometimes the same Unidic POS tag can
+    be mapped differently depending on the literal token or its context
+    in the sentence. This function adds information to the POS tag to 
+    resolve ambiguous mappings.
+    """
+
+    # NOTE: This is a first take. The rules here are crude approximations.
+    # For many of these, full dependencies are needed to properly resolve
+    # PoS mappings.
+
+    if token.part_of_speech == '連体詞,*,*,*':
+        # determiner-likes get DET, otherwise ADJ
+        if re.match('^[こそあど此其彼]の', token.surface):
+            return token.part_of_speech + ',DET'
+        else:
+            return token.part_of_speech + ',ADJ'
+    return token.part_of_speech
+
+def detailed_tokens(tokenizer, text):
+    """Format Mecab output into a nice data structure, based on Janome."""
+
+    node = tokenizer.parseToNode(text)
+    node = node.next # first node is beginning of sentence and empty, skip it
+    words = []
+    while node.posid != 0:
+        parts = node.feature.split(',')
+        pos = ','.join(parts[0:4])
+        reading = parts[6]
+        base = parts[7]
+        surface = parts[8]
+
+        words.append( ShortUnitWord(surface, base, pos) )
+        node = node.next
+    return words
+
+class JapaneseTagger(object):
+    def __init__(self, vocab):
+        try:
+            import MeCab
+        except ImportError:
+            raise ImportError("The Japanese tagger requires the MeCab library: "
+                              "https://github.com/SamuraiT/mecab-python3")
+
+        self.tagger = Tagger(vocab)
+        self.tokenizer = MeCab.Tagger()
+
+    def __call__(self, tokens):
+        # two parts to this:
+        # 1. get raw JP tags
+        # 2. add features to tags as necessary for UD
+
+        # TODO: if the text has been tokenized, this info is already available
+        # How to set the data when tokenizing or save it for the tagger to find?
+
+        dtokens = detailed_tokens(self.tokenizer, tokens.text)
+        rawtags = list(map(resolve_pos, dtokens))
+        self.tagger.tag_from_strings(tokens, rawtags)
+
 class JapaneseDefaults(BaseDefaults):
+    tag_map = TAG_MAP
+
     @classmethod
     def create_tokenizer(cls, nlp=None):
         return JapaneseTokenizer(cls, nlp)
 
+    @classmethod
+    def create_tagger(cls, tokenizer):
+        return JapaneseTagger(tokenizer.vocab)
+
 class Japanese(Language):
     lang = 'ja'
 
     Defaults = JapaneseDefaults
 
     def make_doc(self, text):
-        words = self.tokenizer(text)
-        return Doc(self.vocab, words=words, spaces=[False]*len(words))
-
-        
+        words = [str(t) for t in self.tokenizer(text)]
+        doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
+        tagger = JapaneseDefaults.create_tagger(self.tokenizer)
+        tagger(doc)
+        return doc
diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py
index f5b6b5040..609739c2f 100644
--- a/spacy/ja/tag_map.py
+++ b/spacy/ja/tag_map.py
@@ -3,22 +3,85 @@ from __future__ import unicode_literals
 
 from ..symbols import *
 
-
 TAG_MAP = {
-    "ADV":      {POS: ADV},
-    "NOUN":     {POS: NOUN},
-    "ADP":      {POS: ADP},
-    "PRON":     {POS: PRON},
-    "SCONJ":    {POS: SCONJ},
-    "PROPN":    {POS: PROPN},
-    "DET":      {POS: DET},
-    "SYM":      {POS: SYM},
-    "INTJ":     {POS: INTJ},
-    "PUNCT":    {POS: PUNCT},
-    "NUM":      {POS: NUM},
-    "AUX":      {POS: AUX},
-    "X":        {POS: X},
-    "CONJ":     {POS: CONJ},
-    "ADJ":      {POS: ADJ},
-    "VERB":     {POS: VERB}
+    # Explanation of Unidic tags:
+    # https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf
+
+    # Universal Dependencies Mapping:
+    # http://universaldependencies.org/ja/overview/morphology.html
+    # http://universaldependencies.org/ja/pos/all.html
+
+    "記号,一般,*,*":{POS: PUNCT}, # this includes characters used to represent sounds like ドレミ
+    "記号,文字,*,*":{POS: PUNCT}, # this is for Greek and Latin characters used as sumbols, as in math
+
+    "感動詞,フィラー,*,*": {POS: INTJ},
+    "感動詞,一般,*,*": {POS: INTJ},
+
+    # this is specifically for unicode full-width space
+    "空白,*,*,*": {POS: X}, 
+
+    "形状詞,一般,*,*":{POS: ADJ},
+    "形状詞,タリ,*,*":{POS: ADJ}, 
+    "形状詞,助動詞語幹,*,*":{POS: ADJ}, 
+    "形容詞,一般,*,*":{POS: ADJ},
+    "形容詞,非自立可能,*,*":{POS: AUX}, # XXX ADJ if alone, AUX otherwise
+
+    "助詞,格助詞,*,*":{POS: ADP}, 
+    "助詞,係助詞,*,*":{POS: ADP}, 
+    "助詞,終助詞,*,*":{POS: PART}, 
+    "助詞,準体助詞,*,*":{POS: SCONJ}, # の as in 走るのが速い
+    "助詞,接続助詞,*,*":{POS: SCONJ}, # verb ending て
+    "助詞,副助詞,*,*":{POS: PART},  # ばかり, つつ after a verb
+    "助動詞,*,*,*":{POS: AUX},
+    "接続詞,*,*,*":{POS: SCONJ}, # XXX: might need refinement
+
+    "接頭辞,*,*,*":{POS: NOUN}, 
+    "接尾辞,形状詞的,*,*":{POS: ADJ}, # がち, チック 
+    "接尾辞,形容詞的,*,*":{POS: ADJ}, # -らしい
+    "接尾辞,動詞的,*,*":{POS: NOUN},  # -じみ
+    "接尾辞,名詞的,サ変可能,*":{POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,*
+    "接尾辞,名詞的,一般,*":{POS: NOUN},
+    "接尾辞,名詞的,助数詞,*":{POS: NOUN}, 
+    "接尾辞,名詞的,副詞可能,*":{POS: NOUN}, # -後, -過ぎ
+
+    "代名詞,*,*,*":{POS: PRON},
+    "動詞,一般,*,*":{POS: VERB},
+    "動詞,非自立可能,*,*":{POS: AUX}, # XXX VERB if alone, AUX otherwise
+    "動詞,非自立可能,*,*,AUX":{POS: AUX},
+    "動詞,非自立可能,*,*,VERB":{POS: VERB},
+    "副詞,*,*,*":{POS: ADV},
+
+    "補助記号,ＡＡ,一般,*":{POS: SYM}, # text art
+    "補助記号,ＡＡ,顔文字,*":{POS: SYM}, # kaomoji
+    "補助記号,一般,*,*":{POS: SYM}, 
+    "補助記号,括弧開,*,*":{POS: PUNCT}, # open bracket
+    "補助記号,括弧閉,*,*":{POS: PUNCT}, # close bracket
+    "補助記号,句点,*,*":{POS: PUNCT}, # period or other EOS marker
+    "補助記号,読点,*,*":{POS: PUNCT}, # comma
+
+    "名詞,固有名詞,一般,*":{POS: PROPN}, # general proper noun
+    "名詞,固有名詞,人名,一般":{POS: PROPN}, # person's name
+    "名詞,固有名詞,人名,姓":{POS: PROPN}, # surname
+    "名詞,固有名詞,人名,名":{POS: PROPN}, # first name
+    "名詞,固有名詞,地名,一般":{POS: PROPN}, # place name
+    "名詞,固有名詞,地名,国":{POS: PROPN}, # country name
+
+    "名詞,助動詞語幹,*,*":{POS: AUX}, 
+    "名詞,数詞,*,*":{POS: NUM}, # includes Chinese numerals
+
+    "名詞,普通名詞,サ変可能,*":{POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun
+    "名詞,普通名詞,サ変可能,*,NOUN":{POS: NOUN}, 
+    "名詞,普通名詞,サ変可能,*,VERB":{POS: VERB}, 
+
+    "名詞,普通名詞,サ変形状詞可能,*":{POS: NOUN}, # ex: 下手
+    "名詞,普通名詞,一般,*":{POS: NOUN}, 
+    "名詞,普通名詞,形状詞可能,*":{POS: NOUN}, # XXX: sometimes ADJ in UDv2
+    "名詞,普通名詞,形状詞可能,*,NOUN":{POS: NOUN}, 
+    "名詞,普通名詞,形状詞可能,*,ADJ":{POS: ADJ}, 
+    "名詞,普通名詞,助数詞可能,*":{POS: NOUN}, # counter / unit
+    "名詞,普通名詞,副詞可能,*":{POS: NOUN},
+
+    "連体詞,*,*,*":{POS: ADJ}, # XXX note この、その etc. should be DET
+    "連体詞,*,*,*,ADJ":{POS: ADJ}, 
+    "連体詞,*,*,*,DET":{POS: DET}, 
 }
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 6e00b1513..52b9bdd57 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -79,9 +79,12 @@ def fi_tokenizer():
 
 @pytest.fixture
 def ja_tokenizer():
-    janome = pytest.importorskip("janome")
+    pytest.importorskip("MeCab")
     return Japanese.Defaults.create_tokenizer()
 
+@pytest.fixture
+def japanese():
+    return Japanese()
 
 @pytest.fixture
 def sv_tokenizer():
diff --git a/spacy/tests/ja/test_tagger.py b/spacy/tests/ja/test_tagger.py
new file mode 100644
index 000000000..43259fb49
--- /dev/null
+++ b/spacy/tests/ja/test_tagger.py
@@ -0,0 +1,10 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+def test_japanese_tagger(japanese):
+    doc = japanese.make_doc("このファイルには小さなテストが入っているよ")
+    # note these both have the same raw tag, '連体詞,*,*,*'
+    assert doc[0].pos_ == "DET"
+    assert doc[4].pos_ == "ADJ"

From e3738aba0dc562cdd87133fdfd58a9741b0c08f2 Mon Sep 17 00:00:00 2001
From: Kevin Marsh <kevinmarsh@users.noreply.github.com>
Date: Tue, 15 Aug 2017 21:50:09 +0100
Subject: [PATCH 110/195] Fix broken tutorial link on website

---
 website/docs/usage/_data.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json
index 703a185d6..c2ce271aa 100644
--- a/website/docs/usage/_data.json
+++ b/website/docs/usage/_data.json
@@ -313,7 +313,7 @@
                 "author": "Clark Grubb"
             },
             "A very (very) short primer on spacy.io": {
-                "url": "http://blog.milonimrod.com/2015/10/a-very-very-short-primer-on-spacyio.html",
+                "url": "https://web.archive.org/web/20161219095416/http://blog.milonimrod.com/2015/10/a-very-very-short-primer-on-spacyio.html",
                 "author": "Nimrod Milo  "
             }
         },

From 234a8a75917aa01c48e06ed4767d6f47cdfead22 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 21 Aug 2017 00:21:45 +0900
Subject: [PATCH 111/195] =?UTF-8?q?Change=20default=20tag=20for=20?=
 =?UTF-8?q?=E5=8B=95=E8=A9=9E,=E9=9D=9E=E8=87=AA=E7=AB=8B=E5=8F=AF?=
 =?UTF-8?q?=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Example of this is いる in these sentences:

    彼はそこにいる。# should be VERB
    彼は底に立っている。# should be AUX

Unclear which case is more numerous - need to check a large corpus - but
in keeping with the other ambiguous tags, this is mapped to the
"dominant" or first part of the tag. -POLM
---
 spacy/ja/tag_map.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py
index 609739c2f..8436f07ff 100644
--- a/spacy/ja/tag_map.py
+++ b/spacy/ja/tag_map.py
@@ -46,7 +46,7 @@ TAG_MAP = {
 
     "代名詞,*,*,*":{POS: PRON},
     "動詞,一般,*,*":{POS: VERB},
-    "動詞,非自立可能,*,*":{POS: AUX}, # XXX VERB if alone, AUX otherwise
+    "動詞,非自立可能,*,*":{POS: VERB}, # XXX VERB if alone, AUX otherwise
     "動詞,非自立可能,*,*,AUX":{POS: AUX},
     "動詞,非自立可能,*,*,VERB":{POS: VERB},
     "副詞,*,*,*":{POS: ADV},

From c5c3f4c7d9a9c715110040d1e75e08d8a7b8dc20 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 21 Aug 2017 16:08:40 +0200
Subject: [PATCH 112/195] Use more generous .env ignore rule

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 2209f5b4a..84ced41f8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,6 +29,7 @@ Profile.prof
 .python-version
 __pycache__/
 *.py[cod]
+.env*/
 .env/
 .env2/
 .env3/

From edc596d9a77cf0281b3641297fd5abd62a74edf2 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 21 Aug 2017 16:11:36 +0200
Subject: [PATCH 113/195] Add missing tokenizer exceptions (resolves #1281)

---
 spacy/en/tokenizer_exceptions.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py
index d9aa01734..29447314a 100644
--- a/spacy/en/tokenizer_exceptions.py
+++ b/spacy/en/tokenizer_exceptions.py
@@ -276,7 +276,10 @@ for verb_data in [
     {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2},
     {ORTH: "is", LEMMA: "be", TAG: "VBZ"},
     {ORTH: "was", LEMMA: "be"},
-    {ORTH: "were", LEMMA: "be"}
+    {ORTH: "were", LEMMA: "be"},
+    {ORTH: "have"},
+    {ORTH: "has", LEMMA: "have"},
+    {ORTH: "dare"}
 ]:
     verb_data_tc = dict(verb_data)
     verb_data_tc[ORTH] = verb_data_tc[ORTH].title()

From dcff10abe98c844f2f66ff22835c9eb8ea8e7138 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 21 Aug 2017 16:11:47 +0200
Subject: [PATCH 114/195] Add regression test for #1281

---
 spacy/tests/regression/test_issue1281.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue1281.py

diff --git a/spacy/tests/regression/test_issue1281.py b/spacy/tests/regression/test_issue1281.py
new file mode 100644
index 000000000..17307b1d6
--- /dev/null
+++ b/spacy/tests/regression/test_issue1281.py
@@ -0,0 +1,13 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize('text', [
+    "She hasn't done the housework.",
+    "I haven't done it before.",
+    "you daren't do that"])
+def test_issue1281(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert tokens[2].text == "n't"

From c435f748d743b1ee407c02c14223679769fa52b2 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 22 Aug 2017 00:01:28 +0900
Subject: [PATCH 115/195] Put Mecab import in utility function

---
 spacy/ja/__init__.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py
index 5f49f0b1b..c82591f58 100644
--- a/spacy/ja/__init__.py
+++ b/spacy/ja/__init__.py
@@ -16,14 +16,21 @@ from collections import namedtuple
 
 ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech'])
 
+def try_mecab_import():
+    """Mecab is required for Japanese support, so check for it.
+
+    It it's not available blow up and explain how to fix it."""
+    try:
+        import MeCab
+        return MeCab
+    except ImportError:
+        raise ImportError("Japanese support requires MeCab: "
+                          "https://github.com/SamuraiT/mecab-python3")
+
 class JapaneseTokenizer(object):
     def __init__(self, cls, nlp=None):
         self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
-        try:
-            import MeCab
-        except ImportError:
-            raise ImportError("The Japanese tokenizer requires the MeCab library: "
-                              "https://github.com/SamuraiT/mecab-python3")
+        MeCab = try_mecab_import()
         self.tokenizer = MeCab.Tagger()
 
     def __call__(self, text):
@@ -70,12 +77,7 @@ def detailed_tokens(tokenizer, text):
 
 class JapaneseTagger(object):
     def __init__(self, vocab):
-        try:
-            import MeCab
-        except ImportError:
-            raise ImportError("The Japanese tagger requires the MeCab library: "
-                              "https://github.com/SamuraiT/mecab-python3")
-
+        MeCab = try_mecab_import()
         self.tagger = Tagger(vocab)
         self.tokenizer = MeCab.Tagger()
 

From 53e17296e98ba8db1b9b99fec0a39aaa56d12e5c Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 22 Aug 2017 00:01:49 +0900
Subject: [PATCH 116/195] Fix pronoun handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Missed this case earlier.

連体詞 have three classes for UD purposes:

- その -> DET
- それ -> PRON
- 同じ -> ADJ

-POLM
---
 spacy/ja/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py
index c82591f58..8cd48ad84 100644
--- a/spacy/ja/__init__.py
+++ b/spacy/ja/__init__.py
@@ -51,9 +51,10 @@ def resolve_pos(token):
     # PoS mappings.
 
     if token.part_of_speech == '連体詞,*,*,*':
-        # determiner-likes get DET, otherwise ADJ
         if re.match('^[こそあど此其彼]の', token.surface):
             return token.part_of_speech + ',DET'
+        if re.match('^[こそあど此其彼]', token.surface):
+            return token.part_of_speech + ',PRON'
         else:
             return token.part_of_speech + ',ADJ'
     return token.part_of_speech

From adfd98731655cc3f351e0042353ea850ef7d23c2 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 22 Aug 2017 00:02:55 +0900
Subject: [PATCH 117/195] Update the TAG_MAP

---
 spacy/ja/__init__.py | 3 ---
 spacy/ja/tag_map.py  | 3 ++-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py
index 8cd48ad84..dfd0bca5b 100644
--- a/spacy/ja/__init__.py
+++ b/spacy/ja/__init__.py
@@ -87,9 +87,6 @@ class JapaneseTagger(object):
         # 1. get raw JP tags
         # 2. add features to tags as necessary for UD
 
-        # TODO: if the text has been tokenized, this info is already available
-        # How to set the data when tokenizing or save it for the tagger to find?
-
         dtokens = detailed_tokens(self.tokenizer, tokens.text)
         rawtags = list(map(resolve_pos, dtokens))
         self.tagger.tag_from_strings(tokens, rawtags)
diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py
index 8436f07ff..191865ed2 100644
--- a/spacy/ja/tag_map.py
+++ b/spacy/ja/tag_map.py
@@ -81,7 +81,8 @@ TAG_MAP = {
     "名詞,普通名詞,助数詞可能,*":{POS: NOUN}, # counter / unit
     "名詞,普通名詞,副詞可能,*":{POS: NOUN},
 
-    "連体詞,*,*,*":{POS: ADJ}, # XXX note この、その etc. should be DET
+    "連体詞,*,*,*":{POS: ADJ}, # XXX this has exceptions based on literal token
     "連体詞,*,*,*,ADJ":{POS: ADJ}, 
+    "連体詞,*,*,*,PRON":{POS: PRON}, 
     "連体詞,*,*,*,DET":{POS: DET}, 
 }

From bcf2b9b4f5e12951394bbc2e77daf5a1763ec9e5 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 22 Aug 2017 00:03:11 +0900
Subject: [PATCH 118/195] Update tagger & tokenizer tests

Tagger is now parametrized and has two sentences with more tag coverage.

The tokenizer tests are updated to reflect differences in tokenization
between IPAdic and Unidic. -POLM
---
 spacy/tests/ja/test_tagger.py    | 33 +++++++++++++++++++++++++++-----
 spacy/tests/ja/test_tokenizer.py |  4 ++--
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/spacy/tests/ja/test_tagger.py b/spacy/tests/ja/test_tagger.py
index 43259fb49..629cc795f 100644
--- a/spacy/tests/ja/test_tagger.py
+++ b/spacy/tests/ja/test_tagger.py
@@ -3,8 +3,31 @@ from __future__ import unicode_literals
 
 import pytest
 
-def test_japanese_tagger(japanese):
-    doc = japanese.make_doc("このファイルには小さなテストが入っているよ")
-    # note these both have the same raw tag, '連体詞,*,*,*'
-    assert doc[0].pos_ == "DET"
-    assert doc[4].pos_ == "ADJ"
+TAGGER_TESTS = [
+    ('あれならそこにあるよ', 
+      (('代名詞,*,*,*', 'PRON'),
+      ('助動詞,*,*,*', 'AUX'),
+      ('代名詞,*,*,*', 'PRON'),
+      ('助詞,格助詞,*,*', 'ADP'),
+      ('動詞,非自立可能,*,*', 'VERB'),
+      ('助詞,終助詞,*,*', 'PART'))),
+    ('このファイルには小さなテストが入っているよ', 
+      (('連体詞,*,*,*,DET', 'DET'),
+      ('名詞,普通名詞,サ変可能,*', 'NOUN'),
+      ('助詞,格助詞,*,*', 'ADP'),
+      ('助詞,係助詞,*,*', 'ADP'),
+      ('連体詞,*,*,*,ADJ', 'ADJ'),
+      ('名詞,普通名詞,サ変可能,*', 'NOUN'),
+      ('助詞,格助詞,*,*', 'ADP'),
+      ('動詞,一般,*,*', 'VERB'),
+      ('助詞,接続助詞,*,*', 'SCONJ'),
+      ('動詞,非自立可能,*,*', 'VERB'),
+      ('助詞,終助詞,*,*', 'PART')))
+]
+
+@pytest.mark.parametrize('text,expected_tags', TAGGER_TESTS)
+def test_japanese_tagger(japanese, text, expected_tags):
+    tokens = japanese.make_doc(text)
+    assert len(tokens) == len(expected_tags)
+    for token, res in zip(tokens, expected_tags):
+        assert token.tag_ == res[0] and token.pos_ == res[1]
diff --git a/spacy/tests/ja/test_tokenizer.py b/spacy/tests/ja/test_tokenizer.py
index 58700b353..17411aee2 100644
--- a/spacy/tests/ja/test_tokenizer.py
+++ b/spacy/tests/ja/test_tokenizer.py
@@ -4,10 +4,10 @@ from __future__ import unicode_literals
 import pytest
 
 TOKENIZER_TESTS = [
-        ("日本語だよ", ['日本語', 'だ', 'よ']),
+        ("日本語だよ", ['日本', '語', 'だ', 'よ']),
         ("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
         ("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']),
-        ("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お仕置き', 'よ', '!']),
+        ("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お', '仕置き', 'よ', '!']),
         ("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'])
 ]
 

From 95050201ce095e2328be383beec3025a5e64fb0a Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 22 Aug 2017 21:30:59 +0900
Subject: [PATCH 119/195] Add importorskip for Japanese fixture

---
 spacy/tests/conftest.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 52b9bdd57..5fad6e429 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -84,6 +84,7 @@ def ja_tokenizer():
 
 @pytest.fixture
 def japanese():
+    pytest.importorskip("MeCab")
     return Japanese()
 
 @pytest.fixture

From 884ba168a88699bedecf55888b670cbf2040a539 Mon Sep 17 00:00:00 2001
From: Jeffrey Gerard <jeff@getwellio.com>
Date: Wed, 23 Aug 2017 21:18:53 -0700
Subject: [PATCH 120/195] Capture more noun chunks

---
 spacy/syntax/iterators.pyx             |  2 +-
 spacy/tests/parser/test_noun_chunks.py | 30 ++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx
index 0fe724622..14dba5f9b 100644
--- a/spacy/syntax/iterators.pyx
+++ b/spacy/syntax/iterators.pyx
@@ -9,7 +9,7 @@ def english_noun_chunks(obj):
     Detect base noun phrases from a dependency parse.
     Works on both Doc and Span.
     """
-    labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
+    labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'dative', 'appos',
               'attr', 'ROOT']
     doc = obj.doc # Ensure works on both Doc and Span.
     np_deps = [doc.vocab.strings[label] for label in labels]
diff --git a/spacy/tests/parser/test_noun_chunks.py b/spacy/tests/parser/test_noun_chunks.py
index 5e8c7659a..ddebca8b8 100644
--- a/spacy/tests/parser/test_noun_chunks.py
+++ b/spacy/tests/parser/test_noun_chunks.py
@@ -47,6 +47,36 @@ def test_parser_noun_chunks_pp_chunks(en_tokenizer):
     assert chunks[1].text_with_ws == "another phrase "
 
 
+def test_parser_noun_chunks_appositional_modifiers(en_tokenizer):
+    text = "Sam, my brother, arrived to the house."
+    heads = [5, -1, 1, -3, -4, 0, -1, 1, -2, -4]
+    tags = ['NNP', ',', 'PRP$', 'NN', ',', 'VBD', 'IN', 'DT', 'NN', '.']
+    deps = ['nsubj', 'punct', 'poss', 'appos', 'punct', 'ROOT', 'prep', 'det', 'pobj', 'punct']
+
+    tokens = en_tokenizer(text)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 3
+    assert chunks[0].text_with_ws == "Sam "
+    assert chunks[1].text_with_ws == "my brother "
+    assert chunks[2].text_with_ws == "the house "
+
+
+def test_parser_noun_chunks_dative(en_tokenizer):
+    text = "She gave Bob a raise."
+    heads = [1, 0, -1, 1, -3, -4]
+    tags = ['PRP', 'VBD', 'NNP', 'DT', 'NN', '.']
+    deps = ['nsubj', 'ROOT', 'dative', 'det', 'dobj', 'punct']
+
+    tokens = en_tokenizer(text)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 3
+    assert chunks[0].text_with_ws == "She "
+    assert chunks[1].text_with_ws == "Bob "
+    assert chunks[2].text_with_ws == "a raise "
+
+
 def test_parser_noun_chunks_standard_de(de_tokenizer):
     text = "Eine Tasse steht auf dem Tisch."
     heads = [1, 1, 0, -1, 1, -2, -4]

From 8b3e1f7b5b2d29ca3b70e5681daa095574b694be Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 29 Aug 2017 23:58:42 +0900
Subject: [PATCH 121/195] Handle out-of-vocab words

Wasn't handling words out of the tokenizer dictionary vocabulary
properly. This adds a fix and test for that. -POLM
---
 spacy/ja/__init__.py          | 10 +++++++---
 spacy/tests/ja/test_tagger.py |  7 ++++++-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py
index dfd0bca5b..2f85406c0 100644
--- a/spacy/ja/__init__.py
+++ b/spacy/ja/__init__.py
@@ -66,11 +66,15 @@ def detailed_tokens(tokenizer, text):
     node = node.next # first node is beginning of sentence and empty, skip it
     words = []
     while node.posid != 0:
+        surface = node.surface
+        base = surface
         parts = node.feature.split(',')
         pos = ','.join(parts[0:4])
-        reading = parts[6]
-        base = parts[7]
-        surface = parts[8]
+
+        if len(parts) > 6:
+            # this information is only available for words in the tokenizer dictionary
+            reading = parts[6]
+            base = parts[7]
 
         words.append( ShortUnitWord(surface, base, pos) )
         node = node.next
diff --git a/spacy/tests/ja/test_tagger.py b/spacy/tests/ja/test_tagger.py
index 629cc795f..85f653836 100644
--- a/spacy/tests/ja/test_tagger.py
+++ b/spacy/tests/ja/test_tagger.py
@@ -22,7 +22,12 @@ TAGGER_TESTS = [
       ('動詞,一般,*,*', 'VERB'),
       ('助詞,接続助詞,*,*', 'SCONJ'),
       ('動詞,非自立可能,*,*', 'VERB'),
-      ('助詞,終助詞,*,*', 'PART')))
+      ('助詞,終助詞,*,*', 'PART'))),
+    ('プププランドに行きたい',
+      (('名詞,普通名詞,一般,*', 'NOUN'),
+      ('助詞,格助詞,*,*', 'ADP'),
+      ('動詞,非自立可能,*,*', 'VERB'),
+      ('助動詞,*,*,*', 'AUX')))
 ]
 
 @pytest.mark.parametrize('text,expected_tags', TAGGER_TESTS)

From a6d9fb5bb65066887e5a7e5d44b078e722b2b002 Mon Sep 17 00:00:00 2001
From: Vimos Tan <vimostan@gmail.com>
Date: Wed, 30 Aug 2017 14:49:14 +0800
Subject: [PATCH 122/195] fix issue #1292

---
 .../tokenizer/test_customized_tokenizer.py    | 46 +++++++++++++++++++
 spacy/tokenizer.pyx                           |  3 +-
 2 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100644 spacy/tests/tokenizer/test_customized_tokenizer.py

diff --git a/spacy/tests/tokenizer/test_customized_tokenizer.py b/spacy/tests/tokenizer/test_customized_tokenizer.py
new file mode 100644
index 000000000..97a7db64c
--- /dev/null
+++ b/spacy/tests/tokenizer/test_customized_tokenizer.py
@@ -0,0 +1,46 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from ... import load
+from ...tokenizer import Tokenizer
+from ... import util
+
+import pytest
+
+
+def test_customized_tokenizer_handles_infixes():
+    def custom_tokenizer(nlp_model):
+        prefix_re = util.compile_prefix_regex(nlp_model.Defaults.prefixes)
+        suffix_re = util.compile_suffix_regex(nlp_model.Defaults.suffixes)
+        custom_infixes = ['\.\.\.+',
+                          '(?<=[0-9])-(?=[0-9])',
+                          # '(?<=[0-9]+),(?=[0-9]+)',
+                          '[0-9]+(,[0-9]+)+',
+                          u'[\[\]!&:,()\*—–\/-]']
+
+        infix_re = util.compile_infix_regex(custom_infixes)
+
+        # infix_re = re.compile(ur'[\[\]!&:,()]')
+
+        tokenizer = Tokenizer(nlp_model.vocab,
+                              nlp_model.Defaults.tokenizer_exceptions,
+                              prefix_re.search,
+                              suffix_re.search,
+                              infix_re.finditer,
+                              token_match=None)
+        return lambda text: tokenizer(text)
+
+    nlp = load('en', create_make_doc=custom_tokenizer)
+
+    sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion."
+    context = [word.text for word in nlp(sentence)]
+    assert context == [u'The', u'8', u'and', u'10', u'-', u'county', u'definitions', u'are', u'not', u'used',
+                       u'for',
+                       u'the', u'greater', u'Southern', u'California', u'Megaregion', u'.']
+
+    # the trailing '-' may cause Assertion Error
+    sentence = "The 8- and 10-county definitions are not used for the greater Southern California Megaregion."
+    context = [word.text for word in nlp(sentence)]
+    assert context == [u'The', u'8', u'-', u'and', u'10', u'-', u'county', u'definitions', u'are', u'not', u'used',
+                       u'for',
+                       u'the', u'greater', u'Southern', u'California', u'Megaregion', u'.']
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 276f0ef20..799e4bdaa 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -312,7 +312,8 @@ cdef class Tokenizer:
 
                         start = infix_end
                     span = string[start:]
-                    tokens.push_back(self.vocab.get(tokens.mem, span), False)
+                    if span:
+                        tokens.push_back(self.vocab.get(tokens.mem, span), False)
         cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
         while it != suffixes.rend():
             lexeme = deref(it)

From 9bffcaa73df60794c63f428f5f83f06bd5a271e4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 1 Sep 2017 21:16:56 +0200
Subject: [PATCH 123/195] Update test to make it slightly more direct

The `nlp` container should be unnecessary here. If so, we can test the tokenizer class just a little more directly.
---
 .../tokenizer/test_customized_tokenizer.py    | 46 ++++++++-----------
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/spacy/tests/tokenizer/test_customized_tokenizer.py b/spacy/tests/tokenizer/test_customized_tokenizer.py
index 97a7db64c..695f8c649 100644
--- a/spacy/tests/tokenizer/test_customized_tokenizer.py
+++ b/spacy/tests/tokenizer/test_customized_tokenizer.py
@@ -1,46 +1,40 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from ... import load
+from ...lang.en import English
 from ...tokenizer import Tokenizer
 from ... import util
 
 import pytest
 
+@pytest.fixture
+def tokenizer(en_vocab):
+    prefix_re = util.compile_prefix_regex(nlp_model.Defaults.prefixes)
+    suffix_re = util.compile_suffix_regex(nlp_model.Defaults.suffixes)
+    custom_infixes = ['\.\.\.+',
+                      '(?<=[0-9])-(?=[0-9])',
+                      # '(?<=[0-9]+),(?=[0-9]+)',
+                      '[0-9]+(,[0-9]+)+',
+                      u'[\[\]!&:,()\*—–\/-]']
 
-def test_customized_tokenizer_handles_infixes():
-    def custom_tokenizer(nlp_model):
-        prefix_re = util.compile_prefix_regex(nlp_model.Defaults.prefixes)
-        suffix_re = util.compile_suffix_regex(nlp_model.Defaults.suffixes)
-        custom_infixes = ['\.\.\.+',
-                          '(?<=[0-9])-(?=[0-9])',
-                          # '(?<=[0-9]+),(?=[0-9]+)',
-                          '[0-9]+(,[0-9]+)+',
-                          u'[\[\]!&:,()\*—–\/-]']
-
-        infix_re = util.compile_infix_regex(custom_infixes)
-
-        # infix_re = re.compile(ur'[\[\]!&:,()]')
-
-        tokenizer = Tokenizer(nlp_model.vocab,
-                              nlp_model.Defaults.tokenizer_exceptions,
-                              prefix_re.search,
-                              suffix_re.search,
-                              infix_re.finditer,
-                              token_match=None)
-        return lambda text: tokenizer(text)
-
-    nlp = load('en', create_make_doc=custom_tokenizer)
+    infix_re = util.compile_infix_regex(custom_infixes)
+    return Tokenizer(en_vocab,
+                     English.Defaults.tokenizer_exceptions,
+                     prefix_re.search,
+                     suffix_re.search,
+                     infix_re.finditer,
+                     token_match=None)
 
+def test_customized_tokenizer_handles_infixes(tokenizer):
     sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion."
-    context = [word.text for word in nlp(sentence)]
+    context = [word.text for word in tokenizer(sentence)]
     assert context == [u'The', u'8', u'and', u'10', u'-', u'county', u'definitions', u'are', u'not', u'used',
                        u'for',
                        u'the', u'greater', u'Southern', u'California', u'Megaregion', u'.']
 
     # the trailing '-' may cause Assertion Error
     sentence = "The 8- and 10-county definitions are not used for the greater Southern California Megaregion."
-    context = [word.text for word in nlp(sentence)]
+    context = [word.text for word in tokenizer(sentence)]
     assert context == [u'The', u'8', u'-', u'and', u'10', u'-', u'county', u'definitions', u'are', u'not', u'used',
                        u'for',
                        u'the', u'greater', u'Southern', u'California', u'Megaregion', u'.']

From d61c117081a57f7788e7e709abfd9adcd6e39df8 Mon Sep 17 00:00:00 2001
From: Eric Zhao <eric@whimmly.com>
Date: Sun, 3 Sep 2017 12:16:59 -0700
Subject: [PATCH 124/195] Lowest common ancestor matrix for spans and docs

Added functionality for spans and docs to get lowest common ancestor
matrix by simply calling: doc.get_lca_matrix() or
doc[:3].get_lca_matrix().
Corresponding unit tests were also added under spacy/tests/doc and
spacy/tests/spans.
Designed to address: https://github.com/explosion/spaCy/issues/969.
---
 spacy/tests/doc/test_doc_api.py |  7 +++++
 spacy/tests/spans/test_span.py  | 11 +++++++
 spacy/tokens/doc.pyx            | 43 +++++++++++++++++++++++++++
 spacy/tokens/span.pyx           | 52 +++++++++++++++++++++++++++++++++
 4 files changed, 113 insertions(+)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 1bc534ecd..d1a6316d5 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -216,6 +216,13 @@ def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors):
     doc = en_tokenizer(text)
     assert doc.has_vector
 
+def test_lowest_common_ancestor(en_tokenizer):
+    tokens = en_tokenizer('the lazy dog slept')
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0])
+    lca = doc.get_lca_matrix()
+    assert(lca[1, 1] == 1)
+    assert(lca[0, 1] == 2)
+    assert(lca[1, 2] == 2)
 
 def test_parse_tree(en_tokenizer):
     """Tests doc.print_tree() method."""
diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py
index d22fa52ae..29aefe5c7 100644
--- a/spacy/tests/spans/test_span.py
+++ b/spacy/tests/spans/test_span.py
@@ -54,6 +54,17 @@ def test_spans_span_sent(doc):
     assert doc[6:7].sent.root.left_edge.text == 'This'
 
 
+def test_spans_lca_matrix(en_tokenizer):
+    """Test span's lca matrix generation"""
+    tokens = en_tokenizer('the lazy dog slept')
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0])
+    lca = doc[:2].get_lca_matrix()
+    assert(lca[0, 0] == 0)
+    assert(lca[0, 1] == -1)
+    assert(lca[1, 0] == -1)
+    assert(lca[1, 1] == 1)
+
+
 def test_spans_default_sentiment(en_tokenizer):
     """Test span.sentiment property's default averaging behaviour"""
     text = "good stuff bad stuff"
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index ca5a3d696..aa888382e 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -614,6 +614,49 @@ cdef class Doc:
         self.is_tagged = bool(TAG in attrs or POS in attrs)
         return self
 
+
+    def get_lca_matrix(self):
+        '''
+        Calculates the lowest common ancestor matrix
+        for a given Spacy doc.
+        Returns LCA matrix containing the integer index
+        of the ancestor, or -1 if no common ancestor is
+        found (ex if span excludes a necessary ancestor).
+        Apologies about the recursion, but the
+        impact on performance is negligible given
+        the natural limitations on the depth of a typical human sentence.
+        '''
+
+        def __pairwise_lca(token_j, token_k, lca_matrix):
+            if lca_matrix[token_j.i][token_k.i] != -2:
+                return lca_matrix[token_j.i][token_k.i]
+            elif token_j == token_k:
+                lca_index = token_j.i
+            elif token_k.head == token_j:
+                lca_index = token_j.i
+            elif token_j.head == token_k:
+                lca_index = token_k.i
+            elif (token_j.head == token_j) and (token_k.head == token_k):
+                lca_index = -1
+            else:
+                lca_index = __pairwise_lca(token_j.head, token_k.head, lca_matrix)
+            lca_matrix[token_j.i][token_k.i] = lca_index
+            lca_matrix[token_k.i][token_j.i] = lca_index
+
+            return lca_index
+
+        lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
+        lca_matrix.fill(-2)
+        for j in range(len(self)):
+            token_j = self[j]
+            for k in range(len(self)):
+                token_k = self[k]
+                lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix)
+                lca_matrix[k][j] = lca_matrix[j][k]
+
+        return lca_matrix
+
+
     def to_bytes(self):
         """
         Serialize, producing a byte string.
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index d8890addc..ae28f698a 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -130,6 +130,58 @@ cdef class Span:
             return 0.0
         return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
 
+    def get_lca_matrix(self):
+        '''
+        Calculates the lowest common ancestor matrix
+        for a given Spacy span.
+        Returns LCA matrix containing the integer index
+        of the ancestor, or -1 if no common ancestor is
+        found (ex if span excludes a necessary ancestor).
+        Apologies about the recursion, but the
+        impact on performance is negligible given
+        the natural limitations on the depth of a typical human sentence.
+        '''
+
+        def __pairwise_lca(token_j, token_k, lca_matrix, margins):
+            offset = margins[0]
+            token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k
+            token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j
+            token_j_i = token_j.i - offset
+            token_k_i = token_k.i - offset
+
+            if lca_matrix[token_j_i][token_k_i] != -2:
+                return lca_matrix[token_j_i][token_k_i]
+            elif token_j == token_k:
+                lca_index = token_j_i
+            elif token_k_head == token_j:
+                lca_index = token_j_i
+            elif token_j_head == token_k:
+                lca_index = token_k_i
+            elif (token_j_head == token_j) and (token_k_head == token_k):
+                lca_index = -1
+            else:
+                lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins)
+
+            lca_matrix[token_j_i][token_k_i] = lca_index
+            lca_matrix[token_k_i][token_j_i] = lca_index
+
+            return lca_index
+
+        lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
+        lca_matrix.fill(-2)
+        margins = [self.start, self.end]
+
+        for j in range(len(self)):
+            token_j = self[j]
+            for k in range(len(self)):
+                token_k = self[k]
+                lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins)
+                lca_matrix[k][j] = lca_matrix[j][k]
+
+        return lca_matrix
+
+
+
     cpdef int _recalculate_indices(self) except -1:
         if self.end > self.doc.length \
         or self.doc.c[self.start].idx != self.start_char \

From e8a26ebfabec51327b2948fba95d6fa87f77eaa5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 4 Sep 2017 15:43:52 +0200
Subject: [PATCH 125/195] Add efficiency note to new get_lca_matrix() method

---
 spacy/tokens/doc.pyx | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index aa888382e..aca35a73f 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -626,7 +626,14 @@ cdef class Doc:
         impact on performance is negligible given
         the natural limitations on the depth of a typical human sentence.
         '''
-
+        # Efficiency notes:
+        # 
+        # We can easily improve the performance here by iterating in Cython.
+        # To loop over the tokens in Cython, the easiest way is:
+        # for token in doc.c[:doc.c.length]:
+        #     head = token + token.head
+        # Both token and head will be TokenC* here. The token.head attribute
+        # is an integer offset.
         def __pairwise_lca(token_j, token_k, lca_matrix):
             if lca_matrix[token_j.i][token_k.i] != -2:
                 return lca_matrix[token_j.i][token_k.i]
@@ -649,7 +656,7 @@ cdef class Doc:
         lca_matrix.fill(-2)
         for j in range(len(self)):
             token_j = self[j]
-            for k in range(len(self)):
+            for k in range(j, len(self)):
                 token_k = self[k]
                 lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix)
                 lca_matrix[k][j] = lca_matrix[j][k]

From c68f188eb035ed67e2df905dd5e483f0261a8ace Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 4 Sep 2017 18:59:36 +0200
Subject: [PATCH 126/195] Fix error on test

---
 spacy/tests/tokenizer/test_customized_tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/tokenizer/test_customized_tokenizer.py b/spacy/tests/tokenizer/test_customized_tokenizer.py
index 695f8c649..19909ceba 100644
--- a/spacy/tests/tokenizer/test_customized_tokenizer.py
+++ b/spacy/tests/tokenizer/test_customized_tokenizer.py
@@ -1,7 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from ...lang.en import English
+from ...en import English
 from ...tokenizer import Tokenizer
 from ... import util
 

From 45029a550e128e887fe1a6d826c04923991d98e2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 4 Sep 2017 20:13:13 +0200
Subject: [PATCH 127/195] Fix customized-tokenizer tests

---
 spacy/tests/tokenizer/test_customized_tokenizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/tokenizer/test_customized_tokenizer.py b/spacy/tests/tokenizer/test_customized_tokenizer.py
index 19909ceba..855f3386c 100644
--- a/spacy/tests/tokenizer/test_customized_tokenizer.py
+++ b/spacy/tests/tokenizer/test_customized_tokenizer.py
@@ -9,8 +9,8 @@ import pytest
 
 @pytest.fixture
 def tokenizer(en_vocab):
-    prefix_re = util.compile_prefix_regex(nlp_model.Defaults.prefixes)
-    suffix_re = util.compile_suffix_regex(nlp_model.Defaults.suffixes)
+    prefix_re = util.compile_prefix_regex(English.Defaults.prefixes)
+    suffix_re = util.compile_suffix_regex(English.Defaults.suffixes)
     custom_infixes = ['\.\.\.+',
                       '(?<=[0-9])-(?=[0-9])',
                       # '(?<=[0-9]+),(?=[0-9]+)',

From 7692b8c071af51165c732474978b032ca85f262f Mon Sep 17 00:00:00 2001
From: Yu-chun Huang <code@yullage.com>
Date: Tue, 12 Sep 2017 16:23:47 +0800
Subject: [PATCH 128/195] Update __init__.py

Set the "cut_all" parameter to False, or jieba will return ALL POSSIBLE word segmentations.
---
 spacy/zh/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/zh/__init__.py b/spacy/zh/__init__.py
index 0f407dec6..bde0054b5 100644
--- a/spacy/zh/__init__.py
+++ b/spacy/zh/__init__.py
@@ -7,6 +7,6 @@ class Chinese(Language):
 
     def make_doc(self, text):
         import jieba
-        words = list(jieba.cut(text, cut_all=True))
+        words = list(jieba.cut(text, cut_all=False))
         words=[x for x in words if x]
         return Doc(self.vocab, words=words, spaces=[False]*len(words))

From 1f1f35dcd07d419a2aca449c0ef738e098e37b68 Mon Sep 17 00:00:00 2001
From: Yu-chun Huang <code@yullage.com>
Date: Tue, 19 Sep 2017 16:57:24 +0800
Subject: [PATCH 129/195] Add Chinese punctuation

Add Chinese punctuation.
---
 spacy/language_data/punctuation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py
index f23b15bbc..fe636fa4b 100644
--- a/spacy/language_data/punctuation.py
+++ b/spacy/language_data/punctuation.py
@@ -19,11 +19,13 @@ _CURRENCY = r"""
 
 _QUOTES = r"""
 ' '' " ” “ `` ` ‘ ´ ‚ , „ » «
+「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉
 """
 
 
 _PUNCT = r"""
 … , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &
+。？ ！ ， 、 ； ： ～
 """
 
 

From 188b439b25dbe020977761cc719efaf452e79423 Mon Sep 17 00:00:00 2001
From: Yu-chun Huang <code@yullage.com>
Date: Tue, 19 Sep 2017 16:58:42 +0800
Subject: [PATCH 130/195] Add Chinese punctuation

Add Chinese punctuation.
---
 spacy/language_data/punctuation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py
index fe636fa4b..6229eff21 100644
--- a/spacy/language_data/punctuation.py
+++ b/spacy/language_data/punctuation.py
@@ -25,7 +25,7 @@ _QUOTES = r"""
 
 _PUNCT = r"""
 … , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &
-。？ ！ ， 、 ； ： ～
+。 ？ ！ ， 、 ； ： ～
 """
 
 

From 978b24ccd44a80f9ea2f8ae781e9b3a2164f68c4 Mon Sep 17 00:00:00 2001
From: Yam <haoshaochun@gmail.com>
Date: Wed, 20 Sep 2017 23:02:22 +0800
Subject: [PATCH 131/195] Update punctuation.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In Chinese, `~` and `——` is hyphens,
`·` is intermittent symbol
---
 spacy/language_data/punctuation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py
index 6229eff21..58ec73f2d 100644
--- a/spacy/language_data/punctuation.py
+++ b/spacy/language_data/punctuation.py
@@ -25,12 +25,12 @@ _QUOTES = r"""
 
 _PUNCT = r"""
 … , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &
-。 ？ ！ ， 、 ； ： ～
+。 ？ ！ ， 、 ； ： ～ ·
 """
 
 
 _HYPHENS = r"""
-- – — -- ---
+- – — -- --- —— ~
 """
 
 

From 44291f6697e3707c8730153c78cc547fc2e8f9e4 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Wed, 20 Sep 2017 23:26:34 +0700
Subject: [PATCH 132/195] add thai

---
 spacy/__init__.py                |  5 +-
 spacy/th/__init__.py             | 30 ++++++++++++
 spacy/th/language_data.py        | 25 ++++++++++
 spacy/th/stop_words.py           | 62 ++++++++++++++++++++++++
 spacy/th/tag_map.py              | 81 ++++++++++++++++++++++++++++++++
 spacy/th/tokenizer_exceptions.py | 80 +++++++++++++++++++++++++++++++
 6 files changed, 281 insertions(+), 2 deletions(-)
 create mode 100644 spacy/th/__init__.py
 create mode 100644 spacy/th/language_data.py
 create mode 100644 spacy/th/stop_words.py
 create mode 100644 spacy/th/tag_map.py
 create mode 100644 spacy/th/tokenizer_exceptions.py

diff --git a/spacy/__init__.py b/spacy/__init__.py
index 3afb38cfb..f0d5ea0fc 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -7,12 +7,13 @@ from .cli.info import info
 from .glossary import explain
 from .about import __version__
 
-from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja
+from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja,th
 
 
 _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French,
              it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish,
-             fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese)
+             fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese,
+             th.Thai)
 
 
 for _lang in _languages:
diff --git a/spacy/th/__init__.py b/spacy/th/__init__.py
new file mode 100644
index 000000000..0b6f8cf76
--- /dev/null
+++ b/spacy/th/__init__.py
@@ -0,0 +1,30 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .language_data import *
+from ..language import Language, BaseDefaults
+from ..attrs import LANG
+from ..tokenizer import Tokenizer
+from ..tokens import Doc
+class ThaiDefaults(BaseDefaults):
+	lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+	lex_attr_getters[LANG] = lambda text: 'th'
+	tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+	tag_map = TAG_MAP
+	stop_words = set(STOP_WORDS)
+
+
+class Thai(Language):
+	lang = 'th'
+	Defaults = ThaiDefaults
+	def make_doc(self, text):
+		try:
+			from pythainlp.tokenize import word_tokenize
+		except ImportError:
+			raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
+								"https://github.com/wannaphongcom/pythainlp/")
+		words = [x for x in list(word_tokenize(text,"newmm"))]
+		return Doc(self.vocab, words=words, spaces=[False]*len(words))
+
+__all__ = ['Thai']
\ No newline at end of file
diff --git a/spacy/th/language_data.py b/spacy/th/language_data.py
new file mode 100644
index 000000000..03800ba19
--- /dev/null
+++ b/spacy/th/language_data.py
@@ -0,0 +1,25 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+# import base language data
+from .. import language_data as base
+
+
+# import util functions
+from ..language_data import update_exc, strings_to_exc
+
+
+# import language-specific data from files
+#from .tag_map import TAG_MAP
+from .tag_map import TAG_MAP
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+
+
+TAG_MAP = dict(TAG_MAP)
+STOP_WORDS = set(STOP_WORDS)
+TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
+
+# export __all__ = ["TAG_MAP", "STOP_WORDS"]
+__all__ = ["TAG_MAP", "STOP_WORDS","TOKENIZER_EXCEPTIONS"]
\ No newline at end of file
diff --git a/spacy/th/stop_words.py b/spacy/th/stop_words.py
new file mode 100644
index 000000000..e13dec984
--- /dev/null
+++ b/spacy/th/stop_words.py
@@ -0,0 +1,62 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+# data from https://github.com/wannaphongcom/pythainlp/blob/dev/pythainlp/corpus/stopwords-th.txt
+# stop words as whitespace-separated list
+STOP_WORDS = set("""
+นี้	นํา	นั้น	นัก	นอกจาก	ทุก	ที่สุด	ที่	ทําให้	ทํา	ทาง	ทั้งนี้	ดัง	ซึ่ง	ช่วง	จาก	จัด	จะ	คือ	ความ	ครั้ง	คง	ขึ้น	ของ
+ขอ	รับ	ระหว่าง	รวม	ยัง	มี	มาก	มา	พร้อม	พบ	ผ่าน	ผล	บาง	น่า	เปิดเผย	เปิด	เนื่องจาก	เดียวกัน	เดียว	เช่น	เฉพาะ	เข้า	ถ้า
+ถูก	ถึง	ต้อง	ต่างๆ	ต่าง	ต่อ	ตาม	ตั้งแต่	ตั้ง	ด้าน	ด้วย	อีก	อาจ	ออก	อย่าง	อะไร	อยู่	อยาก	หาก	หลาย	หลังจาก	แต่	เอง	เห็น
+เลย	เริ่ม	เรา	เมื่อ	เพื่อ	เพราะ	เป็นการ	เป็น	หลัง	หรือ	หนึ่ง	ส่วน	ส่ง	สุด	สําหรับ	ว่า	ลง	ร่วม	ราย	ขณะ	ก่อน	ก็	การ	กับ	กัน
+กว่า	กล่าว	จึง	ไว้	ไป	ได้	ให้	ใน	โดย	แห่ง	แล้ว	และ	แรก	แบบ	ๆ	ทั้ง	วัน	เขา	เคย	ไม่	อยาก	เกิน	เกินๆ	เกี่ยวกัน	เกี่ยวกับ
+เกี่ยวข้อง	เกี่ยวเนื่อง	เกี่ยวๆ	เกือบ	เกือบจะ	เกือบๆ	แก	แก่	แก้ไข	ใกล้	ใกล้ๆ	ไกล	ไกลๆ	ขณะเดียวกัน	ขณะใด	ขณะใดๆ	ขณะที่	ขณะนั้น	ขณะนี้	ขณะหนึ่ง	ขวาง
+ขวางๆ	ขั้น	ใคร	ใคร่	ใคร่จะ	ใครๆ	ง่าย	ง่ายๆ	ไง	จง	จด	จน	จนกระทั่ง	จนกว่า	จนขณะนี้	จนตลอด	จนถึง	จนทั่ว	จนบัดนี้	จนเมื่อ	จนแม้	จนแม้น
+จรด	จรดกับ	จริง	จริงจัง	จริงๆ	จริงๆจังๆ	จวน	จวนจะ	จวนเจียน	จวบ	ซึ่งก็	ซึ่งก็คือ	ซึ่งกัน	ซึ่งกันและกัน	ซึ่งได้แก่	ซึ่งๆ	ณ	ด้วย	ด้วยกัน	ด้วยเช่นกัน	ด้วยที่	ด้วยประการฉะนี้
+ด้วยเพราะ	ด้วยว่า	ด้วยเหตุที่	ด้วยเหตุนั้น	ด้วยเหตุนี้	ด้วยเหตุเพราะ	ด้วยเหตุว่า	ด้วยเหมือนกัน	ดั่ง	ดังกล่าว	ดังกับ	ดั่งกับ	ดังกับว่า	ดั่งกับว่า	ดังเก่า
+ดั่งเก่า	ดังเคย	ใดๆ	ได้	ได้แก่	ได้แต่	ได้ที่	ได้มา	ได้รับ	ตน	ตนเอง	ตนฯ	ตรง	ตรงๆ	ตลอด	ตลอดกาล	ตลอดกาลนาน	ตลอดจน	ตลอดถึง	ตลอดทั้ง
+ตลอดทั่ว	ตลอดทั่วถึง	ตลอดทั่วทั้ง	ตลอดปี	ตลอดไป	ตลอดมา	ตลอดระยะเวลา	ตลอดวัน	ตลอดเวลา	ตลอดศก	ต่อ	ต่อกัน	ถึงแก่	ถึงจะ	ถึงบัดนั้น	ถึงบัดนี้
+ถึงเมื่อ	ถึงเมื่อใด	ถึงเมื่อไร	ถึงแม้	ถึงแม้จะ	ถึงแม้ว่า	ถึงอย่างไร	ถือ	ถือว่า	ถูกต้อง	ถูกๆ	เถอะ	เถิด	ทรง	ทว่า	ทั้งคน	ทั้งตัว	ทั้งที	ทั้งที่	ทั้งนั้น	ทั้งนั้นด้วย	ทั้งนั้นเพราะ
+นอก	นอกจากที่	นอกจากนั้น	นอกจากนี้	นอกจากว่า	นอกนั้น	นอกเหนือ	นอกเหนือจาก	น้อย	น้อยกว่า	น้อยๆ	นะ	น่ะ	นักๆ	นั่น	นั่นไง	นั่นเป็น	นั่นแหละ
+นั่นเอง	นั้นๆ	นับ	นับจากนั้น	นับจากนี้	นับตั้งแต่	นับแต่	นับแต่ที่	นับแต่นั้น	เป็นต้น	เป็นต้นไป	เป็นต้นมา	เป็นแต่	เป็นแต่เพียง	เป็นที	เป็นที่	เป็นที่สุด	เป็นเพราะ
+เป็นเพราะว่า	เป็นเพียง	เป็นเพียงว่า	เป็นเพื่อ	เป็นอัน	เป็นอันมาก	เป็นอันว่า	เป็นอันๆ	เป็นอาทิ	เป็นๆ	เปลี่ยน	เปลี่ยนแปลง	เปิด	เปิดเผย	ไป่	ผ่าน	ผ่านๆ
+ผิด	ผิดๆ	ผู้	เพียงเพื่อ	เพียงไร	เพียงไหน	เพื่อที่	เพื่อที่จะ	เพื่อว่า	เพื่อให้	ภาค	ภาคฯ	ภาย	ภายใต้	ภายนอก	ภายใน	ภายภาค	ภายภาคหน้า	ภายหน้า	ภายหลัง
+มอง	มองว่า	มัก	มักจะ	มัน	มันๆ	มั้ย	มั้ยนะ	มั้ยนั่น	มั้ยเนี่ย	มั้ยล่ะ	ยืนนาน	ยืนยง	ยืนยัน	ยืนยาว	เยอะ	เยอะแยะ	เยอะๆ	แยะ	แยะๆ	รวด	รวดเร็ว	ร่วม	รวมกัน	ร่วมกัน
+รวมด้วย	ร่วมด้วย	รวมถึง	รวมทั้ง	ร่วมมือ	รวมๆ	ระยะ	ระยะๆ	ระหว่าง	รับรอง	รึ	รึว่า	รือ	รือว่า	สิ้นกาลนาน	สืบเนื่อง	สุดๆ	สู่	สูง	สูงกว่า	สูงส่ง	สูงสุด	สูงๆ	เสมือนกับ
+เสมือนว่า	เสร็จ	เสร็จกัน	เสร็จแล้ว	เสร็จสมบูรณ์	เสร็จสิ้น	เสีย	เสียก่อน	เสียจน	เสียจนกระทั่ง	เสียจนถึง	เสียด้วย	เสียนั่น	เสียนั่นเอง	เสียนี่	เสียนี่กระไร	เสียยิ่ง
+เสียยิ่งนัก	เสียแล้ว	ใหญ่ๆ	ให้ดี	ให้แด่	ให้ไป	ใหม่	ให้มา	ใหม่ๆ	ไหน	ไหนๆ	อดีต	อนึ่ง	อย่าง	อย่างเช่น	อย่างดี	อย่างเดียว	อย่างใด	อย่างที่	อย่างน้อย	อย่างนั้น
+อย่างนี้	อย่างโน้น	ก็คือ	ก็แค่	ก็จะ	ก็ดี	ก็ได้	ก็ต่อเมื่อ	ก็ตาม	ก็ตามแต่	ก็ตามที	ก็แล้วแต่	กระทั่ง	กระทำ	กระนั้น	กระผม	กลับ	กล่าวคือ	กลุ่ม	กลุ่มก้อน
+กลุ่มๆ	กว้าง	กว้างขวาง	กว้างๆ	ก่อนหน้า	ก่อนหน้านี้	ก่อนๆ	กันดีกว่า	กันดีไหม	กันเถอะ	กันนะ	กันและกัน	กันไหม	กันเอง	กำลัง	กำลังจะ	กำหนด	กู	เก็บ
+เกิด	เกี่ยวข้อง	แก่	แก้ไข	ใกล้	ใกล้ๆ	ข้า	ข้าง	ข้างเคียง	ข้างต้น	ข้างบน	ข้างล่าง	ข้างๆ	ขาด	ข้าพเจ้า	ข้าฯ	เข้าใจ	เขียน	คงจะ	คงอยู่	ครบ	ครบครัน	ครบถ้วน
+ครั้งกระนั้น	ครั้งก่อน	ครั้งครา	ครั้งคราว	ครั้งใด	ครั้งที่	ครั้งนั้น	ครั้งนี้	ครั้งละ	ครั้งหนึ่ง	ครั้งหลัง	ครั้งหลังสุด	ครั้งไหน	ครั้งๆ	ครัน	ครับ	ครา	คราใด	คราที่	ครานั้น	ครานี้	คราหนึ่ง
+คราไหน	คราว	คราวก่อน	คราวใด	คราวที่	คราวนั้น	คราวนี้	คราวโน้น	คราวละ	คราวหน้า	คราวหนึ่ง	คราวหลัง	คราวไหน	คราวๆ	คล้าย	คล้ายกัน	คล้ายกันกับ
+คล้ายกับ	คล้ายกับว่า	คล้ายว่า	ควร	ค่อน	ค่อนข้าง	ค่อนข้างจะ	ค่อยไปทาง	ค่อนมาทาง	ค่อย	ค่อยๆ	คะ	ค่ะ	คำ	คิด	คิดว่า	คุณ	คุณๆ
+เคยๆ	แค่	แค่จะ	แค่นั้น	แค่นี้	แค่เพียง	แค่ว่า	แค่ไหน	ใคร่	ใคร่จะ	ง่าย	ง่ายๆ	จนกว่า	จนแม้	จนแม้น	จังๆ	จวบกับ	จวบจน	จ้ะ	จ๊ะ	จะได้	จัง	จัดการ	จัดงาน	จัดแจง
+จัดตั้ง	จัดทำ	จัดหา	จัดให้	จับ	จ้า	จ๋า	จากนั้น	จากนี้ 	จากนี้ไป	จำ	จำเป็น 	จำพวก	จึงจะ	จึงเป็น	จู่ๆ	ฉะนั้น	ฉะนี้	ฉัน	เฉกเช่น	เฉย	เฉยๆ	ไฉน	ช่วงก่อน
+ช่วงต่อไป	ช่วงถัดไป	ช่วงท้าย	ช่วงที่	ช่วงนั้น	ช่วงนี้	ช่วงระหว่าง	ช่วงแรก	ช่วงหน้า	ช่วงหลัง	ช่วงๆ	ช่วย	ช้า	ช้านาน	ชาว	ช้าๆ	เช่นก่อน	เช่นกัน	เช่นเคย
+เช่นดัง	เช่นดังก่อน	เช่นดังเก่า	เช่นดังที่	เช่นดังว่า	เช่นเดียวกัน	เช่นเดียวกับ	เช่นใด	เช่นที่	เช่นที่เคย	เช่นที่ว่า	เช่นนั้น	เช่นนั้นเอง	เช่นนี้	เช่นเมื่อ	เช่นไร	เชื่อ
+เชื่อถือ	เชื่อมั่น	เชื่อว่า	ใช่	ใช่ไหม	ใช้	ซะ	ซะก่อน	ซะจน	ซะจนกระทั่ง	ซะจนถึง	ซึ่งได้แก่	ด้วยกัน	ด้วยเช่นกัน	ด้วยที่	ด้วยเพราะ	ด้วยว่า	ด้วยเหตุที่	ด้วยเหตุนั้น
+ด้วยเหตุนี้	ด้วยเหตุเพราะ	ด้วยเหตุว่า	ด้วยเหมือนกัน	ดังกล่าว	ดังกับว่า	ดั่งกับว่า	ดังเก่า	ดั่งเก่า	ดั่งเคย	ต่างก็	ต่างหาก	ตามด้วย	ตามแต่	ตามที่
+ตามๆ	เต็มไปด้วย	เต็มไปหมด	เต็มๆ	แต่ก็	แต่ก่อน	แต่จะ	แต่เดิม	แต่ต้อง	แต่ถ้า	แต่ทว่า	แต่ที่	แต่นั้น	แต่เพียง	แต่เมื่อ	แต่ไร	แต่ละ	แต่ว่า	แต่ไหน	แต่อย่างใด	โต
+โตๆ	ใต้	ถ้าจะ	ถ้าหาก	ถึงแก่	ถึงแม้	ถึงแม้จะ	ถึงแม้ว่า	ถึงอย่างไร	ถือว่า	ถูกต้อง	ทว่า	ทั้งนั้นด้วย	ทั้งปวง	ทั้งเป็น	ทั้งมวล	ทั้งสิ้น	ทั้งหมด	ทั้งหลาย	ทั้งๆ	ทัน
+ทันใดนั้น	ทันที	ทันทีทันใด	ทั่ว	ทำไม	ทำไร	ทำให้	ทำๆ	ที	ที่จริง	ที่ซึ่ง	ทีเดียว	ทีใด	ที่ใด	ที่ได้	ทีเถอะ	ที่แท้	ที่แท้จริง	ที่นั้น	ที่นี้	ทีไร	ทีละ	ที่ละ
+ที่แล้ว	ที่ว่า	ที่แห่งนั้น	ที่ไหน	ทีๆ	ที่ๆ	ทุกคน	ทุกครั้ง	ทุกครา	ทุกคราว	ทุกชิ้น	ทุกตัว	ทุกทาง	ทุกที	ทุกที่	ทุกเมื่อ	ทุกวัน	ทุกวันนี้	ทุกสิ่ง	ทุกหน	ทุกแห่ง	ทุกอย่าง
+ทุกอัน	ทุกๆ	เท่า	เท่ากัน	เท่ากับ	เท่าใด	เท่าที่	เท่านั้น	เท่านี้	เท่าไร	เท่าไหร่	แท้	แท้จริง	เธอ	นอกจากว่า	น้อย	น้อยกว่า	น้อยๆ	น่ะ	นั้นไว	นับแต่นี้	นาง
+นางสาว	น่าจะ	นาน	นานๆ	นาย	นำ	นำพา	นำมา	นิด	นิดหน่อย	นิดๆ	นี่	นี่ไง	นี่นา	นี่แน่ะ	นี่แหละ	นี้แหล่	นี่เอง	นี้เอง	นู่น	นู้น	เน้น	เนี่ย
+เนี่ยเอง	ในช่วง	ในที่	ในเมื่อ	ในระหว่าง	บน	บอก	บอกแล้ว	บอกว่า	บ่อย	บ่อยกว่า	บ่อยครั้ง	บ่อยๆ	บัดดล	บัดเดี๋ยวนี้	บัดนั้น	บัดนี้	บ้าง	บางกว่า
+บางขณะ	บางครั้ง	บางครา	บางคราว	บางที	บางที่	บางแห่ง	บางๆ	ปฏิบัติ	ประกอบ	ประการ	ประการฉะนี้	ประการใด	ประการหนึ่ง	ประมาณ	ประสบ	ปรับ
+ปรากฏ	ปรากฏว่า	ปัจจุบัน	ปิด	เป็นด้วย	เป็นดัง	เป็นต้น	เป็นแต่	เป็นเพื่อ	เป็นอัน	เป็นอันมาก	เป็นอาทิ	ผ่านๆ	ผู้	ผู้ใด	เผื่อ	เผื่อจะ	เผื่อที่	เผื่อว่า	ฝ่าย
+ฝ่ายใด	พบว่า	พยายาม	พร้อมกัน	พร้อมกับ	พร้อมด้วย	พร้อมทั้ง	พร้อมที่	พร้อมเพียง	พวก	พวกกัน	พวกกู	พวกแก	พวกเขา	พวกคุณ	พวกฉัน	พวกท่าน
+พวกที่	พวกเธอ	พวกนั้น	พวกนี้	พวกนู้น	พวกโน้น	พวกมัน	พวกมึง	พอ	พอกัน	พอควร	พอจะ	พอดี	พอตัว	พอที	พอที่	พอเพียง	พอแล้ว	พอสม	พอสมควร
+พอเหมาะ	พอๆ	พา	พึง	พึ่ง	พื้นๆ	พูด	เพราะฉะนั้น	เพราะว่า	เพิ่ง	เพิ่งจะ	เพิ่ม	เพิ่มเติม	เพียง	เพียงแค่	เพียงใด	เพียงแต่	เพียงพอ	เพียงเพราะ
+เพื่อว่า	เพื่อให้	ภายใต้	มองว่า	มั๊ย	มากกว่า	มากมาย	มิ	มิฉะนั้น	มิใช่	มิได้	มีแต่	มึง	มุ่ง	มุ่งเน้น	มุ่งหมาย	เมื่อก่อน	เมื่อครั้ง	เมื่อครั้งก่อน
+เมื่อคราวก่อน	เมื่อคราวที่	เมื่อคราว	เมื่อคืน	เมื่อเช้า	เมื่อใด	เมื่อนั้น	เมื่อนี้	เมื่อเย็น	เมื่อไร	เมื่อวันวาน	เมื่อวาน	เมื่อไหร่	แม้	แม้กระทั่ง	แม้แต่	แม้นว่า	แม้ว่า
+ไม่ค่อย	ไม่ค่อยจะ	ไม่ค่อยเป็น	ไม่ใช่	ไม่เป็นไร	ไม่ว่า	ยก	ยกให้	ยอม	ยอมรับ	ย่อม	ย่อย	ยังคง	ยังงั้น	ยังงี้	ยังโง้น	ยังไง	ยังจะ	ยังแต่	ยาก
+ยาว	ยาวนาน	ยิ่ง	ยิ่งกว่า	ยิ่งขึ้น	ยิ่งขึ้นไป	ยิ่งจน	ยิ่งจะ	ยิ่งนัก	ยิ่งเมื่อ	ยิ่งแล้ว	ยิ่งใหญ่	ร่วมกัน	รวมด้วย	ร่วมด้วย	รือว่า	เร็ว	เร็วๆ	เราๆ	เรียก	เรียบ	เรื่อย
+เรื่อยๆ	ไร	ล้วน	ล้วนจน	ล้วนแต่	ละ	ล่าสุด	เล็ก	เล็กน้อย	เล็กๆ	เล่าว่า	แล้วกัน	แล้วแต่	แล้วเสร็จ	วันใด	วันนั้น	วันนี้	วันไหน	สบาย	สมัย	สมัยก่อน
+สมัยนั้น	สมัยนี้	สมัยโน้น	ส่วนเกิน	ส่วนด้อย	ส่วนดี	ส่วนใด	ส่วนที่	ส่วนน้อย	ส่วนนั้น	ส่วนมาก	ส่วนใหญ่	สั้น	สั้นๆ	สามารถ	สำคัญ	สิ่ง
+สิ่งใด	สิ่งนั้น	สิ่งนี้	สิ่งไหน	สิ้น	เสร็จแล้ว	เสียด้วย	เสียแล้ว	แสดง	แสดงว่า	หน	หนอ	หนอย	หน่อย	หมด	หมดกัน	หมดสิ้น	หรือไง	หรือเปล่า	หรือไม่	หรือยัง
+หรือไร	หากแม้	หากแม้น	หากแม้นว่า	หากว่า	หาความ	หาใช่	หารือ	เหตุ	เหตุผล	เหตุนั้น	เหตุนี้	เหตุไร	เห็นแก่	เห็นควร	เห็นจะ	เห็นว่า	เหลือ	เหลือเกิน	เหล่า
+เหล่านั้น	เหล่านี้	แห่งใด	แห่งนั้น	แห่งนี้	แห่งโน้น	แห่งไหน	แหละ	ให้แก่	ใหญ่	ใหญ่โต	อย่างเช่น	อย่างดี	อย่างเดียว	อย่างใด	อย่างที่	อย่างน้อย	อย่างนั้น	อย่างนี้
+อย่างโน้น	อย่างมาก	อย่างยิ่ง	อย่างไร	อย่างไรก็	อย่างไรก็ได้	อย่างไรเสีย	อย่างละ	อย่างหนึ่ง	อย่างไหน	อย่างๆ	อัน	อันจะ	อันใด	อันได้แก่	อันที่
+อันที่จริง	อันที่จะ	อันเนื่องมาจาก	อันละ	อันไหน	อันๆ	อาจจะ	อาจเป็น	อาจเป็นด้วย	อื่น	อื่นๆ	เอ็ง	เอา	ฯ	ฯล	ฯลฯ
+""".split())
\ No newline at end of file
diff --git a/spacy/th/tag_map.py b/spacy/th/tag_map.py
new file mode 100644
index 000000000..e225f7289
--- /dev/null
+++ b/spacy/th/tag_map.py
@@ -0,0 +1,81 @@
+# encoding: utf8
+# data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1)
+from __future__ import unicode_literals
+
+from ..symbols import *
+
+TAG_MAP = {
+    #NOUN
+    "NOUN":     {POS: NOUN},
+    "NCMN":     {POS: NOUN},
+    "NTTL":     {POS: NOUN},
+    "CNIT":     {POS: NOUN},
+    "CLTV":     {POS: NOUN},
+    "CMTR":     {POS: NOUN},
+    "CFQC":     {POS: NOUN},
+    "CVBL":     {POS: NOUN},
+    #PRON
+    "PRON":     {POS: PRON},
+    "NPRP":     {POS: PRON},
+    # ADJ
+    "ADJ":      {POS: ADJ},
+    "NONM":      {POS: ADJ},
+    "VATT":      {POS: ADJ},
+    "DONM":      {POS: ADJ},
+    # ADV
+    "ADV":      {POS: ADV},
+    "ADVN":      {POS: ADV},
+    "ADVI":      {POS: ADV},
+    "ADVP":      {POS: ADV},
+    "ADVS":      {POS: ADV},
+	# INT
+    "INT":      {POS: INTJ},
+    # PRON
+    "PROPN":    {POS: PROPN},
+    "PPRS":    {POS: PROPN},
+    "PDMN":    {POS: PROPN},
+    "PNTR":    {POS: PROPN},
+    # DET
+    "DET":      {POS: DET},
+    "DDAN":      {POS: DET},
+    "DDAC":      {POS: DET},
+    "DDBQ":      {POS: DET},
+    "DDAQ":      {POS: DET},
+    "DIAC":      {POS: DET},
+    "DIBQ":      {POS: DET},
+    "DIAQ":      {POS: DET},
+    "DCNM":      {POS: DET},
+    # NUM
+    "NUM":      {POS: NUM},
+    "NCNM":      {POS: NUM},
+    "NLBL":      {POS: NUM},
+    "DCNM":      {POS: NUM},
+	# AUX
+    "AUX":      {POS: AUX},
+    "XVBM":      {POS: AUX},
+    "XVAM":      {POS: AUX},
+    "XVMM":      {POS: AUX},
+    "XVBB":      {POS: AUX},
+    "XVAE":      {POS: AUX},
+	# ADP
+    "ADP":      {POS: ADP},
+    "RPRE":      {POS: ADP},
+    # CCONJ
+    "CCONJ":    {POS: CCONJ},
+    "JCRG":    {POS: CCONJ},
+	# SCONJ
+    "SCONJ":    {POS: SCONJ},
+    "PREL":    {POS: SCONJ},
+    "JSBR":    {POS: SCONJ},
+    "JCMP":    {POS: SCONJ},
+    # PART
+    "PART":    {POS: PART},
+    "FIXN":    {POS: PART},
+    "FIXV":    {POS: PART},
+    "EAFF":    {POS: PART},
+    "AITT":    {POS: PART},
+    "NEG":    {POS: PART},
+    # PUNCT
+    "PUNCT":    {POS: PUNCT},
+    "PUNC":    {POS: PUNCT}
+}
\ No newline at end of file
diff --git a/spacy/th/tokenizer_exceptions.py b/spacy/th/tokenizer_exceptions.py
new file mode 100644
index 000000000..7e3967aed
--- /dev/null
+++ b/spacy/th/tokenizer_exceptions.py
@@ -0,0 +1,80 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ..symbols import *
+from ..language_data import PRON_LEMMA
+
+
+TOKENIZER_EXCEPTIONS = {
+    "ม.ค.": [
+        {ORTH: "ม.ค.", LEMMA: "มกราคม"}
+    ],
+    "ก.พ.": [
+        {ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"}
+    ],
+    "มี.ค.": [
+        {ORTH: "มี.ค.", LEMMA: "มีนาคม"}
+    ],
+    "เม.ย.": [
+        {ORTH: "เม.ย.", LEMMA: "เมษายน"}
+    ],
+    "พ.ค.": [
+        {ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}
+    ],
+    "มิ.ย.": [
+        {ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}
+    ],
+    "ก.ค.": [
+        {ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}
+    ],
+    "ส.ค.": [
+        {ORTH: "ส.ค.", LEMMA: "สิงหาคม"}
+    ],
+    "ก.ย.": [
+        {ORTH: "ก.ย.", LEMMA: "กันยายน"}
+    ],
+    "ต.ค.": [
+        {ORTH: "ต.ค.", LEMMA: "ตุลาคม"}
+    ],
+    "พ.ย.": [
+        {ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}
+    ],
+    "ธ.ค.": [
+        {ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}
+    ]
+}
+
+
+# exceptions mapped to a single token containing only ORTH property
+# example: {"string": [{ORTH: "string"}]}
+# converted using strings_to_exc() util
+'''
+ORTH_ONLY = [
+    "a.",
+    "b.",
+    "c.",
+    "d.",
+    "e.",
+    "f.",
+    "g.",
+    "h.",
+    "i.",
+    "j.",
+    "k.",
+    "l.",
+    "m.",
+    "n.",
+    "o.",
+    "p.",
+    "q.",
+    "r.",
+    "s.",
+    "t.",
+    "u.",
+    "v.",
+    "w.",
+    "x.",
+    "y.",
+    "z."
+]
+'''
\ No newline at end of file

From 39bb5690f0e1398b75407f70e89f88da4f9c3738 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Thu, 21 Sep 2017 00:36:02 +0700
Subject: [PATCH 133/195] update th

---
 spacy/th/__init__.py             |  4 +---
 spacy/th/tokenizer_exceptions.py | 37 +-------------------------------
 2 files changed, 2 insertions(+), 39 deletions(-)

diff --git a/spacy/th/__init__.py b/spacy/th/__init__.py
index 0b6f8cf76..0ed5268c6 100644
--- a/spacy/th/__init__.py
+++ b/spacy/th/__init__.py
@@ -25,6 +25,4 @@ class Thai(Language):
 			raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
 								"https://github.com/wannaphongcom/pythainlp/")
 		words = [x for x in list(word_tokenize(text,"newmm"))]
-		return Doc(self.vocab, words=words, spaces=[False]*len(words))
-
-__all__ = ['Thai']
\ No newline at end of file
+		return Doc(self.vocab, words=words, spaces=[False]*len(words))
\ No newline at end of file
diff --git a/spacy/th/tokenizer_exceptions.py b/spacy/th/tokenizer_exceptions.py
index 7e3967aed..0f933f1c1 100644
--- a/spacy/th/tokenizer_exceptions.py
+++ b/spacy/th/tokenizer_exceptions.py
@@ -42,39 +42,4 @@ TOKENIZER_EXCEPTIONS = {
     "ธ.ค.": [
         {ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}
     ]
-}
-
-
-# exceptions mapped to a single token containing only ORTH property
-# example: {"string": [{ORTH: "string"}]}
-# converted using strings_to_exc() util
-'''
-ORTH_ONLY = [
-    "a.",
-    "b.",
-    "c.",
-    "d.",
-    "e.",
-    "f.",
-    "g.",
-    "h.",
-    "i.",
-    "j.",
-    "k.",
-    "l.",
-    "m.",
-    "n.",
-    "o.",
-    "p.",
-    "q.",
-    "r.",
-    "s.",
-    "t.",
-    "u.",
-    "v.",
-    "w.",
-    "x.",
-    "y.",
-    "z."
-]
-'''
\ No newline at end of file
+}
\ No newline at end of file

From 1abf472068ef700c66da4dc0f4beadb3ccd7c718 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Thu, 21 Sep 2017 12:56:58 +0700
Subject: [PATCH 134/195] add th test

---
 spacy/tests/conftest.py          |  6 ++++++
 spacy/tests/th/test_tokenizer.py | 13 +++++++++++++
 2 files changed, 19 insertions(+)
 create mode 100644 spacy/tests/th/test_tokenizer.py

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 6e00b1513..c9652b08d 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -15,6 +15,7 @@ from ..fi import Finnish
 from ..bn import Bengali
 from ..he import Hebrew
 from ..nb import Norwegian
+from ..th import Thai
 
 
 from ..tokens import Doc
@@ -101,6 +102,11 @@ def he_tokenizer():
 def nb_tokenizer():
     return Norwegian.Defaults.create_tokenizer()
 
+@pytest.fixture
+def th_tokenizer():
+    pythainlp = pytest.importorskip("pythainlp")
+    return Thai.Defaults.create_tokenizer()
+
 @pytest.fixture
 def stringstore():
     return StringStore()
diff --git a/spacy/tests/th/test_tokenizer.py b/spacy/tests/th/test_tokenizer.py
new file mode 100644
index 000000000..851c6f067
--- /dev/null
+++ b/spacy/tests/th/test_tokenizer.py
@@ -0,0 +1,13 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+TOKENIZER_TESTS = [
+        ("คุณรักผมไหม", ['คุณ', 'รัก', 'ผม', 'ไหม'])
+]
+
+@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
+def test_thai_tokenizer(th_tokenizer, text, expected_tokens):
+    tokens = [token.text for token in th_tokenizer(text)]
+    assert tokens == expected_tokens

From 425c09488d1370d217b46521e2942b4b04a4e254 Mon Sep 17 00:00:00 2001
From: Yam <haoshaochun@gmail.com>
Date: Fri, 22 Sep 2017 08:56:34 +0800
Subject: [PATCH 135/195] Update word-vectors-similarities.jade

add
```
import spacy
nlp = spacy.load('en') ```
---
 website/docs/usage/word-vectors-similarities.jade | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/website/docs/usage/word-vectors-similarities.jade b/website/docs/usage/word-vectors-similarities.jade
index 3cc0a67a8..3fd6326d1 100644
--- a/website/docs/usage/word-vectors-similarities.jade
+++ b/website/docs/usage/word-vectors-similarities.jade
@@ -21,10 +21,12 @@ p
 
 +code.
     import numpy
+    import spacy
+    nlp = spacy.load('en') 
 
     apples, and_, oranges = nlp(u'apples and oranges')
     print(apples.vector.shape)
-    # (1,)
+    # (300,)
     apples.similarity(oranges)
 
 p

From 923c4c2fb2863858c18d262de53746f42c9aa6ae Mon Sep 17 00:00:00 2001
From: Yam <haoshaochun@gmail.com>
Date: Fri, 22 Sep 2017 09:50:46 +0800
Subject: [PATCH 136/195] Update punctuation.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

add `……`
---
 spacy/language_data/punctuation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py
index 58ec73f2d..3b5307496 100644
--- a/spacy/language_data/punctuation.py
+++ b/spacy/language_data/punctuation.py
@@ -36,7 +36,7 @@ _HYPHENS = r"""
 
 LIST_ELLIPSES = [
     r'\.\.+',
-    "…"
+    "… ……"
 ]
 
 

From 6f450306c3429d19472e7ae25bcbcd7f8b835e2d Mon Sep 17 00:00:00 2001
From: Yam <haoshaochun@gmail.com>
Date: Fri, 22 Sep 2017 10:53:22 +0800
Subject: [PATCH 137/195] Update customizing-tokenizer.jade

update some codes:
- `me` -> `-PRON`
- `TAG` -> `POS`
- `create_tokenizer` function
---
 website/docs/usage/customizing-tokenizer.jade | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade
index ca5be9ef1..c7f717380 100644
--- a/website/docs/usage/customizing-tokenizer.jade
+++ b/website/docs/usage/customizing-tokenizer.jade
@@ -40,7 +40,9 @@ p
             {
                 ORTH: u'me'}])
     assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that']
-    assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that']
+    # Pronoun lemma is returned as -PRON-
+    # More details please see: https://spacy.io/docs/usage/troubleshooting#pron-lemma
+    assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that']
 
 p
     |  The special case doesn't have to match an entire whitespace-delimited
@@ -57,7 +59,7 @@ p
 +code.
     nlp.tokenizer.add_special_case(u'...gimme...?',
         [{
-            ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}])
+            ORTH: u'...gimme...?', LEMMA: u'give', POS: u'VB'}])
     assert len(nlp(u'...gimme...?')) == 1
 
 p
@@ -172,12 +174,14 @@ p
 
     prefix_re = re.compile(r'''[\[\(&quot;']''')
     suffix_re = re.compile(r'''[\]\)&quot;']''')
+    infix_re = re.compile(r'''[-~]''')
     def create_tokenizer(nlp):
-        return Tokenizer(nlp.vocab,
+        return Tokenizer(nlp.vocab, rules={},
                 prefix_search=prefix_re.search,
-                suffix_search=suffix_re.search)
+                suffix_search=suffix_re.search,
+                infix_finditer=infix_re.finditer)
 
-    nlp = spacy.load('en', tokenizer=create_make_doc)
+    nlp = spacy.load('en', create_make_doc=create_tokenizer)
 
 p
     |  If you need to subclass the tokenizer instead, the relevant methods to

From 54855f0eee6707798caa58d41d192ec4401a5763 Mon Sep 17 00:00:00 2001
From: Yam <haoshaochun@gmail.com>
Date: Fri, 22 Sep 2017 12:15:48 +0800
Subject: [PATCH 138/195] Update customizing-tokenizer.jade

---
 website/docs/usage/customizing-tokenizer.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade
index c7f717380..c2f840a27 100644
--- a/website/docs/usage/customizing-tokenizer.jade
+++ b/website/docs/usage/customizing-tokenizer.jade
@@ -59,7 +59,7 @@ p
 +code.
     nlp.tokenizer.add_special_case(u'...gimme...?',
         [{
-            ORTH: u'...gimme...?', LEMMA: u'give', POS: u'VB'}])
+            ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}])
     assert len(nlp(u'...gimme...?')) == 1
 
 p

From b6ebedd09c03648c8bd3a448bd15ab87ce1631e4 Mon Sep 17 00:00:00 2001
From: Jeffrey Gerard <jeff@getwellio.com>
Date: Mon, 25 Sep 2017 13:13:25 -0700
Subject: [PATCH 139/195] Document Tokenizer(token_match) and clarify
 tokenizer_pseudo_code

Closes #835

In the `tokenizer_pseudo_code` I put the `special_cases` kwarg
before `find_prefix` because this now matches the order the args
are used in the pseudocode, and it also matches spacy's actual code.
---
 website/docs/usage/customizing-tokenizer.jade | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade
index c2f840a27..173521a33 100644
--- a/website/docs/usage/customizing-tokenizer.jade
+++ b/website/docs/usage/customizing-tokenizer.jade
@@ -87,8 +87,8 @@ p
     |  algorithm in Python, optimized for readability rather than performance:
 
 +code.
-    def tokenizer_pseudo_code(text, find_prefix, find_suffix,
-                              find_infixes, special_cases):
+    def tokenizer_pseudo_code(text, special_cases,
+                              find_prefix, find_suffix, find_infixes):
         tokens = []
         for substring in text.split(' '):
             suffixes = []
@@ -140,7 +140,7 @@ p
 
 p
     |  Let's imagine you wanted to create a tokenizer for a new language. There
-    |  are four things you would need to define:
+    |  are five things you would need to define:
 
 +list("numbers")
     +item
@@ -162,6 +162,11 @@ p
         |  A function #[code infixes_finditer], to handle non-whitespace
         |  separators, such as hyphens etc.
 
+    +item
+        |  (Optional) A boolean function #[code token_match] matching strings
+        |  that should never be split, overriding the previous rules.
+        |  Useful for things like URLs or numbers.
+
 p
     |  You shouldn't usually need to create a #[code Tokenizer] subclass.
     |  Standard usage is to use #[code re.compile()] to build a regular
@@ -175,11 +180,15 @@ p
     prefix_re = re.compile(r'''[\[\(&quot;']''')
     suffix_re = re.compile(r'''[\]\)&quot;']''')
     infix_re = re.compile(r'''[-~]''')
+    simple_url_re = re.compile(r'''^https?://''')
     def create_tokenizer(nlp):
-        return Tokenizer(nlp.vocab, rules={},
+        return Tokenizer(nlp.vocab,
+                rules={},
                 prefix_search=prefix_re.search,
                 suffix_search=suffix_re.search,
-                infix_finditer=infix_re.finditer)
+                infix_finditer=infix_re.finditer,
+                token_match=simple_url_re.match
+                )
 
     nlp = spacy.load('en', create_make_doc=create_tokenizer)
 

From 259ed027af0e4584956b7d00c37a3beb9d5b8d98 Mon Sep 17 00:00:00 2001
From: Vincent Genty <vincent@destygo.com>
Date: Tue, 26 Sep 2017 15:46:04 +0200
Subject: [PATCH 140/195] Fixed NER model loading bug

---
 spacy/syntax/parser.pyx | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index b9de1e114..48edb6d22 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -147,6 +147,9 @@ cdef class Parser:
         # TODO: remove this shim when we don't have to support older data
         if 'labels' in cfg and 'actions' not in cfg:
             cfg['actions'] = cfg.pop('labels')
+        # Convert string keys to int
+        if cfg.get('actions'):
+            cfg['actions'] = {int(action_name): labels for action_name, labels in cfg['actions'].items()}
         # TODO: remove this shim when we don't have to support older data
         for action_name, labels in dict(cfg.get('actions', {})).items():
             # We need this to be sorted

From a9362f1c73fd7197548f6d32ed997600d15f9ff2 Mon Sep 17 00:00:00 2001
From: Ondrej Kokes <ondrej.kokes@gmail.com>
Date: Wed, 4 Oct 2017 12:55:07 +0200
Subject: [PATCH 141/195] Fixing links to SyntaxNet

---
 website/docs/api/index.jade | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/api/index.jade b/website/docs/api/index.jade
index 24f3d4458..7e3f1a906 100644
--- a/website/docs/api/index.jade
+++ b/website/docs/api/index.jade
@@ -6,7 +6,7 @@ include ../../_includes/_mixins
 
 p
     |  Here's a quick comparison of the functionalities offered by spaCy,
-    |  #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet],
+    |  #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet],
     |  #[+a("http://www.nltk.org/py-modindex.html") NLTK] and
     |  #[+a("http://stanfordnlp.github.io/CoreNLP/") CoreNLP].
 
@@ -107,7 +107,7 @@ p
 
 p
     |  In 2016, Google released their
-    |  #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet]
+    |  #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet]
     |  library, setting a new state of the art for syntactic dependency parsing
     |  accuracy. SyntaxNet's algorithm is very similar to spaCy's. The main
     |  difference is that SyntaxNet uses a neural network while spaCy uses a
@@ -129,7 +129,7 @@ p
             +cell=data
 
     +row
-        +cell #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") Parsey McParseface]
+        +cell #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") Parsey McParseface]
         each data in [ 94.15, 89.08, 94.77 ]
             +cell=data
 

From e81a608173e78b10da5984cf0d2632de29f407f1 Mon Sep 17 00:00:00 2001
From: Orion Montoya <orion@mdcclv.com>
Date: Thu, 5 Oct 2017 10:47:48 -0400
Subject: [PATCH 142/195] Regression test for lemmatizer exceptions --
 demonstrate issue #1387

---
 spacy/tests/regression/test_issue1387.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue1387.py

diff --git a/spacy/tests/regression/test_issue1387.py b/spacy/tests/regression/test_issue1387.py
new file mode 100644
index 000000000..c5f01d145
--- /dev/null
+++ b/spacy/tests/regression/test_issue1387.py
@@ -0,0 +1,22 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from ...symbols import POS, VERB, VerbForm_part
+from ...vocab import Vocab
+from ...lemmatizer import Lemmatizer
+from ..util import get_doc
+
+import pytest
+
+def test_issue1387():
+    tag_map = {'VBG': {POS: VERB, VerbForm_part: True}}
+    index = {"verb": ("cope","cop")}
+    exc = {"verb": {"coping": ("cope",)}}
+    rules = {"verb": [["ing", ""]]}
+    lemmatizer = Lemmatizer(index, exc, rules)
+    vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
+    doc = get_doc(vocab, ["coping"])
+    doc[0].tag_ = 'VBG'
+    assert doc[0].text == "coping"
+    assert doc[0].lemma_ == "cope"
+

From ffb50d21a043a1028a7a8ac3f354483ec100fce6 Mon Sep 17 00:00:00 2001
From: Orion Montoya <orion@mdcclv.com>
Date: Thu, 5 Oct 2017 10:49:02 -0400
Subject: [PATCH 143/195] Lemmatizer honors exceptions: Fix #1387

---
 spacy/lemmatizer.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index d7541c56b..1112bcee3 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -78,15 +78,16 @@ def lemmatize(string, index, exceptions, rules):
     #    forms.append(string)
     forms.extend(exceptions.get(string, []))
     oov_forms = []
-    for old, new in rules:
-        if string.endswith(old):
-            form = string[:len(string) - len(old)] + new
-            if not form:
-                pass
-            elif form in index or not form.isalpha():
-                forms.append(form)
-            else:
-                oov_forms.append(form)
+    if not forms:
+        for old, new in rules:
+            if string.endswith(old):
+                form = string[:len(string) - len(old)] + new
+                if not form:
+                    pass
+                elif form in index or not form.isalpha():
+                    forms.append(form)
+                else:
+                    oov_forms.append(form)
     if not forms:
         forms.extend(oov_forms)
     if not forms:

From b0d271809dab5146fdc45cfcfab2e467b8a9347e Mon Sep 17 00:00:00 2001
From: Orion Montoya <orion@mdcclv.com>
Date: Thu, 5 Oct 2017 10:49:28 -0400
Subject: [PATCH 144/195] Unit test for lemmatizer exceptions -- copied from
 regression test for #1387

---
 spacy/tests/tagger/test_lemmatizer.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py
index 5db0d0b2c..91ed7d2f1 100644
--- a/spacy/tests/tagger/test_lemmatizer.py
+++ b/spacy/tests/tagger/test_lemmatizer.py
@@ -47,3 +47,20 @@ def test_tagger_lemmatizer_lemma_assignment(EN):
     assert all(t.lemma_ == '' for t in doc)
     EN.tagger(doc)
     assert all(t.lemma_ != '' for t in doc)
+
+
+from ...symbols import POS, VERB, VerbForm_part
+from ...vocab import Vocab
+from ...lemmatizer import Lemmatizer
+from ..util import get_doc
+def test_tagger_lemmatizer_exceptions():
+    index = {"verb": ("cope","cop")}
+    exc = {"verb": {"coping": ("cope",)}}
+    rules = {"verb": [["ing", ""]]}
+    tag_map = {'VBG': {POS: VERB, VerbForm_part: True}}
+    lemmatizer = Lemmatizer(index, exc, rules)
+    vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
+    doc = get_doc(vocab, ["coping"])
+    doc[0].tag_ = 'VBG'
+    assert doc[0].text == "coping"
+    assert doc[0].lemma_ == "cope"

From e77d8886f7bad951341060fee328eaa7ab4e927e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Thu, 5 Oct 2017 22:22:04 +0200
Subject: [PATCH 145/195] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 995f6901f..97c53c3d2 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -26,7 +26,9 @@ This is a list of everyone who has made significant contributions to spaCy, in a
 * Ines Montani, [@ines](https://github.com/ines)
 * J Nicolas Schrading, [@NSchrading](https://github.com/NSchrading)
 * Janneke van der Zwaan, [@jvdzwaan](https://github.com/jvdzwaan)
+* Jim Geovedi, [@geovedi](https://github.com/geovedi)
 * Jim Regan, [@jimregan](https://github.com/jimregan)
+* Jeffrey Gerard, [@IamJeffG](https://github.com/IamJeffG)
 * Jordan Suchow, [@suchow](https://github.com/suchow)
 * Josh Reeter, [@jreeter](https://github.com/jreeter)
 * Juan Miguel Cejuela, [@juanmirocks](https://github.com/juanmirocks)
@@ -41,6 +43,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a
 * Michael Wallin, [@wallinm1](https://github.com/wallinm1)
 * Miguel Almeida, [@mamoit](https://github.com/mamoit)
 * Oleg Zd, [@olegzd](https://github.com/olegzd)
+* Paul O'Leary McCann, [@polm](https://github.com/polm)
 * Pokey Rule, [@pokey](https://github.com/pokey)
 * Raphaël Bournhonesque, [@raphael0202](https://github.com/raphael0202)
 * Rob van Nieuwpoort, [@RvanNieuwpoort](https://github.com/RvanNieuwpoort)
@@ -51,11 +54,15 @@ This is a list of everyone who has made significant contributions to spaCy, in a
 * Swier, [@swierh](https://github.com/swierh)
 * Thomas Tanon, [@Tpt](https://github.com/Tpt)
 * Tiago Rodrigues, [@TiagoMRodrigues](https://github.com/TiagoMRodrigues)
+* Vimos Tan, [@Vimos](https://github.com/Vimos)
 * Vsevolod Solovyov, [@vsolovyov](https://github.com/vsolovyov)
 * Wah Loon Keng, [@kengz](https://github.com/kengz)
+* Wannaphong Phatthiyaphaibun, [@wannaphongcom](https://github.com/wannaphongcom)
 * Willem van Hage, [@wrvhage](https://github.com/wrvhage)
 * Wolfgang Seeker, [@wbwseeker](https://github.com/wbwseeker)
+* Yam, [@hscspring](https://github.com/hscspring)
 * Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang)
 * Yasuaki Uechi, [@uetchy](https://github.com/uetchy)
+* Yu-chun Huang, [@galaxyh](https://github.com/galaxyh)
 * Yubing Dong, [@tomtung](https://github.com/tomtung)
 * Yuval Pinter, [@yuvalpinter](https://github.com/yuvalpinter)

From e04e11070f78ea827ddce40e62ee9ce8c7f38489 Mon Sep 17 00:00:00 2001
From: Orion Montoya <orion@mdcclv.com>
Date: Thu, 5 Oct 2017 17:45:45 -0400
Subject: [PATCH 146/195] Contributor agreement for Orion Montoya @mdcclv

---
 .github/contributors/mdcclv.md | 106 +++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/mdcclv.md

diff --git a/.github/contributors/mdcclv.md b/.github/contributors/mdcclv.md
new file mode 100644
index 000000000..14ebfae26
--- /dev/null
+++ b/.github/contributors/mdcclv.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                            |
+|------------------------------- | -------------------------------- |
+| Name                           |  Orion Montoya                   |
+| Company name (if applicable)   |                                  |
+| Title or role (if applicable)  |                                  |
+| Date                           |  04-10-2017                      |
+| GitHub username                |  mdcclv                          |
+| Website (optional)             |  http://www.mdcclv.com/          |

From 763b54cbc38120f63c308b4d519c9fb2cb2408ae Mon Sep 17 00:00:00 2001
From: Alex <yuukos.dev@icloud.com>
Date: Fri, 6 Oct 2017 16:30:44 +0700
Subject: [PATCH 147/195] Update adding-languages.jade

Fixed misspellings
---
 website/docs/usage/adding-languages.jade | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade
index 7d893b4eb..02dfb79ca 100644
--- a/website/docs/usage/adding-languages.jade
+++ b/website/docs/usage/adding-languages.jade
@@ -525,13 +525,13 @@ p
     |   └── oov_prob      # optional
     ├── pos/              # optional
     |   ├── model         # via nlp.tagger.model.dump(path)
-    |   └── config.json   # via Langage.train
+    |   └── config.json   # via Language.train
     ├── deps/             # optional
     |   ├── model         # via nlp.parser.model.dump(path)
-    |   └── config.json   # via Langage.train
+    |   └── config.json   # via Language.train
     └── ner/              # optional
         ├── model         # via nlp.entity.model.dump(path)
-        └── config.json   # via Langage.train
+        └── config.json   # via Language.train
 
 p
     |  This creates a spaCy data directory with a vocabulary model, ready to be

From e89689a31d69180b9ee22603b488a3594a8383dc Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Fri, 6 Oct 2017 18:02:40 +0200
Subject: [PATCH 148/195] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 97c53c3d2..9e210bd4c 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -43,6 +43,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a
 * Michael Wallin, [@wallinm1](https://github.com/wallinm1)
 * Miguel Almeida, [@mamoit](https://github.com/mamoit)
 * Oleg Zd, [@olegzd](https://github.com/olegzd)
+* Orion Montoya, [@mdcclv](https://github.com/mdcclv)
 * Paul O'Leary McCann, [@polm](https://github.com/polm)
 * Pokey Rule, [@pokey](https://github.com/pokey)
 * Raphaël Bournhonesque, [@raphael0202](https://github.com/raphael0202)

From efe0800f91dd35d114cbcdf64845bdafa34de9f5 Mon Sep 17 00:00:00 2001
From: Yam <haoshaochun@gmail.com>
Date: Mon, 9 Oct 2017 21:39:15 -0500
Subject: [PATCH 149/195] Update training.jade

fix several changes
---
 website/docs/usage/training.jade | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade
index 8a5c111bd..3a15ae2a1 100644
--- a/website/docs/usage/training.jade
+++ b/website/docs/usage/training.jade
@@ -33,12 +33,14 @@ p
     from spacy.vocab import Vocab
     from spacy.pipeline import EntityRecognizer
     from spacy.tokens import Doc
+    from spacy.gold import GoldParse
 
     vocab = Vocab()
     entity = EntityRecognizer(vocab, entity_types=['PERSON', 'LOC'])
 
     doc = Doc(vocab, words=['Who', 'is', 'Shaka', 'Khan', '?'])
-    entity.update(doc, ['O', 'O', 'B-PERSON', 'L-PERSON', 'O'])
+    gold = GoldParse(doc, entities=['O', 'O', 'B-PERSON', 'L-PERSON', 'O'])
+    entity.update(doc, gold)
 
     entity.model.end_training()
 
@@ -65,13 +67,14 @@ p.o-inline-list
     from spacy.vocab import Vocab
     from spacy.pipeline import DependencyParser
     from spacy.tokens import Doc
+    from spacy.gold import GoldParse
 
     vocab = Vocab()
     parser = DependencyParser(vocab, labels=['nsubj', 'compound', 'dobj', 'punct'])
 
     doc = Doc(vocab, words=['Who', 'is', 'Shaka', 'Khan', '?'])
-    parser.update(doc, [(1, 'nsubj'), (1, 'ROOT'), (3, 'compound'), (1, 'dobj'),
-                        (1, 'punct')])
+    gold = GoldParse(doc, [1,1,3,1,1], ['nsubj', 'ROOT', 'compound', 'dobj', 'punct'])
+    parser.update(doc, gold)
 
     parser.model.end_training()
 
@@ -120,7 +123,7 @@ p
 
 +code.
     from spacy.vocab import Vocab
-    from spacy.pipeline import Tagger
+    from spacy.tagger import Tagger
     from spacy.tagger import P2_orth, P1_orth
     from spacy.tagger import P2_cluster, P1_cluster, W_orth, N1_orth, N2_orth
 

From 3452d6ce521943fb0bb02f59d3d9e3a1bac218c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@likeabird.io>
Date: Wed, 11 Oct 2017 11:24:00 +0200
Subject: [PATCH 150/195] Resolve issue #1078 by simplifying URL pattern

- avoid catastrophic backtracking
- reduce character range of host name, domain name and TLD identifier
---
 spacy/language_data/tokenizer_exceptions.py |  6 +++---
 spacy/tests/tokenizer/test_urls.py          | 18 +++++++++---------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/spacy/language_data/tokenizer_exceptions.py b/spacy/language_data/tokenizer_exceptions.py
index b84adb2c4..9d5187d83 100644
--- a/spacy/language_data/tokenizer_exceptions.py
+++ b/spacy/language_data/tokenizer_exceptions.py
@@ -32,11 +32,11 @@ _URL_PATTERN = (
     r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
     r"|"
     # host name
-    r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)"
+    r"(?:(?:[a-z0-9\-]*)?[a-z0-9]+)"
     # domain name
-    r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*"
+    r"(?:\.(?:[a-z0-9\-])*[a-z0-9]+)*"
     # TLD identifier
-    r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
+    r"(?:\.(?:[a-z]{2,}))"
     r")"
     # port number
     r"(?::\d{2,5})?"
diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py
index 959067110..3bb6521f1 100644
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@@ -33,13 +33,10 @@ URLS_SHOULD_MATCH = [
     "http://userid:password@example.com/",
     "http://142.42.1.1/",
     "http://142.42.1.1:8080/",
-    "http://⌘.ws",
-    "http://⌘.ws/",
     "http://foo.com/blah_(wikipedia)#cite-1",
     "http://foo.com/blah_(wikipedia)_blah#cite-1",
     "http://foo.com/unicode_(✪)_in_parens",
     "http://foo.com/(something)?after=parens",
-    "http://☺.damowmow.com/",
     "http://code.google.com/events/#&product=browser",
     "http://j.mp",
     "ftp://foo.bar/baz",
@@ -49,14 +46,17 @@ URLS_SHOULD_MATCH = [
     "http://a.b-c.de",
     "http://223.255.255.254",
     "http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
-    "http://✪df.ws/123",
-    "http://➡.ws/䨹",
-    "http://مثال.إختبار",
-    "http://例子.测试",
-    "http://उदाहरण.परीक्षा",
 
     pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)"),
     pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)_(again)"),
+    pytest.mark.xfail("http://⌘.ws"),
+    pytest.mark.xfail("http://⌘.ws/"),
+    pytest.mark.xfail("http://☺.damowmow.com/"),
+    pytest.mark.xfail("http://✪df.ws/123"),
+    pytest.mark.xfail("http://➡.ws/䨹"),
+    pytest.mark.xfail("http://مثال.إختبار"),
+    pytest.mark.xfail("http://例子.测试"),
+    pytest.mark.xfail("http://उदाहरण.परीक्षा"),
 ]
 
 URLS_SHOULD_NOT_MATCH = [
@@ -83,7 +83,6 @@ URLS_SHOULD_NOT_MATCH = [
     "http://foo.bar/foo(bar)baz quux",
     "ftps://foo.bar/",
     "http://-error-.invalid/",
-    "http://-a.b.co",
     "http://a.b-.co",
     "http://0.0.0.0",
     "http://10.1.1.0",
@@ -99,6 +98,7 @@ URLS_SHOULD_NOT_MATCH = [
     pytest.mark.xfail("foo.com"),
     pytest.mark.xfail("http://1.1.1.1.1"),
     pytest.mark.xfail("http://www.foo.bar./"),
+    pytest.mark.xfail("http://-a.b.co"),
 ]
 
 

From 2a78f4d6345084fda788a7f94beff963026b0e83 Mon Sep 17 00:00:00 2001
From: yuukos <yuukos.dev@icloud.com>
Date: Thu, 12 Oct 2017 22:23:19 +0700
Subject: [PATCH 151/195] updated .gitignore file

added excluding PyCharm's idea directory
---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index 84ced41f8..ecd8ed39f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,7 @@ Desktop.ini
 
 # Other
 *.tgz
+
+
+# JetBrains PyCharm
+.idea/
\ No newline at end of file

From 7b9491679ffa235ce6cc3f8d3f94b00c14d40655 Mon Sep 17 00:00:00 2001
From: yuukos <yuukos.dev@icloud.com>
Date: Thu, 12 Oct 2017 22:24:20 +0700
Subject: [PATCH 152/195] added russian language support

---
 spacy/ru/__init__.py             | 56 ++++++++++++++++++++++++++++++++
 spacy/ru/language_data.py        | 18 ++++++++++
 spacy/ru/stop_words.py           | 54 ++++++++++++++++++++++++++++++
 spacy/ru/tokenizer_exceptions.py | 29 +++++++++++++++++
 4 files changed, 157 insertions(+)
 create mode 100644 spacy/ru/__init__.py
 create mode 100644 spacy/ru/language_data.py
 create mode 100644 spacy/ru/stop_words.py
 create mode 100644 spacy/ru/tokenizer_exceptions.py

diff --git a/spacy/ru/__init__.py b/spacy/ru/__init__.py
new file mode 100644
index 000000000..d8f38e199
--- /dev/null
+++ b/spacy/ru/__init__.py
@@ -0,0 +1,56 @@
+# encoding: utf8
+from __future__ import unicode_literals, print_function
+
+from ..language import Language
+from ..attrs import LANG
+from ..tokens import Doc
+from .language_data import *
+
+
+class RussianTokenizer(object):
+    try:
+        from pymorphy2 import MorphAnalyzer
+    except ImportError:
+        raise ImportError(
+            "The Russian tokenizer requires the pymorphy2 library: "
+            "try to fix it with "
+            "pip install pymorphy2==0.8")
+
+    _morph = MorphAnalyzer()
+
+    def __init__(self, spacy_tokenizer, cls, nlp=None):
+        self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp)
+        self._spacy_tokenizer = spacy_tokenizer
+
+    def __call__(self, text):
+        words = [self._normalize(RussianTokenizer._get_word(token))
+                 for token in self._spacy_tokenizer(text)]
+
+        return Doc(self.vocab, words, [False] * len(words))
+
+    @staticmethod
+    def _get_word(token):
+        return token.lemma_ if len(token.lemma_) > 0 else token.text
+
+    @classmethod
+    def _normalize(cls, word):
+        return cls._morph.parse(word)[0].normal_form
+
+
+class RussianDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'ru'
+
+    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    stop_words = STOP_WORDS
+
+    @classmethod
+    def create_tokenizer(cls, nlp=None):
+        tokenizer = super(RussianDefaults, cls).create_tokenizer(nlp)
+        return RussianTokenizer(tokenizer, cls, nlp)
+
+
+class Russian(Language):
+    lang = 'ru'
+
+    Defaults = RussianDefaults
diff --git a/spacy/ru/language_data.py b/spacy/ru/language_data.py
new file mode 100644
index 000000000..75ca41b65
--- /dev/null
+++ b/spacy/ru/language_data.py
@@ -0,0 +1,18 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from .. import language_data as base
+from ..language_data import update_exc, strings_to_exc
+
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+
+
+STOP_WORDS = set(STOP_WORDS)
+TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
+
+
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
+
+
+__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"]
\ No newline at end of file
diff --git a/spacy/ru/stop_words.py b/spacy/ru/stop_words.py
new file mode 100644
index 000000000..ddb28af86
--- /dev/null
+++ b/spacy/ru/stop_words.py
@@ -0,0 +1,54 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+STOP_WORDS = set("""
+а
+
+будем будет будете будешь буду будут будучи будь будьте бы был была были было
+быть
+
+в вам вами вас весь во вот все всё всего всей всем всём всеми всему всех всею
+всея всю вся вы
+
+да для до
+
+его едим едят ее её ей ел ела ем ему емъ если ест есть ешь еще ещё ею
+
+же
+
+за
+
+и из или им ими имъ их
+
+к как кем ко когда кого ком кому комья которая которого которое которой котором
+которому которою которую которые который которым которыми которых кто
+
+меня мне мной мною мог моги могите могла могли могло могу могут мое моё моего
+моей моем моём моему моею можем может можете можешь мои мой моим моими моих
+мочь мою моя мы
+
+на нам нами нас наса наш наша наше нашего нашей нашем нашему нашею наши нашим
+нашими наших нашу не него нее неё ней нем нём нему нет нею ним ними них но
+
+о об один одна одни одним одними одних одно одного одной одном одному одною
+одну он она оне они оно от
+
+по при
+
+с сам сама сами самим самими самих само самого самом самому саму свое своё
+своего своей своем своём своему своею свои свой своим своими своих свою своя
+себе себя собой собою
+
+та так такая такие таким такими таких такого такое такой таком такому такою
+такую те тебе тебя тем теми тех то тобой тобою того той только том томах тому
+тот тою ту ты
+
+у уже
+
+чего чем чём чему что чтобы
+
+эта эти этим этими этих это этого этой этом этому этот этою эту
+
+я
+""".split())
\ No newline at end of file
diff --git a/spacy/ru/tokenizer_exceptions.py b/spacy/ru/tokenizer_exceptions.py
new file mode 100644
index 000000000..8df57a402
--- /dev/null
+++ b/spacy/ru/tokenizer_exceptions.py
@@ -0,0 +1,29 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ..symbols import *
+
+
+TOKENIZER_EXCEPTIONS = {
+    "Пн.": [
+        {ORTH: "Пн.", LEMMA: "Понедельник"}
+    ],
+    "Вт.": [
+        {ORTH: "Вт.", LEMMA: "Вторник"}
+    ],
+    "Ср.": [
+        {ORTH: "Ср.", LEMMA: "Среда"}
+    ],
+    "Чт.": [
+        {ORTH: "Чт.", LEMMA: "Четверг"}
+    ],
+    "Пт.": [
+        {ORTH: "Пт.", LEMMA: "Пятница"}
+    ],
+    "Сб.": [
+        {ORTH: "Сб.", LEMMA: "Суббота"}
+    ],
+    "Вс.": [
+        {ORTH: "Вс.", LEMMA: "Воскресенье"}
+    ],
+}
\ No newline at end of file

From f81dd284eb2e8c09c55a4fc37abb3e00e278f0a8 Mon Sep 17 00:00:00 2001
From: yuukos <yuukos.dev@icloud.com>
Date: Thu, 12 Oct 2017 22:28:34 +0700
Subject: [PATCH 153/195] updated spacy/__init__.py

registered russian language via set_lang_class
---
 spacy/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/__init__.py b/spacy/__init__.py
index f0d5ea0fc..1e5faf504 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -7,13 +7,13 @@ from .cli.info import info
 from .glossary import explain
 from .about import __version__
 
-from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja,th
+from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja,th, ru
 
 
 _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French,
              it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish,
              fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese,
-             th.Thai)
+             th.Thai, ru.Russian)
 
 
 for _lang in _languages:

From 622b6d627078f5a5bc14ebb2840a64ec3db5d118 Mon Sep 17 00:00:00 2001
From: yuukos <yuukos.dev@icloud.com>
Date: Fri, 13 Oct 2017 13:57:29 +0700
Subject: [PATCH 154/195] updated Russian tokenizer

moved the trying to import pymorph into __init__
---
 spacy/ru/__init__.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/spacy/ru/__init__.py b/spacy/ru/__init__.py
index d8f38e199..12b480a8a 100644
--- a/spacy/ru/__init__.py
+++ b/spacy/ru/__init__.py
@@ -8,17 +8,19 @@ from .language_data import *
 
 
 class RussianTokenizer(object):
-    try:
-        from pymorphy2 import MorphAnalyzer
-    except ImportError:
-        raise ImportError(
-            "The Russian tokenizer requires the pymorphy2 library: "
-            "try to fix it with "
-            "pip install pymorphy2==0.8")
-
-    _morph = MorphAnalyzer()
+    _morph = None
 
     def __init__(self, spacy_tokenizer, cls, nlp=None):
+        try:
+            from pymorphy2 import MorphAnalyzer
+        except ImportError:
+            raise ImportError(
+                "The Russian tokenizer requires the pymorphy2 library: "
+                "try to fix it with "
+                "pip install pymorphy2==0.8")
+
+        RussianTokenizer._morph = RussianTokenizer._create_morph(MorphAnalyzer)
+
         self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp)
         self._spacy_tokenizer = spacy_tokenizer
 
@@ -36,6 +38,12 @@ class RussianTokenizer(object):
     def _normalize(cls, word):
         return cls._morph.parse(word)[0].normal_form
 
+    @classmethod
+    def _create_morph(cls, morph_analyzer_class):
+        if not cls._morph:
+            cls._morph = morph_analyzer_class()
+        return cls._morph
+
 
 class RussianDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)

From a229b6e0ded3b1255fd77e00c197fa35c9030e5b Mon Sep 17 00:00:00 2001
From: yuukos <yuukos.dev@icloud.com>
Date: Fri, 13 Oct 2017 14:04:37 +0700
Subject: [PATCH 155/195] added tests for Russian language

added tests of creating Russian Language instance and Russian tokenizer
---
 spacy/tests/conftest.py | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 90b947702..718a8265c 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -16,7 +16,7 @@ from ..bn import Bengali
 from ..he import Hebrew
 from ..nb import Norwegian
 from ..th import Thai
-
+from ..ru import Russian
 
 from ..tokens import Doc
 from ..strings import StringStore
@@ -30,7 +30,7 @@ import pytest
 
 # These languages get run through generic tokenizer tests
 LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
-             Swedish, Hungarian, Finnish, Bengali, Norwegian]
+             Swedish, Hungarian, Finnish, Bengali, Norwegian, Russian]
 
 
 @pytest.fixture(params=LANGUAGES)
@@ -53,6 +53,7 @@ def en_vocab():
 def en_parser():
     return English.Defaults.create_parser()
 
+
 @pytest.fixture
 def es_tokenizer():
     return Spanish.Defaults.create_tokenizer()
@@ -83,11 +84,13 @@ def ja_tokenizer():
     pytest.importorskip("MeCab")
     return Japanese.Defaults.create_tokenizer()
 
+
 @pytest.fixture
 def japanese():
     pytest.importorskip("MeCab")
     return Japanese()
 
+
 @pytest.fixture
 def sv_tokenizer():
     return Swedish.Defaults.create_tokenizer()
@@ -102,15 +105,30 @@ def bn_tokenizer():
 def he_tokenizer():
     return Hebrew.Defaults.create_tokenizer()
 
+
 @pytest.fixture
 def nb_tokenizer():
     return Norwegian.Defaults.create_tokenizer()
 
+
 @pytest.fixture
 def th_tokenizer():
     pythainlp = pytest.importorskip("pythainlp")
     return Thai.Defaults.create_tokenizer()
 
+
+@pytest.fixture
+def ru_tokenizer():
+    pytest.importorskip("pymorphy2")
+    return Russian.Defaults.create_tokenizer()
+
+
+@pytest.fixture
+def russian():
+    pytest.importorskip("pymorphy2")
+    return Russian()
+
+
 @pytest.fixture
 def stringstore():
     return StringStore()
@@ -118,7 +136,7 @@ def stringstore():
 
 @pytest.fixture
 def en_entityrecognizer():
-     return English.Defaults.create_entity()
+    return English.Defaults.create_entity()
 
 
 @pytest.fixture
@@ -130,6 +148,7 @@ def lemmatizer():
 def text_file():
     return StringIO()
 
+
 @pytest.fixture
 def text_file_b():
     return BytesIO()
@@ -149,11 +168,11 @@ def DE():
 
 def pytest_addoption(parser):
     parser.addoption("--models", action="store_true",
-        help="include tests that require full models")
+                     help="include tests that require full models")
     parser.addoption("--vectors", action="store_true",
-        help="include word vectors tests")
+                     help="include word vectors tests")
     parser.addoption("--slow", action="store_true",
-        help="include slow tests")
+                     help="include slow tests")
 
 
 def pytest_runtest_setup(item):

From 6fb9d75bd2a9ed049300b4237bec23d7a09e6845 Mon Sep 17 00:00:00 2001
From: yuukos <yuukos.dev@icloud.com>
Date: Fri, 13 Oct 2017 15:51:03 +0700
Subject: [PATCH 156/195] fixed test with creating tokenizer

---
 spacy/tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 718a8265c..de0facf49 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -30,7 +30,7 @@ import pytest
 
 # These languages get run through generic tokenizer tests
 LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
-             Swedish, Hungarian, Finnish, Bengali, Norwegian, Russian]
+             Swedish, Hungarian, Finnish, Bengali, Norwegian]
 
 
 @pytest.fixture(params=LANGUAGES)

From ce00405afc176bd02363a7d703c3e61ef52fb851 Mon Sep 17 00:00:00 2001
From: Alex <yuukos.dev@icloud.com>
Date: Fri, 13 Oct 2017 21:00:15 +0700
Subject: [PATCH 157/195] Create yuukos.md

---
 .github/contributors/yuukos.md | 106 +++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/yuukos.md

diff --git a/.github/contributors/yuukos.md b/.github/contributors/yuukos.md
new file mode 100644
index 000000000..aecafeecb
--- /dev/null
+++ b/.github/contributors/yuukos.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Alexey Kim           |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 13-12-2017           |
+| GitHub username                | yuukos               |
+| Website (optional)             |                      |

From 95836abee1c311bb95d291d0357f29b9f4e98e1c Mon Sep 17 00:00:00 2001
From: Alex <yuukos.dev@icloud.com>
Date: Fri, 13 Oct 2017 21:02:19 +0700
Subject: [PATCH 158/195] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 9e210bd4c..edd1ed30d 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -3,6 +3,7 @@
 This is a list of everyone who has made significant contributions to spaCy, in alphabetical order. Thanks a lot for the great work!
 
 * Adam Bittlingmayer, [@bittlingmayer](https://github.com/bittlingmayer)
+* Alexey Kim, [@yuukos](https://github.com/yuukos)
 * Alexis Eidelman, [@AlexisEidelman](https://github.com/AlexisEidelman)
 * Andreas Grivas, [@andreasgrv](https://github.com/andreasgrv)
 * Andrew Poliakov, [@pavlin99th](https://github.com/pavlin99th)

From a31d33be06b3a2c933bb1b0d4859778616065cb8 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Sat, 14 Oct 2017 19:28:04 +0900
Subject: [PATCH 159/195] Contributor agreement

---
 .github/contributors/polm.md | 106 +++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/polm.md

diff --git a/.github/contributors/polm.md b/.github/contributors/polm.md
new file mode 100644
index 000000000..a2aa0cb65
--- /dev/null
+++ b/.github/contributors/polm.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Paul McCann          |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2017-10-14           |
+| GitHub username                | polm                 |
+| Website (optional)             | http://dampfkraft.com|

From 43eedf73f2aaf506e158115dfb328fb60bd91943 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Sun, 15 Oct 2017 23:33:25 +0900
Subject: [PATCH 160/195] [ja] Stash tokenizer output for speed

Before this commit, the Mecab tokenizer had to be called twice when
creating a Doc- once during tokenization and once during tagging. This
creates a JapaneseDoc wrapper class for Doc that stashes the parsed
tokenizer output to remove redundant processing. -POLM
---
 spacy/ja/__init__.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py
index 2f85406c0..b2ec281f7 100644
--- a/spacy/ja/__init__.py
+++ b/spacy/ja/__init__.py
@@ -16,6 +16,13 @@ from collections import namedtuple
 
 ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech'])
 
+class JapaneseDoc(Doc):
+    def __init__(self, detailed_tokens, vocab, words=None, spaces=None, orths_and_spaces=None):
+        super(JapaneseDoc, self).__init__(vocab, words, spaces, orths_and_spaces)
+        # This saves tokenizer output so mecab doesn't have to be called again
+        # when determining POS tags.
+        self.detailed_tokens = detailed_tokens
+
 def try_mecab_import():
     """Mecab is required for Japanese support, so check for it.
 
@@ -34,8 +41,9 @@ class JapaneseTokenizer(object):
         self.tokenizer = MeCab.Tagger()
 
     def __call__(self, text):
-        words = [x.surface for x in detailed_tokens(self.tokenizer, text)]
-        return Doc(self.vocab, words=words, spaces=[False]*len(words))
+        dtokens = detailed_tokens(self.tokenizer, text)
+        words = [x.surface for x in dtokens]
+        return JapaneseDoc(dtokens, self.vocab, words=words, spaces=[False]*len(words))
 
 def resolve_pos(token):
     """If necessary, add a field to the POS tag for UD mapping.
@@ -91,7 +99,7 @@ class JapaneseTagger(object):
         # 1. get raw JP tags
         # 2. add features to tags as necessary for UD
 
-        dtokens = detailed_tokens(self.tokenizer, tokens.text)
+        dtokens = tokens.detailed_tokens
         rawtags = list(map(resolve_pos, dtokens))
         self.tagger.tag_from_strings(tokens, rawtags)
 
@@ -112,8 +120,7 @@ class Japanese(Language):
     Defaults = JapaneseDefaults
 
     def make_doc(self, text):
-        words = [str(t) for t in self.tokenizer(text)]
-        doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
+        jdoc = self.tokenizer(text)
         tagger = JapaneseDefaults.create_tagger(self.tokenizer)
-        tagger(doc)
-        return doc
+        tagger(jdoc)
+        return jdoc

From 71ae8013ec5e981c9b44699afd82162c6f6c625b Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 16 Oct 2017 00:24:34 +0900
Subject: [PATCH 161/195] [ja] Use user_details instead of a wrapper class

Instead of using a JapaneseDoc wrapper class to store Mecab output,
stash it in `user_data`. -POLM
---
 spacy/ja/__init__.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py
index b2ec281f7..26e39a593 100644
--- a/spacy/ja/__init__.py
+++ b/spacy/ja/__init__.py
@@ -16,12 +16,7 @@ from collections import namedtuple
 
 ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech'])
 
-class JapaneseDoc(Doc):
-    def __init__(self, detailed_tokens, vocab, words=None, spaces=None, orths_and_spaces=None):
-        super(JapaneseDoc, self).__init__(vocab, words, spaces, orths_and_spaces)
-        # This saves tokenizer output so mecab doesn't have to be called again
-        # when determining POS tags.
-        self.detailed_tokens = detailed_tokens
+DETAILS_KEY = 'mecab_details'
 
 def try_mecab_import():
     """Mecab is required for Japanese support, so check for it.
@@ -43,7 +38,10 @@ class JapaneseTokenizer(object):
     def __call__(self, text):
         dtokens = detailed_tokens(self.tokenizer, text)
         words = [x.surface for x in dtokens]
-        return JapaneseDoc(dtokens, self.vocab, words=words, spaces=[False]*len(words))
+        doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
+        # stash details tokens for tagger to use
+        doc.user_data[DETAILS_KEY] = dtokens
+        return doc
 
 def resolve_pos(token):
     """If necessary, add a field to the POS tag for UD mapping.
@@ -99,7 +97,7 @@ class JapaneseTagger(object):
         # 1. get raw JP tags
         # 2. add features to tags as necessary for UD
 
-        dtokens = tokens.detailed_tokens
+        dtokens = tokens.user_data[DETAILS_KEY]
         rawtags = list(map(resolve_pos, dtokens))
         self.tagger.tag_from_strings(tokens, rawtags)
 

From 241d19a3e6f78918bc8296d574a1e65e4ce9381f Mon Sep 17 00:00:00 2001
From: yuukos <yuukos.dev@icloud.com>
Date: Mon, 16 Oct 2017 13:37:05 +0700
Subject: [PATCH 162/195] fixed Russian Tokenizer

- added trailing space flags for tokens
---
 spacy/ru/__init__.py             | 20 +++++++++++++++++---
 spacy/ru/language_data.py        |  2 +-
 spacy/ru/stop_words.py           |  2 +-
 spacy/ru/tokenizer_exceptions.py |  3 ++-
 4 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/spacy/ru/__init__.py b/spacy/ru/__init__.py
index 12b480a8a..8789cd6e5 100644
--- a/spacy/ru/__init__.py
+++ b/spacy/ru/__init__.py
@@ -25,15 +25,29 @@ class RussianTokenizer(object):
         self._spacy_tokenizer = spacy_tokenizer
 
     def __call__(self, text):
-        words = [self._normalize(RussianTokenizer._get_word(token))
-                 for token in self._spacy_tokenizer(text)]
+        get_norm = RussianTokenizer._get_norm
+        has_space = RussianTokenizer._has_space
 
-        return Doc(self.vocab, words, [False] * len(words))
+        words_with_space_flags = [(get_norm(token), has_space(token, text))
+                                  for token in self._spacy_tokenizer(text)]
+
+        words, spaces = map(lambda s: list(s), zip(*words_with_space_flags))
+
+        return Doc(self.vocab, words, spaces)
 
     @staticmethod
     def _get_word(token):
         return token.lemma_ if len(token.lemma_) > 0 else token.text
 
+    @staticmethod
+    def _has_space(token, text):
+        pos_after_token = token.idx + len(token.text)
+        return pos_after_token < len(text) and text[pos_after_token] == ' '
+
+    @classmethod
+    def _get_norm(cls, token):
+        return cls._normalize(cls._get_word(token))
+
     @classmethod
     def _normalize(cls, word):
         return cls._morph.parse(word)[0].normal_form
diff --git a/spacy/ru/language_data.py b/spacy/ru/language_data.py
index 75ca41b65..d33d388fd 100644
--- a/spacy/ru/language_data.py
+++ b/spacy/ru/language_data.py
@@ -15,4 +15,4 @@ TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
 
 
-__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"]
\ No newline at end of file
+__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"]
diff --git a/spacy/ru/stop_words.py b/spacy/ru/stop_words.py
index ddb28af86..2d89b7726 100644
--- a/spacy/ru/stop_words.py
+++ b/spacy/ru/stop_words.py
@@ -51,4 +51,4 @@ STOP_WORDS = set("""
 эта эти этим этими этих это этого этой этом этому этот этою эту
 
 я
-""".split())
\ No newline at end of file
+""".split())
diff --git a/spacy/ru/tokenizer_exceptions.py b/spacy/ru/tokenizer_exceptions.py
index 8df57a402..f444f3df6 100644
--- a/spacy/ru/tokenizer_exceptions.py
+++ b/spacy/ru/tokenizer_exceptions.py
@@ -26,4 +26,5 @@ TOKENIZER_EXCEPTIONS = {
     "Вс.": [
         {ORTH: "Вс.", LEMMA: "Воскресенье"}
     ],
-}
\ No newline at end of file
+}
+

From b47b4e2654f69498b68c06a0b6464db4e924d268 Mon Sep 17 00:00:00 2001
From: Ramanan Balakrishnan <ramanan90@gmail.com>
Date: Wed, 18 Oct 2017 14:43:47 +0530
Subject: [PATCH 163/195] Support single value for attribute list in
 doc.to_scalar conversion

---
 spacy/tokens/doc.pyx | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index aca35a73f..9a644b86d 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -496,13 +496,19 @@ cdef class Doc:
         cdef int i, j
         cdef attr_id_t feature
         cdef np.ndarray[attr_t, ndim=2] output
+        cdef np.ndarray[attr_t, ndim=1] output_1D
         # Make an array from the attributes --- otherwise our inner loop is Python
         # dict iteration.
+        if( type(py_attr_ids) is not list ):
+            py_attr_ids = [ py_attr_ids ]
         cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32)
         output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32)
         for i in range(self.length):
             for j, feature in enumerate(attr_ids):
                 output[i, j] = get_token_attr(&self.c[i], feature)
+        if( len(attr_ids) == 1 ):
+            output_1D = output.reshape((self.length))
+            return output_1D
         return output
 
     def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):

From 8bd9b05fdc212e55b7714bb20594d8bb51657ba9 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Wed, 18 Oct 2017 14:13:36 +0200
Subject: [PATCH 164/195] Update CONTRIBUTING.md

---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8a9ab517b..7cc47296c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -70,7 +70,7 @@ The [spaCy developer resources](https://github.com/explosion/spacy-dev-resources
 
 ### Contributor agreement
 
-If you've made a substantial contribution to spaCy, you should fill in the [spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that your contribution can be used across the project. If you agree to be bound by the terms of the agreement, fill in the [template]((.github/CONTRIBUTOR_AGREEMENT.md)) and include it with your pull request, or sumit it separately to [`.github/contributors/`](/.github/contributors). The name of the file should be your GitHub username, with the extension `.md`. For example, the user
+If you've made a substantial contribution to spaCy, you should fill in the [spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that your contribution can be used across the project. If you agree to be bound by the terms of the agreement, fill in the [template](.github/CONTRIBUTOR_AGREEMENT.md) and include it with your pull request, or sumit it separately to [`.github/contributors/`](/.github/contributors). The name of the file should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 
 

From 5a4b5b362c27f1948187915e2349c35db8a5d64c Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Wed, 18 Oct 2017 14:29:10 +0200
Subject: [PATCH 165/195] Create shuvanon.md

---
 .github/contributors/shuvanon.md | 106 +++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/shuvanon.md

diff --git a/.github/contributors/shuvanon.md b/.github/contributors/shuvanon.md
new file mode 100644
index 000000000..c915d48bf
--- /dev/null
+++ b/.github/contributors/shuvanon.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Shuvanon Razik       |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 3/12/2017            |
+| GitHub username                | shuvanon             |
+| Website (optional)             |                      |

From e787045cf55db5b68d878a291793b6e3786d6633 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 18 Oct 2017 14:31:57 +0200
Subject: [PATCH 166/195] Revert "filled up CONTRIBUTOR_AGREEMENT.md"

This reverts commit 8a2d22222dec5cf910df5a378cbcd9ea2ab53ec4.
---
 .github/CONTRIBUTOR_AGREEMENT.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md
index c915d48bf..668b9dba2 100644
--- a/.github/CONTRIBUTOR_AGREEMENT.md
+++ b/.github/CONTRIBUTOR_AGREEMENT.md
@@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
 
-    * [x] I am signing on behalf of myself as an individual and no other person
+    * [ ] I am signing on behalf of myself as an individual and no other person
     or entity, including my employer, has or will have rights with respect my
     contributions.
 
@@ -98,9 +98,9 @@ mark both statements:
 
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
-| Name                           | Shuvanon Razik       |
+| Name                           |                      |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
-| Date                           | 3/12/2017            |
-| GitHub username                | shuvanon             |
+| Date                           |                      |
+| GitHub username                |                      |
 | Website (optional)             |                      |

From 9162ecb43ff2883f271da2a7d5cab17615288ac3 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Wed, 18 Oct 2017 14:36:19 +0200
Subject: [PATCH 167/195] Update CONTRIBUTOR_AGREEMENT.md

---
 .github/CONTRIBUTOR_AGREEMENT.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md
index 668b9dba2..f34603065 100644
--- a/.github/CONTRIBUTOR_AGREEMENT.md
+++ b/.github/CONTRIBUTOR_AGREEMENT.md
@@ -88,7 +88,7 @@ U.S. Federal law. Any choice of law rules will not apply.
 mark both statements:
 
     * [ ] I am signing on behalf of myself as an individual and no other person
-    or entity, including my employer, has or will have rights with respect my
+    or entity, including my employer, has or will have rights with respect to my
     contributions.
 
     * [ ] I am signing on behalf of my employer or a legal entity and I have the

From 0b239ee6461a77f41d50aab64040b4f97f5949a5 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Wed, 18 Oct 2017 14:37:08 +0200
Subject: [PATCH 168/195] Create ines.md

---
 .github/contributors/ines.md | 106 +++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/ines.md

diff --git a/.github/contributors/ines.md b/.github/contributors/ines.md
new file mode 100644
index 000000000..5cd57b07e
--- /dev/null
+++ b/.github/contributors/ines.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [ ] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [x] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Ines Montani         |
+| Company name (if applicable)   | Explosion AI         |
+| Title or role (if applicable)  | Founder              |
+| Date                           | 2017/10/18           |
+| GitHub username                | ines                 |
+| Website (optional)             | https://explosion.ai |

From 3357588b9fb6156cfcd48e3b9e556e413b5b9e27 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Wed, 18 Oct 2017 14:41:31 +0200
Subject: [PATCH 169/195] Create honnibal.md

---
 .github/contributors/honnibal.md | 106 +++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/honnibal.md

diff --git a/.github/contributors/honnibal.md b/.github/contributors/honnibal.md
new file mode 100644
index 000000000..3a700b7dd
--- /dev/null
+++ b/.github/contributors/honnibal.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [ ] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [x] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Matthew Honnibal     |
+| Company name (if applicable)   | Explosion AI         |
+| Title or role (if applicable)  | Founder              |
+| Date                           | 2017-10-18           |
+| GitHub username                | honnibal             |
+| Website (optional)             | https://explosion.ai |

From e7b78370d99a59a80119ae1641b97ebbbb60088b Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Wed, 18 Oct 2017 14:41:38 +0200
Subject: [PATCH 170/195] Add note on origin of manually moved agreement

See 8a2d22222dec5cf910df5a378cbcd9ea2ab53ec4
---
 .github/contributors/shuvanon.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/contributors/shuvanon.md b/.github/contributors/shuvanon.md
index c915d48bf..82d02d8d2 100644
--- a/.github/contributors/shuvanon.md
+++ b/.github/contributors/shuvanon.md
@@ -1,3 +1,5 @@
+<!-- This agreement was mistakenly submitted as an update to the CONTRIBUTOR_AGREEMENT.md template. Commit: 8a2d22222dec5cf910df5a378cbcd9ea2ab53ec4. It was therefore moved over manually. -->
+
 # spaCy contributor agreement
 
 This spaCy Contributor Agreement (**"SCA"**) is based on the

From f39fc34c95746d0f2ec8ad8105e76bdecc8aed33 Mon Sep 17 00:00:00 2001
From: demfier <sahu.gaurav719@gmail.com>
Date: Wed, 18 Oct 2017 22:32:58 +0530
Subject: [PATCH 171/195] Add minor update in README

---
 README.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index 0f3efc146..0fd807388 100644
--- a/README.rst
+++ b/README.rst
@@ -16,7 +16,7 @@ MIT license.
 .. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square
     :target: https://travis-ci.org/explosion/spaCy
     :alt: Travis Build Status
-    
+
 .. image:: https://img.shields.io/appveyor/ci/explosion/spacy/master.svg?style=flat-square
     :target: https://ci.appveyor.com/project/explosion/spacy
     :alt: Appveyor Build Status
@@ -100,7 +100,7 @@ Top Performance
 
 * Fastest in the world: <50ms per document.  No faster system has ever been
   announced.
-* Accuracy within 1% of the current state of the art on all tasks performed
+* Accuracy within 1% of the current state-of-the-art on all tasks performed
   (parsing, named entity recognition, part-of-speech tagging).  The only more
   accurate systems are an order of magnitude slower or more.
 
@@ -254,7 +254,7 @@ details.
     pip install -r requirements.txt
     pip install -e .
 
-Compared to regular install via pip `requirements.txt <requirements.txt>`_
+Compared to a regular install via pip, `requirements.txt <requirements.txt>`_
 additionally installs developer dependencies such as Cython.
 
 Instead of the above verbose commands, you can also use the following

From 772c8035f779ad2043c6ef3a3c2db8bebbebd9cd Mon Sep 17 00:00:00 2001
From: demfier <sahu.gaurav719@gmail.com>
Date: Wed, 18 Oct 2017 23:12:24 +0530
Subject: [PATCH 172/195] Sign SCA

---
 .github/contributors/demfier.md | 106 ++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/demfier.md

diff --git a/.github/contributors/demfier.md b/.github/contributors/demfier.md
new file mode 100644
index 000000000..1a730fc78
--- /dev/null
+++ b/.github/contributors/demfier.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Gaurav Sahu          |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           |  2017-10-18          |
+| GitHub username                |  demfier             |
+| Website (optional)             |                      |

From 44c61fde25af968b69b1f171e5681caceb16baed Mon Sep 17 00:00:00 2001
From: John Haley <john@haley.io>
Date: Thu, 19 Oct 2017 08:56:28 -0700
Subject: [PATCH 173/195] Fix Keras install in keras_parikeh_entailment

The master branch of Keras doesn't work with this example anymore so this pins Keras to version 1.2.2 for this example.
---
 examples/keras_parikh_entailment/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/keras_parikh_entailment/README.md b/examples/keras_parikh_entailment/README.md
index adc80ce89..25324c095 100644
--- a/examples/keras_parikh_entailment/README.md
+++ b/examples/keras_parikh_entailment/README.md
@@ -47,13 +47,13 @@ First, install [Keras](https://keras.io/), [spaCy](https://spacy.io) and the spa
 English models (about 1GB of data):
 
 ```bash
-pip install https://github.com/fchollet/keras/archive/master.zip
+pip install https://github.com/fchollet/keras/archive/1.2.2.zip
 pip install spacy
 python -m spacy.en.download
 ```
 
 ⚠️ **Important:** In order for the example to run, you'll need to install Keras from 
-the master branch (and not via `pip install keras`). For more info on this, see 
+the 1.2.2 release (and not via `pip install keras`). For more info on this, see 
 [#727](https://github.com/explosion/spaCy/issues/727).
 
 You'll also want to get Keras working on your GPU. This will depend on your

From 989814c4b6690c29e1d6a0f1ffa79a8579960b9c Mon Sep 17 00:00:00 2001
From: John Haley <john@haley.io>
Date: Thu, 19 Oct 2017 09:11:16 -0700
Subject: [PATCH 174/195] Create johnhaley81.md

---
 .github/contributors/johnhaley81.md | 106 ++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/johnhaley81.md

diff --git a/.github/contributors/johnhaley81.md b/.github/contributors/johnhaley81.md
new file mode 100644
index 000000000..277b3126c
--- /dev/null
+++ b/.github/contributors/johnhaley81.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | John Haley           |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 19/10/2017           |
+| GitHub username                | johnhaley81          |
+| Website (optional)             |                      |

From 5941aa96a12771ec3ca500c4df68b7cea0c25af1 Mon Sep 17 00:00:00 2001
From: Ramanan Balakrishnan <ramanan90@gmail.com>
Date: Wed, 18 Oct 2017 15:52:17 +0530
Subject: [PATCH 175/195] Support strings for attribute list in doc.to_array

---
 .github/contributors/ramananbalakrishnan.md | 106 ++++++++++++++++++++
 spacy/tests/doc/test_array.py               |  20 ++++
 spacy/tokens/doc.pyx                        |  29 ++++--
 3 files changed, 146 insertions(+), 9 deletions(-)
 create mode 100644 .github/contributors/ramananbalakrishnan.md

diff --git a/.github/contributors/ramananbalakrishnan.md b/.github/contributors/ramananbalakrishnan.md
new file mode 100644
index 000000000..37492fb3d
--- /dev/null
+++ b/.github/contributors/ramananbalakrishnan.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Ramanan Balakrishnan |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2017-10-18           |
+| GitHub username                | ramananbalakrishnan  |
+| Website (optional)             |                      |
diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py
index dd87aa763..ff10394d1 100644
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@@ -17,6 +17,26 @@ def test_doc_array_attr_of_token(en_tokenizer, en_vocab):
     assert feats_array[0][0] != feats_array[0][1]
 
 
+def test_doc_stringy_array_attr_of_token(en_tokenizer, en_vocab):
+    text = "An example sentence"
+    tokens = en_tokenizer(text)
+    example = tokens.vocab["example"]
+    assert example.orth != example.shape
+    feats_array = tokens.to_array((ORTH, SHAPE))
+    feats_array_stringy = tokens.to_array(("ORTH", "SHAPE"))
+    assert feats_array_stringy[0][0] == feats_array[0][0]
+    assert feats_array_stringy[0][1] == feats_array[0][1]
+
+
+def test_doc_scalar_attr_of_token(en_tokenizer, en_vocab):
+    text = "An example sentence"
+    tokens = en_tokenizer(text)
+    example = tokens.vocab["example"]
+    assert example.orth != example.shape
+    feats_array = tokens.to_array(ORTH)
+    assert feats_array.shape == (3,)
+
+
 def test_doc_array_tag(en_tokenizer):
     text = "A nice sentence."
     pos = ['DET', 'ADJ', 'NOUN', 'PUNCT']
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 9a644b86d..4f3b06946 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -16,6 +16,7 @@ from .token cimport Token
 from ..lexeme cimport Lexeme
 from ..lexeme cimport EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
+from ..attrs import IDS
 from ..attrs cimport attr_id_t
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
@@ -474,10 +475,13 @@ cdef class Doc:
 
     @cython.boundscheck(False)
     cpdef np.ndarray to_array(self, object py_attr_ids):
-        """
-        Given a list of M attribute IDs, export the tokens to a numpy
-        `ndarray` of shape (N, M), where `N` is the length
-        of the document. The values will be 32-bit integers.
+        """Export given token attributes to a numpy `ndarray`.
+
+	If `attr_ids` is a sequence of M attributes, the output array will
+	be of shape `(N, M)`, where N is the length of the `Doc`
+	(in tokens). If `attr_ids` is a single attribute, the output shape will
+	be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
+	or string name (e.g. 'LEMMA' or 'lemma').
 
         Example:
             from spacy import attrs
@@ -486,22 +490,29 @@ cdef class Doc:
             np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
 
         Arguments:
-            attr_ids (list[int]): A list of attribute ID ints.
+            attr_ids (list[]): A list of attributes (int IDs or string names).
 
         Returns:
             feat_array (numpy.ndarray[long, ndim=2]):
               A feature matrix, with one row per word, and one column per attribute
-              indicated in the input attr_ids.
+              indicated in the input `attr_ids`.
         """
         cdef int i, j
         cdef attr_id_t feature
         cdef np.ndarray[attr_t, ndim=2] output
         cdef np.ndarray[attr_t, ndim=1] output_1D
+        # Handle scalar/list inputs of strings/ints for py_attr_ids
+        if( type(py_attr_ids) is not list and type(py_attr_ids) is not tuple ):
+            py_attr_ids = [ py_attr_ids ]
+        py_attr_ids_input = []
+        for py_attr_id in py_attr_ids:
+            if( type(py_attr_id) is int ):
+                py_attr_ids_input.append(py_attr_id)
+            else:
+                py_attr_ids_input.append(IDS[py_attr_id.upper()])
         # Make an array from the attributes --- otherwise our inner loop is Python
         # dict iteration.
-        if( type(py_attr_ids) is not list ):
-            py_attr_ids = [ py_attr_ids ]
-        cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32)
+        cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids_input, dtype=numpy.int32)
         output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32)
         for i in range(self.length):
             for j, feature in enumerate(attr_ids):

From fbccc8c87d5456bb6b84730cbdd69abcccc64142 Mon Sep 17 00:00:00 2001
From: Ramanan Balakrishnan <ramanan90@gmail.com>
Date: Fri, 20 Oct 2017 14:23:48 +0530
Subject: [PATCH 176/195] Update documentation on doc.to_array

---
 website/docs/api/doc.jade | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade
index 1c2911f52..59752b2a0 100644
--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@@ -176,9 +176,14 @@ p
     +tag method
 
 p
-    |  Export the document annotations to a numpy array of shape #[code N*M]
-    |  where #[code N] is the length of the document and #[code M] is the number
-    |  of attribute IDs to export. The values will be 32-bit integers.
+    |  Export given token attributes to a numpy #[code ndarray].
+    |  If #[code attr_ids] is a sequence of #[code M] attributes,
+    |  the output array will  be of shape #[code (N, M)], where #[code N]
+    |  is the length of the #[code Doc] (in tokens). If #[code attr_ids] is
+    |  a single attribute, the output shape will be #[code (N,)]. You can
+    |  specify attributes by integer ID (e.g. #[code spacy.attrs.LEMMA])
+    |  or string name (e.g. 'LEMMA' or 'lemma'). The values will be 32-bit
+    |  integers.
 
 +aside-code("Example").
     from spacy import attrs
@@ -186,19 +191,26 @@ p
     # All strings mapped to integers, for easy export to numpy
     np_array = doc.to_array([attrs.LOWER, attrs.POS,
                              attrs.ENT_TYPE, attrs.IS_ALPHA])
+    np_array = doc.to_array("POS")
 
 +table(["Name", "Type", "Description"])
     +row
         +cell #[code attr_ids]
-        +cell ints
-        +cell A list of attribute ID ints.
+        +cell int or string
+        +cell
+            | A list of attributes (int IDs or string names) or
+            | a single attribute (int ID or string name)
 
     +footrow
         +cell return
-        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+        +cell
+            | #[code numpy.ndarray[ndim=2, dtype='int32']] or
+            | #[code numpy.ndarray[ndim=1, dtype='int32']]
         +cell
             |  The exported attributes as a 2D numpy array, with one row per
-            |  token and one column per attribute.
+            |  token and one column per attribute (when #[code attr_ids] is a
+            |  list), or as a 1D numpy array, with one item per attribute (when
+            |  #[code attr_ids] is a single value).
 
 +h(2, "count_by") Doc.count_by
     +tag method

From c0799430a7d126dcc0105898fca29d3e5ceff50a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 20 Oct 2017 11:17:00 +0200
Subject: [PATCH 177/195] Make small changes to Doc.to_array

* Change type-check logic to 'hasattr' (Python type-checking is brittle)
* Small 'house style' edits, mostly making code more terse.
---
 spacy/tokens/doc.pyx | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 4f3b06946..66936c4a5 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -477,11 +477,11 @@ cdef class Doc:
     cpdef np.ndarray to_array(self, object py_attr_ids):
         """Export given token attributes to a numpy `ndarray`.
 
-	If `attr_ids` is a sequence of M attributes, the output array will
-	be of shape `(N, M)`, where N is the length of the `Doc`
-	(in tokens). If `attr_ids` is a single attribute, the output shape will
-	be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
-	or string name (e.g. 'LEMMA' or 'lemma').
+        If `attr_ids` is a sequence of M attributes, the output array will
+        be of shape `(N, M)`, where N is the length of the `Doc`
+        (in tokens). If `attr_ids` is a single attribute, the output shape will
+        be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
+        or string name (e.g. 'LEMMA' or 'lemma').
 
         Example:
             from spacy import attrs
@@ -499,28 +499,25 @@ cdef class Doc:
         """
         cdef int i, j
         cdef attr_id_t feature
+        cdef np.ndarray[attr_t, ndim=1] attr_ids, output_1D
         cdef np.ndarray[attr_t, ndim=2] output
-        cdef np.ndarray[attr_t, ndim=1] output_1D
         # Handle scalar/list inputs of strings/ints for py_attr_ids
-        if( type(py_attr_ids) is not list and type(py_attr_ids) is not tuple ):
-            py_attr_ids = [ py_attr_ids ]
-        py_attr_ids_input = []
-        for py_attr_id in py_attr_ids:
-            if( type(py_attr_id) is int ):
-                py_attr_ids_input.append(py_attr_id)
-            else:
-                py_attr_ids_input.append(IDS[py_attr_id.upper()])
-        # Make an array from the attributes --- otherwise our inner loop is Python
+	    if not hasattr(py_attr_ids, '__iter__'):
+            py_attr_ids = [py_attr_ids]
+	
+        # Allow strings, e.g. 'lemma' or 'LEMMA'
+        convert_id = lambda id_:  IDS[id_.upper()] if hasattr(id_, 'upper') else id_
+        # Make an array from the attributes --- otherwise inner loop would be Python
         # dict iteration.
-        cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids_input, dtype=numpy.int32)
+        attr_ids = numpy.asarray((convert_id(id_) for id_ in py_attr_ids),
+                                 dtype=numpy.int32)
+									   
         output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32)
         for i in range(self.length):
             for j, feature in enumerate(attr_ids):
                 output[i, j] = get_token_attr(&self.c[i], feature)
-        if( len(attr_ids) == 1 ):
-            output_1D = output.reshape((self.length))
-            return output_1D
-        return output
+        # Handle 1d case
+        return output if len(attr_ids) >= 2 else output.reshape((self.length,))
 
     def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
         """

From 658536b5ce0c90a4baa7d1c25e7d3fad363f4d4d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 20 Oct 2017 11:35:10 +0200
Subject: [PATCH 178/195] Fix to_array compile error

---
 spacy/tokens/doc.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 66936c4a5..ce2a82cd0 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -499,10 +499,10 @@ cdef class Doc:
         """
         cdef int i, j
         cdef attr_id_t feature
-        cdef np.ndarray[attr_t, ndim=1] attr_ids, output_1D
+        cdef np.ndarray[attr_t, ndim=1] attr_ids
         cdef np.ndarray[attr_t, ndim=2] output
         # Handle scalar/list inputs of strings/ints for py_attr_ids
-	    if not hasattr(py_attr_ids, '__iter__'):
+        if not hasattr(py_attr_ids, '__iter__'):
             py_attr_ids = [py_attr_ids]
 	
         # Allow strings, e.g. 'lemma' or 'LEMMA'

From 7a46792376773f0f7ed55f9a1e71d0512c5eed2b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 20 Oct 2017 11:53:47 +0200
Subject: [PATCH 179/195] Fix compile error

Closures not allowed in cpdef
---
 spacy/tokens/doc.pyx | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index ce2a82cd0..3b2ef80fa 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -506,12 +506,11 @@ cdef class Doc:
             py_attr_ids = [py_attr_ids]
 	
         # Allow strings, e.g. 'lemma' or 'LEMMA'
-        convert_id = lambda id_:  IDS[id_.upper()] if hasattr(id_, 'upper') else id_
+        py_attr_ids = [(IDS[id_.toupper()] if hasattr(id_, 'upper') else id_)
+                       for id_ in py_attr_ids]
         # Make an array from the attributes --- otherwise inner loop would be Python
         # dict iteration.
-        attr_ids = numpy.asarray((convert_id(id_) for id_ in py_attr_ids),
-                                 dtype=numpy.int32)
-									   
+        attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32)									   
         output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32)
         for i in range(self.length):
             for j, feature in enumerate(attr_ids):

From dbc276e3b2ca64a6f72d612629d773a9f44e13da Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 20 Oct 2017 13:02:13 +0200
Subject: [PATCH 180/195] Fix 'toupper()' -> 'upper()'

---
 spacy/tokens/doc.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 3b2ef80fa..1bc8745c4 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -506,7 +506,7 @@ cdef class Doc:
             py_attr_ids = [py_attr_ids]
 	
         # Allow strings, e.g. 'lemma' or 'LEMMA'
-        py_attr_ids = [(IDS[id_.toupper()] if hasattr(id_, 'upper') else id_)
+        py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, 'upper') else id_)
                        for id_ in py_attr_ids]
         # Make an array from the attributes --- otherwise inner loop would be Python
         # dict iteration.

From 80edc905f7490fbaabce47743879d6478e0dfcf1 Mon Sep 17 00:00:00 2001
From: mayukh18 <mayukh.superb@gmail.com>
Date: Sun, 22 Oct 2017 13:16:39 +0530
Subject: [PATCH 181/195] added a few bengali pronouns

---
 spacy/bn/morph_rules.py | 15 ++++++++++++++-
 spacy/bn/stop_words.py  |  4 ++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/spacy/bn/morph_rules.py b/spacy/bn/morph_rules.py
index efa5a6185..b63379325 100644
--- a/spacy/bn/morph_rules.py
+++ b/spacy/bn/morph_rules.py
@@ -11,11 +11,11 @@ MORPH_RULES = {
         'কি':        {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
         'সে':        {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Nom'},
         'কিসে':      {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
-        'কাদের':     {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
         'তাকে':      {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'},
         'স্বয়ং':     {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
         'কোনগুলো':   {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
         'তুমি':      {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
+        'তুই':      {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
         'তাদেরকে':   {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'},
         'আমরা':      {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One ', 'PronType': 'Prs', 'Case': 'Nom'},
         'যিনি':      {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'},
@@ -23,12 +23,15 @@ MORPH_RULES = {
         'কোন':       {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
         'কারা':      {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
         'তোমাকে':    {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
+        'তোকে':    {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
         'খোদ':       {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
         'কে':        {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
         'যারা':      {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Rel', 'Case': 'Nom'},
         'যে':        {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'},
         'তোমরা':     {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
+        'তোরা':     {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
         'তোমাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
+        'তোদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
         'আপন':       {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
         'এ':         {LEMMA: PRON_LEMMA, 'PronType': 'Dem'},
         'নিজ':       {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
@@ -41,6 +44,10 @@ MORPH_RULES = {
 
         'আমার':    {LEMMA:  PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
                     'Case': 'Nom'},
+        'মোর':     {LEMMA:  PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
+                    'Case': 'Nom'},
+        'মোদের':   {LEMMA:  PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
+                    'Case': 'Nom'},
         'তার':     {LEMMA:  PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes',
                     'Case': 'Nom'},
         'তোমাদের': {LEMMA:  PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
@@ -49,7 +56,13 @@ MORPH_RULES = {
                     'Case': 'Nom'},
         'তোমার':   {LEMMA:  PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
                     'Case': 'Nom'},
+        'তোর':     {LEMMA:  PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
+                    'Case': 'Nom'},
         'তাদের':   {LEMMA:  PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes',
                     'Case': 'Nom'},
+        'কাদের':   {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
+        'তোদের':   {LEMMA:  PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
+                    'Case': 'Nom'},
+        'যাদের':   {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
     },
 }
diff --git a/spacy/bn/stop_words.py b/spacy/bn/stop_words.py
index 5b513da7b..ca0ae934a 100644
--- a/spacy/bn/stop_words.py
+++ b/spacy/bn/stop_words.py
@@ -22,7 +22,7 @@ STOP_WORDS = set("""
 টি 
 ঠিক 
 তখন তত তথা তবু তবে তা তাঁকে তাঁদের তাঁর তাঁরা তাঁহারা তাই তাও তাকে তাতে তাদের তার তারপর তারা তারই তাহলে তাহা তাহাতে তাহার তিনই 
-তিনি তিনিও তুমি তুলে তেমন তো তোমার 
+তিনি তিনিও তুমি তুলে তেমন তো তোমার তুই তোরা তোর তোমাদের তোদের
 থাকবে থাকবেন থাকা থাকায় থাকে থাকেন থেকে থেকেই  থেকেও থাকায়
 দিকে দিতে দিয়ে দিয়েছে দিয়েছেন দিলেন দিয়ে দু  দুটি  দুটো দেওয়া দেওয়ার দেখতে দেখা দেখে দেন দেয়  দেশের  
 দ্বারা দিয়েছে দিয়েছেন দেয় দেওয়া দেওয়ার দিন দুই
@@ -32,7 +32,7 @@ STOP_WORDS = set("""
 ফলে ফিরে ফের 
 বছর বদলে বরং বলতে বলল বললেন বলা বলে বলেছেন বলেন  বসে বহু বা বাদে বার বিনা বিভিন্ন বিশেষ বিষয়টি বেশ ব্যবহার ব্যাপারে বক্তব্য বন বেশি
 ভাবে  ভাবেই 
-মত মতো মতোই মধ্যভাগে মধ্যে মধ্যেই  মধ্যেও মনে মাত্র মাধ্যমে মানুষ মানুষের মোট মোটেই 
+মত মতো মতোই মধ্যভাগে মধ্যে মধ্যেই  মধ্যেও মনে মাত্র মাধ্যমে মানুষ মানুষের মোট মোটেই মোদের মোর 
 যখন যত যতটা যথেষ্ট যদি যদিও যা যাঁর যাঁরা যাওয়া  যাওয়ার যাকে যাচ্ছে যাতে যাদের যান যাবে যায় যার  যারা যায় যিনি যে যেখানে যেতে যেন 
 যেমন 
 রকম রয়েছে রাখা রেখে রয়েছে 

From 5c7c08c2e3acf49673d6e3d914b4259328792b12 Mon Sep 17 00:00:00 2001
From: Jeroen Bobbeldijk <jeroen@devpros.nl>
Date: Sun, 22 Oct 2017 15:35:46 +0200
Subject: [PATCH 182/195] Add myself to contributors

---
 .github/contributors/jerbob92.md | 106 +++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/jerbob92.md

diff --git a/.github/contributors/jerbob92.md b/.github/contributors/jerbob92.md
new file mode 100644
index 000000000..bb0430d14
--- /dev/null
+++ b/.github/contributors/jerbob92.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Jeroen Bobbeldijk    |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 22-10-2017           |
+| GitHub username                | jerbob92             |
+| Website (optional)             |                      |

From 80a9652617a2cf7b76bd9bfb57f6d60ec393a8d8 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Tue, 24 Oct 2017 15:48:22 +1100
Subject: [PATCH 183/195] DOC "OP" key in token spec

---
 website/docs/usage/rule-based-matching.jade | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade
index db7c70608..cf27a6a94 100644
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@@ -59,6 +59,10 @@ p
 
 +h(2, "quantifiers") Using quantifiers
 
+p
+    | Token specifiers may have quantifiers attached to them, by setting the "OP" key
+    | to one of the following values:
+
 +table([ "Name", "Description", "Example"])
     +row
         +cell #[code !]
@@ -80,10 +84,18 @@ p
         +cell match 0 or 1 times
         +cell optional, max one
 
+p
+    | Thus the following matcher will match "Hello, World" or "Hello world" or "Hello - - world":
+    
++code.
+    matcher = Matcher(nlp.vocab)
+    matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True, 'OP': '*'}, {LOWER: "world"}])
+
 p
     |  There are no nested or scoped quantifiers. You can build those
     |  behaviours with acceptors and
     |  #[+api("matcher#add_entity") #[code on_match]] callbacks.
+    |  All the operators are greedy: they will match as many tokens as possible.
 
 +h(2, "acceptor-functions") Acceptor functions
 

From 1b64a44d85e542c62ddeaf1f89622dcf3fa72229 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 24 Oct 2017 11:48:20 +0200
Subject: [PATCH 184/195] Add dependency patterns example

---
 examples/dependency_patterns.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 examples/dependency_patterns.py

diff --git a/examples/dependency_patterns.py b/examples/dependency_patterns.py
new file mode 100644
index 000000000..776e045b7
--- /dev/null
+++ b/examples/dependency_patterns.py
@@ -0,0 +1,33 @@
+'''
+Match a dependency pattern. See https://github.com/explosion/spaCy/pull/1120
+
+We start by creating a DependencyTree for the Doc. This class models the document
+dependency tree. Then we compile the query into a Pattern using the PatternParser.
+The syntax is quite simple:
+
+we define a node named 'fox', that must match in the dep tree a token
+whose orth_ is 'fox'. an anonymous token whose lemma is 'quick' must have fox
+as parent, with a dep_ matching the regex am.* another anonymous token whose
+orth_ matches the regex brown|yellow has fox as parent, with whathever dep_
+DependencyTree.match returns a list of PatternMatch. Notice that we can assign
+names to anonymous or defined nodes ([word:fox]=f). We can get the Token mapped
+to the fox node using match['f'].
+'''
+import spacy
+from spacy.pattern import PatternParser, DependencyTree
+
+nlp = spacy.load('en')
+doc = nlp("The quick brown fox jumped over the lazy dog.")
+tree = DependencyTree(doc)
+
+query = """fox [word:fox]=f
+           [lemma:quick]=q >/am.*/ fox
+           [word:/brown|yellow/] > fox"""
+
+pattern = PatternParser.parse(query)
+matches = tree.match(pattern)
+
+assert len(matches) == 1
+match = matches[0]
+
+assert match['f'] == doc[3]

From a2e7e9be9883e34774cf425d76bb2974eb3ed215 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Tue, 24 Oct 2017 16:12:47 +0200
Subject: [PATCH 185/195] Update landing

---
 website/index.jade | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/website/index.jade b/website/index.jade
index df5428316..5ce00e2c1 100644
--- a/website/index.jade
+++ b/website/index.jade
@@ -33,7 +33,7 @@ include _includes/_mixins
                 |  spaCy is designed to help you do real work — to build real
                 |  products, or gather real insights. The library respects
                 |  your time, and tries to avoid wasting it. It's easy to
-                |  install, and its API is simple and productive. I like to
+                |  install, and its API is simple and productive. We like to
                 |  think of spaCy as the Ruby on Rails of Natural Language
                 |  Processing.
 
@@ -102,7 +102,9 @@ include _includes/_mixins
                 +item GIL-free #[strong multi-threading]
                 +item Efficient binary serialization
                 +item Easy #[strong deep learning] integration
-                +item Statistical models for #[strong English] and #[strong German]
+                +item
+                    |  Statistical models for #[strong English],
+                    |  #[strong German], #[strong French] and #[strong Spanish]
                 +item State-of-the-art speed
                 +item Robust, rigorously evaluated accuracy
 

From fdd8dacb752b718a918908156e5ce316d3f36559 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Tue, 24 Oct 2017 16:13:52 +0200
Subject: [PATCH 186/195] Fix compilation of color utility class names

---
 website/_harp.json                       | 2 +-
 website/assets/css/_base/_utilities.sass | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/_harp.json b/website/_harp.json
index 37a0b54dd..5a44f1a81 100644
--- a/website/_harp.json
+++ b/website/_harp.json
@@ -81,7 +81,7 @@
             }
         ],
 
-        "V_CSS": "1.7",
+        "V_CSS": "1.8",
         "V_JS": "1.2",
         "DEFAULT_SYNTAX": "python",
         "ANALYTICS": "UA-58931649-1",
diff --git a/website/assets/css/_base/_utilities.sass b/website/assets/css/_base/_utilities.sass
index 2c40858a8..49e98064b 100644
--- a/website/assets/css/_base/_utilities.sass
+++ b/website/assets/css/_base/_utilities.sass
@@ -125,7 +125,7 @@
 .u-border-dotted
     border-top: 1px dotted $color-subtle
 
-@each $name, $color in (theme: $color-theme, subtle: $color-subtle-dark, light: $color-back, red: $color-red, green: $color-green, yellow: $color-yellow)
+@each $name, $color in (theme: $color-theme, subtle: $color-subtle-dark, light: $color-back, 'red': $color-red, 'green': $color-green, 'yellow': $color-yellow)
     .u-color-#{$name}
         color: $color
 

From 91dbee1b8fc86e1c9678ede0404711142d8ac628 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Tue, 24 Oct 2017 16:17:03 +0200
Subject: [PATCH 187/195] Add BILUO docs to NER annotation scheme

---
 website/docs/api/annotation.jade | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade
index d4b01a819..155c4d13b 100644
--- a/website/docs/api/annotation.jade
+++ b/website/docs/api/annotation.jade
@@ -86,6 +86,31 @@ include _annotation/_dep-labels
 
 include _annotation/_named-entities
 
+    |  showed that the minimal #[strong Begin], #[strong In], #[strong Out]
+    |  scheme was more difficult to learn than the #[strong BILUO] scheme that
+    |  we use, which explicitly marks boundary tokens.
+
++table(["Tag", "Description"])
+    +row
+        +cell #[code #[span.u-color-theme B] EGIN]
+        +cell The first token of a multi-token entity.
+
+    +row
+        +cell #[code #[span.u-color-theme I] N]
+        +cell An inner token of a multi-token entity.
+
+    +row
+        +cell #[code #[span.u-color-theme L] AST]
+        +cell The final token of a multi-token entity.
+
+    +row
+        +cell #[code #[span.u-color-theme U] NIT]
+        +cell A single-token entity.
+
+    +row
+        +cell #[code #[span.u-color-theme O] UT]
+        +cell A non-entity token.
+
 +h(2, "json-input") JSON input format for training
 
 p

From 0e081d0167087f9f7f1d768956ccd2feb4a7ce11 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Tue, 24 Oct 2017 16:17:54 +0200
Subject: [PATCH 188/195] Update JSON training format docs (resolves #1291)

---
 website/docs/api/annotation.jade | 73 ++++++++++++++++++++++----------
 1 file changed, 51 insertions(+), 22 deletions(-)

diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade
index 155c4d13b..b5a17de89 100644
--- a/website/docs/api/annotation.jade
+++ b/website/docs/api/annotation.jade
@@ -86,6 +86,25 @@ include _annotation/_dep-labels
 
 include _annotation/_named-entities
 
++h(3, "biluo") BILUO Scheme
+
+p
+    |  spaCy translates the character offsets into this scheme, in order to
+    |  decide the cost of each action given the current state of the entity
+    |  recogniser. The costs are then used to calculate the gradient of the
+    |  loss, to train the model. The exact algorithm is a pastiche of
+    |  well-known methods, and is not currently described in any single
+    |  publication. The model is a greedy transition-based parser guided by a
+    |  linear model whose weights are learned using the averaged perceptron
+    |  loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle]
+    |  imitation learning strategy. The transition system is equivalent to the
+    |  BILOU tagging scheme.
+
++aside("Why BILUO, not IOB?")
+    |  There are several coding schemes for encoding entity annotations as
+    |  token tags.  These coding schemes are equally expressive, but not
+    |  necessarily equally learnable.
+    |  #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth]
     |  showed that the minimal #[strong Begin], #[strong In], #[strong Out]
     |  scheme was more difficult to learn than the #[strong BILUO] scheme that
     |  we use, which explicitly marks boundary tokens.
@@ -114,29 +133,39 @@ include _annotation/_named-entities
 +h(2, "json-input") JSON input format for training
 
 p
-    |  spaCy takes training data in the following format:
+    |  spaCy takes training data in JSON format. The built-in
+    |  #[+a("/docs/usage/cli#convert") #[code convert] command] helps you
+    |  convert the #[code .conllu] format used by the
+    |  #[+a("https://github.com/UniversalDependencies") Universal Dependencies corpora]
+    |  to spaCy's training format.
+
++aside("Annotating entities")
+    |  Named entities are provided in the #[+a("#biluo") BILUO]
+    |  notation. Tokens outside an entity are set to #[code "O"] and tokens
+    |  that are part of an entity are set to the entity label, prefixed by the
+    |  BILUO marker. For example #[code "B-ORG"] describes the first token of
+    |  a multi-token #[code ORG] entity and #[code "U-PERSON"] a single
+    |  token representing a #[code PERSON] entity
 
 +code("Example structure").
-    doc: {
-        id: string,
-        paragraphs: [{
-            raw: string,
-            sents: [int],
-            tokens: [{
-                start: int,
-                tag: string,
-                head: int,
-                dep: string
-            }],
-            ner: [{
-                start: int,
-                end: int,
-                label: string
-            }],
-            brackets: [{
-                start: int,
-                end: int,
-                label: string
+    [{
+        "id": int,                      # ID of the document within the corpus
+        "paragraphs": [{                # list of paragraphs in the corpus
+            "raw": string,              # raw text of the paragraph
+            "sentences": [{             # list of sentences in the paragraph
+                "tokens": [{            # list of tokens in the sentence
+                    "id": int,          # index of the token in the document
+                    "dep": string,      # dependency label
+                    "head": int,        # offset of token head relative to token index
+                    "tag": string,      # part-of-speech tag
+                    "orth": string,     # verbatim text of the token
+                    "ner": string       # BILUO label, e.g. "O" or "B-ORG"
+                }],
+                "brackets": [{          # phrase structure (NOT USED by current models)
+                    "first": int,       # index of first token
+                    "last": int,        # index of last token
+                    "label": string     # phrase label
+                }]
             }]
         }]
-    }
+    }]

From 90601cf1b38bcaab121d8c413979c1813e6f4314 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Tue, 24 Oct 2017 16:22:37 +0200
Subject: [PATCH 189/195] Fix formatting

---
 website/docs/usage/rule-based-matching.jade | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade
index cf27a6a94..4f8172797 100644
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@@ -23,7 +23,7 @@ p
 +code.
     from spacy.matcher import Matcher
     from spacy.attrs import IS_PUNCT, LOWER
-    
+
     matcher = Matcher(nlp.vocab)
     matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True}, {LOWER: "world"}])
 
@@ -60,8 +60,8 @@ p
 +h(2, "quantifiers") Using quantifiers
 
 p
-    | Token specifiers may have quantifiers attached to them, by setting the "OP" key
-    | to one of the following values:
+    | Token specifiers may have quantifiers attached to them, by setting the
+    |  #[code "OP"] key to one of the following values:
 
 +table([ "Name", "Description", "Example"])
     +row
@@ -85,8 +85,9 @@ p
         +cell optional, max one
 
 p
-    | Thus the following matcher will match "Hello, World" or "Hello world" or "Hello - - world":
-    
+    | Thus the following matcher will match "Hello, World" or "Hello world" or
+    |  "Hello - - world":
+
 +code.
     matcher = Matcher(nlp.vocab)
     matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True, 'OP': '*'}, {LOWER: "world"}])

From ebd2e5ff54eb28e276f5ec135e3a81e30cae4cf4 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Tue, 24 Oct 2017 16:22:46 +0200
Subject: [PATCH 190/195] Fix matcher docs (resolves #1453)

---
 website/docs/usage/rule-based-matching.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade
index 4f8172797..1be219f0e 100644
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@@ -18,7 +18,7 @@ p Here's a minimal example. We first add a pattern that specifies three tokens:
 
 p
     |  Once we've added the pattern, we can use the #[code matcher] as a
-    |  callable, to receive a list of #[code (ent_id, start, end)] tuples.
+    |  callable, to receive a list of #[code (ent_id, label, start, end)] tuples.
 
 +code.
     from spacy.matcher import Matcher

From b51dcee3ce047410697d4fd0fc2406bcf5021cdd Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Tue, 24 Oct 2017 16:25:49 +0200
Subject: [PATCH 191/195] Fix unicode in lightning tour example (resolves
 #1356)

---
 website/docs/usage/lightning-tour.jade | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade
index 2fd390d26..6c98cf3f3 100644
--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@@ -58,13 +58,13 @@ p
     assert token.shape_ == 'Xxxxx'
     for lexeme in nlp.vocab:
         if lexeme.is_alpha:
-            lexeme.shape_ = 'W'
+            lexeme.shape_ = u'W'
         elif lexeme.is_digit:
-            lexeme.shape_ = 'D'
+            lexeme.shape_ = u'D'
         elif lexeme.is_punct:
-            lexeme.shape_ = 'P'
+            lexeme.shape_ = u'P'
         else:
-            lexeme.shape_ = 'M'
+            lexeme.shape_ = u'M'
     assert token.shape_ == 'W'
 
 +h(2, "examples-numpy-arrays") Export to numpy arrays

From dee289613399ea975d57d87c61269506d9db3687 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Tue, 24 Oct 2017 21:52:12 +0200
Subject: [PATCH 192/195] Update PULL_REQUEST_TEMPLATE.md

---
 .github/PULL_REQUEST_TEMPLATE.md | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index e97a7ea16..feda380e6 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,20 +1,19 @@
-<!--- Provide a general summary of your changes in the Title -->
+<!--- Provide a general summary of your changes in the title. -->
 
 ## Description
-<!--- Use this section to describe your changes and how they're affecting the code. -->
-<!-- If your changes required testing, include information about the testing environment and the tests you ran. -->
+<!--- Use this section to describe your changes. If your changes required
+testing, include information about the testing environment and the tests you
+ran. If your test fixes a bug reported in an issue, don't forget to include the
+issue number. If your PR is still a work in progress, that's totally fine – just
+include a note to let us know. -->
 
+### Types of changes
+<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
+or new feature, or a change to the documentation? -->
 
-## Types of changes
-<!--- What types of changes does your code introduce? Put an `x` in all applicable boxes.: -->
-- [ ] **Bug fix** (non-breaking change fixing an issue)
-- [ ] **New feature** (non-breaking change adding functionality to spaCy)
-- [ ] **Breaking change** (fix or feature causing change to spaCy's existing functionality)
-- [ ] **Documentation** (addition to documentation of spaCy)
-
-## Checklist:
-<!--- Go over all the following points, and put an `x` in all applicable boxes.: -->
-- [ ] My change requires a change to spaCy's documentation.
-- [ ] I have updated the documentation accordingly.
-- [ ] I have added tests to cover my changes.
-- [ ] All new and existing tests passed.
+## Checklist
+<!--- Before you submit the PR, go over this checklist and make sure you can
+tick off all the boxes. [] -> [x] -->
+- [ ] I have submitted the spaCy Contributor Agreement.
+- [ ] I ran the tests, and all new and existing tests passed.
+- [ ] My changes don't require a change to the documentation, or if they do, I've added all required details.

From 3dc3f10a40e494e68b418fbf5d925cc4654cf547 Mon Sep 17 00:00:00 2001
From: Jeffrey Gerard <jeff@getwellio.com>
Date: Thu, 2 Nov 2017 09:28:26 -0700
Subject: [PATCH 193/195] Contributing agreement - IamJeffG

---
 .github/contributors/IamJeffG.md | 106 +++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/IamJeffG.md

diff --git a/.github/contributors/IamJeffG.md b/.github/contributors/IamJeffG.md
new file mode 100644
index 000000000..030e711a2
--- /dev/null
+++ b/.github/contributors/IamJeffG.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [ ] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect my
+    contributions.
+
+    * [x] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Jeffrey Gerard       |
+| Company name (if applicable)   | cephalo-ai / wellio  |
+| Title or role (if applicable)  | Senior Data Scientist|
+| Date                           | 11/02/2017           |
+| GitHub username                | IamJeffG             |
+| Website (optional)             |                      |

From fcc3b84be576007ecc653445463b11024583f319 Mon Sep 17 00:00:00 2001
From: uwol <ulrich@wolffgang.de>
Date: Sun, 5 Nov 2017 12:47:44 +0100
Subject: [PATCH 194/195] added contributor agreement

---
 .github/uwol.md | 106 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/uwol.md

diff --git a/.github/uwol.md b/.github/uwol.md
new file mode 100644
index 000000000..ddc82d220
--- /dev/null
+++ b/.github/uwol.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                              |
+|------------------------------- | --------------------               |
+| Name                           | Ulrich Wolffgang                   |
+| Company name (if applicable)   |                                    |
+| Title or role (if applicable)  |                                    |
+| Date                           | 2017-11-05                         |
+| GitHub username                | uwol                               |
+| Website (optional)             | https://uwol.github.io/            |

From 6c477d864b0f46b6101fc9d61abecd50eb922916 Mon Sep 17 00:00:00 2001
From: uwol <ulrich@wolffgang.de>
Date: Sun, 5 Nov 2017 12:49:35 +0100
Subject: [PATCH 195/195] added contributor agreement

---
 .github/{ => contributors}/uwol.md | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename .github/{ => contributors}/uwol.md (100%)

diff --git a/.github/uwol.md b/.github/contributors/uwol.md
similarity index 100%
rename from .github/uwol.md
rename to .github/contributors/uwol.md